diff --git a/.gitignore b/.gitignore
index 7ea986bd04538cf36023aa51a5468ca963c513fe..964bfa4e48ee8e7c9387339d5775a3df90c63eb4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -70,6 +70,12 @@ build
 cmake-build-debug
 cmake-build-release
 
+#ios demo
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/googlenet_combine/
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/*.jpg
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/*.a
+*.xcuserstate
+/tools/quantification/quantify
 
 # metal
 Podfile.lock
@@ -78,12 +84,3 @@ SwiftProtobuf.framework
 paddle-mobile.xcworkspace
 metal/models/
 metal/images/
-
-
-
-
-
-
-
-
-
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..f81f3717a4ead833784b63da35185f2d07409983
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "src/operators/kernel/mali/ACL_Android"]
+	path = src/operators/kernel/mali/ACL_Android
+	url = https://github.com/halsay/ACL_Android.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9c3ff01e5d3e20923021904cdbe9008a11cc30ce..4ccf73763c08a748b53027d7f4a0f254774a1843 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.6)
 project(paddle-mobile)
 
 option(DEBUGING "enable debug mode" ON)
@@ -6,41 +6,30 @@ option(USE_OPENMP "openmp support" OFF)
 option(USE_EXCEPTION "use std exception" ON)
 option(LOG_PROFILE "log profile" ON)
 # select the platform to build
-option(CPU "cpu" ON)
-option(MALI_GPU "mali gpu" ON)
+option(CPU "armv7 with neon" ON)
+option(MALI_GPU "mali gpu" OFF)
 option(FPGA "fpga" OFF)
 
-if (CPU)
-    add_definitions(-DPADDLE_MOBILE_CPU)
-endif()
-
-if (MALI_GPU)
-    add_definitions(-DPADDLE_MOBILE_MALI_GPU)
-endif()
+file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
+file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
+include_directories(src/)
 
-if(FPGA)
-    add_definitions(-DPADDLE_MOBILE_FPGA)
+if(IS_IOS)
+    set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
+else()
+    set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
 endif()
 
-set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
 if (DEBUGING)
-    set(CMAKE_BUILD_TYPE Debug)
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS}")
-else()
+    message(STATUS "debug")
     set(CMAKE_BUILD_TYPE Release)
-endif ()
-
-if(DEBUGING)
-    message(STATUS "debuging")
+    set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
     add_definitions(-DPADDLE_MOBILE_DEBUG)
-    if(ANDROID)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
-    endif()
-
-else()
-    message(STATUS "releasing")
+else ()
+    set(CMAKE_BUILD_TYPE Release)
+    set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
     add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
-endif()
+endif ()
 
 if (USE_EXCEPTION)
     message(STATUS "use exception")
@@ -54,115 +43,123 @@ if (LOG_PROFILE)
     add_definitions(-DPADDLE_MOBILE_PROFILE)
 endif()
 
-if(IS_MAC)
-    add_definitions(-DX86)
-elseif(IS_IOS)
-    add_definitions(-DIOS)
-elseif(V7)
-    add_definitions(-DARMV7)
-elseif(V8)
-    add_definitions(-DARMV8)
-else ()
-    add_definitions(-DX86)
+if(USE_OPENMP AND NOT IS_IOS)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
 endif()
 
-set(CMAKE_VERBOSE_MAKEFILE ON)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
-
-file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c)
-file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
-
-if (NOT ANDROID)
-list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.cpp)
-list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.h)
-list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
+# platform control
+if (ARM_LINUX)
+    include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake")
 endif ()
 
-include_directories(src/)
+if (CPU)
+  add_definitions(-DPADDLE_MOBILE_CPU)
+else()
+    file(GLOB_RECURSE _tmp_list src/operators/kernel/arm/*.cpp src/operators/kernel/arm/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/arm/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
+endif()
 
-if(USE_OPENMP)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
+if (MALI_GPU)
+    add_definitions(-DPADDLE_MOBILE_MALI_GPU)
+    add_definitions(-DUSE_ACL=1)
+    add_definitions(-DUSE_OPENCL)
+    set(ACL_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/ACL_Android)
+    include_directories(${ACL_ROOT} ${ACL_ROOT}/include)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_core")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_graph")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build/opencl-1.2-stubs")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ACL=1")
+else()
+    file(GLOB_RECURSE _tmp_list src/operators/kernel/mali/*.cpp src/operators/kernel/mali/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/mali/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
 endif()
 
-if (googlenet)
-    add_definitions(-DCONCAT_OP)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DLRN_OP)
-    add_definitions(-DMUL_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DFUSION_FC_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DFUSION_CONVADD_OP)
-    add_definitions(-DFUSION_CONVADD_RELU_OP)
-elseif (mobilenet)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DSOFTMAX_OP)
-    add_definitions(-DSOFTMAX_OP)
-    add_definitions(-DDEPTHWISECONV_OP)
-    add_definitions(-DBATCHNORM_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DRESHAPE_OP)
-elseif (yolo)
-    add_definitions(-DBATCHNORM_OP)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-elseif (squeezenet)
-    add_definitions(-DCONCAT_OP)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DRESHAPE_OP)
-    add_definitions(-DSOFTMAX_OP)
-elseif(resnet)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DBATCHNORM_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DSOFTMAX_OP)
-    add_definitions(-DMUL_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DRELU_OP)
-else ()
-    add_definitions(-DBATCHNORM_OP)
-    add_definitions(-DBOXCODER_OP)
-    add_definitions(-DCONCAT_OP)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DDEPTHWISECONV_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DFUSION_CONVADD_OP)
-    add_definitions(-DCONVADDRELU_OP)
-    add_definitions(-DFUSION_FC_OP)
-    add_definitions(-DLRN_OP)
-    add_definitions(-DMUL_OP)
-    add_definitions(-DMULTICLASSNMS_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DPRIORBOX_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DRESHAPE_OP)
-    add_definitions(-DSIGMOID_OP)
-    add_definitions(-DSOFTMAX_OP)
-    add_definitions(-DTRANSPOSE_OP)
-    add_definitions(-DFUSION_CONVADD_RELU_OP)
+if(FPGA)
+    add_definitions(-DPADDLE_MOBILE_FPGA)
+else()
+    file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/fpga/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
+
+
+    file(GLOB_RECURSE _tmp_list src/fpga/*.cpp src/fpga/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/fpga/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
 endif()
 
+if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
+else()
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
+endif ()
+
 if (IS_IOS)
-    add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
-elseif(ANDROID)
-    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
 else()
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobile.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobile.mm)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h)
+endif ()
+
+set(CMAKE_VERBOSE_MAKEFILE ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
+
+# NET default
+set(NET "default" CACHE STRING "select net type")
+set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGAnets")
+include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
+
+
+# build library
+if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
+    list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS)
+    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+elseif(IS_IOS)
+    add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+else ()
     add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
 endif ()
 
+# unit test
 if(DEBUGING)
-    add_subdirectory(test)
+    if(IS_IOS)
+    else()
+        add_subdirectory(test)
+    endif()
 endif()
 
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1a25d65e02afb09dabc96e1ec241346cff34f6f2..a33db73e109042276b686e8ab74261273df87390 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -183,6 +183,9 @@ upstream
 
 接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。
 
+![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294877166787.jpg)
+之后就可以提交代码了
+
 ## 删除远程分支
 
 在 PR 被 merge 进主仓库后，我们可以在 PR 的页面删除远程仓库的分支。
@@ -219,7 +222,7 @@ upstream
      - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
      - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
    - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
-3. 如果解决了某个Issue的问题，请在该Pull Request的**第一个**评论框中加上：`fix #issue_number`，这样当该PUll Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
+3. 如果解决了某个Issue的问题，请在该Pull Request的**第一个**评论框中加上：`fix #issue_number`，这样当该Pull Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
 
 此外，在回复评审人意见时，请您遵守以下约定：
 
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..f4fa3abcd29f613fe5f7a90f22a9736a3006bf3f
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,38 @@
+FROM ubuntu:16.04
+
+RUN echo '\
+deb <mirror> <version> main restricted universe multiverse\n\
+deb <mirror> <version>-updates main restricted universe multiverse\n\
+deb <mirror> <version>-backports main restricted universe multiverse\n\
+deb <mirror> <version>-security main restricted universe multiverse\n'\
+> /etc/apt/sources.list
+RUN sed -ie 's|<mirror>|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|' /etc/apt/sources.list
+RUN sed -ie 's|<version>|xenial|' /etc/apt/sources.list
+
+RUN apt-get update && apt-get upgrade -y
+RUN apt-get install -y --no-install-recommends \
+        curl \
+        unzip \
+        git \
+        make \
+        cmake-curses-gui \
+        python \
+        python-pip \
+        python-setuptools \
+        clang-format-5.0 \
+        graphviz \
+        g++-arm-linux-gnueabi \
+        gcc-arm-linux-gnueabi
+RUN apt-get autoremove -y && apt-get clean
+RUN ln -s clang-format-5.0 /usr/bin/clang-format
+RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip
+RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel
+RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit
+RUN cd /tmp && curl -O http://mirrors.neusoft.edu.cn/android/repository/android-ndk-r17b-linux-x86_64.zip
+RUN curl -O https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
+        tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
+        mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \
+        mv /usr/bin/cmake /usr/bin/cmake.bak && ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
+        mv /usr/bin/ccmake /usr/bin/ccmake.bak && ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
+RUN cd /opt && unzip /tmp/android-ndk-r17b-linux-x86_64.zip
+ENV NDK_ROOT /opt/android-ndk-r17b
diff --git a/README.md b/README.md
index b6ae2beed999d146c64ffc9ee495373d9b77a175..05a109a81791ac85d975138dcd76f7f71716624a 100644
--- a/README.md
+++ b/README.md
@@ -1,36 +1,121 @@
-# Paddle-Mobile 
-
+# Paddle-Mobile
  
 [![Build Status](https://travis-ci.org/PaddlePaddle/paddle-mobile.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/paddle-mobile)
-[![License](https://img.shields.io/badge/license-Apache%202-brightgreen.svg)](LICENSE)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/doc)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
+
+<!--[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)-->
+
+
+欢迎来到 Paddle-Mobile GitHub 项目。
+
+Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平台的深度学习的框架。Paddle-Moible设计思想和PaddlePaddle的最新版fluid版本保持了高度一致，同时针对嵌入式做了大量优化。设计之初就对嵌入式的性能、体积、能耗、硬件平台覆盖等方面做了考虑。
+
+## 简单搜索线上效果
+
+如下gif是简单搜索app的线上主体检测应用效果
+
+![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif)
+
+## Demo目录
+
+[点我](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo)
+
+## Features
+
+- **ARM CPU**
+
+|mobilenet arm v7|1线程|2线程|4线程|
+|------------|----|-----|-----|
+|麒麟960(ms)|110.586|70.897|47.474|
+|||||
+|mobilenetssd arm v7|1线程|2线程|4线程|
+|麒麟960(ms)|222.124|138.952|90.856|
+|||||
+|googlenet(v1) arm v7|1线程|2线程|4线程|
+|麒麟960(ms)|348.018|240.304|169.998|
+|||||
+|squeezenet arm v7|1线程|2线程|4线程|
+|麒麟960(ms)|84.685|56.544|38.833|
+|||||
+|yolo arm v7|1线程|2线程|4线程|
+|麒麟960(ms)|131.831|88.990|60.905|
+
+    arm cpu是paddle-mobile的主要支持方向，cpu的通用性一直是其优势。嵌入式深度学习，需要大量的cpu汇编实现。我们正在紧锣密鼓的编码，为的是能充分硬件的每一点加速能力。
+    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
+    
+- **Mali GPU**
+
+    Mali GPU是百度和ARM合作开发的，双方团队近期都在致力于将paddle的op能无缝运行在ACL(arm compute library)。目前已经支持squeezenet，googlenet，resnet等几个网络模型，后续会继续加大力度。使全部移动端paddle op能高效运行在mali gpu上。 
+
+- **苹果设备的GPU Metal实现**
+
+    基于Metal实现的苹果设备的GPU预测库，也已经在实现中，近期也会有相应可运行版本。
+     
+- **FPGA**
+
+    FPGA实现正在进行中，是基于Xilinx的ZU5目标开发板。
+
+- **灵活性**
+
+    * paddle-mobile cpu版不依赖任何第三库, 可进行快速集成。
+    * 使用泛型特化进行平台切换, 可灵活切换 cpu、gpu 和其他协处理器。
+    * 可根据特定的常见网络, 进行编译特定的 op, 降低编译时间, 减小包大小。
+    * 使用 docker 编译, 提供统一的编译环境。
+    * 高可拓展性, 方便拓展其他协处理器, 提供高性能 arm 算子实现, 方便其他协处理器开发者集成开发。
+    * 直接兼容 paddle-fluid 模型, 不需要额外的转换操作。
+
+- **体积**
+
+    paddle-mobile从设计之初就深入考虑到移动端的包体积的问题，cpu实现中没有外部依赖。在编译过程中，如果该网络不需要的op是完全不会被打入的。同时编译选项优化也为体积压缩提供了帮助。
+    除了二进制体积，我们对代码体积极力避免过大。整个仓库的代码体积也非常小。
+
+
+## 文档
+
+### 设计文档
+
+关于paddle-mobile设计文档在下面链接中，如果想了解更多内容。[issue](https://github.com/PaddlePaddle/paddle-mobile/issues)中会有很多早期的设计和讨论过程。
+[设计文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/design_doc.md)
+
+### 开发文档
+
+开发文档主要是关于编译、运行等问题。做为开发者，它可以和贡献文档共同结合使用。
+[开发文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_doc.md)
 
+### 贡献文档
+- [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
+- 上面文档中涵盖了主要的贡献代码流程，如果在实践中您还遇到了其他问题，可以发[issue](https://github.com/PaddlePaddle/paddle-mobile/issues)。我们看到后会尽快处理。
 
-This project is used to develop the next version deep learning freamwork for mobile device.
 
-# Development
+## 模型获得
+目前Paddle-Mobile仅支持Paddle fluid训练的模型。如果你手中的模型是不同种类的模型，需要进行模型转换才可以运行。
+### 1. 直接使用Paddle Fluid训练
+该方式最为可靠，推荐方式
+### 2. caffe转为Paddle Fluid模型
+[链接](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
+### 3. ONNX
+ONNX全称为“Open Neural Network Exchange”，即“开放的神经网络切换”。该项目的目的是让不同的神经网络开发框架做到互通互用。
 
-[Used model in development](https://mms-mis.cdn.bcebos.com/paddle-mobile/models.zip)
+除直接使用PaddlePaddle训练fluid版本的模型外，还可以通过onnx转换得到个别Paddle fluid模型。
 
-## cross-compilation to android
+目前，百度也在做onnx支持工作。相关转换项目在这里：[paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)。
 
-* NDK is required
-* ANDROID_NDK environment variable is required
+![](http://7xop3k.com1.z0.glb.clouddn.com/15311951836000.jpg)
 
-```bash 
-sh build.sh android
-```
+### 4. 部分测试模型和测试图片下载
+[下载链接](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
 
-## build for x86
-paddle-mobile is to run on arm platform. x86 only used to test not arm assembly code. So do not recommend compiling x86.
+## 问题解决
 
-Now only support osx.
+欢迎提出或解决我们的问题，有疑问可以发issue. [Github Issues](https://github.com/PaddlePaddle/paddle-mobile/issues).
 
-```
-sh build.sh mac
-```
+## Copyright and License
+Paddle-Mobile 提供相对宽松的Apache-2.0开源协议 [Apache-2.0 license](LICENSE).
 
-## Old Version of Mobile-Deep-Learning
-The old version of MDL was I moved to here [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) 
 
+## 旧版 Mobile-Deep-Learning
+原MDL(Mobile-Deep-Learning)工程被迁移到了这里 [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) 
 
 
diff --git a/demo/ReadMe.md b/demo/ReadMe.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa71f75cb7526234bb0bb32e2e5e1f93c1789711
--- /dev/null
+++ b/demo/ReadMe.md
@@ -0,0 +1,11 @@
+## 如何运行demo
+- Android demo下载路径   
+ http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip
+- iOS demo下载路径：   
+  http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip
+  
+在demo目录下执行下载demo的脚本
+```
+sh getDemo.sh
+```
+demo工程就下载解压到当前目录中了。
\ No newline at end of file
diff --git a/demo/getDemo.sh b/demo/getDemo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b69461e01c710c30ce9a44714ed2d0cdae0c9819
--- /dev/null
+++ b/demo/getDemo.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip
+wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip
+unzip paddle-mobile%2FPaddleMobile_Android.zip
+unzip paddle-mobile%2FPaddleMobileDemo_iOS.zip
+rm -rf paddle-mobile%2FPaddleMobile_Android.zip
+rm -rf paddle-mobile%2FPaddleMobileDemo_iOS.zip
+rm -rf __MACOSX
\ No newline at end of file
diff --git a/doc/build.md b/doc/build.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c1c906458a0dd5f525c9d5153d48356b907b23b
--- /dev/null
+++ b/doc/build.md
@@ -0,0 +1,64 @@
+# 环境搭建
+## 使用 docker
+### 1. 安装 docker
+安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
+### 2. 使用 docker 搭建构建环境
+首先进入 paddle-mobile 的目录下，执行 `docker build`
+以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
+```
+$ docker build -t paddle-mobile:dev - < Dockerfile
+```
+使用 `docker images` 可以看到我们新建的 image
+```
+$ docker images
+REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
+paddle-mobile   dev     33b146787711   45 hours ago    372MB
+```
+### 3. 使用 docker 构建
+进入 paddle-mobile 目录，执行 docker run
+```
+$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
+root@5affd29d4fc5:/ # cd /paddle-mobile
+###
+### paddle-mobile 支持 arm 架构下的各种平台，包括 android 以及 linux 等，可以使用不同的
+### toolchain 文件生成满足需要的 makefile
+###
+# 生成构建 android 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
+
+# 生成构建 linux 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
+```
+### 4. 设置编译选项
+可以通过 ccmake 设置编译选项
+```
+root@5affd29d4fc5:/ # ccmake .
+                                                     Page 1 of 1
+ CMAKE_ASM_FLAGS
+ CMAKE_ASM_FLAGS_DEBUG
+ CMAKE_ASM_FLAGS_RELEASE
+ CMAKE_BUILD_TYPE
+ CMAKE_INSTALL_PREFIX             /usr/local
+ CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
+ CPU                              ON
+ DEBUGING                         ON
+ FPGA                             OFF
+ LOG_PROFILE                      ON
+ MALI_GPU                         OFF
+ NET                              googlenet
+ USE_EXCEPTION                    ON
+ USE_OPENMP                       OFF
+```
+修改选项后，按 `c`, `g` 更新 Makefile
+### 5. 构建
+使用 make 命令进行构建
+```
+root@5affd29d4fc5:/ # make
+```
+### 6. 查看构建产出
+构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
+
+## 不使用 docker
+不使用 docker 的方法，可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具，可能需要设置 CC，CXX 环境变量，或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake，或者增加自己需要的 toolchain file。
diff --git a/doc/design_doc.md b/doc/design_doc.md
new file mode 100644
index 0000000000000000000000000000000000000000..bf5f78e8d805465418cad8989945f2afa7ab5587
--- /dev/null
+++ b/doc/design_doc.md
@@ -0,0 +1,182 @@
+# paddle-mobile 设计文档
+
+
+#### 以下是 paddle-mobile 代码的执行流程图:
+
+![执行流程图](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305189473720.png)
+
+
+#### 主要分为: Loader 模块、 Program 模块、 Executor 模块、 op 模块、 kernel 模块、scope variable Tensor 模块
+
+#### 下面展开说一下各个模块的作用以及设计思路
+
+### 一. Loader
+先来看一下模型, 模型分为两种结构:
+ 一种为参数文件是散开的, 如下图, 红框为模型结构的 protobuf 文件, 其余为参数文件
+
+![模型描述](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305190629577.png)
+
+
+另一种为参数文件结合在一起的, 如下图, 红框内为模型结构描述的 protobuf 文件, 另一个文件为结合在一起的参数文件
+
+![模型描述combined](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191057130.png)
+
+
+loader 模块的作用是将模型结构信息 load 进内存, 将红框内的 protobuf 文件 load 进内存, 并对模型结构进行优化(如将几个细粒度的 op 融合成 粗粒度的 op, 如将 conv、 add、 batchnorm、 relu 融合为 conv\_add\_batchnorm\_relu).
+方便进行算法优化.
+
+__那么为什么融合在一起能够做算法优化 ?__
+
+如果未融合的 conv add batchnorm relu 运算是这样的
+
+```
+[n]
+[conv_res] = conv([n])
+
+for &res in conv_res {
+	res = add_biase(res)
+}
+
+for &res in conv_res {
+	res = batchnorm(res)
+}
+
+for &res in conv_res {
+	res = relu(res)
+}
+
+```
+融合后的 conv\_add\_batchnorm\_relu 运算是这样的:
+
+```
+[n]
+[conv_res] = conv([n])
+
+for &res in conv_res {
+	res = relu(batchnorm(add_biase(res)))
+}
+
+```
+由于 conv 可以转换为两个大矩阵相乘, 更进一步可以分为若干个一行一列的小矩阵相乘, 那最终的运算是这样的:
+
+```
+[n]
+for &res in [res] {
+	res = relu(batchnorm(add_biase(A * B)))
+}
+
+其中 A 和 B 为 1 * k 和 k * 1 矩阵
+
+```
+
+
+
+### 二. Program
+
+program 为 loader 模块的结果, 包含了优化前的模型结构对象, 以及优化后的模型结构对象, 此模块基本对应着 paddle 模型的结构, 关于paddle 模型的一些概念的定义, 详细设计可以参考 [program.md](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), 以下是一个简单的概况: 
+
+* programDesc 中包含着若干个(googlenet mobilenet yolo squeezenet resnet 常见的模型只有一个)可以嵌套的 block, blocks中的第一个block中的某个 op 可能会执行 blocks 中后边 block 中的一系列 op 运算(只有多个block才会有此概念)
+* block 包含着 ops 和 vars
+* ops 为一系列 op 的描述, 描述着每个 op 的类型, 输入输出, 所需参数
+* vars 里包含的为所有 op 运算所需的参数描述
+
+### 三. Executor
+
+executor 主要是用于 op 运算的上层调度操作, 主要有两个操作,  executor 实例化 和 暴露给上层的 predict 方法
+
+* executor 实例化过程中, 主要进行了这几个操作 
+	1. 根据 loader 产出的 program 初始化 operator 对象 
+	2. 分配所有需要用到的内存, 包括每个op 的输入输出, 权重参数, 目前模型的权重参数文件的内存格式为 NCHW, op 的输入输出中间矩阵参数也是 NCHW 格式
+	3. 调用每个 op 的 init 方法, init 方法是每个 op 实现者进行参数预处理的地方, 有助于减少 predict 的耗时
+
+* predict, 主要用于拿到外部的输入, 顺序调用 op 的 run 方法进行运算, 并返回最终的结果.
+
+
+### 四. op
+关于 op 模块代码的详细设计可以参考 [operator部分代码设计](https://github.com/PaddlePaddle/paddle-mobile/issues/300), operator主要包含一个kernel用于运算、一个 param 用于存储属性, operator 主要有三个操作, Init、RunImp、InferShape
+
+* Init: Init 函数主要用于参数预处理, 如对 batchNorm 参数进行预处理, 可以将 batchNorm 运算转化为 a * x + b 形式的运算, 这个函数也会调用, kernel 的 Init 函数对 kernel 进行初始化
+* RunImp: RunImp 函数会调用自己的kernel 的 compute 方法进行运算
+* InferShape: InferShape 函数会根据输入和参数得出输出的形状, 这个函数会在 executor 实例化时, 内存初始化前调用
+
+每个 operator 都需要进行注册才可以被使用, 以 conv 为例, 需在 conv_op.cpp 底部这样写: 
+
+```c++
+// 三个平台都注册了 conv op
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(conv2d);
+REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(conv2d);
+REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(conv2d);
+REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
+#endif
+
+```
+
+__一个关于包大小的优化__:
+
+每个 operator 都由一个宏控制编译, 如 conv_op.h(除了 conv_op.h ,  conv_op.cpp、conv_kernle.h、conv_kernle.cpp 也都需要加此宏控制)
+
+```c++
+
+#ifdef CONV_OP    //这个宏控制着 conv_op 是否被编译, 除了 conv_op.h ,  conv_op.cpp、conv_kernle.h conv_kernle.cpp 也都需要加此宏控制
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/conv_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+template <typename DeviceType, typename T>
+class ConvOp
+	//impl  
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+
+```
+这样做的目的是为了根据不同类型的网络编译特定的op, 在 cmake 中已经配置好不同网络编译的宏, 如果你要进行编译支持 yolo 的模型, 仅需执行:
+
+```sh
+cd toools
+sh build.sh android yolo
+
+```
+这样只会编译 yolo 所包含的四种 op, 极大的减小了包体积和编译时间
+
+### 五. kernel
+kernel 为 op 的底层运算实现, 主要有两个函数, Init 和 Compute, 分别用来初始化、预处理 和 运算操作, 值得提出的是, kernel 会根据泛型特化到不同的平台, 如图所示:
+
+![设备特化]![](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191401976.png)
+
+不同平台的 kernel 实现, 为同一个 kernel 类不同泛型的特化实现, 目前有三个平台, arm、mali、fpga, 图中的 central-arm-func\ 目录为 op kernel 的 arm 实现, 它承担了 arm\ 目录下 kernel 的底层实现, 同时 arm 处理器作为中央处理器, central-arm-func\ 也可以作为其他协处理器的底层实现, 如: fpga 的某一个 op kernel 还没有 fpga 协处理器的实现, 就可以直接调用使用这里的 arm 实现.
+
+__如果你有兴趣新增一个协处理器实现, 就可以在次添加一个 kernel 目录, 提供协处理器实现, 如果某个 kernel 你没有实现完, 你也可以直接使用 arm 实现__
+
+### 六. scope variable Tensor
+* scope 用来存储管理所需用到的所有 variable(用来存储不同类型的对象, 主要是矩阵Tensor, 也就是说 scpoe 管理着 op 运算过程中所有参数矩阵, 输入输出矩阵), 可以将 scope 理解为一个 map, 这里在 map 上封了一层 scope 的概念是为了方便内存管理
+* variable 可以用来存储不同类型的对象, paddle-mobile 里主要用它来存储矩阵 Tensor
+* tensor 代表着矩阵, 通过泛型可以用来存储不同类型的矩阵, 但需要注意的是, 存入和取出时的类型必须保持一致, 如果类型不一致,  使用 inline const T \*data() const 获取指针会不能通过类型检查, 通过  inline T \*mutable_data() 获取指针会重新分配内存, 以下是关于 Tensor 的一些小概念:
+	1. DDim: 用来存储矩阵的维度信息.
+	2. Slice(): 这个函数用来获取 N 维 (NCHW中的 N) 上切片
+	3. 当实例化未分配内存时, 调用 inline T *mutable_data() 会分配内存
+
+
+
+
+
+
diff --git a/doc/development_doc.md b/doc/development_doc.md
new file mode 100644
index 0000000000000000000000000000000000000000..3f45f956f00e78c23b60b4c108b8c90cf4065e04
--- /dev/null
+++ b/doc/development_doc.md
@@ -0,0 +1,252 @@
+### iOS&Android开发文档
+
+# iOS开发文档
+
+## 编译
+
+```sh
+
+# 在 paddle-mobile 目录下:
+cd tools
+
+sh build.sh ios
+
+# 如果只想编译某个特定模型的 op, 则需执行以下命令
+sh build.sh ios googlenet
+
+# 在这个文件夹下, 你可以拿到生成的 .a 库
+cd ../build/release/ios/build
+
+```
+#### 常见问题:
+
+1. No iOS SDK's found in default search path ...
+
+    这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定, 
+    以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk")
+
+## 集成
+
+```
+将上一步生成的:
+libpaddle-mobile.a
+
+/src/ios_io/ 下的
+PaddleMobile.h
+```
+拖入工程
+
+#### oc 接口
+
+接口如下:
+
+```
+/*
+	创建对象
+*/
+- (instancetype)init;
+
+/*
+	load 模型, 开辟内存
+*/
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+
+/*
+	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
+
+/*
+	进行预测
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
+
+/*
+	清理内存
+*/
+- (void)clear;
+
+```
+
+
+# Android开发文档
+
+用户可通过如下两种方式，交叉编译Android平台上适用的paddle-mobile库：
+
+- 基于Docker容器编译
+- 基于Linux交叉编译
+
+
+## 基于Docker容器编译
+### 1. 安装 docker
+安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
+### 2. 使用 docker 搭建构建环境
+首先进入 paddle-mobile 的目录下，执行 `docker build`
+以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
+
+```
+$ docker build -t paddle-mobile:dev - < Dockerfile
+```
+使用 `docker images` 可以看到我们新建的 image
+
+```
+$ docker images
+REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
+paddle-mobile   dev     33b146787711   45 hours ago    372MB
+```
+### 3. 使用 docker 构建
+进入 paddle-mobile 目录，执行 docker run
+
+```
+$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
+root@5affd29d4fc5:/ # cd /paddle-mobile
+# 生成构建 android 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
+# 生成构建 linux 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
+```
+### 4. 设置编译选项
+可以通过 ccmake 设置编译选项
+
+```
+root@5affd29d4fc5:/ # ccmake .
+                                                     Page 1 of 1
+ CMAKE_ASM_FLAGS
+ CMAKE_ASM_FLAGS_DEBUG
+ CMAKE_ASM_FLAGS_RELEASE
+ CMAKE_BUILD_TYPE
+ CMAKE_INSTALL_PREFIX             /usr/local
+ CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
+ CPU                              ON
+ DEBUGING                         ON
+ FPGA                             OFF
+ LOG_PROFILE                      ON
+ MALI_GPU                         OFF
+ NET                              googlenet
+ USE_EXCEPTION                    ON
+ USE_OPENMP                       OFF
+```
+修改选项后，按 `c`, `g` 更新 Makefile
+### 5. 构建
+使用 make 命令进行构建
+
+```
+root@5affd29d4fc5:/ # make
+```
+### 6. 查看构建产出
+构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
+
+## 基于Linux交叉编译
+### 交叉编译环境准备
+##### 下载Android NDK
+
+从源码交叉编译paddle-mobile,用户需要提前准备好交叉编译环境。Android平台使用的C/C++交叉编译工具链是[Android NDK](https://developer.android.com/ndk/)，用户可以自行前往下载，也可以通过以下命令获取：
+- Mac平台
+```
+wget https://dl.google.com/android/repository/android-ndk-r17b-darwin-x86_64.zip
+unzip android-ndk-r17b-darwin-x86_64.zip
+
+```
+- Linux平台
+```
+wget https://dl.google.com/android/repository/android-ndk-r17b-linux-x86_64.zip
+unzip android-ndk-r17b-linux-x86_64.zip
+```
+
+##### 设置环境变量
+工程中自带的独立工具链会根据环境变量NDK_ROOT查找NDK，因此需要配置环境变量：
+
+```
+export NDK_ROOT = "path to ndk"
+```
+### 执行编译
+在paddle-mobile根目录中，执行以下命令：
+
+```
+cd tools
+sh build.sh android
+
+```
+执行完毕后，生成的so位于build目录中，单测可执行文件位于test/build目录中。
+##### Tips:
+如果想要获得体积更小的库，可选择编译支持指定模型结构的库。
+如执行如下命令：
+
+```
+sh build.sh android googlenet
+```
+会得到一个支持googlnet的体积更小的库。
+
+##测试
+在编译完成后，我们提供了自动化的测试脚本，帮助用户将运行单测文件所需要的模型及库文件push到Android设备中，执行以下命令：
+
+```
+cd tools/android-debug-script
+sh run_on_android.sh (npm) 可选参数npm,用于选择是否传输模型文件到手机上
+```
+出现如下提示：
+
+```
+**** choose OP or NET to test ****
+which to test :
+```
+输入名称即可运行对应的测试文件。
+
+##部署
+Android应用可通过JNI接口调用底层C/C++，paddle-mobile对外提供的JNI接口如下：
+
+##### 1 load接口  加载模型参数
+- 用于加载参数文件分散的模型
+```
+/**
+     * Load seperated parameters
+     * @param modelDir
+     * @return
+     */
+    public static native boolean load(String modelDir);
+```
+- 用于加载参数文件合并的模型文件
+```
+/**
+     * Load combined parameters
+     * @param modelPath
+     * @param paramPath
+     * @return
+     */
+    public static native boolean loadCombined(String modelPath,String paramPath);
+
+```
+##### 2 predict接口 执行预测
+- 接受预处理过的RGB数组的predict接口
+```
+/**
+*@param buf 输入数据
+*@return 输出数据
+JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
+    JNIEnv *env, jclass thiz, jfloatArray buf);
+```
+- 接受原始yuv数据的predict接口
+```
+ /**
+     *
+     * @param buf yuv420格式的字节数组
+     * @param imgWidth yuv数据的宽
+     * @param imgHeight yuv数据的高
+     * @param ddims 输入数据的形状
+     * @param meanValues 模型训练时各通道的均值
+     * @return
+     */
+
+    public static native float[] predictYuv(byte[] buf, int imgWidth, int imgHeight, int[] ddims, float[]meanValues);
+
+```
+##### 3 clear接口 销毁实例、清理内存操作
+
+```
+JNIEXPORT void JNICALL Java_com_baidu_paddle_PMLL_clear(JNIEnv *env,
+                                                        jclass thiz);
+```
+
+
diff --git a/doc/images/devices.png b/doc/images/devices.png
new file mode 100644
index 0000000000000000000000000000000000000000..413d32c249972ee96f678d50a5cd0b36a2a03e29
Binary files /dev/null and b/doc/images/devices.png differ
diff --git a/doc/images/flow_chart.png b/doc/images/flow_chart.png
new file mode 100644
index 0000000000000000000000000000000000000000..c747230da43e2e688d7460704268631758d34596
Binary files /dev/null and b/doc/images/flow_chart.png differ
diff --git a/doc/images/model_desc.png b/doc/images/model_desc.png
new file mode 100644
index 0000000000000000000000000000000000000000..3c026b6192c8e1d84b3a82c3db91e022f35358c2
Binary files /dev/null and b/doc/images/model_desc.png differ
diff --git a/doc/images/model_desc_combined.png b/doc/images/model_desc_combined.png
new file mode 100644
index 0000000000000000000000000000000000000000..38e7388efcfdcad53f4e80ce0ac5d3b993eb986c
Binary files /dev/null and b/doc/images/model_desc_combined.png differ
diff --git a/doc/quantification.md b/doc/quantification.md
new file mode 100644
index 0000000000000000000000000000000000000000..04a93116a08c094ef71861cec1bb3262304c4cb7
--- /dev/null
+++ b/doc/quantification.md
@@ -0,0 +1,39 @@
+# Quantification 模型量化、反量化
+
+## 背景故事
+部分网络如AlexNet训练出的模型体积较大，不适宜在移动设备上使用。
+
+
+## 解决模型过大办法
+1. 选用适合移动端的模型结构如：mobilenet、googlenet、 yolo、squeezenet 等；
+2. 使用我们提供的量化工具，可以在几乎不影响精度的情况下将float32模型减小至原模型的 1/4；
+
+- - - - - 
+## 量化工具介绍
+
+### 模型转化工具目录：
+
+- [量化工具目录](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/tools/quantification)
+
+- [模型转化工具](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/convert.cpp)
+
+#### 使用说明
+- [工具使用](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/README.md)
+
+## 如何读取量化后的模型
+load方法中添加了 quantification 参数，默认为false。 如果需要load量化后的模型，按需传参即可。
+
+[我是源代码](https://github.com/PaddlePaddle/paddle-mobile/blob/55302b33ea3bd68c9797d8f65e527544792b8095/src/io/paddle_mobile.h)
+
+```c++
+bool Load(const std::string &dirname, bool optimize = false,
+            bool quantification = false, int batch_size = 1);
+```
+
+- - - - - 
+
+
+
+
+
+
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
index 7c3ec5c2249673f1e749fe70d0094b4b2f0bec99..6ab6f7c05e30049e850170409efcd6f049c73abe 100644
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
@@ -42,4 +42,3 @@ class AppDelegate: UIResponder, UIApplicationDelegate {
 
 
 }
-
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
index 8a1f82939544df42f8601f38dc2bee60e6201589..d57b610e4d10f02d2eace4892a6d55eda8f2c9b9 100644
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
@@ -17,4 +17,3 @@ class ViewController: UIViewController {
     }
 
 }
-
diff --git a/metal/paddle-mobile/paddle-mobile/paddle_mobile.h b/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
index f9daa6a97ace697ab967c7a6eba1605860e8485f..ffa44be38a4c3a1f3109c51b3d15506591f2de2e 100644
--- a/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
+++ b/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
@@ -12,6 +12,8 @@
  See the License for the specific language governing permissions and
  limitations under the License. */
 
+#pragma once
+
 #import <UIKit/UIKit.h>
 
 //! Project version number for paddle_mobile.
@@ -20,6 +22,4 @@ FOUNDATION_EXPORT double paddle_mobileVersionNumber;
 //! Project version string for paddle_mobile.
 FOUNDATION_EXPORT const unsigned char paddle_mobileVersionString[];
 
-// In this header, you should import all the public headers of your framework using statements like #import <paddle_mobile/PublicHeader.h>
-
 
diff --git a/src/common/common.h b/src/common/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..12157b5e946490d041f0cc0d235142a13a3a2527
--- /dev/null
+++ b/src/common/common.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <chrono>
+
+using Time = decltype(std::chrono::high_resolution_clock::now());
+
+inline Time time() { return std::chrono::high_resolution_clock::now(); }
+
+inline double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
diff --git a/src/common/depCore.h b/src/common/dep_core.h
similarity index 100%
rename from src/common/depCore.h
rename to src/common/dep_core.h
diff --git a/src/common/enforce.h b/src/common/enforce.h
index 51d2110e32433686d1b3353bc63b92a564a13e9d..aebe2a58031cb1341596f07dbf653be4a5e01900 100644
--- a/src/common/enforce.h
+++ b/src/common/enforce.h
@@ -61,7 +61,14 @@ struct PaddleMobileException : public std::exception {
   }
 #else
 #define PADDLE_MOBILE_THROW_EXCEPTION(...)
-#define PADDLE_MOBILE_ENFORCE(stat, ...)
+
+#define PADDLE_MOBILE_ENFORCE(stat, ...) \
+  {                                      \
+    if (stat) {                          \
+    } else {                             \
+    }                                    \
+  }
+
 #endif
 
 }  // namespace paddle_mobile
diff --git a/src/common/log.h b/src/common/log.h
index 07afdb39d04f2bf3ba083f79e812fb951a6194be..d964d9c1b39a7e72e3d757ef2be0737fd1d25f94 100644
--- a/src/common/log.h
+++ b/src/common/log.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <vector>
 #ifdef PADDLE_MOBILE_DEBUG
+#include <cstring>
 #include <iostream>
 #include <sstream>
 #include <string>
@@ -115,26 +116,29 @@ struct ToLog {
   Print printer_;
 };
 
-#define LOG(level)                                                             \
-  if (level > paddle_mobile::log_level) {                                      \
-  } else                                                                       \
-    paddle_mobile::ToLog(                                                      \
-        level,                                                                 \
-        (std::stringstream()                                                   \
-         << "[file: "                                                          \
-         << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \
-         << "] [line: " << __LINE__ << "] ")                                   \
-            .str())
-
-#define DLOG                                                                   \
-  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {                  \
-  } else                                                                       \
-    paddle_mobile::ToLog(                                                      \
-        paddle_mobile::kLOG_DEBUG,                                             \
-        (std::stringstream()                                                   \
-         << "[file: "                                                          \
-         << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \
-         << "] [line: " << __LINE__ << "] ")                                   \
+#define LOG(level)                                                           \
+  if (level > paddle_mobile::log_level) {                                    \
+  } else                                                                     \
+    paddle_mobile::ToLog(                                                    \
+        level, static_cast<const std::stringstream &>(                       \
+                   std::stringstream()                                       \
+                   << "[file: "                                              \
+                   << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
+                                              : __FILE__)                    \
+                   << "] [line: " << __LINE__ << "] ")                       \
+                   .str())
+
+#define DLOG                                                          \
+  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {         \
+  } else                                                              \
+    paddle_mobile::ToLog(                                             \
+        paddle_mobile::kLOG_DEBUG,                                    \
+        static_cast<const std::stringstream &>(                       \
+            std::stringstream()                                       \
+            << "[file: "                                              \
+            << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
+                                       : __FILE__)                    \
+            << "] [line: " << __LINE__ << "] ")                       \
             .str())
 
 #define LOGF(level, format, ...)          \
@@ -170,7 +174,10 @@ struct ToLog;
 struct Print {
   friend struct ToLog;
   template <typename T>
-  Print &operator<<(T const &value) {}
+  Print &operator<<(T const &value) {
+    Print p = Print();
+    return p;
+  }
 
  private:
 };
diff --git a/src/common/type_define.h b/src/common/type_define.h
index c26cdd91e0694d44cca9443503d3e263ee21f201..389f9a715f8cec3f0b494ae3b43b3952e49677f8 100644
--- a/src/common/type_define.h
+++ b/src/common/type_define.h
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once;
+#pragma once
 
+#include <functional>
 #include <map>
 #include <string>
 #include <vector>
diff --git a/src/common/types.cpp b/src/common/types.cpp
index a6f32762d3c8a492c3347ebfe65cb50f39425976..b6387503856f438acd74b8d147da13a2b009f2a1 100644
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -17,34 +17,47 @@ limitations under the License. */
 
 namespace paddle_mobile {
 
-const std::string G_OP_TYPE_CONV = "conv2d";
-const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
-const std::string G_OP_TYPE_BOX_CODER = "box_coder";
-const std::string G_OP_TYPE_CONCAT = "concat";
-const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
-const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
-const std::string G_OP_TYPE_FC = "fc";
-const std::string G_OP_TYPE_CONV_ADD = "conv_add";
-const std::string G_OP_TYPE_LRN = "lrn";
-const std::string G_OP_TYPE_MUL = "mul";
-const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
-const std::string G_OP_TYPE_POOL2D = "pool2d";
-const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
-const std::string G_OP_TYPE_RELU = "relu";
-const std::string G_OP_TYPE_RESHAPE = "reshape";
-const std::string G_OP_TYPE_SIGMOID = "sigmoid";
-const std::string G_OP_TYPE_SOFTMAX = "softmax";
-const std::string G_OP_TYPE_TRANSPOSE = "transpose";
-const std::string G_OP_TYPE_SPLIT = "split";
-const std::string G_OP_TYPE_FEED = "feed";
-const std::string G_OP_TYPE_FETCH = "fetch";
-const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
+const char *G_OP_TYPE_CONV = "conv2d";
+const char *G_OP_TYPE_BATCHNORM = "batch_norm";
+const char *G_OP_TYPE_BOX_CODER = "box_coder";
+const char *G_OP_TYPE_CONCAT = "concat";
+const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
+const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
+const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
+const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
+const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
+const char *G_OP_TYPE_FC = "fusion_fc";
+const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
+const char *G_OP_TYPE_LRN = "lrn";
+const char *G_OP_TYPE_MUL = "mul";
+const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
+const char *G_OP_TYPE_POOL2D = "pool2d";
+const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
+const char *G_OP_TYPE_RELU = "relu";
+const char *G_OP_TYPE_RESHAPE = "reshape";
+const char *G_OP_TYPE_SIGMOID = "sigmoid";
+const char *G_OP_TYPE_SOFTMAX = "softmax";
+const char *G_OP_TYPE_TRANSPOSE = "transpose";
+const char *G_OP_TYPE_SPLIT = "split";
+const char *G_OP_TYPE_FEED = "feed";
+const char *G_OP_TYPE_FETCH = "fetch";
+const char *G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
+const char *G_OP_TYPE_IM2SEQUENCE = "im2sequence";
+const char *G_OP_TYPE_DROPOUT = "dropout";
+const char *G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn";
+const char *G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn";
+const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU =
+    "fusion_elementwise_add_relu";
+const char *G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu";
+const char *G_OP_TYPE_REGION = "region";
 
 std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
     op_input_output_key = {
         {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
-        {G_OP_TYPE_CONV_ADD, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DWCONV_BN_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_BN_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
         {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
         {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
@@ -59,11 +72,19 @@ std::unordered_map<
         {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
         {G_OP_TYPE_BOX_CODER,
          {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
         {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
         {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
         {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
         {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}}};
+        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_BN, {{"Input"}, {"Y"}}},
+        {G_OP_TYPE_FUSION_POOL_BN, {{"X"}, {"Y"}}},
+        {G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_FC_RELU, {{"X", "Y", "Z"}, {"Out"}}},
+        {G_OP_TYPE_REGION, {{"X"}, {"Out"}}}};
 
 }  // namespace paddle_mobile
diff --git a/src/common/types.h b/src/common/types.h
index 30a0663eeef899e3b8ff35bcb062824417362efc..6066879305d5ea7d1b6dcb0bb618c234338cc171 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -12,13 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once;
+#pragma once
 
 #include <string>
 #include <unordered_map>
+#include <utility>
+#include <vector>
 
 namespace paddle_mobile {
-enum class Precision : int { FP32 = 0 };
+enum class Precision : int { FP32 = 0, FP16 = 1 };
+
+typedef int16_t half;
 
 template <Precision p>
 struct PrecisionTrait {
@@ -29,6 +33,10 @@ template <>
 struct PrecisionTrait<Precision::FP32> {
   typedef float ptype;
 };
+template <>
+struct PrecisionTrait<Precision::FP16> {
+  typedef half ptype;
+};
 
 //! device type
 enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
@@ -71,28 +79,40 @@ enum PMStatus {
   PMWrongDevice = 0x08     /*!< un-correct device. */
 };
 
-extern const std::string G_OP_TYPE_CONV;
-extern const std::string G_OP_TYPE_BATCHNORM;
-extern const std::string G_OP_TYPE_BOX_CODER;
-extern const std::string G_OP_TYPE_CONCAT;
-extern const std::string G_OP_TYPE_ELEMENTWISE_ADD;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU;
-extern const std::string G_OP_TYPE_FC;
-extern const std::string G_OP_TYPE_CONV_ADD;
-extern const std::string G_OP_TYPE_LRN;
-extern const std::string G_OP_TYPE_MUL;
-extern const std::string G_OP_TYPE_MULTICLASS_NMS;
-extern const std::string G_OP_TYPE_POOL2D;
-extern const std::string G_OP_TYPE_PRIOR_BOX;
-extern const std::string G_OP_TYPE_RELU;
-extern const std::string G_OP_TYPE_RESHAPE;
-extern const std::string G_OP_TYPE_SIGMOID;
-extern const std::string G_OP_TYPE_SOFTMAX;
-extern const std::string G_OP_TYPE_TRANSPOSE;
-extern const std::string G_OP_TYPE_SPLIT;
-extern const std::string G_OP_TYPE_FEED;
-extern const std::string G_OP_TYPE_FETCH;
-extern const std::string G_OP_TYPE_DEPTHWISE_CONV;
+extern const char *G_OP_TYPE_CONV;
+extern const char *G_OP_TYPE_BATCHNORM;
+extern const char *G_OP_TYPE_BOX_CODER;
+extern const char *G_OP_TYPE_CONCAT;
+extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
+extern const char *G_OP_TYPE_FC;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_CONV_BN_RELU;
+
+extern const char *G_OP_TYPE_LRN;
+extern const char *G_OP_TYPE_MUL;
+extern const char *G_OP_TYPE_MULTICLASS_NMS;
+extern const char *G_OP_TYPE_POOL2D;
+extern const char *G_OP_TYPE_PRIOR_BOX;
+extern const char *G_OP_TYPE_RELU;
+extern const char *G_OP_TYPE_RESHAPE;
+extern const char *G_OP_TYPE_SIGMOID;
+extern const char *G_OP_TYPE_SOFTMAX;
+extern const char *G_OP_TYPE_TRANSPOSE;
+extern const char *G_OP_TYPE_SPLIT;
+extern const char *G_OP_TYPE_FEED;
+extern const char *G_OP_TYPE_FETCH;
+extern const char *G_OP_TYPE_DEPTHWISE_CONV;
+extern const char *G_OP_TYPE_IM2SEQUENCE;
+extern const char *G_OP_TYPE_DROPOUT;
+
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN;
+extern const char *G_OP_TYPE_FUSION_POOL_BN;
+extern const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU;
+extern const char *G_OP_TYPE_FUSION_FC_RELU;
+extern const char *G_OP_TYPE_REGION;
 
 extern std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
diff --git a/src/common/variant.h b/src/common/variant.h
index 7fbf0ec0772f102165770dc9c8e053f469965f10..00b8eb985d8f7fc22bb93a3e229aa387c358e257 100644
--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <cstdlib>
+
 #include "common/enforce.h"
 #include "common/log.h"
 
@@ -82,7 +84,8 @@ struct Variant {
     if (type_id == typeid(T).hash_code()) {
       return *const_cast<T *>(reinterpret_cast<const T *>(&data));
     } else {
-      PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant ");
+      PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant");
+      exit(0);
     }
   }
 
diff --git a/src/fpga/api/fpga_api.cpp b/src/fpga/api/fpga_api.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..779c846d1f3c465e5113f805b2b3856a1a7894c5
--- /dev/null
+++ b/src/fpga/api/fpga_api.cpp
@@ -0,0 +1,75 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+
+#include "fpga/api/fpga_api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+
+static int fd = -1;
+static const char *device_path = "/dev/fpgadrv0";
+
+static inline int do_ioctl(int req, const void *arg) {
+  return ioctl(req, (unsigned int64_t)arg);
+}
+
+int open_device() {
+  if (fd == -1) {
+    fd = open(device_path, O_RDWR);
+  }
+  return fd;
+}
+
+// memory management;
+void *fpga_malloc(size_t size) {
+  return reinterpret_cast<void *>(
+      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
+}
+
+void fpga_free(void *ptr) { munmap(ptr, 0); }
+
+void fpga_copy(void *dest, const void *src, size_t num) {
+  memcpy(dest, src, num);
+}
+
+int ComputeFpgaConv(const struct ConvArgs &args) {
+  return do_ioctl(IOCTL_CONFIG_CONV, &args);
+}
+int ComputeFpgaPool(const struct PoolingArgs &args) {
+  return do_ioctl(IOCTL_CONFIG_POOLING, &args);
+}
+int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
+  return do_ioctl(IOCTL_CONFIG_EW, &args);
+}
+int PerformBypass(const struct BypassArgs &args) {
+  return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/api/fpga_api.h b/src/fpga/api/fpga_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..0823e19a7f9dfaba709b6ad2723e3228c27e2e0f
--- /dev/null
+++ b/src/fpga/api/fpga_api.h
@@ -0,0 +1,178 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <cstddef>
+#include <iostream>
+#include <limits>
+
+// memory management;
+
+namespace paddle_mobile {
+namespace fpga {
+
+int open_device();
+int close_device();
+
+void* fpga_malloc(size_t size);
+void fpga_free(void* ptr);
+void fpga_copy(void* dst, const void* src, size_t num);
+
+enum DataConvertType {
+  DATA_NO_CONVERT = 0,
+  DATA_FP32_TO_FP16 = 1,
+  DATA_FP16_TO_FP32 = 2,
+};
+
+enum LayoutConvertType {
+  LAYOUT_NO_CONVERT = 0,
+  LAYOUT_CHW_TO_HWC = 1,
+  LAYOUT_HWC_TO_CHW = 2,
+};
+
+struct VersionArgs {
+  void* buffer;
+};
+
+struct MemoryCopyArgs {
+  void* src;
+  void* dest;
+  size_t size;
+};
+
+struct BNArgs {
+  bool enabled;
+  void* bias_address;
+  void* scale_address;
+};
+
+/**
+Conv and Pooling kernel
+*/
+struct KernelArgs {
+  uint32_t width;
+  uint32_t height;
+  uint32_t stride_w;
+  uint32_t stride_h;
+};
+
+struct ImageInputArgs {
+  void* address;         // input featuremap virtual address
+  float* scale_address;  // input scale address;
+  uint32_t channels;
+  uint32_t width;  // featuremap width
+  uint32_t height;
+  uint32_t pad_width;  // padding width;
+  uint32_t pad_height;
+};
+
+struct ImageOutputArgs {
+  void* address;         // output result address;
+  float* scale_address;  // output scale address;
+};
+
+struct ConvArgs {
+  bool relu_enabled;
+  void* sb_address;  // scale and bias are interlaced;
+  void* filter_address;
+  float* filter_scale_address;
+  uint32_t filter_num;
+  uint32_t group_num;
+
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+
+struct PoolingArgs {
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+
+// elementwise add arguments
+struct EWAddArgs {
+  bool relu_enabled;
+
+  float const0;  // output0 = const0 x input0 + const1 x input1;
+  float const1;
+  struct ImageInputArgs image0;
+  struct ImageInputArgs image1;
+  struct ImageOutputArgs output;
+};
+
+struct BypassArgs {
+  enum DataConvertType convert_type;
+  enum LayoutConvertType layout_type;
+  struct ImageInputArgs image;
+  struct ImageOutputArgs output;
+};
+
+struct FpgaRegWriteArgs {
+  uint64_t address;  //
+  uint64_t value;
+};
+
+#define IOCTL_FPGA_MAGIC 'FPGA'
+
+#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
+
+#define IOCTL_SEPARATOR_0 10
+
+#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
+
+#define IOCTL_SEPARATOR_1 20
+
+#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
+#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
+#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
+#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
+#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
+#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
+
+enum FPGA_ERR_TYPE {
+  ERR_IOCTL_CMD = -1,
+  ERR_TIMEOUT = -2,
+  ERR_COMPLETION_TIMEOUT = -3,
+  ERR_INVALID_FPGA_ADDR = -4,
+  ERR_NOMEM = -5,
+  ERR_NO_RESERVE_MEM = -6,
+  ERR_COPY_FROM_USER = -7,
+  ERR_COPY_TO_USER = -8,
+  ERR_DEL_TIMER = -9,
+  ERR_ENABLE_MSI = -10,
+  ERR_REGISTER_IRQ = -11,
+  ERR_PCIE_REGISTER = -12,
+  ERR_PCIE_PROBE = -13,
+  ERR_REGISTER_BLOCK = -14,
+  ERR_ALLOC_GENDISK = -15,
+  ERR_INIT_QUEUE = -16,
+  ERR_WAIT = -17,
+  ERR_ECC_ERROR = -31,
+  ERR_FPGA_FAIL_STOP = -64,
+  ERR_FPGA_DEBUG_STOP = -113,
+  DEV_TMP_UNAVAILABLE = -128
+};
+
+//============================== API =============================
+
+int PerformBypass(const struct BypassArgs& args);
+int ComputeFpgaConv(const struct ConvArgs& args);
+int ComputeFpgaPool(const struct PoolingArgs& args);
+int ComputeFpgaEWAdd(const struct EWAddArgs& args);
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/fpga_quantilization.cpp b/src/fpga/fpga_quantilization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dee3d3abc19e020304ff9e658d40797b6681c43b
--- /dev/null
+++ b/src/fpga/fpga_quantilization.cpp
@@ -0,0 +1,95 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/fpga_quantilization.h"
+#include <algorithm>
+
+namespace paddle_mobile {
+namespace fpga {
+
+template <typename Dtype>
+static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
+                       int height, int width) {
+  int offset_height = 0;
+
+  for (int n = 0; n < num; n++) {
+    int amount_per_row = width * channel;
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        int offset_height = h * amount_per_row;
+        for (int w = 0; w < width; w++) {
+          *(data_out + offset_height + w * channel + c) = *(data_in++);
+        }
+      }
+    }
+    data_out += num;
+  }
+}
+
+template <typename Dtype>
+static Dtype find_max(Dtype* data, int num) {
+  Dtype max = 0;
+  for (int i = 0; i < num; ++i) {
+    max = std::max(max, data[i]);
+  }
+  return max;
+}
+
+// template <typename Dtype>
+framework::Tensor* quantify_filter(framework::Tensor* filter) {
+  float scale = 0;
+  float fix_range = static_cast<float>((1 << (8 - 1)) - 1);
+
+  const int batch_size = filter->dims()[0];
+  const int channel = filter->dims()[1];
+  const int height = filter->dims()[2];
+  const int width = filter->dims()[3];
+
+  int8_t* int_data = nullptr;
+  int8_t* tmp_data = new int8_t[filter->numel()];
+
+  // 32bit filter -> 8bit filter;
+  if (filter->type() == typeid(float)) {
+    float* float_data = filter->data<float>();
+    float max = find_max(float_data, filter->numel());
+
+    scale = (max / fix_range);
+
+    framework::Tensor* filter = filter;
+    framework::Tensor* quant_filter = new framework::Tensor();
+
+    int_data = quant_filter->mutable_data<int8_t>();
+    for (int i = 0; i < filter->numel(); ++i) {
+      tmp_data[i] = (int8_t)float_data[i] * scale;
+    }
+    filter = quant_filter;
+  } else {
+    int8_t max = find_max(filter->data<int8_t>(), filter->numel());
+    scale = (max / fix_range);
+
+    int_data = filter->data<int8_t>();
+    for (int i = 0; i < filter->numel(); ++i) {
+      tmp_data[i] = int_data[i];
+    }
+    int_data = filter->mutable_data<int8_t>();
+  }
+  // NCHW -> NHWC;
+  chw_to_hwc<int8_t>(tmp_data, int_data, batch_size, channel, height, width);
+  delete tmp_data;
+  *(filter->fpga_args().scale_pointer()) = scale;
+  return filter;
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/fpga_quantilization.h b/src/fpga/fpga_quantilization.h
new file mode 100644
index 0000000000000000000000000000000000000000..56e14f89ac0e7d21e7bbb704df838374be84fbcd
--- /dev/null
+++ b/src/fpga/fpga_quantilization.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "common/types.h"
+#include "framework/lod_tensor.h"
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+namespace fpga {
+
+template <typename Dtype>
+static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
+                       int height, int width);
+
+// template <typename Dtype>
+framework::Tensor* quantify_filter(framework::Tensor* filter);
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/framework/attribute.h b/src/framework/attribute.h
index 3b6608cf03e7f786ad8c087dc869516cb6220edb..ed264057be6810d8bae29e0117fa4f6d91067cc1 100644
--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <cstdlib>
 #include <string>
+#include <typeinfo>
 #include <unordered_map>
 #include <vector>
 
@@ -128,6 +130,7 @@ class Attribute {
       return vistor(attr.variant_.Get<int64_t>());
     } else {
       PADDLE_MOBILE_THROW_EXCEPTION("type not support");
+      exit(0);
     }
   }
 
diff --git a/src/framework/data_layout.h b/src/framework/data_layout.h
index 3b31445707a887a2715afd0b9e7192ad76724351..0ba31ef9b7016b453b34cc4a023b0841b2110540 100644
--- a/src/framework/data_layout.h
+++ b/src/framework/data_layout.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cctype>
+#include <cstdlib>
 #include <string>
 
 namespace paddle_mobile {
@@ -40,6 +41,7 @@ inline DataLayout StringToDataLayout(const std::string &str) {
     return DataLayout::kAnyLayout;
   } else {
     PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
+    exit(0);
   }
 }
 
@@ -52,6 +54,8 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) {
     case DataLayout::kAnyLayout:
       return "ANY_LAYOUT";
     default:
+      PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string ")
+      exit(0);
       break;
   }
 }
diff --git a/src/framework/ddim.h b/src/framework/ddim.h
index c1d917dff612de3a42168c47d0bacd3ac7bdd3ad..db240b260185bb8ac2ba1fe84d3390bedac5c36d 100644
--- a/src/framework/ddim.h
+++ b/src/framework/ddim.h
@@ -14,8 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <cstdlib>
 #include <initializer_list>
+#include <typeinfo>
 #include <vector>
+
 #include "common/enforce.h"
 #include "common/variant.h"
 #include "dim.h"
@@ -57,7 +60,8 @@ struct DDim {
     } else if (d.var.TypeId() == typeid(Dim<9>).hash_code()) {
       return vistor(d.var.Get<Dim<9>>());
     } else {
-      DLOG << " dim not support";
+      PADDLE_MOBILE_ENFORCE(false, " dim not support");
+      exit(0);
     }
   }
 
diff --git a/src/framework/dim.h b/src/framework/dim.h
index 38e62df99519c3e869dc0fd2ae71beed28370122..0d3e86e92289da155843e1a9959d5ea67a73c060 100644
--- a/src/framework/dim.h
+++ b/src/framework/dim.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <cstdlib>
 #include "common/enforce.h"
 namespace paddle_mobile {
 namespace framework {
@@ -129,6 +130,7 @@ int64_t &indexer(Dim<D> &dim, int idx) {
 template <>
 int64_t &indexer<0>(Dim<0> &dim, int idx) {
   PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
+  exit(0);
 }
 
 template <int D>
@@ -145,6 +147,7 @@ int64_t indexer(const Dim<D> &dim, int idx) {
 template <>
 int64_t indexer<0>(const Dim<0> &dim, int idx) {
   PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
+  exit(0);
 }
 
 }  // namespace
diff --git a/src/framework/lod_tensor.cpp b/src/framework/lod_tensor.cpp
index 0a57d29a0c05c009299d43b3b9f5a59b2c3dc341..e165e55507ed04a9b63e4ad5eb002f206c71d96c 100644
--- a/src/framework/lod_tensor.cpp
+++ b/src/framework/lod_tensor.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "lod_tensor.h"
+#include <algorithm>
 
 namespace paddle_mobile {
 namespace framework {
diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp
index 36b4663cb603d29bb60cfc297899d1c300e8ca91..765103c241a82ac224d707340f8b66ace827e335 100644
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -28,6 +28,16 @@ vector<string> OperatorBase<Dtype>::GetOutKeys() const {
   return it->second.second;
 }
 
+template <typename Dtype>
+vector<string> OperatorBase<Dtype>::GetInputKeys() const {
+  auto it = op_input_output_key.find(type_);
+  if (it == op_input_output_key.end()) {
+    DLOG << type_ << " has no outputs";
+    return {};
+  }
+  return it->second.first;
+}
+
 template <typename Dtype>
 OperatorBase<Dtype>::OperatorBase(const std::string &type,
                                   const VariableNameMap &inputs,
@@ -49,6 +59,11 @@ template <typename Dtype>
 void OperatorBase<Dtype>::Run() const {
   RunImpl();
 #ifdef PADDLE_MOBILE_DEBUG
+  vector<string> input_keys = GetInputKeys();
+  for (const auto key : input_keys) {
+    Tensor *input = GetVarValue<framework::LoDTensor>(key, inputs_, *scope_);
+    DLOG << type_ << " input- " << key << "=" << *input;
+  }
   vector<string> output_keys = GetOutKeys();
   for (const auto key : output_keys) {
     Tensor *out_ = GetVarValue<framework::LoDTensor>(key, outputs_, *scope_);
diff --git a/src/framework/operator.h b/src/framework/operator.h
index 6a89884e9beb23878b422f77a5ed2851da6950e6..084ac3c81185fe489fe1ca67589c1e8edb1d4fdf 100644
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -61,8 +61,10 @@ class OperatorBase {
   virtual ~OperatorBase() {}
   void Run() const;
   std::vector<string> GetOutKeys() const;
+  std::vector<string> GetInputKeys() const;
   virtual void RunImpl() const = 0;
 
+  virtual void Init() = 0;
   /*
    * @b op 运算所需的输入, 如上一层的输出结果、卷积核
    * */
@@ -110,15 +112,21 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
                      const VariableNameMap &outputs, const AttributeMap &attrs,
                      std::shared_ptr<Scope> scope)
       : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope),
-        param_(inputs, outputs, attrs, *scope) {
-    PADDLE_MOBILE_ENFORCE(kernel_.Init(param_), "  %s kernel init failed",
-                          this->type_.c_str());
-  }
+        param_(inputs, outputs, attrs, *scope) {}
 
   virtual void RunImpl() const { this->kernel_.Compute(this->param_); }
 
   virtual void InferShape() const = 0;
 
+  void Init() {
+    //    for (auto i : this->inputs_) {
+    //      DLOG << i.first;
+    //      DLOG << i.second;
+    //    }
+    PADDLE_MOBILE_ENFORCE(kernel_.Init(&param_), "  %s kernel init failed",
+                          this->type_.c_str());
+  }
+
  protected:
   KernelType kernel_;
   ParamType param_;
@@ -135,9 +143,21 @@ class OpKernelBase {
    * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
    *    所有结构体存在与: paddle-mobile/src/operators/op_param.h
    * */
+#ifdef PADDLE_MOBILE_MALI_GPU
+  OpKernelBase() { acl_op_ = nullptr; }
+  void *GetAclOp() const { return acl_op_; }
+  void SetAclOp(void *op, void *ob) const {
+    reinterpret_cast<OpKernelBase<Dtype, P> *>(ob)->acl_op_ = op;
+  }
+#endif
   virtual void Compute(const P &para) const = 0;
-  virtual bool Init(const P &para) const { return true; };
+  virtual bool Init(P *para) { return true; }
   virtual ~OpKernelBase() = default;
+
+ private:
+#ifdef PADDLE_MOBILE_MALI_GPU
+  void *acl_op_;
+#endif
 };
 
 #define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)                                 \
diff --git a/src/framework/program/block_desc.cpp b/src/framework/program/block_desc.cpp
index 4b45ab305bf0f353f017674773b5fc51203bfef8..4e3eb79d07d0c8c363a6c3a9556cf718ebdc08f2 100644
--- a/src/framework/program/block_desc.cpp
+++ b/src/framework/program/block_desc.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "block_desc.h"
+#include <algorithm>
 
 namespace paddle_mobile {
 namespace framework {
diff --git a/src/framework/program/program-optimize/fusion_op_register.h b/src/framework/program/program-optimize/fusion_op_register.h
index 4cc83f8c80ab86ee6dcc1e3c395f872419da2be7..a5890d34c600f6c4f4838ec94c202801b3044d3f 100644
--- a/src/framework/program/program-optimize/fusion_op_register.h
+++ b/src/framework/program/program-optimize/fusion_op_register.h
@@ -14,11 +14,13 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include <map>
 #include <string>
+#include <vector>
 
 #include "framework/operator.h"
-#include "node.h"
+#include "framework/program/program-optimize/node.h"
 
 namespace paddle_mobile {
 namespace framework {
@@ -34,12 +36,25 @@ class FusionOpRegister {
   }
 
   void regist(FusionOpMatcher* matcher) {
+    if (matchers_.find(matcher->Type()) != matchers_.end()) {
+      return;
+    }
+
     std::shared_ptr<FusionOpMatcher> shared_matcher(matcher);
     matchers_[matcher->Type()] = shared_matcher;
   }
 
-  const std::map<std::string, std::shared_ptr<FusionOpMatcher>> Matchers() {
-    return matchers_;
+  const std::vector<std::shared_ptr<FusionOpMatcher>> Matchers() {
+    std::vector<std::shared_ptr<FusionOpMatcher>> matchers;
+    for (const auto& match : matchers_) {
+      matchers.push_back(match.second);
+    }
+    std::sort(matchers.begin(), matchers.end(),
+              [](std::shared_ptr<FusionOpMatcher> first,
+                 std::shared_ptr<FusionOpMatcher> second) {
+                return first->BeginNode().Depth() > second->BeginNode().Depth();
+              });
+    return matchers;
   }
 
  private:
diff --git a/src/framework/program/program-optimize/node.cpp b/src/framework/program/program-optimize/node.cpp
index 4ea45ec0a859ef8aa3ab4e34de8279e732706803..a4e1db506da362df4fb61b39827d5e77ebc425eb 100644
--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "framework/program/program-optimize/node.h"
+#include <algorithm>
 #include "framework/operator.h"
 
 namespace paddle_mobile {
@@ -43,23 +44,6 @@ bool Node::operator==(const Node &in) {
   return true;
 }
 
-std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(int size) {
-  std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
-  OpDescs(size - 1, &op_descs);
-  return op_descs;
-}
-
-void Node::OpDescs(int index,
-                   std::vector<std::shared_ptr<framework::OpDesc>> *op_desc) {
-  if (index == 0) {
-    return;
-  }
-  op_desc->push_back(this->op_desc_);
-  for (auto &output : outputs_) {
-    output->OpDescs(index, op_desc);
-  }
-}
-
 std::shared_ptr<Node> Node::To(int size) {
   std::shared_ptr<Node> node = std::make_shared<Node>();
   this->To(size - 1, node);
@@ -92,7 +76,8 @@ int Node::Depth(int begin) {
 
 Node &Node::Folder(
     int size, std::string type,
-    std::map<std::string, std::pair<std::string, std::string>> change,
+    std::map<std::string, std::vector<std::pair<std::string, std::string>>>
+        change,
     std::vector<std::shared_ptr<Node>> *removed_nodes) {
   std::shared_ptr<framework::OpDesc> op_desc =
       std::make_shared<framework::OpDesc>();
@@ -109,12 +94,15 @@ Node &Node::Folder(
 void Node::Folder(
     std::shared_ptr<framework::OpDesc> op_desc,
     std::vector<std::shared_ptr<Node>> *outputs, int index,
-    std::map<std::string, std::pair<std::string, std::string>> *change,
+    std::map<std::string, std::vector<std::pair<std::string, std::string>>>
+        *change,
     Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes) {
   if (change->find(this->type_) != change->end()) {
-    auto change_pair = (*change)[this->type_];
-    op_desc->GetInputs()[change_pair.second] =
-        this->op_desc_->GetInputs()[change_pair.first];
+    auto change_pairs = (*change)[this->type_];
+    for (const auto &change_pair : change_pairs) {
+      op_desc->GetInputs()[change_pair.second] =
+          this->op_desc_->GetInputs()[change_pair.first];
+    }
   }
 
   for (auto &attr_pair : this->op_desc_->attrs_) {
diff --git a/src/framework/program/program-optimize/node.h b/src/framework/program/program-optimize/node.h
index 7236ffdd1782dfb39af73195da9b3756030c9117..7eb179c243c28fe2668c3cf2f8f28f81312c0988 100644
--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cinttypes>
 #include <map>
 #include <string>
+#include <utility>
 #include <vector>
 #include "common/log.h"
 #include "framework/program/op_desc.h"
@@ -43,20 +44,19 @@ class Node {
   int Depth(int begin = 0);
   Node &Folder(
       int size, std::string type,
-      std::map<std::string, std::pair<std::string, std::string>> change_map,
+      std::map<std::string, std::vector<std::pair<std::string, std::string>>>
+          change,
       std::vector<std::shared_ptr<Node>> *removed_nodes);
-  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(int size);
   std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
   std::string Type() { return type_; }
 
  private:
-  void OpDescs(int size,
-               std::vector<std::shared_ptr<framework::OpDesc>> *op_desc);
   void To(int index, std::shared_ptr<Node>);
   void Folder(
       std::shared_ptr<framework::OpDesc> op_desc,
       std::vector<std::shared_ptr<Node>> *outputs, int index,
-      std::map<std::string, std::pair<std::string, std::string>> *change,
+      std::map<std::string, std::vector<std::pair<std::string, std::string>>>
+          *change,
       Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes);
   std::shared_ptr<framework::OpDesc> op_desc_;
 #ifdef PADDLE_MOBILE_DEBUG
diff --git a/src/framework/program/program-optimize/program_optimize.cpp b/src/framework/program/program-optimize/program_optimize.cpp
index e9b5cc5187bef7c9963e23b05187c09e2c789dc2..82d33bc65d864e010fbe41b270b71ed98a21b33e 100644
--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "framework/program/program-optimize/program_optimize.h"
+#include <algorithm>
 #include "framework/program/program-optimize/fusion_op_register.h"
 
 namespace paddle_mobile {
@@ -77,9 +78,8 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
     }
 
     for (auto &registed : FusionOpRegister::Instance()->Matchers()) {
-      std::string fusion_type = registed.first;
-      std::shared_ptr<FusionOpMatcher> matcher = registed.second;
-      //      DLOG << " registed node \n " << matcher->BeginNode();
+      std::string fusion_type = registed->Type();
+      std::shared_ptr<FusionOpMatcher> matcher = registed;
 
       auto match_vector = type_map[matcher->BeginType()];
 
diff --git a/src/framework/program/program.h b/src/framework/program/program.h
index bb82fa7334a7d1941734dcd846c8e66befdbdd10..e500d500344d83204bf388401541259b90ea2f78 100644
--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -29,7 +29,8 @@ class Program {
   std::shared_ptr<Scope> scope;
   std::string model_path;
   std::string para_path;
-  bool is_commbine = false;
+  bool combined = false;
+  bool quantification = false;
 
  private:
 };
diff --git a/src/framework/scope.cpp b/src/framework/scope.cpp
index 2f7ff247b846f0a5f3e59c5c2f317a59598fc643..a1f5789aa52d2a70f54cef5c622c3a15907a4683 100644
--- a/src/framework/scope.cpp
+++ b/src/framework/scope.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "framework/scope.h"
 
+#include <algorithm>
 #include <set>
 #include <string>
 #include <vector>
diff --git a/src/framework/scope.h b/src/framework/scope.h
index d714f61af3bd443c09fcef7aacee2416b90b5e02..054f141ff68895e0879fd31e15d90c76ea038135 100644
--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -23,7 +23,17 @@ namespace framework {
 class Scope {
  public:
   Scope() = default;
-  ~Scope() = default;
+
+  ~Scope() {
+    for (auto &var : vars_) {
+      delete var.second;
+    }
+    vars_.clear();
+    for (auto kid : kids_) {
+      delete kid;
+    }
+    kids_.clear();
+  }
 
   Scope &NewScope() const;
 
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index a221a26aa1435000646cf7d58321df28f3322834..797fcf5bffbe5e738fe352d1ca84602f0e5d86a0 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -16,13 +16,15 @@ limitations under the License. */
 
 #include <cstdint>
 #include <cstring>
+#include <fstream>
 #include <memory>
+#include <string>
 #include <type_traits>
 #include <typeindex>
 #include <vector>
-#include "common/enforce.h"
 
 #include "common/enforce.h"
+#include "common/types.h"
 #include "framework/data_layout.h"
 #include "framework/ddim.h"
 #include "memory/t_malloc.h"
@@ -62,7 +64,8 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 };
 
 static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t> functor;
+  SizeOfTypeFunctor<int, half, float, double, int16_t, int64_t, bool, size_t>
+      functor;
   size_t size = functor(type);
 
   PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
@@ -131,11 +134,27 @@ class Tensor {
     return reinterpret_cast<T *>(mutable_data(typeid(T)));
   }
 
+#ifdef PADDLE_MOBILE_DEBUG
+  template <typename T>
+  inline void dump(std::string filename) const {
+    const T *dataptr = data<T>();
+    std::ofstream out(filename.c_str());
+    for (int i = 0; i < numel(); ++i) {
+      out << dataptr[i] << " ";
+    }
+    out << "形状：";
+    for (int j = 0; j < dims_.size(); ++j) {
+      out << dims_[j] << " ";
+    }
+    out.close();
+  }
+#endif
+
   inline void *mutable_data(std::type_index type) {
     if (holder_ != nullptr) {
       holder_->set_type(type);
     }
-    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor'snumel must >=0.")
+    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
     int64_t size = numel() * SizeOfType(type);
     if (holder_ == nullptr || holder_->size() < size + offset_) {
       holder_.reset(new PlaceholderImpl(size, type));
@@ -234,6 +253,18 @@ class Tensor {
                           "Tensor's dims_ is out of bound. ");
   }
 
+#ifdef PADDLE_MOBILE_FPGA
+  struct FPGAArgs {
+    float scale;
+
+    inline float *scale_pointer() { return &scale; }
+  };
+
+  struct FPGAArgs fpga_args() const {
+    return fpgaArgs_;
+  }
+#endif
+
  private:
   /**
    * @note    Placeholder hides type T, so it doesn't appear as a
@@ -300,6 +331,10 @@ class Tensor {
    * begins.
    */
   size_t offset_;
+
+#ifdef PADDLE_MOBILE_FPGA
+  FPGAArgs fpgaArgs_;
+#endif
 };
 
 #ifdef PADDLE_MOBILE_DEBUG
diff --git a/src/io/api.cc b/src/io/api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e254aa15ac06083038773d89c23d40242847782
--- /dev/null
+++ b/src/io/api.cc
@@ -0,0 +1,85 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "cstring"
+#include "io/paddle_inference_api.h"
+
+namespace paddle_mobile {
+
+int PaddleDtypeSize(PaddleDType dtype) {
+  switch (dtype) {
+    case PaddleDType::FLOAT32:
+      return sizeof(float);
+    case PaddleDType::INT64:
+      return sizeof(int64_t);
+    default:
+      assert(false);
+      return -1;
+  }
+}
+
+PaddleBuf::PaddleBuf(PaddleBuf&& other)
+    : data_(other.data_),
+      length_(other.length_),
+      memory_owned_(other.memory_owned_) {
+  other.memory_owned_ = false;
+  other.data_ = nullptr;
+  other.length_ = 0;
+}
+
+PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
+
+PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
+  // only the buffer with external memory can be copied
+  if (!other.memory_owned_) {
+    data_ = other.data_;
+    length_ = other.length_;
+    memory_owned_ = other.memory_owned_;
+  } else {
+    Resize(other.length());
+    memcpy(data_, other.data(), other.length());
+    length_ = other.length();
+    memory_owned_ = true;
+  }
+  return *this;
+}
+
+void PaddleBuf::Resize(size_t length) {
+  // Only the owned memory can be reset, the external memory can't be changed.
+  if (length_ == length) return;
+  if (memory_owned_) {
+    Free();
+  }
+  data_ = new char[length];
+  length_ = length;
+  memory_owned_ = true;
+}
+
+void PaddleBuf::Reset(void* data, size_t length) {
+  Free();
+  memory_owned_ = false;
+  data_ = data;
+  length_ = length;
+}
+
+void PaddleBuf::Free() {
+  if (memory_owned_ && data_) {
+    assert(length_ > 0);
+    delete[] static_cast<char*>(data_);
+    data_ = nullptr;
+    length_ = 0;
+  }
+}
+
+}  // namespace paddle_mobile
diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4609438ec9fbdb5b5030b56a4bf18b9437bf7c2e
--- /dev/null
+++ b/src/io/api_paddle_mobile.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "io/api_paddle_mobile.h"
+#include <vector>
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+
+template <typename Dtype, Precision P>
+PaddleMobilePredictor<Dtype, P>::PaddleMobilePredictor(
+    const PaddleMobileConfig &config) {
+  PADDLE_MOBILE_ENFORCE(Init(config) == true,
+                        "paddle mobile predictor init failed!");
+  config_ = config;
+}
+
+template <typename Dtype, Precision P>
+bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
+  paddle_mobile_.reset(new PaddleMobile<Dtype, P>());
+  if (!config.model_dir.empty()) {
+    paddle_mobile_->Load(config.model_dir, config.optimize,
+                         config.quantification, config.batch_size);
+  } else if (!config.prog_file.empty() && !config.param_file.empty()) {
+    paddle_mobile_->Load(config.prog_file, config.param_file, config.optimize,
+                         config.quantification, config.batch_size);
+  } else {
+    LOG(kLOG_ERROR) << "fail to load inference model!";
+    return false;
+  }
+  // If the openmp is open, set the thread num
+  paddle_mobile_->SetThreadNum(config.thread_num);
+  return true;
+}
+
+template <typename Dtype, Precision P>
+bool PaddleMobilePredictor<Dtype, P>::Run(
+    const std::vector<PaddleTensor> &inputs,
+    std::vector<PaddleTensor> *output_data, int batch_size) {
+  if (inputs.empty()) {
+    LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
+    return false;
+  }
+  auto input = inputs[0];
+
+  if (input.shape.size() != 4) {
+    LOG(kLOG_ERROR) << "input shape not equal to 4!";
+    return false;
+  }
+  std::vector<int64_t> dims;
+  for (auto d : input.shape) {
+    dims.push_back(static_cast<int64_t>(d));
+  }
+
+  // use tensor
+  framework::DDim ddim =
+      framework::make_ddim({dims[0], dims[1], dims[2], dims[3]});
+
+  framework::Tensor input_tensor;
+  input_tensor.Resize(ddim);
+  int input_length = framework::product(ddim);
+  typedef typename PrecisionTrait<P>::ptype PType;
+  auto input_ptr = input_tensor.mutable_data<PType>();
+
+  memcpy(input_ptr, static_cast<PType *>(input.data.data()),
+         input_length * sizeof(PType));
+  auto output_tensor = paddle_mobile_->Predict(input_tensor);
+
+  if (output_data->empty()) {
+    LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
+    return false;
+  }
+
+  auto &output = (*output_data)[0];
+  int output_length = output_tensor->numel();
+  std::vector<int64_t> tensor_shape =
+      framework::vectorize(output_tensor->dims());
+
+  for (auto d : tensor_shape) {
+    output.shape.push_back(static_cast<int>(d));
+  }
+
+  if (output.data.length() < output_length * sizeof(PType)) {
+    output.data.Resize(output_length * sizeof(PType));
+  }
+
+  memcpy(output.data.data(), output_tensor->template data<PType>(),
+         output_length * sizeof(PType));
+
+  return true;
+}
+
+// A factory to help create difference predictor.
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(
+    const PaddleMobileConfig &config) {
+  std::unique_ptr<PaddlePredictor> x;
+  if (config.precision == PaddleMobileConfig::FP32) {
+    if (config.device == PaddleMobileConfig::kCPU) {
+      x.reset(new PaddleMobilePredictor<CPU, Precision::FP32>(config));
+    } else if (config.device == PaddleMobileConfig::kFPGA) {
+      x.reset(new PaddleMobilePredictor<FPGA, Precision::FP32>(config));
+    } else if (config.device == PaddleMobileConfig::kGPU_MALI) {
+      x.reset(new PaddleMobilePredictor<GPU_MALI, Precision::FP32>(config));
+    } else {
+      LOG(kLOG_ERROR) << "unsupport device type!";
+      return nullptr;
+    }
+  } else {
+    LOG(kLOG_ERROR) << "unsupport precision type!";
+    return nullptr;
+  }
+  return std::move(x);
+}
+
+}  // namespace paddle_mobile
diff --git a/src/io/api_paddle_mobile.h b/src/io/api_paddle_mobile.h
new file mode 100644
index 0000000000000000000000000000000000000000..66c6a4d5d9f8fc81b96642c6d5b62757dd581bc3
--- /dev/null
+++ b/src/io/api_paddle_mobile.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the implementation of inference API with Anakin engine
+ * embeded, this API can only support Anakin models.
+ */
+
+#pragma once
+
+#include <vector>
+#include "io/paddle_inference_api.h"
+
+// from paddle_mobile
+#include "common/enforce.h"
+#include "common/types.h"
+#include "io/paddle_mobile.h"
+
+namespace paddle_mobile {
+
+template <typename Dtype = CPU, Precision P = Precision::FP32>
+class PaddleMobilePredictor : public PaddlePredictor {
+ public:
+  PaddleMobilePredictor() {}
+
+  explicit PaddleMobilePredictor(const PaddleMobileConfig& config);
+
+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override;
+
+  ~PaddleMobilePredictor() override{};
+
+ private:
+  std::unique_ptr<PaddleMobile<Dtype, P>> paddle_mobile_;
+  bool Init(const PaddleMobileConfig& config);
+
+  PaddleMobileConfig config_;
+};
+
+}  // namespace paddle_mobile
diff --git a/src/io/io.cpp b/src/io/executor.cpp
similarity index 70%
rename from src/io/io.cpp
rename to src/io/executor.cpp
index 7931432bd1d4528ab9b0cda7ab05ab13c14dfcfe..d6434b64aa752fd62bc637a882298228d59880b8 100644
--- a/src/io/io.cpp
+++ b/src/io/executor.cpp
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "io/io.h"
+#include "io/executor.h"
+#include <operators/math/gemm.h>
+#include <algorithm>
 #include <vector>
 #include "common/enforce.h"
 #include "common/log.h"
@@ -25,7 +27,6 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
-#include <algorithm>
 #include <queue>
 #include <utility>
 #include "common/threadpool.h"
@@ -39,7 +40,7 @@ char *Get_binary_data(std::string filename) {
   PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
                         filename.c_str());
   fseek(file, 0, SEEK_END);
-  long size = ftell(file);
+  int64_t size = ftell(file);
   PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
   rewind(file);
   char *data = new char[size];
@@ -50,116 +51,6 @@ char *Get_binary_data(std::string filename) {
   return data;
 }
 
-static size_t ReadBuffer(const char *file_name, uint8_t **out) {
-  printf("%s \n", file_name);
-  FILE *fp;
-  fp = fopen(file_name, "rb");
-  PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name);
-
-  fseek(fp, 0, SEEK_END);
-  size_t size = ftell(fp);
-  rewind(fp);
-
-  DLOG << "model size: " << size;
-
-  *out = reinterpret_cast<uint8_t *>(malloc(size));
-
-  size_t cur_len = 0;
-  size_t nread;
-  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
-    cur_len += nread;
-  }
-  fclose(fp);
-  return cur_len;
-}
-
-template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &dirname, bool optimize, bool can_add_split) {
-  auto program =
-      this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
-  program.model_path = dirname;
-  return program;
-}
-
-template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &model_path, const std::string &para_path,
-    bool optimize) {
-  auto program = this->LoadProgram(model_path, optimize);
-  program.para_path = para_path;
-  program.is_commbine = true;
-  return program;
-}
-
-template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
-    const std::string &model_path, bool optimize, bool can_add_split) {
-  std::string model_filename = model_path;
-  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-  uint8_t *buf = NULL;
-  size_t read_size = ReadBuffer(model_filename.c_str(), &buf);
-
-  PADDLE_MOBILE_ENFORCE(buf != NULL, "read from __model__ is null");
-
-  c_program = paddle_mobile__framework__proto__program_desc__unpack(
-      NULL, read_size, buf);
-  //
-  PADDLE_MOBILE_ENFORCE(c_program != NULL, "program is null");
-  //
-  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
-  //
-  auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
-
-  framework::Program<Dtype, P> program;
-  program.originProgram = originProgramDesc;
-
-  auto scope = std::make_shared<framework::Scope>();
-  program.scope = scope;
-
-  for (const auto &block : originProgramDesc->Blocks()) {
-    for (auto var_desc : block->Vars()) {
-      auto var = scope->Var(var_desc->Name());
-
-      if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-        if (var_desc->Persistable() &&
-            var_desc->Type() != framework::VARTYPE_TYPE_FEED_MINIBATCH &&
-            var_desc->Type() != framework::VARTYPE_TYPE_FETCH_LIST) {
-          auto dim = var_desc->Tensor_desc().Dims();
-          auto tensor = var->GetMutable<framework::LoDTensor>();
-          tensor->Resize(framework::make_ddim(dim));
-        } else {
-          auto dim = var_desc->Tensor_desc().Dims();
-          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
-          dim[0] = 1;
-          auto tensor = var->GetMutable<framework::LoDTensor>();
-          tensor->Resize(framework::make_ddim(dim));
-        }
-      } else {
-        // TODO(codeWorm): some.
-      }
-    }
-  }
-
-  if (optimize) {
-    framework::ProgramOptimize program_optimize;
-    program.optimizeProgram =
-        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
-  }
-  if (optimize) {
-    program.optimizeProgram->Description("optimize: ");
-  } else {
-    originProgramDesc->Description("program: ");
-  }
-
-  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
-  return program;
-}
-
-template class Loader<CPU, Precision::FP32>;
-template class Loader<FPGA, Precision::FP32>;
-template class Loader<GPU_MALI, Precision::FP32>;
-
 #pragma mark - executor
 template <typename Dtype, Precision P>
 Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
@@ -193,39 +84,46 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
 #endif
     }
   }
-  if (program_.is_commbine) {
+  if (program_.combined) {
     InitCombineMemory();
   } else {
     InitMemory();
   }
+
+  std::shared_ptr<framework::BlockDesc> to_predict_block =
+      to_predict_program_->Block(0);
+  auto &ops = ops_of_block_[*to_predict_block.get()];
+  for (const auto &op : ops) {
+    op->Init();
+  }
 }
 
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
-                                    framework::LoDTensor *tensor, char *&data) {
+                                    framework::LoDTensor *tensor, char **data) {
   // 1. version
-  uint32_t version = *(uint32_t *)data;
-  data += sizeof(uint32_t);
+  uint32_t version = *reinterpret_cast<uint32_t *>(*data);
+
+  (*data) += sizeof(uint32_t);
 
   // 2 Lod information
   uint64_t *lod_level_ptr = new uint64_t();
-  memcpy(lod_level_ptr, data, sizeof(uint64_t));
+  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
   uint64_t lod_level = *lod_level_ptr;
   delete lod_level_ptr;
-  data += sizeof(uint64_t);
+  (*data) += sizeof(uint64_t);
 
   auto &lod = *tensor->mutable_lod();
   lod.resize(lod_level);
   for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size = *(uint64_t *)data;
-    data += sizeof(uint64_t);
+    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
+    (*data) += sizeof(uint64_t);
     DLOG << "lod size: " << i << size;
     std::vector<size_t> tmp(size / sizeof(size_t));
 
     for (int k = 0; k < tmp.size(); ++k) {
-      tmp[k] = *(size_t *)data;
-      DLOG << "tmp[k]: " << k << *(size_t *)data;
-      data += sizeof(size_t);
+      tmp[k] = *reinterpret_cast<size_t *>(*data);
+      (*data) += sizeof(size_t);
     }
 
     for (auto j : tmp) {
@@ -235,18 +133,18 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
   }
 
   // 3. tensor version
-  uint32_t tensor_version = *(uint32_t *)data;
-  data += sizeof(uint32_t);
+  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
+  (*data) += sizeof(uint32_t);
 
   // 4. tensor desc
-  int32_t size = *(int32_t *)data;
-  data += sizeof(int32_t);
+  int32_t size = *reinterpret_cast<int32_t *>(*data);
+  (*data) += sizeof(int32_t);
 
   std::unique_ptr<char[]> buf(new char[size]);
   for (int m = 0; m < size; ++m) {
-    buf.get()[m] = data[m];
+    buf.get()[m] = (*data)[m];
   }
-  data += (sizeof(char) * size);
+  (*data) += (sizeof(char) * size);
 
   const framework::TensorDesc &desc = var_desc.Tensor_desc();
   int memory_size = 1;
@@ -256,7 +154,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
 
   tensor->Resize(framework::make_ddim(desc.Dims()));
 
-  void *memory = tensor;
+  void *memory = nullptr;
   int type_size = 0;
   switch (desc.DataType()) {
     case framework::VARTYPE_TYPE_FP16:
@@ -281,11 +179,25 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
     default:
       break;
   }
-
-  for (int n = 0; n < memory_size * type_size; ++n) {
-    static_cast<char *>(memory)[n] = data[n];
+  if (program_.quantification) {
+    float min_value;
+    float max_value;
+
+    memcpy(&min_value, *data, sizeof(float));
+    memcpy(&max_value, *data + sizeof(float), sizeof(float));
+    *data += 2 * sizeof(float);
+    const float factor = (max_value - min_value) / 255.0;
+    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
+    for (int k = 0; k < memory_size; ++k) {
+      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
+    }
+    *data += (memory_size * sizeof(uint8_t));
+  } else {
+    for (int n = 0; n < memory_size * type_size; ++n) {
+      static_cast<char *>(memory)[n] = (*data)[n];
+    }
+    (*data) += (sizeof(char) * memory_size * type_size);
   }
-  data += (sizeof(char) * memory_size * type_size);
 }
 
 template <typename Dtype, Precision P>
@@ -302,7 +214,7 @@ void Executor<Dtype, P>::InitMemory() {
         char *origin_data =
             Get_binary_data(program_.model_path + "/" + var_desc->Name());
         char *data = origin_data;
-        LoadMemory(*var_desc, tensor, data);
+        LoadMemory(*var_desc, tensor, &data);
         delete origin_data;
       } else {
         if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
@@ -328,7 +240,7 @@ void Executor<Dtype, P>::InitCombineMemory() {
         if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
           continue;
         }
-        LoadMemory(*var_desc, tensor, data);
+        LoadMemory(*var_desc, tensor, &data);
       } else {
         if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
           auto tensor = var->template GetMutable<framework::LoDTensor>();
@@ -416,6 +328,8 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
     clock_gettime(CLOCK_MONOTONIC, &ts);
     profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
+
+    // to Run
     ops[i]->Run();
 #ifdef PADDLE_MOBILE_PROFILE
     clock_gettime(CLOCK_MONOTONIC, &ts);
@@ -433,7 +347,8 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
                                                    *(program_.scope));
 #ifdef PADDLE_MOBILE_PROFILE
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
-  // TODO expose profile info as an interface, user can get them to analysis
+  // TODO(haipeng): expose profile info as an interface, user can get them to
+  // analysis
   //      the performance of their deepnet.
   FILE *df = fopen("net.dot", "w");
   fprintf(df, "digraph {\n");
@@ -448,16 +363,19 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
   fprintf(df, "}\n");
   fclose(df);
 #endif
-  FILE *pf = fopen("profile.out", "w");
+
+  //  FILE *pf = fopen("profile.out", "w");
   std::unordered_map<std::string, uint64_t> _tp;
   for (int i = 0; i < profile.size(); i++) {
     const auto &pInfo = profile[i];
     uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
     _tp[ops[i]->Type()] += timeCost;
-    fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i, ops[i]->Type().c_str(),
-            pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
+    //    fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i,
+    //    ops[i]->Type().c_str(),
+    //            pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
   }
-  fclose(pf);
+  //  fclose(pf);
+
   printf("====================[ profile ]======================\n");
   using prof_t = std::pair<std::string, uint64_t>;
   std::vector<prof_t> _tv(_tp.begin(), _tp.end());
@@ -471,8 +389,9 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
   std::sort(_tv.begin(), _tv.end(), compf);
   _tv.push_back(std::make_pair("total", _ptotal));
   for (auto const &p : _tv) {
-    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(), (float)p.second,
-           (float)p.second / _ptotal * 100.0);
+    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
+           static_cast<float>(p.second),
+           static_cast<float>(p.second) / _ptotal * 100.0);
   }
   printf("====================[---------]======================\n");
 #endif
@@ -500,7 +419,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
 }
 
 template class Executor<CPU, Precision::FP32>;
-template class Executor<FPGA, Precision::FP32>;
 template class Executor<GPU_MALI, Precision::FP32>;
+template class Executor<FPGA, Precision::FP32>;
 
 }  // namespace paddle_mobile
diff --git a/src/io/io.h b/src/io/executor.h
similarity index 70%
rename from src/io/io.h
rename to src/io/executor.h
index ff520dd628406eae47f76196dbe66a0992dfe735..f8f2a8ad5657fdb3cf6cb249e32537bd5e866913 100644
--- a/src/io/io.h
+++ b/src/io/executor.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "common/types.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"
@@ -27,36 +28,11 @@ limitations under the License. */
 #include <condition_variable>
 #include <mutex>
 #include <thread>
-#include "common/depCore.h"
+#include "common/dep_core.h"
 #endif
 
 namespace paddle_mobile {
 
-template <typename Dtype = CPU, Precision P = Precision::FP32>
-class Loader {
- public:
-  /*
-   * @b load separate format fluid model
-   * @b 加载分开形式的 fluid 模型
-   * */
-  const framework::Program<Dtype, P> Load(const std::string &dirname,
-                                          bool optimize = false,
-                                          bool can_add_split = false);
-
-  /*
-   * @b load combine format fluid mode
-   * @b 加载结合在一起格式的模型
-   * */
-  const framework::Program<Dtype, P> Load(const std::string &model_path,
-                                          const std::string &para_path,
-                                          bool optimize = false);
-
- private:
-  const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
-                                                 bool optimize = false,
-                                                 bool can_add_split = false);
-};
-
 template <typename Dtype = CPU, Precision P = Precision::FP32>
 class Executor {
  public:
@@ -86,7 +62,7 @@ class Executor {
   Executor() = default;
   void InitMemory();
   void LoadMemory(const framework::VarDesc var_desc,
-                  framework::LoDTensor *tensor, char *&data);
+                  framework::LoDTensor *tensor, char **data);
   void InitCombineMemory();
   framework::Program<Dtype> program_;
   int batch_size_ = 1;
diff --git a/src/io/loader.cpp b/src/io/loader.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cdcecf02ab8af22dec0e32113052ac26e9c5fcfc
--- /dev/null
+++ b/src/io/loader.cpp
@@ -0,0 +1,137 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "io/loader.h"
+
+#include "framework/lod_tensor.h"
+#include "framework/program/program-optimize/program_optimize.h"
+
+namespace paddle_mobile {
+using framework::Variable;
+
+static size_t ReadBuffer(const char *file_name, uint8_t **out) {
+  FILE *fp;
+  fp = fopen(file_name, "rb");
+  PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name);
+
+  fseek(fp, 0, SEEK_END);
+  size_t size = ftell(fp);
+  rewind(fp);
+
+  DLOG << "model size: " << size;
+
+  *out = reinterpret_cast<uint8_t *>(malloc(size));
+
+  size_t cur_len = 0;
+  size_t nread;
+  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
+    cur_len += nread;
+  }
+  fclose(fp);
+  return cur_len;
+}
+
+template <typename Dtype, Precision P>
+const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
+    const std::string &dirname, bool optimize, bool quantification,
+    bool can_add_split) {
+  auto program = this->LoadProgram(dirname + "/__model__", optimize,
+                                   quantification, can_add_split);
+  program.model_path = dirname;
+  return program;
+}
+
+template <typename Dtype, Precision P>
+const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
+    const std::string &model_path, const std::string &para_path, bool optimize,
+    bool quantification) {
+  auto program = this->LoadProgram(model_path, optimize, quantification);
+
+  program.para_path = para_path;
+  program.combined = true;
+  program.quantification = quantification;
+  return program;
+}
+
+template <typename Dtype, Precision P>
+const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
+    const std::string &model_path, bool optimize, bool quantification,
+    bool can_add_split) {
+  std::string model_filename = model_path;
+  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
+  uint8_t *buf = NULL;
+  size_t read_size = ReadBuffer(model_filename.c_str(), &buf);
+
+  PADDLE_MOBILE_ENFORCE(buf != NULL, "read from __model__ is null");
+
+  c_program = paddle_mobile__framework__proto__program_desc__unpack(
+      NULL, read_size, buf);
+  //
+  PADDLE_MOBILE_ENFORCE(c_program != NULL, "program is null");
+  //
+  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
+  //
+  auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
+
+  framework::Program<Dtype, P> program;
+  program.originProgram = originProgramDesc;
+  program.quantification = quantification;
+
+  auto scope = std::make_shared<framework::Scope>();
+  program.scope = scope;
+
+  for (const auto &block : originProgramDesc->Blocks()) {
+    for (auto var_desc : block->Vars()) {
+      auto var = scope->Var(var_desc->Name());
+
+      if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+        if (var_desc->Persistable() &&
+            var_desc->Type() != framework::VARTYPE_TYPE_FEED_MINIBATCH &&
+            var_desc->Type() != framework::VARTYPE_TYPE_FETCH_LIST) {
+          auto dim = var_desc->Tensor_desc().Dims();
+          auto tensor = var->GetMutable<framework::LoDTensor>();
+          tensor->Resize(framework::make_ddim(dim));
+        } else {
+          auto dim = var_desc->Tensor_desc().Dims();
+          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
+          dim[0] = 1;
+          auto tensor = var->GetMutable<framework::LoDTensor>();
+          tensor->Resize(framework::make_ddim(dim));
+        }
+      } else {
+        // TODO(codeWorm): some.
+      }
+    }
+  }
+
+  if (optimize) {
+    framework::ProgramOptimize program_optimize;
+    program.optimizeProgram =
+        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
+  }
+  if (optimize) {
+    program.optimizeProgram->Description("optimize: ");
+  } else {
+    originProgramDesc->Description("program: ");
+  }
+
+  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
+  return program;
+}
+
+template class Loader<CPU, Precision::FP32>;
+template class Loader<FPGA, Precision::FP32>;
+template class Loader<GPU_MALI, Precision::FP32>;
+
+}  // namespace paddle_mobile
diff --git a/src/io/loader.h b/src/io/loader.h
new file mode 100644
index 0000000000000000000000000000000000000000..512cee831f0a09f8223c07c531eb9d1c74e75d92
--- /dev/null
+++ b/src/io/loader.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "common/types.h"
+#include "framework/program/program.h"
+
+namespace paddle_mobile {
+
+template <typename Dtype = CPU, Precision P = Precision::FP32>
+class Loader {
+ public:
+  /*
+   * @b load separate format fluid model
+   * @b 加载分开形式的 fluid 模型
+   * */
+  const framework::Program<Dtype, P> Load(const std::string &dirname,
+                                          bool optimize = false,
+                                          bool quantification = false,
+                                          bool can_add_split = false);
+
+  /*
+   * @b load combine format fluid mode
+   * @b 加载结合在一起格式的模型
+   * */
+  const framework::Program<Dtype, P> Load(const std::string &model_path,
+                                          const std::string &para_path,
+                                          bool optimize = false,
+                                          bool quantification = false);
+
+ private:
+  const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
+                                                 bool optimize = false,
+                                                 bool quantification = false,
+                                                 bool can_add_split = false);
+};
+
+}  // namespace paddle_mobile
diff --git a/src/io/paddle_inference_api.h b/src/io/paddle_inference_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..97564f4132d2e43cf736c2eb4a95d437584be24f
--- /dev/null
+++ b/src/io/paddle_inference_api.h
@@ -0,0 +1,132 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the definition of a simple Inference API for Paddle.
+ *
+ * ATTENTION: It requires some C++ features, for lower version C++ or C, we
+ * might release another API.
+ */
+
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace paddle_mobile {
+
+enum PaddleDType {
+  FLOAT32,
+  INT64,
+};
+
+class PaddleBuf {
+ public:
+  PaddleBuf() = default;
+  PaddleBuf(PaddleBuf&& other);
+  // Copy only available when memory is managed externally.
+  explicit PaddleBuf(const PaddleBuf&);
+  PaddleBuf& operator=(const PaddleBuf&);
+  // Do not own the memory.
+  PaddleBuf(void* data, size_t length)
+      : data_(data), length_(length), memory_owned_{false} {}
+  // Own memory.
+  PaddleBuf(size_t length)
+      : data_(new char[length]), length_(length), memory_owned_(true) {}
+  // Resize to `length` bytes.
+  void Resize(size_t length);
+  // Reset to external memory.
+  void Reset(void* data, size_t length);
+  bool empty() const { return length_ == 0; }
+  void* data() const { return data_; }
+  size_t length() const { return length_; }
+
+  ~PaddleBuf() { Free(); }
+
+ private:
+  void Free();
+  void* data_{nullptr};  // pointer to the data memory.
+  size_t length_{0};     // number of memory bytes.
+  bool memory_owned_{true};
+};
+
+struct PaddleTensor {
+  PaddleTensor() = default;
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+};
+
+enum class PaddleEngineKind {
+  kPaddleMobile,
+  // TODO(Superjomn) support following engines latter.
+  // kTensorRT,           // Use TensorRT for inference.
+  // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
+  // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
+};
+
+/*
+ * A simple Inference API for Paddle. Currently this API can be used by
+ * non-sequence scenerios.
+ */
+class PaddlePredictor {
+ public:
+  struct Config;
+  PaddlePredictor() = default;
+  PaddlePredictor(const PaddlePredictor&) = delete;
+  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
+
+  // Predict an record.
+  // The caller should be responsible for allocating and releasing the memory of
+  // `inputs`. `inputs` should be available until Run returns. Caller should be
+  // responsible for the output tensor's buffer, either allocated or passed from
+  // outside.
+  virtual bool Run(const std::vector<PaddleTensor>& inputs,
+                   std::vector<PaddleTensor>* output_data,
+                   int batch_size = -1) = 0;
+
+  // Destroy the Predictor.
+  virtual ~PaddlePredictor() = default;
+
+  // The common configs for all the predictors.
+  struct Config {
+    std::string model_dir;  // path to the model directory.
+  };
+};
+
+struct PaddleMobileConfig : public PaddlePredictor::Config {
+  enum Precision { FP32 = 0 };
+  enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
+
+  enum Precision precision;
+  enum Device device;
+
+  int batch_size = 1;
+  bool optimize = true;
+  bool quantification = false;
+  int thread_num = 1;
+  std::string prog_file;
+  std::string param_file;
+};
+
+// A factory to help create different predictors.
+template <typename ConfigT,
+          PaddleEngineKind engine = PaddleEngineKind::kPaddleMobile>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+}  // namespace paddle_mobile
diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5e2e209d64aa7a00b56a5bdbbff88cb3097b7b94
--- /dev/null
+++ b/src/io/paddle_mobile.cpp
@@ -0,0 +1,98 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "io/paddle_mobile.h"
+
+namespace paddle_mobile {
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
+#ifdef _OPENMP
+  //  omp_set_dynamic(0);
+  omp_set_num_threads(num);
+#endif
+};
+
+template <typename Dtype, Precision P>
+bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
+                                  bool quantification, int batch_size) {
+  if (loader_.get() == nullptr) {
+    loader_ = std::make_shared<Loader<Dtype, P>>();
+  } else {
+    LOG(kLOG_INFO) << "loader inited";
+  }
+
+  if (executor_.get() == nullptr) {
+    executor_ = std::make_shared<Executor<Dtype, P>>(
+        loader_->Load(dirname, optimize, quantification), batch_size, optimize);
+  } else {
+    LOG(kLOG_INFO) << "executor inited";
+  }
+
+  return true;
+}
+
+template <typename Dtype, Precision P>
+bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
+                                  const std::string &para_path, bool optimize,
+                                  bool quantification, int batch_size) {
+  if (loader_.get() == nullptr) {
+    loader_ = std::make_shared<Loader<Dtype, P>>();
+  } else {
+    LOG(kLOG_INFO) << "loader inited";
+  }
+
+  if (executor_.get() == nullptr) {
+    executor_ = std::make_shared<Executor<Dtype, P>>(
+        loader_->Load(model_path, para_path, optimize, quantification),
+        batch_size, optimize);
+  } else {
+    LOG(kLOG_INFO) << "executor inited";
+  }
+
+  return true;
+}
+
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::Predict(
+    const framework::Tensor &t) {
+  return executor_->Predict(t);
+}
+
+template <typename Dtype, Precision P>
+std::vector<typename PaddleMobile<Dtype, P>::Ptype>
+PaddleMobile<Dtype, P>::Predict(const std::vector<Ptype> &input,
+                                const std::vector<int64_t> &dims) {
+  return executor_->Predict(input, dims);
+}
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::Clear() {
+  executor_ = nullptr;
+  loader_ = nullptr;
+}
+
+template <typename Dtype, Precision P>
+PaddleMobile<Dtype, P>::~PaddleMobile() {
+  executor_ = nullptr;
+  loader_ = nullptr;
+}
+
+template class PaddleMobile<CPU, Precision::FP32>;
+
+template class PaddleMobile<FPGA, Precision::FP32>;
+
+template class PaddleMobile<GPU_MALI, Precision::FP32>;
+
+}  // namespace paddle_mobile
diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h
new file mode 100644
index 0000000000000000000000000000000000000000..5dc3ccb21dd7e67fbe9b5032d01046b12728dc64
--- /dev/null
+++ b/src/io/paddle_mobile.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#ifdef _OPENMP
+#include <omp.h>
+#endif  // _OPENMP
+
+#include "common/types.h"
+#include "framework/tensor.h"
+#include "io/executor.h"
+#include "io/loader.h"
+
+namespace paddle_mobile {
+
+template <typename Dtype = CPU, Precision P = Precision::FP32>
+class PaddleMobile {
+  typedef typename PrecisionTrait<P>::ptype Ptype;
+
+ public:
+  PaddleMobile() {}
+  /*
+   * @b load separate format fluid model
+   * @b 加载分开形式的 fluid 模型
+   * */
+  bool Load(const std::string &dirname, bool optimize = false,
+            bool quantification = false, int batch_size = 1);
+
+  /*
+   * @b load combine format fluid mode
+   * @b 加载结合在一起格式的模型
+   * */
+  bool Load(const std::string &model_path, const std::string &para_path,
+            bool optimize = false, bool quantification = false,
+            int batch_size = 1);
+  /*
+   * @b 设置线程数, 当 cmake 中开启 openmp 时生效
+   * */
+  void SetThreadNum(int num);
+
+  /*
+   * @b to predict
+   * */
+  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
+
+  /*
+   * @b to predict with vector and dim
+   *
+   * @b 使用 输入 和 输入的维度信息 进行预测
+   * */
+  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
+                             const std::vector<int64_t> &dims);
+
+  void Clear();
+
+  ~PaddleMobile();
+
+ private:
+  std::shared_ptr<Loader<Dtype, P>> loader_;
+  std::shared_ptr<Executor<Dtype, P>> executor_;
+};
+
+}  // namespace paddle_mobile
diff --git a/src/ios_io/PaddleMobile.h b/src/ios_io/PaddleMobile.h
new file mode 100644
index 0000000000000000000000000000000000000000..5854c5c3a4d4c899feb88822b2f7993860d1ed76
--- /dev/null
+++ b/src/ios_io/PaddleMobile.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#import <CoreImage/CoreImage.h>
+#import <Foundation/Foundation.h>
+
+@interface PaddleMobile : NSObject
+
+/*
+    创建对象
+*/
+- (instancetype)init;
+
+/*
+    load 模型, 开辟内存
+*/
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+
+/*
+  加载散开形式的模型, 需传入模型的目录
+*/
+- (BOOL)load:(NSString *)modelAndWeightPath;
+
+/*
+    进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
+
+/*
+    进行预测
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
+
+/*
+    清理内存
+*/
+- (void)clear;
+
+@end
diff --git a/src/ios_io/PaddleMobile.mm b/src/ios_io/PaddleMobile.mm
new file mode 100644
index 0000000000000000000000000000000000000000..e3ed909394a1057302fb0f747b582b944c89cc65
--- /dev/null
+++ b/src/ios_io/PaddleMobile.mm
@@ -0,0 +1,179 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#import "PaddleMobile.h"
+#import "op_symbols.h"
+#import "io/paddle_mobile.h"
+
+#import <memory>
+#import <vector>
+
+@interface  PaddleMobile()
+{
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32> *pam_;
+  BOOL loaded_;
+}
+@end
+
+@implementation PaddleMobile
+
+static std::mutex shared_mutex;
+
+- (instancetype)init {
+  if (self = [super init]) {
+    pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32>();
+  }
+  return self;
+}
+
+- (void)dealloc {
+  if (pam_) {
+    delete pam_;
+  }
+}
+
++ (instancetype)sharedInstance{
+  static dispatch_once_t onceToken;
+  static id sharedManager = nil;
+  dispatch_once(&onceToken, ^{
+    sharedManager = [[[self class] alloc] init];
+  });
+  return sharedManager;
+}
+
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath{
+  std::string model_path_str = std::string([modelPath UTF8String]);
+  std::string weights_path_str = std::string([weighsPath UTF8String]);
+  if (loaded_ = pam_->Load(model_path_str, weights_path_str, false)) {
+    return YES;
+  } else {
+    return NO;
+  }
+}
+
+- (BOOL)load:(NSString *)modelAndWeightPath{
+  std::string model_path_str = std::string([modelAndWeightPath UTF8String]);
+  if (loaded_ = pam_->Load(model_path_str)) {
+    return YES;
+  } else {
+    return NO;
+  }
+}
+
+-(void)preprocess:(const UInt8 *)input output:(float *)output imageWidth:(int)imageWidth imageHeight:(int)imageHeight imageChannels:(int)imageChannels means:(NSArray<NSNumber *> *)means scale:(float)scale dim:(std::vector<int64_t>)dim{
+  if (means == nil) {
+    means = @[@0, @0, @0];
+  }
+
+  int wanted_input_width = dim[3];
+  int wanted_input_height = dim[2];
+  int wanted_input_channels = dim[1];
+
+  for (int c = 0; c < wanted_input_channels; ++c) {
+    float *out_channel = output + c * wanted_input_height * wanted_input_width;
+    for (int y = 0; y < wanted_input_height; ++y) {
+      float *out_row = out_channel + y * wanted_input_width;
+      for (int x = 0; x < wanted_input_width; ++x) {
+        int in_row = (y * imageHeight) / wanted_input_height;
+        int in_col = (x * imageWidth) / wanted_input_width;
+        const UInt8 *in_pixel = input + (in_row * imageWidth * imageChannels) + (in_col * imageChannels);
+        float *out_pos = out_row + x;
+        if (c == 0) {
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }else if (c == 1){
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }else if (c == 2){
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }
+      }
+    }
+  }
+}
+
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale{
+  std::lock_guard<std::mutex> lock(shared_mutex);
+  if (!loaded_) {
+    printf("PaddleMobile doesn't be loaded yet");
+    return nil;
+  }
+
+  if (dim.count != 4) {
+    printf("dim must have 4 elements");
+    return nil;
+  }
+
+  // dim to c++ vector, get numel
+  std::vector<int64_t > dim_vec;
+  int numel = 1;
+  for (int k = 0; k < dim.count; ++k) {
+    int d = dim[k].intValue;
+    numel *= d;
+    dim_vec.push_back(d);
+  }
+
+  const int sourceRowBytes = CGImageGetBytesPerRow(image);
+  const int image_width = CGImageGetWidth(image);
+  const int image_height = CGImageGetHeight(image);
+  const int image_channels = 4;
+  CGDataProviderRef provider = CGImageGetDataProvider(image);
+  CFDataRef cfData = CGDataProviderCopyData(provider);
+  const UInt8 *input = CFDataGetBytePtr(cfData);
+
+  // sample image
+  float *output = (float *)malloc(numel*sizeof(float));
+  [self preprocess:input output:output imageWidth:image_width imageHeight:image_height imageChannels:image_channels means:means scale:scale dim:dim_vec];
+  float *dataPointer = nullptr;
+  if (nullptr != output) {
+    dataPointer = output;
+  } else {
+    return nil;
+  }
+
+  // input
+  std::vector<float> predict_input;
+  for (int j = 0; j < numel; ++j) {
+    predict_input.push_back(dataPointer[j]);
+  }
+
+  // predict
+  std::vector<float> cpp_result = pam_->Predict(predict_input, dim_vec);
+
+  // result
+  long count = 0;
+  count = cpp_result.size();
+  NSMutableArray *result = [[NSMutableArray alloc] init];
+  for (int i = 0; i < count; i++) {
+    [result addObject:[NSNumber numberWithFloat:cpp_result[i]]];
+  }
+
+  free(output);
+
+  // 待验证
+  //  if ([UIDevice currentDevice].systemVersion.doubleValue < 11.0) {
+  CFRelease(cfData);
+  cfData = NULL;
+  //  }
+
+  return result;
+}
+
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim {
+  [self predict:image dim:dim means:nil scale:1];
+}
+
+- (void)clear{
+  pam_->Clear();
+}
+
+@end
diff --git a/src/ios_io/op_symbols.h b/src/ios_io/op_symbols.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2825b90e67c4e20030509358f468c9c0190f727
--- /dev/null
+++ b/src/ios_io/op_symbols.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#include "operators/batchnorm_op.h"
+#include "operators/box_coder_op.h"
+#include "operators/concat_op.h"
+#include "operators/conv_op.h"
+#include "operators/depthwise_conv_op.h"
+#include "operators/dropout_op.h"
+#include "operators/elementwise_add_op.h"
+#include "operators/feed_op.h"
+#include "operators/fetch_op.h"
+#include "operators/fusion_conv_add.h"
+#include "operators/fusion_conv_add_bn_relu_op.h"
+#include "operators/fusion_fc_op.h"
+#include "operators/im2sequence_op.h"
+#include "operators/lrn_op.h"
+#include "operators/mul_op.h"
+#include "operators/multiclass_nms_op.h"
+#include "operators/pool_op.h"
+#include "operators/prior_box_op.h"
+#include "operators/relu_op.h"
+#include "operators/reshape_op.h"
+#include "operators/sigmoid_op.h"
+#include "operators/softmax_op.h"
+#include "operators/transpose_op.h"
diff --git a/src/jni/paddle_mobile_jni.cpp b/src/jni/paddle_mobile_jni.cpp
index f663b78fd490f2c9f0af525c7dabd2cc513c3a53..c8ed491672920d85adafa28316663ede64a6dcc9 100644
--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
@@ -15,6 +15,11 @@ limitations under the License. */
 #ifdef ANDROID
 
 #include "paddle_mobile_jni.h"
+#include <cmath>
+#include "common/log.h"
+#include "framework/tensor.h"
+#include "io/paddle_mobile.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -28,17 +33,16 @@ using std::string;
 
 extern const char *ANDROID_LOG_TAG =
     "paddle_mobile LOG built on " __DATE__ " " __TIME__;
-static Executor<CPU> *shared_executor_instance = nullptr;
+static PaddleMobile<CPU> *shared_paddle_mobile_instance = nullptr;
 
 // toDo mutex lock
 // static std::mutex shared_mutex;
 
-Executor<CPU> *getExecutorInstance(const Program<CPU> p, int batch_size,
-                                   bool use_optimize) {
-  if (nullptr == shared_executor_instance) {
-    shared_executor_instance = new Executor<CPU>(p, batch_size, use_optimize);
+PaddleMobile<CPU> *getPaddleMobileInstance() {
+  if (nullptr == shared_paddle_mobile_instance) {
+    shared_paddle_mobile_instance = new PaddleMobile<CPU>();
   }
-  return shared_executor_instance;
+  return shared_paddle_mobile_instance;
 }
 
 string jstring2cppstring(JNIEnv *env, jstring jstr) {
@@ -51,15 +55,51 @@ string jstring2cppstring(JNIEnv *env, jstring jstr) {
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                                           jclass thiz,
                                                           jstring modelPath) {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  ANDROIDLOGI("load invoked");
+  bool optimize = true;
+  return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                         optimize);
+}
+
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
+    JNIEnv *env, jclass thiz, jstring modelPath) {
+  ANDROIDLOGI("loadQualified invoked");
+  bool optimize = true;
+  bool qualified = true;
+  return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                         optimize, qualified);
+}
+
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) {
+  ANDROIDLOGI("loadCombined invoked");
+  bool optimize = true;
+  return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                         jstring2cppstring(env, paramPath),
+                                         optimize);
+}
+
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) {
+  ANDROIDLOGI("loadCombinedQualified invoked");
   bool optimize = true;
-  auto program = loader.Load(jstring2cppstring(env, modelPath), optimize);
-  shared_executor_instance = getExecutorInstance(program, 1, optimize);
-  return shared_executor_instance != nullptr ? JNI_TRUE : JNI_FALSE;
+  bool qualified = true;
+  return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                         jstring2cppstring(env, paramPath),
+                                         optimize, qualified);
 }
 
 JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
-    JNIEnv *env, jclass thiz, jfloatArray buf) {
+    JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims) {
+  ANDROIDLOGI("predictImage invoked");
+  jsize ddim_size = env->GetArrayLength(ddims);
+  if (ddim_size != 4) {
+    ANDROIDLOGE("ddims size not equal to 4");
+  }
+  jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
+  framework::DDim ddim = framework::make_ddim(
+      {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
+  int length = framework::product(ddim);
   jfloatArray result = NULL;
   int count = 0;
   float *dataPointer = nullptr;
@@ -67,21 +107,116 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
     dataPointer = env->GetFloatArrayElements(buf, NULL);
   }
   framework::Tensor input;
-  framework::DDim ddim = framework::make_ddim({1, 3, 224, 224});
   input.Resize(ddim);
   auto input_ptr = input.mutable_data<float>();
-  for (int i = 0; i < framework::product(ddim); i++) {
+  for (int i = 0; i < length; i++) {
     input_ptr[i] = dataPointer[i];
   }
-  auto output = shared_executor_instance->Predict(input);
+  auto output = shared_paddle_mobile_instance->Predict(input);
   count = output->numel();
   result = env->NewFloatArray(count);
   env->SetFloatArrayRegion(result, 0, count, output->data<float>());
+  env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
+  ANDROIDLOGI("predictImage finished");
   return result;
 }
 
+inline int yuv_to_rgb(int y, int u, int v, float *r, float *g, float *b) {
+  int r1 = (int)(y + 1.370705 * (v - 128));
+  int g1 = (int)(y - 0.698001 * (u - 128) - 0.703125 * (v - 128));
+  int b1 = (int)(y + 1.732446 * (u - 128));
+
+  r1 = (int)fminf(255, fmaxf(0, r1));
+  g1 = (int)fminf(255, fmaxf(0, g1));
+  b1 = (int)fminf(255, fmaxf(0, b1));
+  *r = r1;
+  *g = g1;
+  *b = b1;
+
+  return 0;
+}
+void convert_nv21_to_matrix(uint8_t *nv21, float *matrix, int width, int height,
+                            int targetWidth, int targetHeight, float *means) {
+  const uint8_t *yData = nv21;
+  const uint8_t *vuData = nv21 + width * height;
+
+  const int yRowStride = width;
+  const int vuRowStride = width;
+
+  float scale_x = width * 1.0 / targetWidth;
+  float scale_y = height * 1.0 / targetHeight;
+
+  for (int j = 0; j < targetHeight; ++j) {
+    int y = j * scale_y;
+    const uint8_t *pY = yData + y * yRowStride;
+    const uint8_t *pVU = vuData + (y >> 1) * vuRowStride;
+    for (int i = 0; i < targetWidth; ++i) {
+      int x = i * scale_x;
+      const int offset = ((x >> 1) << 1);
+      float r = 0;
+      float g = 0;
+      float b = 0;
+      yuv_to_rgb(pY[x], pVU[offset + 1], pVU[offset], &r, &g, &b);
+      int r_index = j * targetWidth + i;
+      int g_index = r_index + targetWidth * targetHeight;
+      int b_index = g_index + targetWidth * targetHeight;
+      matrix[r_index] = r - means[0];
+      matrix[g_index] = g - means[1];
+      matrix[b_index] = b - means[2];
+    }
+  }
+}
+
+JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
+    JNIEnv *env, jclass thiz, jbyteArray yuv_, jint imgwidth, jint imgHeight,
+    jintArray ddims, jfloatArray meanValues) {
+  ANDROIDLOGI("predictYuv invoked");
+  jsize ddim_size = env->GetArrayLength(ddims);
+  if (ddim_size != 4) {
+    ANDROIDLOGE("ddims size not equal to 4");
+  }
+  jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
+  framework::DDim ddim = framework::make_ddim(
+      {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
+  int length = framework::product(ddim);
+  float matrix[length];
+  jbyte *yuv = env->GetByteArrayElements(yuv_, NULL);
+  float *meansPointer = nullptr;
+  if (nullptr != meanValues) {
+    meansPointer = env->GetFloatArrayElements(meanValues, NULL);
+  }
+  convert_nv21_to_matrix((uint8_t *)yuv, matrix, imgwidth, imgHeight, ddim[3],
+                         ddim[2], meansPointer);
+  jfloatArray result = NULL;
+  int count = 0;
+  framework::Tensor input;
+  input.Resize(ddim);
+  auto input_ptr = input.mutable_data<float>();
+  for (int i = 0; i < length; i++) {
+    input_ptr[i] = matrix[i];
+  }
+  auto output = shared_paddle_mobile_instance->Predict(input);
+  count = output->numel();
+  result = env->NewFloatArray(count);
+  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
+  env->ReleaseByteArrayElements(yuv_, yuv, 0);
+  env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
+  env->ReleaseFloatArrayElements(meanValues, meansPointer, 0);
+  ANDROIDLOGI("predictYuv finished");
+  return result;
+}
+
+JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_setThread(JNIEnv *env,
+                                                           jclass thiz,
+                                                           jint threadCount) {
+  ANDROIDLOGI("setThreadCount %d", threadCount);
+  getPaddleMobileInstance()->SetThreadNum((int)threadCount);
+}
+
 JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_clear(JNIEnv *env,
-                                                       jclass thiz) {}
+                                                       jclass thiz) {
+  getPaddleMobileInstance()->Clear();
+}
 
 }  // namespace jni
 }  // namespace paddle_mobile
diff --git a/src/jni/paddle_mobile_jni.h b/src/jni/paddle_mobile_jni.h
index a262d4070c37013977e869fa816d52d78fbfa485..4fd62a6d56c71dfc748cc967244bc830abb74a80 100644
--- a/src/jni/paddle_mobile_jni.h
+++ b/src/jni/paddle_mobile_jni.h
@@ -15,9 +15,6 @@ limitations under the License. */
 #pragma once
 #ifdef ANDROID
 #include <jni.h>
-#include "common/log.h"
-#include "framework/tensor.h"
-#include "io/io.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -25,18 +22,54 @@ extern "C" {
 namespace paddle_mobile {
 namespace jni {
 /**
- * load model & params of the net for android
+ * load separated model for android
  */
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                                           jclass thiz,
                                                           jstring modelPath);
 
+/**
+ * load separated qualified model for android
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
+    JNIEnv *env, jclass thiz, jstring modelPath);
+/**
+ * load combined model  for android
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath);
+
+/**
+ * load combined qualified model for android
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath);
+
 /**
  * object detection for anroid
  */
 JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
-    JNIEnv *env, jclass thiz, jfloatArray buf);
+    JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims);
+
+/**
+ * object detection for anroid
+ */
+JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
+    JNIEnv *env, jclass thiz, jbyteArray yuv, jint imgwidth, jint imgHeight,
+    jintArray ddims, jfloatArray meanValues);
 
+/**
+ * object detection for anroid
+ */
+JNIEXPORT jfloatArray JNICALL
+Java_com_baidu_paddle_PML_predict(JNIEnv *env, jclass thiz, jfloatArray buf);
+
+/**
+ * setThreadCount for multithread
+ */
+JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_setThread(JNIEnv *env,
+                                                           jclass thiz,
+                                                           jint threadCount);
 /**
  * clear data of the net when destroy for android
  */
diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp
index 62e929024d7232ba4bee6b9e95ee895c2badb95e..42b8c4551871c58955251d94845ca13576d7735b 100644
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -12,16 +12,36 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
 #include "memory/t_malloc.h"
 #include <cstdlib>
 #include <cstring>
 
+#ifdef PADDLE_MOBILE_FPGA
+
+#include "fpga/api/fpga_api.h"
+
+#endif
+
 namespace paddle_mobile {
 namespace memory {
 const int MALLOC_ALIGN = 64;
 
+#ifdef PADDLE_MOBILE_FPGA
+namespace fpga = paddle_mobile::fpga;
+
+void Copy(void *dst, const void *src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+void *Alloc(size_t size) { return fpga::fpga_malloc(size); }
+
+void Free(void *ptr) {
+  if (ptr) {
+    fpga::fpga_free(ptr);
+  }
+}
+
+#else
 void Copy(void *dst, const void *src, size_t num) {
   std::memcpy(dst, src, num);
 }
@@ -44,5 +64,7 @@ void Free(void *ptr) {
   }
 }
 
+#endif
+
 }  // namespace memory
 }  // namespace paddle_mobile
diff --git a/src/operators/batchnorm_op.cpp b/src/operators/batchnorm_op.cpp
index 672e990be44c11df0795b9c6f301803f8ad02285..f820908404ea637d9680c32d5c4b5568e191dd7e 100644
--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -26,16 +26,16 @@ void BatchNormOp<Dtype, T>::InferShape() const {
   auto x_dims = this->param_.InputX()->dims();
   this->param_.OutputY()->Resize(x_dims);
 }
-template class BatchNormOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(batch_norm);
 REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
diff --git a/src/operators/batchnorm_op.h b/src/operators/batchnorm_op.h
index 9ee0b2dcf6b6ec46fcb08cac88d3df275d33f7d6..2b2795b64fddfbcd1000088dbab18e54a017b459 100644
--- a/src/operators/batchnorm_op.h
+++ b/src/operators/batchnorm_op.h
@@ -45,4 +45,13 @@ class BatchNormOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(batch_norm);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(batch_norm);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/box_coder_op.cpp b/src/operators/box_coder_op.cpp
index 31891ed74266d599898dd7426eed5cd28f320ab6..9e57c9021dac1b6857752989727c1c86051e33f7 100644
--- a/src/operators/box_coder_op.cpp
+++ b/src/operators/box_coder_op.cpp
@@ -47,13 +47,12 @@ void BoxCoderOp<Dtype, T>::InferShape() const {
   this->param_.OutputBox()->Resize(framework::make_ddim(
       {input_targetbox_dims[0], input_priorbox_dims[0], 4}));
 }
-template class BoxCoderOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(box_coder);
 REGISTER_OPERATOR_CPU(box_coder, ops::BoxCoderOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/box_coder_op.h b/src/operators/box_coder_op.h
index 33ff2358bc8285a026c217ed11c2250769395567..5a75cacaf27f20e69b5e427421bd3dd8f43e8556 100644
--- a/src/operators/box_coder_op.h
+++ b/src/operators/box_coder_op.h
@@ -51,4 +51,12 @@ class BoxCoderOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(box_coder);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/concat_op.cpp b/src/operators/concat_op.cpp
index f5a9c3d81ef34ac9ff643dd174741e083c879cbc..f767f3481c999a16da46e75e314e8ebcb54193fa 100644
--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #ifdef CONCAT_OP
 
-#include "concat_op.h"
+#include <vector>
+
+#include "operators/concat_op.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -56,19 +58,19 @@ void ConcatOp<Dtype, T>::InferShape() const {
 
   this->param_.Out()->Resize(out_dims);
 }
-template class ConcatOp<CPU, float>;
 
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(concat);
 REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(concat, ops::ConcatOp);
 #endif
 
 #endif
diff --git a/src/operators/concat_op.h b/src/operators/concat_op.h
index 93612c6b1b6d1f6aa992773ef5cccc0c93f1b6e8..bad0015917c2a9d4016def26c8a332d076b39c99 100644
--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
@@ -46,4 +46,14 @@ class ConcatOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(concat);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(concat);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(concat);
+#endif
+
 #endif
diff --git a/src/operators/conv_op.cpp b/src/operators/conv_op.cpp
index 01d284a06ed33142a8d16cdc32f304c3d1a75e28..c4601995219b32db75f22c7c2ed959e18af85f36 100644
--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -48,22 +48,17 @@ void ConvOp<Dtype, T>::InferShape() const {
   this->param_.Output()->Resize(ddim);
 }
 
-template class ConvOp<CPU, float>;
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(conv2d);
 REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(conv2d);
 REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(conv2d);
 REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
 #endif
 
diff --git a/src/operators/conv_op.h b/src/operators/conv_op.h
index f8e8952d47fd726c712c0f7817606d959095b65b..d36fa47f4a0b37c467eb2101e2e930fe54a0e28b 100644
--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -46,4 +46,14 @@ class ConvOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(conv2d);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(conv2d);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(conv2d);
+#endif
+
 #endif
diff --git a/src/common/openmp-fix.cpp b/src/operators/conv_transpose_op.cpp
similarity index 51%
rename from src/common/openmp-fix.cpp
rename to src/operators/conv_transpose_op.cpp
index 8c31ef45c68227c612155e826e664367a7917501..1e1d9e9c519732607b27aac7873b6a8eec93510b 100644
--- a/src/common/openmp-fix.cpp
+++ b/src/operators/conv_transpose_op.cpp
@@ -12,16 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_MOBILE_USE_OPENMP
-/**
- * android-ndk-r17 has a problem when linking with openmp.
- * if paddle-mobile enables -fopenmp, but didn't use those omp_* functions,
- * after linking another binary with libpaddle-mobile.so, the omp_get_thread_num
- * will not work. see test/common/test_openmp.cc the detailed reason is still
- * unclear, but this trick will work. a better solution is hacking the linker,
- * try some flags to make it link omp_* functions, but I didn't find out how to
- * make it work.
- */
-#include <omp.h>
-static int _ = omp_get_num_procs();
+#ifdef CONV_TRANSPOSE
+
+#include "operators/conv_transpose_op.h"
+
+namespace paddle_mobile {
+namespace operators {}
+}  // namespace paddle_mobile
+
 #endif
diff --git a/src/operators/conv_transpose_op.h b/src/operators/conv_transpose_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8176913b0d9535f7bd677439f314137e9d59ad72
--- /dev/null
+++ b/src/operators/conv_transpose_op.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_TRANSPOSE
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "framework/operator.h"
+#include "operators/kernel/conv_transpose_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class ConvOpTranspose : public framework::OperatorWithKernel<
+                            DeviceType, ConvTransposeParam,
+                            operators::ConvTransposeKernel<DeviceType, T>> {
+ public:
+  ConvOpTranspose(const std::string &type, const VariableNameMap &inputs,
+                  const VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs,
+                  std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, ConvTransposeParam,
+            operators::ConvTransposeKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  void InferShape() const {
+    auto input = this->param_.Input();
+    auto in_dims = input->dims();
+
+    auto filter = this->param_.Filter();
+    auto filter_dims = filter->dims();
+
+    std::vector<int> strides = this->param_.Strides();
+    std::vector<int> paddings = this->param_.Paddings();
+    std::vector<int> dilations = this->param_.Dilations();
+
+    int groups = this->param_.Groups();
+
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == 4 || in_dims.size() == 5,
+        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == filter_dims.size(),
+        "ConvTransposeOp input dimension and filter dimension "
+        "should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() - strides.size() == 2U,
+        "ConvTransposeOp input dimension and strides dimension should "
+        "be consistent.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
+                          "ConvTransposeOp paddings dimension and strides "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
+                          "ConvTransposeOp paddings dimension and dilations "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims[1] == filter_dims[0],
+        "In ConvTransposeOp, The number of input channels should "
+        "be equal to the number of filter's channels.");
+
+    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
+    for (size_t i = 0; i < strides.size(); ++i) {
+      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
+      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
+                             2 * paddings[i] + filter_extent);
+    }
+    this->param_.Output()->Resize(framework::make_ddim(output_shape));
+  }
+
+ private:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/depthwise_conv_op.cpp b/src/operators/depthwise_conv_op.cpp
index 46f2db30ba2fbff5839d6a737dda12fa6cd10b43..8d6b6a143c37537be6de1e60cc095f1052136e26 100644
--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -49,14 +49,11 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
   this->param_.Output()->Resize(ddim);
 }
 
-template class DepthwiseConvOp<CPU, float>;
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(depthwise_conv2d);
 REGISTER_OPERATOR_CPU(depthwise_conv2d, ops::DepthwiseConvOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/depthwise_conv_op.h b/src/operators/depthwise_conv_op.h
index 75bcf44cb8790365e7f33719c481354c1a57c80a..9d7cbcfa2f2924db040cdc5f38ca6bb7ad8074b5 100644
--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
@@ -48,4 +48,12 @@ class DepthwiseConvOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(depthwise_conv2d);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/dropout_op.cpp b/src/operators/dropout_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a913ff017bfe776a2c2dfea5696e4c0f23683c46
--- /dev/null
+++ b/src/operators/dropout_op.cpp
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef DROPOUT_OP
+#include "operators/dropout_op.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void DropoutOp<Dtype, T>::InferShape() const {
+  auto input_dims = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(input_dims);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(dropout, ops::DropoutOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(dropout, ops::DropoutOp);
+#endif
+
+#endif
diff --git a/src/operators/dropout_op.h b/src/operators/dropout_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..89d658dd8c1e11576a2cc0ef9cceae3fcdf26477
--- /dev/null
+++ b/src/operators/dropout_op.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef DROPOUT_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/dropout_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class DropoutOp
+    : public framework::OperatorWithKernel<
+          DeviceType, DropoutParam, operators::DropoutKernel<DeviceType, T>> {
+ public:
+  DropoutOp(const std::string &type, const VariableNameMap &inputs,
+            const VariableNameMap &outputs, const framework::AttributeMap attrs,
+            std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, DropoutParam,
+                                      operators::DropoutKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  // using framework::OperatorWithKernel<DeviceType, DropoutParam,
+  //                                    operators::DropoutKernel<DeviceType,
+  //                                    T>>;
+  void InferShape() const override;
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(dropout);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(dropout);
+#endif
+
+#endif
diff --git a/src/operators/elementwise_add_op.cpp b/src/operators/elementwise_add_op.cpp
index 966bc9c1e77a4ae6e33bc830c06ba7593c7ba3e0..49885f783417d61c6348fc4563e7306036994f17 100644
--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -24,16 +24,16 @@ void ElementwiseAddOp<Dtype, T>::InferShape() const {
   auto x_dim = this->param_.InputX()->dims();
   this->param_.Out()->Resize(x_dim);
 }
-template class ElementwiseAddOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(elementwise_add);
 REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
diff --git a/src/operators/elementwise_add_op.h b/src/operators/elementwise_add_op.h
index 6cb80d06d0a4d66935c77a3c23a6264d0be53ecc..761a5d35459558d1ca5673757fae13147b7f6a6f 100644
--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
@@ -48,4 +48,13 @@ class ElementwiseAddOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(elementwise_add);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(elementwise_add);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/feed_op.cpp b/src/operators/feed_op.cpp
index a40eac098c7bef442befa1758b21904269cc22d5..77acb5db31e66d78bccd8dbef51832bda1a1bb60 100644
--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -14,8 +14,16 @@ limitations under the License. */
 
 #include "feed_op.h"
 namespace paddle_mobile {
-namespace operators {
-
-template class FeedOp<CPU, float>;
-}
+namespace operators {}
 }  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(feed, ops::FeedOp);
+#endif
diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h
index 9079dbb0b3d83b2b28a046ae3d78025a24fc4958..4766d56d9ae0b86cc28c476a17547acfd53ab02b 100644
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -29,8 +29,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
          std::shared_ptr<framework::Scope> scope)
       : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
                                             scope),
-        param_(inputs, outputs, attrs, *scope) {}
-  void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
+        param_(inputs, outputs, attrs, scope.get()) {}
 
   void InferShape() const {
     auto out_dims = param_.Out()->dims();
@@ -38,19 +37,42 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
     param_.Out()->Resize(out_dims);
   }
 
+#ifdef PADDLE_MOBILE_FPGA
+  void RunImpl() const { fpga::PerformBypass(param_.FpgaArgs()); }
+  void Init() {
+    const Tensor *input = param_.InputX();
+    auto input_ptr = input->data<float>();
+    Tensor *output = param_.Out();
+    auto output_ptr = output->mutable_data<half>();
+    fpga::BypassArgs args;
+    args.convert_type = fpga::DATA_FP32_TO_FP16;
+    args.layout_type = fpga::LAYOUT_CHW_TO_HWC;
+    args.image.address = (void *)input_ptr;
+    args.image.channels = input->dims()[1];
+    args.image.height = input->dims()[2];
+    args.image.width = input->dims()[3];
+    args.output.address = output_ptr;
+    param_.SetFpgaArgs(args);
+  }
+
+#else
+  void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
+  void Init() {}
+#endif
+
  protected:
   FeedParam param_;
 };
 
-namespace ops = paddle_mobile::operators;
+}  // namespace operators
+}  // namespace paddle_mobile
+
 #ifdef PADDLE_MOBILE_CPU
 USE_OP_CPU(feed);
-REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(feed);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(feed);
 #endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/src/operators/fetch_op.cpp b/src/operators/fetch_op.cpp
index 45d6afc07b597156a746b7cd6657c3b58f1b9950..30cddceaa45da91be5ea91d70f78503c404552c3 100644
--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
@@ -14,8 +14,16 @@ limitations under the License. */
 
 #include "fetch_op.h"
 namespace paddle_mobile {
-namespace operators {
-
-template class FetchOp<CPU, float>;
-}
+namespace operators {}
 }  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fetch, ops::FetchOp);
+#endif
diff --git a/src/operators/fetch_op.h b/src/operators/fetch_op.h
index a65338f7f4262de1b74d7a18525f6c3b9551243a..417637c80086b099395e93227991491309f656fe 100644
--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -33,6 +33,8 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
         param_(inputs, outputs, attrs, *scope) {}
   void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
 
+  void Init() {}
+
   void InferShape() const {
     auto x_dims = param_.InputX()->dims();
     param_.Out()->Resize(x_dims);
@@ -42,15 +44,15 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
   FetchParam param_;
 };
 
-namespace ops = paddle_mobile::operators;
+}  // namespace operators
+}  // namespace paddle_mobile
+
 #ifdef PADDLE_MOBILE_CPU
 USE_OP_CPU(fetch);
-REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fetch);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fetch);
 #endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/src/operators/fusion_conv_add.cpp b/src/operators/fusion_conv_add.cpp
index 2605414c892f89787701334f428621d9d8c2520f..cdd6a6db2bb11ebf8dce2aca85630aa8805adf3e 100644
--- a/src/operators/fusion_conv_add.cpp
+++ b/src/operators/fusion_conv_add.cpp
@@ -44,16 +44,16 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
   framework::DDim ddim = framework::make_ddim(output_shape);
   this->param_.Output()->Resize(ddim);
 }
-template class FusionConvAddOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(conv_add);
-REGISTER_OPERATOR_CPU(conv_add, ops::FusionConvAddOp);
+REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
diff --git a/src/operators/fusion_conv_add.h b/src/operators/fusion_conv_add.h
index f0a3ea17d9a86e2c8638c164cfa2bf21d4fb727d..170df9ce33e4ab90297664fbc81d723e7c246f83 100644
--- a/src/operators/fusion_conv_add.h
+++ b/src/operators/fusion_conv_add.h
@@ -11,9 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define FUSION_CONVADD_OP
-#ifdef FUSION_CONVADD_OP
 
+#ifdef FUSION_CONVADD_OP
 #pragma once
 
 #include <string>
@@ -37,13 +36,11 @@ class FusionConvAddMatcher : public framework::FusionOpMatcher {
   void FolderNodes(
       framework::Node *node,
       std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node->OpDescs(node_.Depth());
     node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Y"}}}, removed_nodes);
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
   }
 
-  std::string Type() { return G_OP_TYPE_CONV_ADD; }
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD; }
 };
 
 template <typename DeviceType, typename T>
@@ -68,15 +65,39 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
 };
 
 #ifdef PADDLE_MOBILE_CPU
+
+#ifndef CONV_ADD_REGISTER
 static framework::FusionOpRegistrar convadd_registrar(
     new FusionConvAddMatcher());
+#define CONV_ADD_REGISTER
+#endif
+
 #endif
+
 #ifdef PADDLE_MOBILE_MALI_GPU
+
+#ifndef CONV_ADD_REGISTER
+static framework::FusionOpRegistrar convadd_registrar(
+    new FusionConvAddMatcher());
+#define CONV_ADD_REGISTER
+
 #endif
+
+#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 #endif
 
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fusion_conv_add);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/fusion_conv_add_bn_op.cpp b/src/operators/fusion_conv_add_bn_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5b61bf5d390cc2904a3f40f5400a5a3eec9a2dd5
--- /dev/null
+++ b/src/operators/fusion_conv_add_bn_op.cpp
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBN_OP
+
+#include "operators/fusion_conv_add_bn_op.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionConvAddBNOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_conv_add_bn, ops::FusionConvAddBNOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_conv_add_bn, ops::FusionConvAddBNOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_add_bn_op.h b/src/operators/fusion_conv_add_bn_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a7f6b2bababd3f5d36d7b6faf60069567d45423
--- /dev/null
+++ b/src/operators/fusion_conv_add_bn_op.h
@@ -0,0 +1,115 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBN_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "op_param.h"
+#include "operators/kernel/conv_add_bn_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionConvAddBNMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionConvAddBNMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
+                  {G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_BN; }
+};
+
+template <typename DeviceType, typename T>
+class FusionConvAddBNOp : public framework::OperatorWithKernel<
+                              DeviceType, FusionConvAddBNParam,
+                              operators::ConvAddBNKernel<DeviceType, T>> {
+ public:
+  FusionConvAddBNOp(const string &type, const VariableNameMap &inputs,
+                    const VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs,
+                    std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionConvAddBNParam,
+            operators::ConvAddBNKernel<DeviceType, T>>(type, inputs, outputs,
+                                                       attrs, scope) {}
+
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef FUSION_CONV_ADD_BN_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
+    new FusionConvAddBNMatcher());
+#define FUSION_CONV_ADD_BN_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+
+#ifndef FUSION_CONV_ADD_BN_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
+    new FusionConvAddBNMatcher());
+#define FUSION_CONV_ADD_BN_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+
+#ifndef FUSION_CONV_ADD_BN_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
+    new FusionConvAddBNMatcher());
+#define FUSION_CONV_ADD_BN_REGISTER
+#endif
+
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_bn);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_conv_add_bn);
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_add_bn_relu_op.cpp b/src/operators/fusion_conv_add_bn_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..793634eec392fabe6c7399127ec9cb3e187697bc
--- /dev/null
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#include "operators/fusion_conv_add_bn_relu_op.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_add_bn_relu_op.h b/src/operators/fusion_conv_add_bn_relu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..54e7e58f8af4111edd0b86c85bb1cffc87f5cd22
--- /dev/null
+++ b/src/operators/fusion_conv_add_bn_relu_op.h
@@ -0,0 +1,120 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "op_param.h"
+#include "operators/kernel/conv_add_bn_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionConvAddBNReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionConvAddBNReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
+                  {G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_BN_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionConvAddBNReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionConvAddBNReluParam,
+          operators::ConvAddBNReluKernel<DeviceType, T>> {
+ public:
+  FusionConvAddBNReluOp(const string &type, const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs,
+                        std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionConvAddBNReluParam,
+            operators::ConvAddBNReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, FusionConvAddBNReluParam,
+      operators::ConvAddBNReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
+    new FusionConvAddBNReluMatcher());
+#define FUSION_CONV_ADD_BN_RELU_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+
+#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
+    new FusionConvAddBNReluMatcher());
+#define FUSION_CONV_ADD_BN_RELU_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+
+#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
+    new FusionConvAddBNReluMatcher());
+#define FUSION_CONV_ADD_BN_RELU_REGISTER
+#endif
+
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_bn_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_conv_add_bn_relu);
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_add_relu_op.cpp b/src/operators/fusion_conv_add_relu_op.cpp
index 694e46af1f8dec3513c5a6d2ff26e3676e9204e4..99b770a6c5e3bc89024e467631e129b914f0bcec 100644
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef CONVADDRELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 
 #include "fusion_conv_add_relu_op.h"
 #include "operators/math/conv_func.h"
@@ -49,12 +49,12 @@ void FusionConvAddReluOp<Dtype, T>::InferShape() const {
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add_relu);
 REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #endif
 
 #endif
diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h
index b87f1c4110de6c525e4544d5a350b2beaf98af95..e8a9498819cae330abbd4a007a6510d89f167114 100644
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef CONVADDRELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 
 #pragma once
 
+#include <string>
+#include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
 #include "operators/kernel/conv_add_relu_kernel.h"
@@ -36,7 +38,7 @@ class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher {
       framework::Node *node,
       std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
     node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Y"}}}, removed_nodes);
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
   }
   std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; }
 };
@@ -64,15 +66,36 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
 };
 
 #ifdef PADDLE_MOBILE_CPU
-// static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(new
-// FusionConvAddReluOpMatcher());
+
+#ifndef CONV_ADD_RELU_REGISTER
+#define CONV_ADD_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
+    new FusionConvAddReluOpMatcher());
+#endif
+
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+
+#ifndef CONV_ADD_RELU_REGISTER
+#define CONV_ADD_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
+    new FusionConvAddReluOpMatcher());
+#endif
+
 #endif
 
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_conv_add_relu);
+#endif
+
 #endif
diff --git a/src/operators/fusion_conv_bn_relu_op.cpp b/src/operators/fusion_conv_bn_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..49fe9c933a5a9695f2c18bd0921c2d36063dc065
--- /dev/null
+++ b/src/operators/fusion_conv_bn_relu_op.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#include "operators/fusion_conv_bn_relu_op.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionConvBNReluOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_bn_relu_op.h b/src/operators/fusion_conv_bn_relu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c2c1033ac0a4d6c8e3bc3f188a66884dd9e0642
--- /dev/null
+++ b/src/operators/fusion_conv_bn_relu_op.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/conv_bn_relu_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionConvBNReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionConvBNReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_BN_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionConvBNReluOp : public framework::OperatorWithKernel<
+                               DeviceType, FusionConvBNReluParam,
+                               operators::ConvBNReluKernel<DeviceType, T>> {
+ public:
+  FusionConvBNReluOp(const string &type, const VariableNameMap &inputs,
+                     const VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs,
+                     std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionConvBNReluParam,
+            operators::ConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                        attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, FusionConvBNReluParam,
+      operators::ConvBNReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef FUSION_CONV_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_bn_relu_registrar(
+    new FusionConvBNReluMatcher());
+#define FUSION_CONV_BN_RELU_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_bn_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/fusion_dwconv_bn_relu_op.cpp b/src/operators/fusion_dwconv_bn_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e55295830e19b5b39a5ae2501e30170ffb1a7854
--- /dev/null
+++ b/src/operators/fusion_dwconv_bn_relu_op.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DWCONVBNRELU_OP
+
+#include "operators/fusion_dwconv_bn_relu_op.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/fusion_dwconv_bn_relu_op.h b/src/operators/fusion_dwconv_bn_relu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f9f03e4936e082de802ced385060fecb9cc27a9
--- /dev/null
+++ b/src/operators/fusion_dwconv_bn_relu_op.h
@@ -0,0 +1,109 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DWCONVBNRELU_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "op_param.h"
+#include "operators/kernel/dwconv_bn_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionDWConvBNReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDWConvBNReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_DEPTHWISE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_DWCONV_BN_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionDWConvBNReluOp : public framework::OperatorWithKernel<
+                                 DeviceType, FusionDWConvBNReluParam,
+                                 operators::DWConvBNReluKernel<DeviceType, T>> {
+ public:
+  FusionDWConvBNReluOp(const string &type, const VariableNameMap &inputs,
+                       const VariableNameMap &outputs,
+                       const framework::AttributeMap &attrs,
+                       std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDWConvBNReluParam,
+            operators::DWConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                          attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, FusionDWConvBNReluParam,
+      operators::DWConvBNReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef FUSION_DWCONV_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_dwconv_bn_relu_registrar(
+    new FusionDWConvBNReluMatcher());
+#define FUSION_DWCONV_BN_RELU_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+
+#ifndef FUSION_DWCONV_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_dwconv_bn_relu_registrar(
+    new FusionDWConvBNReluMatcher());
+#define FUSION_DWCONV_BN_RELU_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_dwconv_bn_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/fusion_elementwise_add_relu_op.cpp b/src/operators/fusion_elementwise_add_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fa2739ab4283c1fbb35e541ed2d40ea7a1904580
--- /dev/null
+++ b/src/operators/fusion_elementwise_add_relu_op.cpp
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_ELEMENTWISEADDRELU_OP
+
+#include "fusion_elementwise_add_relu_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionElementwiseAddReluOp<Dtype, T>::InferShape() const {
+  auto x_dim = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(x_dim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+// REGISTER_OPERATOR_CPU(fusion_elementwise_add_relu,
+//                      ops::FusionElementwiseAddReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+// REGISTER_OPERATOR_MALI_GPU(fusion_elementwise_add_relu,
+//                           ops::FusionElementwiseAddReluOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_elementwise_add_relu,
+                       ops::FusionElementwiseAddReluOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_elementwise_add_relu_op.h b/src/operators/fusion_elementwise_add_relu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7e1f244732f9b4c463b6dd0f1ba81e7baf04bfd
--- /dev/null
+++ b/src/operators/fusion_elementwise_add_relu_op.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_ELEMENTWISEADDRELU_OP
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/elementwise_add_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusioneElementwiseAddReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusioneElementwiseAddReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionElementwiseAddReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, ElementwiseAddReluParam,
+          operators::ElementwiseAddReluKernel<DeviceType, T>> {
+ public:
+  FusionElementwiseAddReluOp(const string &type, const VariableNameMap &inputs,
+                             const VariableNameMap &outputs,
+                             const framework::AttributeMap &attrs,
+                             std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, ElementwiseAddReluParam,
+            operators::ElementwiseAddReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+/*
+#ifndef FUSION_ELEMENTWISE_ADD_RELU_REGISTER
+        static framework::FusionOpRegistrar fusion_elementwise_relu_registrar(
+                new FusioneElementwiseAddReluMatcher());
+#define FUSION_ELEMENTWISE_ADD_RELU_REGISTER
+#endif
+*/
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+/*
+#ifndef FUSION_ELEMENTWISE_ADD_RELU_REGISTER
+        static framework::FusionOpRegistrar fusion_elementwise_relu_registrar(
+                new FusioneElementwiseAddReluMatcher());
+#define FUSION_ELEMENTWISE_ADD_RELU_REGISTER
+#endif
+*/
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+#ifndef FUSION_ELEMENTWISE_ADD_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_elementwise_relu_registrar(
+    new FusioneElementwiseAddReluMatcher());
+#define FUSION_ELEMENTWISE_ADD_RELU_REGISTER
+#endif
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_elementwise_add_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_elementwise_add_relu);
+#endif
+
+#endif
diff --git a/src/operators/fusion_fc_op.cpp b/src/operators/fusion_fc_op.cpp
index 2e4e098fd08e7a765a9f54eb6ed6a4dc579c359f..9fa80fbf12d0fe300921418705b6900108c68faf 100644
--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -49,18 +49,19 @@ void FusionFcOp<Dtype, T>::InferShape() const {
   framework::DDim ddim = framework::make_ddim(output_dims);
   this->param_.Out()->Resize(ddim);
 }
-template class FusionFcOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fc);
-REGISTER_OPERATOR_CPU(fc, ops::FusionFcOp);
+REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(fusion_fc, ops::FusionFcOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_fc, ops::FusionFcOp);
 #endif
 
 #endif
diff --git a/src/operators/fusion_fc_op.h b/src/operators/fusion_fc_op.h
index 2035704bb60eb96bfb22fc4f277d30817efcf646..e6c7e9fdbd5f449eea004615fa31e49af0746086 100644
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -38,7 +38,7 @@ class FusionFcMatcher : public framework::FusionOpMatcher {
       framework::Node *node,
       std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
     node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}}, removed_nodes);
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Z"}}}}, removed_nodes);
   }
 
   std::string Type() { return G_OP_TYPE_FC; }
@@ -66,15 +66,41 @@ class FusionFcOp
 };
 
 #ifdef PADDLE_MOBILE_CPU
+
+#ifndef FUSION_FC_CPU_REGISTER
+#define FUSION_FC_CPU_REGISTER
 static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
 #endif
+
+#endif
+
 #ifdef PADDLE_MOBILE_MALI_GPU
-// static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
+
+#ifndef CONV_CPU_REGISTER
+#define CONV_CPU_REGISTER
+static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
+#endif
+
 #endif
+
 #ifdef PADDLE_MOBILE_FPGA
+#ifndef FUSION_FC_CPU_REGISTER
+#define FUSION_FC_CPU_REGISTER
+static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
+#endif
 #endif
 
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_fc);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fusion_fc);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_fc);
+#endif
+
 #endif
diff --git a/src/operators/fusion_fc_relu_op.cpp b/src/operators/fusion_fc_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..97568323a3c204da06546ffc6b4d9a2483e95848
--- /dev/null
+++ b/src/operators/fusion_fc_relu_op.cpp
@@ -0,0 +1,67 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_FC_RELU_OP
+
+#include "operators/fusion_fc_relu_op.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionFcReluOp<Dtype, T>::InferShape() const {
+  auto x_dims = this->param_.InputX()->dims();
+  auto y_dims = this->param_.InputY()->dims();
+  int x_num_col_dims = this->param_.XNumColDims();
+  int y_num_col_dims = this->param_.YNumColDims();
+
+  assert(x_dims.size() > x_num_col_dims);
+  assert(y_dims.size() > y_num_col_dims);
+
+  /// (1,2,3,4) , x_num_col_dims = 2  -> (2,12)
+  auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
+  auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
+
+  assert(x_mat_dims[1] == y_mat_dims[0]);
+
+  std::vector<int64_t> output_dims;
+  output_dims.reserve(
+      static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
+
+  for (int i = 0; i < x_num_col_dims; ++i) {
+    output_dims.push_back(x_dims[i]);
+  }
+
+  for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
+    output_dims.push_back(y_dims[i]);
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_dims);
+  this->param_.Out()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_fc_relu, ops::FusionFcReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(fusion_fc_relu, ops::FusionFcReluOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_fc_relu, ops::FusionFcReluOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_fc_relu_op.h b/src/operators/fusion_fc_relu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a680695715b042152c8279510cdbf3100e84bb4
--- /dev/null
+++ b/src/operators/fusion_fc_relu_op.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FCRELU_OP
+#pragma once
+#include <string>
+#include <vector>
+
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/fc_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionFcReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionFcReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_MUL);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Z"}}}}, removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_FC_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionFcReluOp : public framework::OperatorWithKernel<
+                           DeviceType, FusionFcReluParam,
+                           operators::FusionFcReluKernel<DeviceType, T>> {
+ public:
+  FusionFcReluOp(const string &type, const VariableNameMap &inputs,
+                 const VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs,
+                 std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionFcReluParam,
+            operators::FusionFcReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                          attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, FusionFcReluParam,
+      operators::FusionFcReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef FUSION_FC_RELU_REGISTER
+#define FUSION_FC_RELU_REGISTER
+static framework::FusionOpRegistrar fc_relu_registrar(
+    new FusionFcReluMatcher());
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+
+#ifndef FUSION_FC_RELU_REGISTER
+#define FUSION_FC_RELU_REGISTER
+static framework::FusionOpRegistrar fc_relu_registrar(
+    new FusionFcReluMatcher());
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+#ifndef FUSION_FC_RELU_REGISTER
+#define FUSION_FC_RELU_REGISTER
+static framework::FusionOpRegistrar fc_relu_registrar(
+    new FusionFcReluMatcher());
+#endif
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_fc_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fusion_fc_relu);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_fc_relu);
+#endif
+#endif  // FUSION_FC_RELU_OP
diff --git a/src/operators/im2sequence_op.cpp b/src/operators/im2sequence_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3c929af9cf0a8a1550f197ffdb42ee590cd43235
--- /dev/null
+++ b/src/operators/im2sequence_op.cpp
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef IM2SEQUENCE_OP
+
+#include "operators/im2sequence_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+int Im2SequenceOutputSize(int input_size, int kernel, int padding_1,
+                          int padding_2, int stride) {
+  int output_size =
+      1 + (padding_1 + padding_2 + input_size - kernel + stride - 1) / stride;
+  return output_size;
+}
+
+template <typename Dtype, typename T>
+void Im2SequenceOp<Dtype, T>::InferShape() const {
+  auto in_x_dims = this->param_.Input()->dims();
+
+  const std::vector<int> &kernels = this->param_.Kernels();
+
+  const std::vector<int> &strides = this->param_.Strides();
+
+  std::vector<int> paddings = this->param_.Paddings();
+
+  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(Im2SequenceOutputSize(in_x_dims[i + 2], kernels[i],
+                                                 paddings[i], paddings[i + 2],
+                                                 strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(im2sequence, ops::Im2SequenceOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/im2sequence_op.h b/src/operators/im2sequence_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0695da9308d33ca2b86a5e052210507beb9a82d3
--- /dev/null
+++ b/src/operators/im2sequence_op.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef IM2SEQUENCE_OP
+
+#pragma once
+
+#include <operators/op_param.h>
+#include "framework/operator.h"
+#include "operators/kernel/im2sequence_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using namespace framework;
+
+template <typename DeviceType, typename T>
+class Im2SequenceOp : public framework::OperatorWithKernel<
+                          DeviceType, Im2SequenceParam,
+                          operators::Im2SequenceKernel<DeviceType, T>> {
+ public:
+  Im2SequenceOp(const std::string &type, const VariableNameMap &inputs,
+                const VariableNameMap &outputs,
+                const framework::AttributeMap &attrs,
+                std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, Im2SequenceParam,
+            operators::Im2SequenceKernel<DeviceType, T>>(type, inputs, outputs,
+                                                         attrs, scope) {}
+
+  // using framework::OperatorWithKernel<
+  //    DeviceType, Im2SequenceParam,
+  //    operators::Im2SequenceKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ private:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(im2sequence);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/kernel/arm/batchnorm_kernel.cpp b/src/operators/kernel/arm/batchnorm_kernel.cpp
index cb30fb41e936543737cd48717920c6a62fb19033..f78d1fdc95ac9e10619dbf32fdc84d01a370f315 100644
--- a/src/operators/kernel/arm/batchnorm_kernel.cpp
+++ b/src/operators/kernel/arm/batchnorm_kernel.cpp
@@ -14,224 +14,20 @@ limitations under the License. */
 
 #ifdef BATCHNORM_OP
 
-#pragma once
-
 #include "operators/kernel/batchnorm_kernel.h"
+#include "operators/kernel/central-arm-func/batchnorm_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
-  const Tensor *input_x = param.InputX();
-  auto input_x_ptr = input_x->data<float>();
-  const auto &x_dims = input_x->dims();
-  const int N = x_dims[0];
-  const int C = x_dims[1];
-  const int H = x_dims[2];
-  const int W = x_dims[3];
-  const int stride0 = C * H * W;
-  const int stride1 = H * W;
-  const int stride2 = W;
-  Tensor *out = param.OutputY();
-  auto out_ptr = out->mutable_data<float>();
-  const float epsilon = param.Epsilon();
-  const Tensor *mean = param.InputMean();
-  const Tensor *variance = param.InputVariance();
-  const Tensor *scale = param.InputScale();
-  const Tensor *bias = param.InputBias();
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-
-  //  Tensor inv_std;
-  //  auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
-
-  PADDLE_MOBILE_ENFORCE(C == variance->numel(),
-                        "C must equal to variance.numel()");
-
-  int HXW = H * W;
-  if (HXW > 32) {
-    int NXC = N * C;
-    float *inv_std_ptr = new float[NXC * 4];
-    float *volatile new_scale_ptr = new float[NXC * 4];
-    float *volatile new_bias_ptr = new float[NXC * 4];
-
-    /// std = (var + epsilon).sqrt();
-    /// inv_std = 1 / std;
-    for (int i = 0; i < C * 4; i += 4) {
-      int index = i / 4;
-      inv_std_ptr[i] =
-          1 / static_cast<float>(pow((variance_ptr[index] + epsilon), 0.5));
-      inv_std_ptr[i + 1] = inv_std_ptr[i];
-      inv_std_ptr[i + 2] = inv_std_ptr[i];
-      inv_std_ptr[i + 3] = inv_std_ptr[i];
-
-      new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[index];
-      new_scale_ptr[i + 1] = new_scale_ptr[i];
-      new_scale_ptr[i + 2] = new_scale_ptr[i];
-      new_scale_ptr[i + 3] = new_scale_ptr[i];
-
-      new_bias_ptr[i] =
-          bias_ptr[index] - mean_ptr[index] * inv_std_ptr[i] * scale_ptr[index];
-
-      new_bias_ptr[i + 1] = new_bias_ptr[i];
-      new_bias_ptr[i + 2] = new_bias_ptr[i];
-      new_bias_ptr[i + 3] = new_bias_ptr[i];
-    }
-
-    for (int j = C * 4; j < NXC * 4; ++j) {
-      new_scale_ptr[j] = new_scale_ptr[j - C * 4];
-      new_bias_ptr[j] = new_bias_ptr[j - C * 4];
-    }
-
-    asm volatile(
-        "subs %[N], %[N], #1                  \n\t"
-        "blt        end_n_%=                  \n\t"
-        "loop_n_%=:                           \n\t"
-
-        "subs %[C], %[C], #1                   \n\t"
-        "blt        end_c_%=                  \n\t"
-        "loop_c_%=:                           \n\t"
-
-        "vld1.32 {q9}, [%[new_scale_ptr]]!    \n\t"
-        "vld1.32 {q10}, [%[new_bias_ptr]]!    \n\t"
-
-        "mov r6, %[HXW]       \n\t"
-
-        "subs r6, r6, #32                       \n\t"
-        "blt        end_hw_%=                   \n\t"
-        "loop_hw_%=:                            \n\t"
-
-        "vld1.32 {q1, q2}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q3, q4}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q5, q6}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q7, q8}, [%[input_x_ptr]]!    \n\t"
-
-        "vmul.f32   q1, q1,   q9  \n\t"
-        "vmul.f32   q2, q2,   q9  \n\t"
-        "vmul.f32   q3, q3,   q9  \n\t"
-        "vmul.f32   q4, q4,   q9  \n\t"
-
-        "vmul.f32   q5, q5,   q9  \n\t"
-        "vmul.f32   q6, q6,   q9  \n\t"
-        "vmul.f32   q7, q7,   q9  \n\t"
-        "vmul.f32   q8, q8,   q9  \n\t"
-
-        "vadd.f32   q1,  q1,  q10 \n\t"
-        "vadd.f32   q2, q2,   q10  \n\t"
-        "vadd.f32   q3, q3,   q10  \n\t"
-        "vadd.f32   q4,  q4,  q10 \n\t"
-        "vadd.f32   q5,  q5,  q10 \n\t"
-        "vadd.f32   q6,  q6,  q10 \n\t"
-        "vadd.f32   q7,  q7,  q10 \n\t"
-        "vadd.f32   q8,  q8,  q10 \n\t"
-
-        "vst1.32 {q1, q2}, [%[out_ptr]]!        \n\t"
-        "vst1.32 {q3, q4}, [%[out_ptr]]!       \n\t"
-        "vst1.32 {q5, q6}, [%[out_ptr]]!       \n\t"
-        "vst1.32 {q7, q8}, [%[out_ptr]]!       \n\t"
-
-        "subs r6, r6, #32                    \n\t"
-        "bge        loop_hw_%=                \n\t"
-        "end_hw_%=:                           \n\t"
-
-        "cmp  r6, #0                                \n\t"
-        "bge  end_remainder_%=                      \n\t"
-        "mov r5, #4                             \n\t"
-        "mul  r6, r6, r5                            \n\t"
-        "add %[input_x_ptr], %[input_x_ptr], r6     \n\t"
-
-        "vld1.32 {q1, q2}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q3, q4}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q5, q6}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q7, q8}, [%[input_x_ptr]]!    \n\t"
-
-        "vmul.f32   q1, q1,   q9  \n\t"
-        "vmul.f32   q2, q2,   q9  \n\t"
-        "vmul.f32   q3, q3,   q9  \n\t"
-        "vmul.f32   q4, q4,   q9  \n\t"
-        "vmul.f32   q5, q5,   q9  \n\t"
-        "vmul.f32   q6, q6,   q9  \n\t"
-        "vmul.f32   q7, q7,   q9  \n\t"
-        "vmul.f32   q8, q8,   q9  \n\t"
-        "vadd.f32   q1,  q1,  q10 \n\t"
-        "vadd.f32   q2, q2,   q10  \n\t"
-        "vadd.f32   q3, q3,   q10  \n\t"
-        "vadd.f32   q4,  q4,  q10 \n\t"
-        "vadd.f32   q5,  q5,  q10 \n\t"
-        "vadd.f32   q6,  q6,  q10 \n\t"
-        "vadd.f32   q7,  q7,  q10 \n\t"
-        "vadd.f32   q8,  q8,  q10 \n\t"
-
-        "add %[out_ptr], %[out_ptr], r6         \n\t"
-        "vst1.32 {q1, q2}, [%[out_ptr]]!        \n\t"
-        "vst1.32 {q3, q4}, [%[out_ptr]]!        \n\t"
-        "vst1.32 {q5, q6}, [%[out_ptr]]!        \n\t"
-        "vst1.32 {q7, q8}, [%[out_ptr]]!        \n\t"
-
-        "end_remainder_%=:                      \n\t"
-
-        "subs %[C], %[C], #1                    \n\t"
-        "bge        loop_c_%=                   \n\t"
-        "end_c_%=:                              \n\t"
-
-        "subs %[N], %[N], #1                    \n\t"
-        "bge        loop_n_%=                   \n\t"
-        "end_n_%=:                              \n\t"
-        :
-        : [input_x_ptr] "r"(input_x_ptr), [out_ptr] "r"(out_ptr),
-          [new_scale_ptr] "r"(new_scale_ptr), [new_bias_ptr] "r"(new_bias_ptr),
-          [N] "r"(N), [C] "r"(C), [HXW] "r"(HXW)
-        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-          "q10", "r5", "r6");
-
-    delete[] inv_std_ptr;
-    delete[] new_scale_ptr;
-    delete[] new_bias_ptr;
-
-  } else {
-    float *inv_std_ptr = new float[C];
-    for (int i = 0; i < C; i++) {
-      inv_std_ptr[i] =
-          1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-    }
-
-    Tensor new_scale;
-    auto new_scale_ptr = new_scale.mutable_data<float>(make_ddim({C}));
-    Tensor new_bias;
-    auto new_bias_ptr = new_bias.mutable_data<float>(make_ddim({C}));
-
-    /// ((x - est_mean) * (inv_var) * scale + bias equal to
-    /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
-    for (int i = 0; i < C; i++) {
-      new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-      new_bias_ptr[i] =
-          bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-      {
-        for (int n = 0; n < N; n++) {
-          for (int h = 0; h < H; h++) {
-            int tmp_index = n * stride0 + i * stride1 + h * stride2;
-            for (int w = 0; w < W; w++) {
-              int index = tmp_index + w;
-              out_ptr[index] =
-                  input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
-            }
-          }
-        }
-      }
-    }
+bool BatchNormKernel<CPU, float>::Init(BatchNormParam *param) {
+  return true;
+}
 
-    delete[] inv_std_ptr;
-    //    DLOG << "input[2,5,1,0](input[102]) ,channel 5 :";
-    //    DLOG << "input_x_ptr : " << input_x_ptr[102];
-    //    DLOG << "variance : " << variance_ptr[5];
-    //    DLOG << "inv_std_ptr : " << inv_std_ptr[5];
-    //    DLOG << "new_scale_ptr : " << new_scale_ptr[5];
-    //    DLOG << "new_bias_ptr : " << new_bias_ptr[5];
-    //    DLOG << "out_ptr : " << out_ptr[102];
-  }
+template <>
+void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
+  BatchnormCompute<float>(param);
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/arm/box_coder_kernel.cpp b/src/operators/kernel/arm/box_coder_kernel.cpp
index 9654228911af77e751e4ef9d1b92fb92ae30591d..d2a479391fbbb416eea7d19ae64125cac4637ef1 100644
--- a/src/operators/kernel/arm/box_coder_kernel.cpp
+++ b/src/operators/kernel/arm/box_coder_kernel.cpp
@@ -15,124 +15,21 @@ limitations under the License. */
 #ifdef BOXCODER_OP
 
 #include "operators/kernel/box_coder_kernel.h"
+#include "operators/kernel/central-arm-func/box_coder_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
-template <typename T>
-void EncodeCenterSize(const framework::Tensor& target_box,
-                      const framework::Tensor& prior_box,
-                      const framework::Tensor& prior_box_var, T* output) {
-  int64_t row = target_box.dims()[0];
-  int64_t col = prior_box.dims()[0];
-  int64_t len = prior_box.dims()[1];
-  auto* target_box_data = target_box.data<T>();
-  auto* prior_box_data = prior_box.data<T>();
-  auto* prior_box_var_data = prior_box_var.data<T>();
-
-  for (int64_t i = 0; i < row; ++i) {
-    for (int64_t j = 0; j < col; ++j) {
-      T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len];
-      T prior_box_height =
-          prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
-      T prior_box_center_x =
-          (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
-      T prior_box_center_y =
-          (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
-
-      T target_box_center_x =
-          (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
-      T target_box_center_y =
-          (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
-      T target_box_width =
-          target_box_data[i * len + 2] - target_box_data[i * len];
-      T target_box_height =
-          target_box_data[i * len + 3] - target_box_data[i * len + 1];
-
-      size_t offset = i * col * len + j * len;
-      output[offset] = (target_box_center_x - prior_box_center_x) /
-                       prior_box_width / prior_box_var_data[j * len];
-      output[offset + 1] = (target_box_center_y - prior_box_center_y) /
-                           prior_box_height / prior_box_var_data[j * len + 1];
-      output[offset + 2] =
-          std::log(std::fabs(target_box_width / prior_box_width)) /
-          prior_box_var_data[j * len + 2];
-      output[offset + 3] =
-          std::log(std::fabs(target_box_height / prior_box_height)) /
-          prior_box_var_data[j * len + 3];
-    }
-  }
-}
-
-template <typename T>
-void DecodeCenterSize(const framework::Tensor& target_box,
-                      const framework::Tensor& prior_box,
-                      const framework::Tensor& prior_box_var, T* output) {
-  int64_t row = target_box.dims()[0];
-  int64_t col = prior_box.dims()[0];
-  int64_t len = prior_box.dims()[1];
-
-  auto* target_box_data = target_box.data<T>();
-  auto* prior_box_data = prior_box.data<T>();
-  auto* prior_box_var_data = prior_box_var.data<T>();
-
-  for (int64_t i = 0; i < row; ++i) {
-    for (int64_t j = 0; j < col; ++j) {
-      size_t offset = i * col * len + j * len;
-      T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len];
-      T prior_box_height =
-          prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
-      T prior_box_center_x =
-          (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
-      T prior_box_center_y =
-          (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
-
-      T target_box_center_x = prior_box_var_data[j * len] *
-                                  target_box_data[offset] * prior_box_width +
-                              prior_box_center_x;
-      T target_box_center_y = prior_box_var_data[j * len + 1] *
-                                  target_box_data[offset + 1] *
-                                  prior_box_height +
-                              prior_box_center_y;
-      T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
-                                    target_box_data[offset + 2]) *
-                           prior_box_width;
-      T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
-                                     target_box_data[offset + 3]) *
-                            prior_box_height;
-
-      output[offset] = target_box_center_x - target_box_width / 2;
-      output[offset + 1] = target_box_center_y - target_box_height / 2;
-      output[offset + 2] = target_box_center_x + target_box_width / 2;
-      output[offset + 3] = target_box_center_y + target_box_height / 2;
-    }
-  }
+template <>
+bool BoxCoderKernel<CPU, float>::Init(BoxCoderParam *param) {
+  return true;
 }
 
 template <>
-void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam& param) const {
-  const auto* input_priorbox = param.InputPriorBox();
-  const auto* input_priorboxvar = param.InputPriorBoxVar();
-  const auto* input_targetbox = param.InputTargetBox();
-
-  const auto& code_type = param.CodeType();
-
-  auto row = input_targetbox->dims()[0];
-  auto col = input_priorbox->dims()[0];
-  auto len = input_priorbox->dims()[1];
-
-  Tensor* output_box = param.OutputBox();
-  auto* output_box_dataptr = output_box->mutable_data<float>({row, col, len});
-
-  if (code_type == "encode_center_size") {
-    EncodeCenterSize<float>(*input_targetbox, *input_priorbox,
-                            *input_priorboxvar, output_box_dataptr);
-  }
-  if (code_type == "decode_center_size") {
-    DecodeCenterSize<float>(*input_targetbox, *input_priorbox,
-                            *input_priorboxvar, output_box_dataptr);
-  }
+void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam &param) const {
+  BoxCoderCompute<float>(param);
 }
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/arm/concat_kernel.cpp b/src/operators/kernel/arm/concat_kernel.cpp
index 329677fb11e6ee2db74b5191586ac6157ede9697..b6810bf76946bfb8151f3001b76fcbaa5e99e5fc 100644
--- a/src/operators/kernel/arm/concat_kernel.cpp
+++ b/src/operators/kernel/arm/concat_kernel.cpp
@@ -15,72 +15,19 @@ limitations under the License. */
 #ifdef CONCAT_OP
 
 #include "operators/kernel/concat_kernel.h"
+#include "operators/kernel/central-arm-func/concat_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
-template <typename T>
-class ConcatFunctor {
- public:
-  void operator()(const std::vector<framework::Tensor> &input, const int axis,
-                  framework::Tensor *output) {
-    size_t num = input.size();
-    int rows = 1;
-    auto dim_0 = input[0].dims();
-    for (int i = 0; i < axis; ++i) {
-      rows *= dim_0[i];
-    }
-    int out_rows = rows, out_cols = 0;
 
-    std::vector<int64_t> input_cols(input.size());
-    for (int i = 0; i < num; ++i) {
-      int t_cols = input[i].numel() / rows;
-      out_cols += t_cols;
-      input_cols[i] = t_cols;
-    }
-
-    // computation
-    for (int k = 0; k < out_rows; ++k) {
-      T *dst_ptr = output->data<T>() + k * out_cols;
-      int col_idx = 0;
-      for (int j = 0; j < num; ++j) {
-        int col_len = input_cols[j];
-        const T *src_prt = input[j].data<T>() + k * col_len;
-        memory::Copy(dst_ptr + col_idx, src_prt, sizeof(T) * col_len);
-        col_idx += col_len;
-      }
-    }
-  }
-};
+template <>
+bool ConcatKernel<CPU, float>::Init(ConcatParam *param) {
+  return true;
+}
 
 template <>
 void ConcatKernel<CPU, float>::Compute(const ConcatParam &param) const {
-  auto inputs = param.Inputs();
-  auto *out = param.Out();
-  int64_t axis = param.Axis();
-  out->mutable_data<float>();
-
-  /// Sometimes direct copies will be faster, this maybe need deeply analysis.
-  if (axis == 0 && inputs.size() < 10) {
-    size_t output_offset = 0;
-    for (auto *in : inputs) {
-      auto in_stride = framework::stride_numel(in->dims());
-      auto out_stride = framework::stride_numel(out->dims());
-      auto dst = out->data<float>() + output_offset;
-      auto src = in->data<float>();
-      PADDLE_MOBILE_ENFORCE(
-          in_stride.size() == out_stride.size(),
-          "src and dst tensor should have the same dims size.");
-      memory::Copy(dst, src, sizeof(float) * in_stride[0]);
-      output_offset += in_stride[0];
-    }
-  } else {
-    std::vector<framework::Tensor> inputs_concat(inputs.size());
-    for (int j = 0; j < inputs.size(); ++j) {
-      inputs_concat[j] = *inputs[j];
-    }
-    ConcatFunctor<float> concat_functor;
-    concat_functor(inputs_concat, static_cast<int>(axis), out);
-  }
+  ConcatCompute<float>(param);
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dbf3745eb15cf56bba32dc8cbae50d242ce2da76
--- /dev/null
+++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#include "operators/kernel/conv_add_bn_relu_kernel.h"
+#include "operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddBNReluKernel<CPU, float>::Init(FusionConvAddBNReluParam *param) {
+  const Tensor *mean = param->InputMean();
+  const Tensor *variance = param->InputVariance();
+  const Tensor *scale = param->InputScale();
+  const Tensor *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+
+  const int C = mean->numel();
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  Tensor *new_scale = new Tensor();
+  Tensor *new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({C});
+  auto new_bias_ptr = new_bias->mutable_data<float>({C});
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  return true;
+}
+
+template <>
+void ConvAddBNReluKernel<CPU, float>::Compute(
+    const FusionConvAddBNReluParam &param) const {
+  ConvAddBNReluCompute<float>(param);
+}
+template class ConvAddBNReluKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/conv_add_kernel.cpp b/src/operators/kernel/arm/conv_add_kernel.cpp
index 1cb61b03dc1d011f45a802eb002824d4bb9f5352..88f839f611f1ed7f46c11a1b24feb6e29ff07ec7 100644
--- a/src/operators/kernel/arm/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_kernel.cpp
@@ -14,99 +14,21 @@ limitations under the License. */
 #ifdef FUSION_CONVADD_OP
 
 #include "operators/kernel/conv_add_kernel.h"
+#include "../central-arm-func/conv_add_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam &param) const {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-  int axis = param.Axis();
-  Tensor *output = param.Output();
-  math::expand_bias(bias, axis, output->dims());
-  output->ShareDataWith(bias);
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
+bool ConvAddKernel<CPU, float>::Init(FusionConvAddParam *param) {
+  return true;
+}
 
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(1));
-    }
-  }
+template <>
+void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam &param) const {
+  ConvAddCompute<float>(param);
 }
+
 template class ConvAddKernel<CPU, float>;
 
 }  // namespace operators
diff --git a/src/operators/kernel/arm/conv_add_relu_kernel.cpp b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
index 2be08d5379b7e5202f79d2829ba79cbee1b0a2c9..8414b7374dd0ed2b10784563dbac9c1565d66f4c 100644
--- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
@@ -12,102 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef FUSION_CONVADD_RELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 
 #include "operators/kernel/conv_add_relu_kernel.h"
+#include "operators/kernel/central-arm-func/conv_add_relu_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
+template <>
+bool ConvAddReluKernel<CPU, float>::Init(FusionConvAddReluParam *param) {
+  return true;
+}
+
 template <>
 void ConvAddReluKernel<CPU, float>::Compute(
     const FusionConvAddReluParam &param) const {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-  int axis = param.Axis();
-  Tensor *output = param.Output();
-  math::expand_bias(bias, axis, output->dims());
-  output->ShareDataWith(bias);
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(1), true);
-    }
-  }
+  ConvAddReluCompute<float>(param);
 }
 template class ConvAddReluKernel<CPU, float>;
 
diff --git a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..23f06c1f0b8a0ed3f22ca9d23d24ae44c59f3618
--- /dev/null
+++ b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#include "operators/kernel/conv_bn_relu_kernel.h"
+#include "operators/kernel/central-arm-func/conv_bn_relu_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvBNReluKernel<CPU, float>::Init(FusionConvBNReluParam *param) {
+  const Tensor *mean = param->InputMean();
+  const Tensor *variance = param->InputVariance();
+  const Tensor *scale = param->InputScale();
+  const Tensor *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+
+  //   DLOG << "variance: " << *variance;
+
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+
+  const int C = mean->numel();
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  Tensor *new_scale = new Tensor();
+  Tensor *new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({C});
+  auto new_bias_ptr = new_bias->mutable_data<float>({C});
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  return true;
+}
+
+template <>
+void ConvBNReluKernel<CPU, float>::Compute(
+    const FusionConvBNReluParam &param) const {
+  ConvBNReluCompute<float>(param);
+}
+template class ConvBNReluKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp
index 1ec022ffab41fc41084220651d286b20ea43d7bb..ca8aeff0dd3db5fe7b625bdeb947b2927eb619ce 100644
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -15,94 +15,19 @@ limitations under the License. */
 #ifdef CONV_OP
 
 #include "operators/kernel/conv_kernel.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
+bool ConvKernel<CPU, float>::Init(ConvParam *param) {
+  return true;
+}
 
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(0));
-    }
-  }
+template <>
+void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
+  ConvCompute<float>(param);
 }
 
 template class ConvKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/conv_transpose_kernel.cpp b/src/operators/kernel/arm/conv_transpose_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d3c1aa96fd34207d401ef96058713d9b93098347
--- /dev/null
+++ b/src/operators/kernel/arm/conv_transpose_kernel.cpp
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_TRANSPOSE
+
+#include "operators/kernel/conv_transpose_kernel.h"
+#include "operators/kernel/central-arm-func/conv_transpose_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvTransposeKernel<CPU, float>::Init(ConvTransposeParam *param) {
+  return true;
+}
+
+template <>
+void ConvTransposeKernel<CPU, float>::Compute(
+    const ConvTransposeParam &param) const {
+  ConvTransposeCompute<float>(param);
+}
+
+template class ConvTransposeKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/depthwise_conv_kernel.cpp b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
index e6f27b772562789e07807b2b56c1f9d73bf373a9..6ede0e2bef2383df8aa0593a07297f2f6233acaf 100644
--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
@@ -15,98 +15,19 @@ limitations under the License. */
 #ifdef DEPTHWISECONV_OP
 
 #include "operators/kernel/depthwise_conv_kernel.h"
-#include "operators/kernel/conv_kernel.h"
+#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
-  LOG(kLOG_DEBUG) << param;
-
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  //  DLOG << " compute end get Attrs " << strides[0];
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
+bool DepthwiseConvKernel<CPU, float>::Init(ConvParam *param) {
+  return true;
+}
 
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(0));
-    }
-  }
+template <>
+void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
+  DepthwiseConvCompute<float>(param);
 }
 
 template class DepthwiseConvKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/dropout_kernel.cpp b/src/operators/kernel/arm/dropout_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db942b018d7085ca3986533937328101afb08ff9
--- /dev/null
+++ b/src/operators/kernel/arm/dropout_kernel.cpp
@@ -0,0 +1,47 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef DROPOUT_OP
+
+#include "operators/kernel/dropout_kernel.h"
+#include <operators/math/transform.h>
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DropoutKernel<CPU, float>::Init(DropoutParam *para) {
+  return true;
+}
+
+template <typename T>
+struct DropoutFunctor {
+  inline T operator()(T in) const { return in; }
+};
+
+template <>
+void DropoutKernel<CPU, float>::Compute(const DropoutParam &param) const {
+  const auto *input_x = param.InputX();
+  auto *input_x_ptr = input_x->data<float>();
+  auto *out = param.Out();
+  auto *out_ptr = out->mutable_data<float>();
+
+  DropoutFunctor<float> func_;
+  math::Transform trans;
+  trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ec08fcecb9fefaa247e0acbb8a085e752b8dba3
--- /dev/null
+++ b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DWCONVBNRELU_OP
+
+#include "operators/kernel/dwconv_bn_relu_kernel.h"
+#include "operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DWConvBNReluKernel<CPU, float>::Init(FusionDWConvBNReluParam *param) {
+  const Tensor *mean = param->InputMean();
+  const Tensor *variance = param->InputVariance();
+  const Tensor *scale = param->InputScale();
+  const Tensor *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+
+  const int C = mean->numel();
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  Tensor *new_scale = new Tensor();
+  Tensor *new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({C});
+  auto new_bias_ptr = new_bias->mutable_data<float>({C});
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  return true;
+}
+
+template <>
+void DWConvBNReluKernel<CPU, float>::Compute(
+    const FusionDWConvBNReluParam &param) const {
+  DWConvBNReluCompute<float>(param);
+}
+template class DWConvBNReluKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/elementwise_add_kernel.cpp b/src/operators/kernel/arm/elementwise_add_kernel.cpp
index 02aabfe3ce0622df80c86906f45ab5cc688c7b12..fdab1c60a310480d8e59f3f84802001ea592433a 100644
--- a/src/operators/kernel/arm/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_add_kernel.cpp
@@ -14,32 +14,23 @@ limitations under the License. */
 
 #ifdef ELEMENTWISEADD_OP
 
-#pragma once
-
 #include "operators/kernel/elementwise_add_kernel.h"
+#include "operators/kernel/central-arm-func/elementwise_add_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
-template <typename T>
-struct AddFunctor {
-  inline T operator()(T a, T b) const { return a + b; }
-};
+template <>
+bool ElementwiseAddKernel<CPU, float>::Init(ElementwiseAddParam *param) {
+  return true;
+}
 
 template <>
 void ElementwiseAddKernel<CPU, float>::Compute(
     const ElementwiseAddParam &param) const {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *Out = param.Out();
-  Out->mutable_data<float>();
-  int axis = param.Axis();
-  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
-                                                 AddFunctor<float>(), Out);
+  ElementwiseAddCompute<float>(param);
 }
 
-template class ElementwiseAddKernel<CPU, float>;
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/arm/fusion_fc_kernel.cpp b/src/operators/kernel/arm/fusion_fc_kernel.cpp
index 3cf6458fc7a476eda8566c97a62971f1a72da496..c72960e67f19c601e6f27a3bedf7123c80875e0c 100644
--- a/src/operators/kernel/arm/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/arm/fusion_fc_kernel.cpp
@@ -14,55 +14,20 @@ limitations under the License. */
 
 #ifdef FUSION_FC_OP
 
-#pragma once
-
 #include "operators/kernel/fusion_fc_kernel.h"
+#include "operators/kernel/central-arm-func/fusion_fc_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-void FusionFcKernel<CPU, float>::Compute(const FusionFcParam &param) const {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  const Tensor *input_z = param.InputZ();
-  auto *input_z_data = input_z->data<float>();
-  int axis = param.Axis();
-  Tensor *out = param.Out();
-  auto *out_data = out->mutable_data<float>();
-  const Tensor x_matrix =
-      input_x->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
-          : *input_x;
-  const Tensor y_matrix =
-      input_y->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
-          : *input_y;
-  auto out_dim = out->dims();
-  if (out_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
-  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
-                        " out_dim.size must be 2.");
-  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
-  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ")
-
-  int64_t classes = input_z->numel();
-  for (int i = 0; i < out_dim[0]; i++) {
-    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
-  }
+bool FusionFcKernel<CPU, float>::Init(FusionFcParam *param) {
+  return true;
+}
 
-  for (int i = 0; i < out->numel(); i++) {
-    DLOG << out_data[i];
-  }
-  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
-                      out, static_cast<float>(1));
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-  //            if (out_dim.size() != 2) {
-  //                out->Resize(out_dim);
-  //            }
+template <>
+void FusionFcKernel<CPU, float>::Compute(const FusionFcParam &param) const {
+  FusionFcCompute<float>(param);
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/arm/im2sequence_kernel.cpp b/src/operators/kernel/arm/im2sequence_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..709fa30a23d4efba3531d9bc567c99f53875bc12
--- /dev/null
+++ b/src/operators/kernel/arm/im2sequence_kernel.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef IM2SEQUENCE_OP
+
+#include "operators/kernel/im2sequence_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool Im2SequenceKernel<CPU, float>::Init(Im2SequenceParam *para) {
+  return true;
+}
+
+inline int Im2SeqOutputSize(int input_size, int filter_size, int padding_0,
+                            int padding_1, int stride) {
+  const int output_size =
+      (input_size + padding_0 + padding_1 - filter_size) / stride + 1;
+  return output_size;
+}
+
+template <>
+void Im2SequenceKernel<CPU, float>::Compute(
+    const Im2SequenceParam &param) const {
+  const Tensor *in_x = param.Input();
+  Tensor *out = param.Output();
+  out->mutable_data<float>();
+
+  std::vector<int> kernels = param.Kernels();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+
+  auto in_x_dim = in_x->dims();
+  const int batch_size = static_cast<int>(in_x_dim[0]);
+  const int img_channels = static_cast<int>(in_x_dim[1]);
+  const int img_height = static_cast<int>(in_x_dim[2]);
+  const int img_width = static_cast<int>(in_x_dim[3]);
+
+  int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
+                                       paddings[2], strides[0]);
+  int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
+                                      paddings[3], strides[1]);
+  const std::vector<int> dilations({1, 1});
+
+  // TODO: verify
+  auto out_dims = out->dims();
+  out->Resize({batch_size, out->numel() / batch_size});
+
+  for (int i = 0; i < batch_size; i++) {
+    const Tensor src =
+        in_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+    Tensor dst = out->Slice(i, i + 1).Resize(
+        {output_height, output_width, img_channels, kernels[0], kernels[1]});
+
+    math::Im2ColFunctor<math::ColFormat::kOCF, CPU, float> f;
+    f(src, dilations, strides, paddings, &dst);
+  }
+  out->Resize(out_dims);
+}
+
+template class Im2SequenceKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/lrn_kernel.cpp b/src/operators/kernel/arm/lrn_kernel.cpp
index 5ac4c67559ebe1603230e0d50895d0702c38cb77..0c20c5167adee5165067cc5ab4935df255751755 100644
--- a/src/operators/kernel/arm/lrn_kernel.cpp
+++ b/src/operators/kernel/arm/lrn_kernel.cpp
@@ -14,34 +14,21 @@ limitations under the License. */
 
 #ifdef LRN_OP
 
-#pragma once
-
 #include "operators/kernel/lrn_kernel.h"
+#include "operators/kernel/central-arm-func/lrn_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-void LrnKernel<CPU, float>::Compute(const LrnParam &param) const {
-  const Tensor *input_x = param.InputX();
-  auto x_dims = input_x->dims();
-  Tensor *out = param.Out();
-  out->mutable_data<float>();
-  /// data_format = NCHW
-  const int N = x_dims[0];
-  const int C = x_dims[1];
-  const int H = x_dims[2];
-  const int W = x_dims[3];
-
-  const int n = param.N();
-  const float alpha = param.Alpha();
-  const float beta = param.Beta();
-  const float k = param.K();
-  LRNFunctor<float> lrnFunctor;
-  lrnFunctor(*input_x, out, N, C, H, W, n, k, alpha, beta);
+bool LrnKernel<CPU, float>::Init(LrnParam *param) {
+  return true;
 }
 
-template class LrnKernel<CPU, float>;
+template <>
+void LrnKernel<CPU, float>::Compute(const LrnParam &param) const {
+  LrnCompute<float>(param);
+}
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/arm/mul_kernel.cpp b/src/operators/kernel/arm/mul_kernel.cpp
index 70bcac2461cdef535de8c9759ec10113e45b7ae2..ac5010ce5492ae1d99e59bfa761e22bb3aa5d1c9 100644
--- a/src/operators/kernel/arm/mul_kernel.cpp
+++ b/src/operators/kernel/arm/mul_kernel.cpp
@@ -14,39 +14,21 @@ limitations under the License. */
 
 #ifdef MUL_OP
 
-#pragma once
-
 #include "operators/kernel/mul_kernel.h"
+#include "operators/kernel/central-arm-func/mul_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-void MulKernel<CPU, float>::Compute(const MulParam &param) const {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *out = param.Out();
-  out->mutable_data<float>();
-  const Tensor x_matrix =
-      input_x->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
-          : *input_x;
-  const Tensor y_matrix =
-      input_y->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
-          : *input_y;
-  auto out_dim = out->dims();
-  if (out_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
-                      out, static_cast<float>(0));
-  if (out_dim.size() != 2) {
-    out->Resize(out_dim);
-  }
+bool MulKernel<CPU, float>::Init(MulParam *param) {
+  return true;
 }
 
-template class MulKernel<CPU, float>;
+template <>
+void MulKernel<CPU, float>::Compute(const MulParam &param) const {
+  MulCompute<float>(param);
+}
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/arm/multiclass_nms_kernel.cpp b/src/operators/kernel/arm/multiclass_nms_kernel.cpp
index 39f55dab38031db14b617e48eedb236eacd1b714..9ed8f1731afe2bab723c66ea1e2e8c5042f6ce28 100644
--- a/src/operators/kernel/arm/multiclass_nms_kernel.cpp
+++ b/src/operators/kernel/arm/multiclass_nms_kernel.cpp
@@ -14,263 +14,21 @@ limitations under the License. */
 
 #ifdef MULTICLASSNMS_OP
 
-#pragma once
-
 #include "operators/kernel/multiclass_nms_kernel.h"
+#include "operators/kernel/central-arm-func/multiclass_nms_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
-constexpr int kOutputDim = 6;
-constexpr int kBBoxSize = 4;
-
-template <class T>
-bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                          const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <class T>
-static inline void GetMaxScoreIndex(
-    const std::vector<T>& scores, const T threshold, int top_k,
-    std::vector<std::pair<T, int>>* sorted_indices) {
-  for (size_t i = 0; i < scores.size(); ++i) {
-    if (scores[i] > threshold) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
-    }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
-                   SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
-}
-
-template <class T>
-static inline T BBoxArea(const T* box, const bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <class T>
-static inline T JaccardOverlap(const T* box1, const T* box2,
-                               const bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = inter_xmax - inter_xmin;
-    const T inter_h = inter_ymax - inter_ymin;
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <typename T>
-static inline void NMSFast(const Tensor& bbox, const Tensor& scores,
-                           const T score_threshold, const T nms_threshold,
-                           const T eta, const int64_t top_k,
-                           std::vector<int>* selected_indices) {
-  // The total boxes for each instance.
-  int64_t num_boxes = bbox.dims()[0];
-  // 4: [xmin ymin xmax ymax]
-  int64_t box_size = bbox.dims()[1];
-
-  std::vector<T> scores_data(num_boxes);
-  std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices;
-  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
-
-  selected_indices->clear();
-  T adaptive_threshold = nms_threshold;
-  const T* bbox_data = bbox.data<T>();
-
-  while (sorted_indices.size() != 0) {
-    const int idx = sorted_indices.front().second;
-    bool keep = true;
-    for (size_t k = 0; k < selected_indices->size(); ++k) {
-      if (keep) {
-        const int kept_idx = (*selected_indices)[k];
-        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                      bbox_data + kept_idx * box_size, true);
-        keep = overlap <= adaptive_threshold;
-      } else {
-        break;
-      }
-    }
-    if (keep) {
-      selected_indices->push_back(idx);
-    }
-    sorted_indices.erase(sorted_indices.begin());
-    if (keep && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-}
-
-template <typename T>
-void MultiClassNMS(const Tensor& scores, const Tensor& bboxes,
-                   std::map<int, std::vector<int>>* indices, int* num_nmsed_out,
-                   const int& background_label, const int& nms_top_k,
-                   const int& keep_top_k, const T& nms_threshold,
-                   const T& nms_eta, const T& score_threshold) {
-  int64_t class_num = scores.dims()[0];
-  int64_t predict_dim = scores.dims()[1];
-  int num_det = 0;
-  for (int64_t c = 0; c < class_num; ++c) {
-    if (c == background_label) continue;
-    Tensor score = scores.Slice(c, c + 1);
-    /// [c] is key
-    NMSFast<float>(bboxes, score, score_threshold, nms_threshold, nms_eta,
-                   nms_top_k, &((*indices)[c]));
-    num_det += (*indices)[c].size();
-  }
-
-  *num_nmsed_out = num_det;
-  const T* scores_data = scores.data<T>();
-  if (keep_top_k > -1 && num_det > keep_top_k) {
-    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-    for (const auto& it : *indices) {
-      int label = it.first;
-      const T* sdata = scores_data + label * predict_dim;
-      const std::vector<int>& label_indices = it.second;
-      for (size_t j = 0; j < label_indices.size(); ++j) {
-        int idx = label_indices[j];
-        // PADDLE_ENFORCE_LT(idx, predict_dim);
-        score_index_pairs.push_back(
-            std::make_pair(sdata[idx], std::make_pair(label, idx)));
-      }
-    }
-    // Keep top k results per image.
-    std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
-                     SortScorePairDescend<std::pair<int, int>>);
-    score_index_pairs.resize(keep_top_k);
-
-    // Store the new indices.
-    std::map<int, std::vector<int>> new_indices;
-    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
-      int label = score_index_pairs[j].second.first;
-      int idx = score_index_pairs[j].second.second;
-      new_indices[label].push_back(idx);
-    }
-    new_indices.swap(*indices);
-    *num_nmsed_out = keep_top_k;
-  }
-}
-
-template <typename T>
-void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
-                      const std::map<int, std::vector<int>>& selected_indices,
-                      Tensor* outs) {
-  int predict_dim = scores.dims()[1];
-  auto* scores_data = scores.data<T>();
-  auto* bboxes_data = bboxes.data<T>();
-  auto* odata = outs->data<T>();
-
-  int count = 0;
-  for (const auto& it : selected_indices) {
-    /// one batch
-    int label = it.first;
-    const T* sdata = scores_data + label * predict_dim;
-    const std::vector<int>& indices = it.second;
-    for (size_t j = 0; j < indices.size(); ++j) {
-      int idx = indices[j];
-      const T* bdata = bboxes_data + idx * kBBoxSize;
-      odata[count * kOutputDim] = label;           // label
-      odata[count * kOutputDim + 1] = sdata[idx];  // score
-      // xmin, ymin, xmax, ymax
-      std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
-      count++;
-    }
-  }
+template <>
+bool MultiClassNMSKernel<CPU, float>::Init(MultiClassNMSParam *param) {
+  return true;
 }
 
 template <>
 void MultiClassNMSKernel<CPU, float>::Compute(
-    const MultiClassNMSParam& param) const {
-  const auto* input_bboxes = param.InputBBoxes();
-  const auto& input_bboxes_dims = input_bboxes->dims();
-
-  const auto* input_scores = param.InputScores();
-  const auto& input_scores_dims = input_scores->dims();
-
-  auto* outs = param.Out();
-  auto background_label = param.BackGroundLabel();
-  auto nms_top_k = param.NMSTopK();
-  auto keep_top_k = param.KeepTopK();
-  auto nms_threshold = param.NMSThreshold();
-  auto nms_eta = param.NMSEta();
-  auto score_threshold = param.ScoreThreshold();
-
-  int64_t batch_size = input_scores_dims[0];
-  int64_t class_num = input_scores_dims[1];
-  int64_t predict_dim = input_scores_dims[2];
-  int64_t box_dim = input_bboxes_dims[2];
-
-  std::vector<std::map<int, std::vector<int>>> all_indices;
-  std::vector<size_t> batch_starts = {0};
-  for (int64_t i = 0; i < batch_size; ++i) {
-    Tensor ins_score = input_scores->Slice(i, i + 1);
-    ins_score.Resize({class_num, predict_dim});
-
-    Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
-    ins_boxes.Resize({predict_dim, box_dim});
-
-    std::map<int, std::vector<int>> indices;
-    int num_nmsed_out = 0;
-    MultiClassNMS<float>(ins_score, ins_boxes, &indices, &num_nmsed_out,
-                         background_label, nms_top_k, keep_top_k, nms_threshold,
-                         nms_eta, score_threshold);
-    all_indices.push_back(indices);
-    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-  }
-
-  int num_kept = batch_starts.back();
-  if (num_kept == 0) {
-    float* od = outs->mutable_data<float>({1});
-    od[0] = -1;
-  } else {
-    outs->mutable_data<float>({num_kept, kOutputDim});
-    for (int64_t i = 0; i < batch_size; ++i) {
-      Tensor ins_score = input_scores->Slice(i, i + 1);
-      ins_score.Resize({class_num, predict_dim});
-
-      Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
-      ins_boxes.Resize({predict_dim, box_dim});
-
-      int64_t s = batch_starts[i];
-      int64_t e = batch_starts[i + 1];
-      if (e > s) {
-        Tensor out = outs->Slice(s, e);
-        MultiClassOutput<float>(ins_score, ins_boxes, all_indices[i], &out);
-      }
-    }
-  }
-
-  //            framework::LoD lod;
-  //            lod.emplace_back(batch_starts);
-  //
-  //            outs->set_lod(lod);
+    const MultiClassNMSParam &param) const {
+  MultiClassNMSCompute<float>(param);
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/arm/pool_kernel.cpp b/src/operators/kernel/arm/pool_kernel.cpp
index 646f538d7a637b4b009b51b9305d607325a8e54e..be2189340f480bef80fd00a612cf32e71ea10a1c 100644
--- a/src/operators/kernel/arm/pool_kernel.cpp
+++ b/src/operators/kernel/arm/pool_kernel.cpp
@@ -14,65 +14,19 @@ limitations under the License. */
 
 #ifdef POOL_OP
 
-#include <operators/kernel/pool_kernel.h>
-#include "common/log.h"
-
+#include "operators/kernel/pool_kernel.h"
+#include "../central-arm-func/pool_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
 
-inline void PoolBasic(std::string pooling_type, std::vector<int> ksize,
-                      std::vector<int> strides, std::vector<int> paddings,
-                      const Tensor *in_x, Tensor *out) {
-  if (pooling_type == "max") {
-    math::PoolFunctor<CPU, math::MaxPool<float>, float> pool2d_forward;
-    math::MaxPool<float> pool_process;
-    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
-
-  } else if (pooling_type == "avg") {
-    math::PoolFunctor<CPU, math::AvgPool<float>, float> pool2d_forward;
-    math::AvgPool<float> pool_process;
-    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
-  }
+template <>
+bool PoolKernel<CPU, float>::Init(PoolParam *param) {
+  return true;
 }
 
 template <>
 void PoolKernel<CPU, float>::Compute(const PoolParam &param) const {
-  const Tensor *in_x = param.Input();
-  Tensor *out = param.Output();
-  std::string pooling_type = param.PoolingType();
-
-  std::vector<int> ksize = param.Ksize();
-
-  std::vector<int> strides = param.Strides();
-
-  std::vector<int> paddings = param.Paddings();
-  if (ksize.size() != 2) {
-    LOG(paddle_mobile::LogLevel::kLOG_ERROR)
-        << "Pool op only supports 2D and 3D input.";
-  }
-
-  if (param.isGlobalPooling()) {
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
-      ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
-    }
-  } else if (ksize[0] == 3 && ksize[0] == ksize[1]) {
-    if (pooling_type == "max") {
-      math::Pool3x3Max(strides, paddings, in_x, out);
-    } else if (pooling_type == "avg") {
-      math::Pool3x3Avg(strides, paddings, in_x, out);
-    }
-
-  } else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
-    if (pooling_type == "max") {
-      math::Pool2x2Max(strides, paddings, in_x, out);
-    } else if (pooling_type == "avg") {
-      math::Pool2x2Avg(strides, paddings, in_x, out);
-    }
-
-  } else {
-    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
-  }
+  PoolCompute<float>(param);
 }
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/arm/prelu_kernel.cpp b/src/operators/kernel/arm/prelu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a83783a078f4ec680fbab238a2839226546f894c
--- /dev/null
+++ b/src/operators/kernel/arm/prelu_kernel.cpp
@@ -0,0 +1,113 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PRELU_OP
+
+#include "operators/kernel/prelu_kernel.h"
+#include <operators/math/transform.h>
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct PReluFunctor {
+  explicit PReluFunctor(float slope) { this->slope_ = slope; }
+  inline T operator()(T in) const { return in > 0 ? in : in * slope_; }
+
+  float slope_ = 0.0f;
+};
+
+/*
+ * @b 特化到具体平台的实现, param 从 op 层传入
+ * */
+template <>
+void PReluKernel<CPU, float>::Compute(const PReluParam &param) const {
+  const auto *input_x = param.InputX();
+  auto *input_x_ptr = input_x->data<float>();
+  auto *out = param.Out();
+  auto *out_ptr = out->mutable_data<float>();
+
+  if (param.Slopes().size() == 1) {
+    PReluFunctor<float> func_(param.Slopes()[0]);
+    math::Transform trans;
+    trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_);
+  } else if (param.Slopes().size() > 1) {
+    const int dim_size = input_x->dims().size();
+    switch (dim_size) {
+      case 0:
+        break;
+      case 1: {
+        const int input_width = input_x->dims()[0];
+        math::Transform trans;
+
+        #pragma omp parallel for
+        for (int w = 0; w < input_width; ++w) {
+          out_ptr[w] = input_x_ptr[w] * param.Slopes()[w];
+        }
+      } break;
+      case 2: {
+        const int input_height = input_x->dims()[0];
+        const int input_width = input_x->dims()[1];
+
+        math::Transform trans;
+        #pragma omp parallel for
+        for (int h = 0; h < input_height; ++h) {
+          PReluFunctor<float> func_(param.Slopes()[h]);
+          const float *ptr = input_x_ptr + h * input_width;
+          float *optr = out_ptr + +h * input_width;
+          trans(ptr, ptr + input_width, optr, func_);
+        }
+      } break;
+      case 3: {
+        const int chan_size = input_x->dims()[0];
+        const int input_height = input_x->dims()[1];
+        const int input_width = input_x->dims()[2];
+
+        math::Transform trans;
+        #pragma omp parallel for
+        for (int c = 0; c < chan_size; ++c) {
+          PReluFunctor<float> func_(param.Slopes()[c]);
+          int size = input_height * input_width;
+          const float *ptr = input_x_ptr + c * size;
+          float *optr = out_ptr + c * size;
+          trans(ptr, ptr + size, optr, func_);
+        }
+      } break;
+      case 4:
+      default: {
+        const int batch_size = input_x->dims()[0];
+        const int chan_size = input_x->dims()[1];
+        const int input_height = input_x->dims()[2];
+        const int input_width = input_x->dims()[3];
+        math::Transform trans;
+
+        #pragma omp parallel for
+        for (int b = 0; b < batch_size; ++b) {
+          for (int c = 0; c < chan_size; ++c) {
+            PReluFunctor<float> func_(param.Slopes()[c]);
+            int size = input_height * input_width;
+            const float *ptr = input_x_ptr + b * c * size;
+            float *optr = out_ptr + +b * c * size;
+            trans(ptr, ptr + size, optr, func_);
+          }
+        }
+      }  // case 3,default
+      break;
+    }
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/prior_box_kernel.cpp b/src/operators/kernel/arm/prior_box_kernel.cpp
index e029c555d4d40745976be45b7a9c022eb62705c7..217d4b83cb1156a0e942c5ced5917546250e8bb1 100644
--- a/src/operators/kernel/arm/prior_box_kernel.cpp
+++ b/src/operators/kernel/arm/prior_box_kernel.cpp
@@ -14,133 +14,20 @@ limitations under the License. */
 
 #ifdef PRIORBOX_OP
 
-#pragma once
-
 #include "operators/kernel/prior_box_kernel.h"
+#include "operators/kernel/central-arm-func/prior_box_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
-template <typename T>
-struct ClipFunctor {
-  inline T operator()(T in) const {
-    return std::min<T>(std::max<T>(in, 0.), 1.);
-  }
-};
+template <>
+bool PriorBoxKernel<CPU, float>::Init(PriorBoxParam *param) {
+  return true;
+}
 
 template <>
 void PriorBoxKernel<CPU, float>::Compute(const PriorBoxParam &param) const {
-  const auto *input_ = param.Input();
-  const auto &input_dims = input_->dims();
-
-  const auto *input_image = param.InputImage();
-  const auto &input_image_dims = input_image->dims();
-
-  const auto &min_sizes = param.MinSizes();
-  const auto &max_sizes = param.MaxSizes();
-  const auto &variances = param.Variances();
-  const auto &input_aspect_ratio = param.AspectRatios();
-  const bool &flip = param.Flip();
-  const bool &clip = param.Clip();
-  const float &step_w = param.StepW();
-  const float &step_h = param.StepH();
-  const float &offset = param.Offset();
-
-  Tensor *output_boxes = param.OutputBoxes();
-  auto output_boxes_dataptr = output_boxes->mutable_data<float>();
-  Tensor *output_variances = param.OutputVariances();
-  auto output_variances_dataptr = output_variances->mutable_data<float>();
-
-  std::vector<float> aspect_ratios;
-  ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
-
-  auto img_width = input_image_dims[3];
-  auto img_height = input_image_dims[2];
-
-  auto feature_width = input_dims[3];
-  auto feature_height = input_dims[2];
-
-  auto stride0 = output_boxes->dims()[1] * output_boxes->dims()[2] *
-                 output_boxes->dims()[3];
-  auto stride1 = output_boxes->dims()[2] * output_boxes->dims()[3];
-  auto stride2 = output_boxes->dims()[3];
-
-  float step_width, step_height;
-  /// 300 / 19
-  if (step_w == 0 || step_h == 0) {
-    step_width = static_cast<float>(img_width) / feature_width;
-    step_height = static_cast<float>(img_height) / feature_height;
-  } else {
-    step_width = step_w;
-    step_height = step_h;
-  }
-
-  int num_priors = aspect_ratios.size() * min_sizes.size();
-  if (!max_sizes.empty()) {
-    num_priors += max_sizes.size();
-  }
-
-  for (int h = 0; h < feature_height; ++h) {
-    for (int w = 0; w < feature_width; ++w) {
-      /// map origin image
-      float center_x = (w + offset) * step_width;
-      float center_y = (h + offset) * step_height;
-      float box_width, box_height;
-      int idx = 0;
-      for (size_t s = 0; s < min_sizes.size(); ++s) {
-        auto min_size = min_sizes[s];
-        // priors with different aspect ratios
-        for (float ar : aspect_ratios) {
-          box_width = min_size * sqrt(ar) / 2.;
-          box_height = min_size / sqrt(ar) / 2.;
-          /// box_width/2 , / img_width 为了得到feature map 相对于
-          /// 原图的归一化位置的比例。
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] =
-              (center_x - box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] =
-              (center_y - box_height) / img_height;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 2] =
-              (center_x + box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] =
-              (center_y + box_height) / img_height;
-          idx++;
-        }
-        if (!max_sizes.empty()) {
-          auto max_size = max_sizes[s];
-          // square prior with size sqrt(minSize * maxSize)
-          box_width = box_height = sqrt(min_size * max_size) / 2.;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] =
-              (center_x - box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] =
-              (center_y - box_height) / img_height;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 2] =
-              (center_x + box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] =
-              (center_y + box_height) / img_height;
-          idx++;
-        }
-      }
-    }
-  }
-  if (clip) {
-    math::Transform trans;
-    ClipFunctor<float> clip_func;
-    trans(output_boxes_dataptr, output_boxes_dataptr + output_boxes->numel(),
-          output_boxes_dataptr, clip_func);
-  }
-
-  if ((variances.size() != 4)) {
-    LOG(kLOG_ERROR) << " variances.size() must be 4.";
-  }
-
-  int64_t box_num = feature_height * feature_width * num_priors;
-
-  for (int i = 0; i < box_num; i++) {
-    output_variances_dataptr[4 * i] = variances[0];
-    output_variances_dataptr[4 * i + 1] = variances[1];
-    output_variances_dataptr[4 * i + 2] = variances[2];
-    output_variances_dataptr[4 * i + 3] = variances[3];
-  }
+  PriorBoxCompute<float>(param);
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/arm/relu_kernel.cpp b/src/operators/kernel/arm/relu_kernel.cpp
index 86bf53e5a1e5ecc285c9e9f20cb412d290d535d1..63259a0c303f5e186f9eb90b98f2a8685f8ba5ca 100644
--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -14,96 +14,22 @@ limitations under the License. */
 
 #ifdef RELU_OP
 
-#pragma once
-
 #include "operators/kernel/relu_kernel.h"
-#include <operators/math/transform.h>
+#include "operators/kernel/central-arm-func/relu_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
-template <typename T>
-struct ReluFunctor {
-  inline T operator()(T in) const { return in > 0 ? in : 0; }
-};
+template <>
+bool ReluKernel<CPU, float>::Init(ReluParam *param) {
+  return true;
+}
 
-/*
- * @b 特化到具体平台的实现, param 从 op 层传入
- * */
 template <>
 void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
-  const auto *input_x = param.InputX();
-  auto *input_x_ptr = input_x->data<float>();
-  auto *out = param.Out();
-  auto *out_ptr = out->mutable_data<float>();
-
-  int numel = input_x->numel();
-  //  if (numel > 64) {
-  //    asm volatile(
-  //        "pld        [%[input_x_ptr], #0]        \n\t"
-  //        "vmov.f32   q8,    #0.0                 \n\t"
-  //        "subs %[num], %[num], #32                \n\t"
-  //        "blt        end_num_%=                  \n\t"
-  //        "loop_num_%=:                           \n\t"
-  //        "pld        [%[input_x_ptr], #1024]      \n\t"
-  //
-  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-  //
-  //        "vmax.f32 q0, q0, q8                   \n\t"
-  //        "vmax.f32 q1, q1, q8                    \n\t"
-  //        "vmax.f32 q2, q2, q8                   \n\t"
-  //        "vmax.f32 q3, q3, q8                   \n\t"
-  //        "vmax.f32 q4, q4, q8                   \n\t"
-  //        "vmax.f32 q5, q5, q8                   \n\t"
-  //        "vmax.f32 q6, q6, q8                   \n\t"
-  //        "vmax.f32 q7, q7, q8                   \n\t"
-  //
-  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
-  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
-  //
-  //        "subs %[num], %[num], #32              \n\t"
-  //        "bge        loop_num_%=                \n\t"
-  //        "end_num_%=:                           \n\t"
-  //        "cmp %[num], #0                         \n\t"
-  //        "bge   end_%=                          \n\t"
-  //        "mov r6, #4                             \n\t"
-  //        "mul r5, %[num], r6                     \n\t"
-  //        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
-  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-  //        "vmax.f32 q0, q0, q8                   \n\t"
-  //        "vmax.f32 q1, q1, q8                    \n\t"
-  //        "vmax.f32 q2, q2, q8                   \n\t"
-  //        "vmax.f32 q3, q3, q8                   \n\t"
-  //        "vmax.f32 q4, q4, q8                   \n\t"
-  //        "vmax.f32 q5, q5, q8                   \n\t"
-  //        "vmax.f32 q6, q6, q8                   \n\t"
-  //        "vmax.f32 q7, q7, q8                   \n\t"
-  //        "add %[out_ptr], %[out_ptr], r5       \n\t"
-  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
-  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
-  //        "end_%=:                                \n\t"
-  //        :
-  //        :
-  //        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num]
-  //        "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-  //        "q7", "q8", "r5",
-  //          "r6");
-  //  } else {
-  ReluFunctor<float> func_;
-  math::Transform trans;
-  trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
-  //  }
+  ReluCompute<float>(param);
 }
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/arm/reshape_kernel.cpp b/src/operators/kernel/arm/reshape_kernel.cpp
index 3d40309e97145e1df70f2a4191ee571c4a05627a..5ae8e5e3f945d115215652ded58dc8571868fcd7 100644
--- a/src/operators/kernel/arm/reshape_kernel.cpp
+++ b/src/operators/kernel/arm/reshape_kernel.cpp
@@ -14,39 +14,20 @@ limitations under the License. */
 
 #ifdef RESHAPE_OP
 
-#pragma once
-
 #include "operators/kernel/reshape_kernel.h"
+#include "operators/kernel/central-arm-func/reshape_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
+template <>
+bool ReshapeKernel<CPU, float>::Init(ReshapeParam *param) {
+  return true;
+}
+
 template <>
 void ReshapeKernel<CPU, float>::Compute(const ReshapeParam &param) const {
-  const auto *input_x = param.InputX();
-  const auto &input_x_dims = input_x->dims();
-  auto *out = param.Out();
-  framework::DDim out_dims = out->dims();
-  const auto *input_shape = param.InputShape();
-
-  if (input_shape) {
-    auto *shape_data = input_shape->data<int>();
-    framework::Tensor cpu_shape_tensor;
-    auto shape =
-        std::vector<int>(shape_data, shape_data + input_shape->numel());
-    out_dims = ValidateShape(shape, input_x->dims());
-  }
-
-  bool inplace = param.Inplace();
-  out->Resize(out_dims);
-  if (!inplace) {
-    out->mutable_data<float>();
-    framework::TensorCopy(*input_x, out);
-    out->Resize(out_dims);
-  } else {
-    out->ShareDataWith(*input_x);
-    out->Resize(out_dims);
-  }
+  ReshapeCompute<float>(param);
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/arm/resize_kernel.cpp b/src/operators/kernel/arm/resize_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bef24f1c56f656d25261c9c837e5455244d1ad87
--- /dev/null
+++ b/src/operators/kernel/arm/resize_kernel.cpp
@@ -0,0 +1,124 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESIZE_OP
+
+#include "operators/kernel/resize_kernel.h"
+#include <cmath>
+
+namespace paddle_mobile {
+namespace operators {
+void BiLinearResizeTensor(const float* src, const int src_height,
+                          const int src_width, float* dst, const int dst_height,
+                          const int dst_width) {
+  const float scale_w = src_width / (float)dst_width;
+  const float scale_h = src_height / (float)dst_height;
+  float* dst_data = dst;
+  const float* src_data = src;
+
+  for (int dst_h = 0; dst_h < dst_height; ++dst_h) {
+    float fh = dst_h * scale_h;
+
+    int src_h = std::floor(fh);
+
+    fh -= src_h;
+    const float w_h0 = std::abs((float)1.0 - fh);
+    const float w_h1 = std::abs(fh);
+
+    const int dst_offset_1 = dst_h * dst_width;
+    const int src_offset_1 = src_h * src_width;
+
+    float* dst_data_ptr = dst_data + dst_offset_1;
+
+    for (int dst_w = 0; dst_w < dst_width; ++dst_w) {
+      float fw = dst_w * scale_w;
+      int src_w = std::floor(fw);
+      fw -= src_w;
+      const float w_w0 = std::abs((float)1.0 - fw);
+      const float w_w1 = std::abs(fw);
+
+      float dst_value = 0;
+
+      const int src_idx = src_offset_1 + src_w;
+      dst_value += (w_h0 * w_w0 * src_data[src_idx]);
+      int flag = 0;
+      if (src_w + 1 < src_width) {
+        dst_value += (w_h0 * w_w1 * src_data[src_idx + 1]);
+        ++flag;
+      }
+      if (src_h + 1 < src_height) {
+        dst_value += (w_h1 * w_w0 * src_data[src_idx + src_width]);
+        ++flag;
+      }
+
+      if (flag > 1) {
+        dst_value += (w_h1 * w_w1 * src_data[src_idx + src_width + 1]);
+        //                ++flag;
+      }
+      *(dst_data_ptr++) = dst_value;
+    }
+  }
+}
+
+void ResizeTensor(const Tensor* src, const int src_n, const int src_c,
+                  Tensor* dst, const int dst_n, const int dst_c) {
+  framework::DDim in_dims = src->dims();
+  const int src_chans = in_dims[1];
+  const int src_height = in_dims[2];
+  const int src_width = in_dims[3];
+  const int src_offset = (src_n * src_chans + src_c) * src_height * src_width;
+
+  framework::DDim out_dims = dst->dims();
+  const int dst_chans = out_dims[1];
+  const int dst_height = out_dims[2];
+  const int dst_width = out_dims[3];
+  const int dst_offset = (dst_n * dst_chans + dst_c) * dst_height * dst_width;
+
+  const auto* src_ptr = src->data<float>();
+  auto* dst_ptr = dst->data<float>();
+  const auto* src_data = &(src_ptr[src_offset]);
+  auto* dst_data = &(dst_ptr[dst_offset]);
+  BiLinearResizeTensor(src_data, src_height, src_width, dst_data, dst_height,
+                       dst_width);
+}
+
+void ResizeTensor(const Tensor* src, Tensor* dst) {
+  framework::DDim in_dims = src->dims();
+  framework::DDim out_dims = dst->dims();
+  PADDLE_MOBILE_ENFORCE(in_dims[0] == out_dims[0],
+                        "src tensor batch num not equal to dst tensor");
+  PADDLE_MOBILE_ENFORCE(in_dims[1] == out_dims[1],
+                        "src tensor channel num not equal to dst tensor");
+  for (int n = 0, batch_num = in_dims[0]; n < batch_num; ++n) {
+    for (int c = 0, chan_num = in_dims[1]; c < chan_num; ++c) {
+      ResizeTensor(src, n, c, dst, n, c);
+    }
+  }
+}
+
+template <>
+void ResizeKernel<CPU, float>::Compute(const ResizeParam& param) const {
+  const auto* input_x = param.InputX();
+  const auto& input_x_dims = input_x->dims();
+  auto* out = param.Out();
+  framework::DDim out_dims = CalOutputShape(param);
+
+  out->Resize(out_dims);
+  ResizeTensor(input_x, out);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/scale_kernel.cpp b/src/operators/kernel/arm/scale_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c20e70896145958aa91de4f00de7ad2eeba1bb5c
--- /dev/null
+++ b/src/operators/kernel/arm/scale_kernel.cpp
@@ -0,0 +1,146 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SCALE_OP
+
+#include "operators/kernel/scale_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+/*
+ * @b 特化到具体平台的实现, param 从 op 层传入
+ * */
+template <>
+void ScaleKernel<CPU, float>::Compute(const ScaleParam &param) const {
+  const auto *input_x = param.InputX();
+  auto *input_x_ptr = input_x->data<float>();
+  auto *out = param.Out();
+  auto *out_ptr = out->mutable_data<float>();
+
+  const vector<float> scales = param.Scales();
+  bool has_bias = param.HasBias();
+
+  const int dim_size = input_x->dims().size();
+  switch (dim_size) {
+    case 1: {
+      const int input_width = input_x->dims()[0];
+      if (has_bias) {
+        const vector<float> biases = param.Biases();
+        #pragma omp parallel for
+        for (int w = 0; w < input_width; w++) {
+          out_ptr[w] = input_x_ptr[w] * scales[w] + biases[w];
+        }
+      } else {
+        #pragma omp parallel for
+        for (int w = 0; w < input_width; w++) {
+          out_ptr[w] = input_x_ptr[w] * scales[w];
+        }
+      }
+    } break;
+    case 2: {
+      const int input_height = input_x->dims()[0];
+      const int input_width = input_x->dims()[1];
+
+      if (has_bias) {
+        const vector<float> biases = param.Biases();
+        #pragma omp parallel for
+        for (int h = 0; h < input_height; ++h) {
+          const float *iptr = input_x_ptr + h * input_width;
+          float *optr = out_ptr + h * input_width;
+          for (int w = 0; w < input_width; ++w) {
+            optr[w] = iptr[w] * scales[w] + biases[w];
+          }
+        }
+      } else {
+        #pragma omp parallel for
+        for (int h = 0; h < input_height; ++h) {
+          const float *iptr = input_x_ptr + h * input_width;
+          float *optr = out_ptr + h * input_width;
+          for (int w = 0; w < input_width; ++w) {
+            optr[w] = iptr[w] * scales[w];
+          }
+        }
+      }
+    } break;
+    case 3: {
+      const int chan_size = input_x->dims()[0];
+      const int input_height = input_x->dims()[1];
+      const int input_width = input_x->dims()[2];
+      int size = input_width * input_height;
+
+      if (has_bias) {
+        const vector<float> biases = param.Biases();
+
+        #pragma omp parallel for
+        for (int c = 0; c < chan_size; ++c) {
+          const float *iptr = input_x_ptr + c * size;
+          float *optr = out_ptr + c * size;
+          for (int i = 0; i < size; ++i) {
+            optr[i] = iptr[i] * scales[c] + biases[c];
+          }
+        }
+      } else {
+        #pragma omp parallel for
+        for (int c = 0; c < chan_size; ++c) {
+          const float *iptr = input_x_ptr + c * size;
+          float *optr = out_ptr + c * size;
+          for (int i = 0; i < size; ++i) {
+            optr[i] = iptr[i] * scales[c];
+          }
+        }
+      }
+    } break;
+
+    case 4: {
+      const int batch_size = input_x->dims()[0];
+      const int chan_size = input_x->dims()[0];
+      const int input_height = input_x->dims()[1];
+      const int input_width = input_x->dims()[2];
+      int size = input_width * input_height;
+
+      if (has_bias) {
+        const vector<float> biases = param.Biases();
+
+        #pragma omp parallel for
+        for (int b = 0; b < batch_size; ++b) {
+          for (int c = 0; c < chan_size; ++c) {
+            const float *iptr = input_x_ptr + b * c * size;
+            float *optr = out_ptr + b * c * size;
+            for (int i = 0; i < size; ++i) {
+              optr[i] = iptr[i] * scales[c] + biases[c];
+            }
+          }
+        }
+      } else {
+        #pragma omp parallel for
+        for (int b = 0; b < batch_size; ++b) {
+          for (int c = 0; c < chan_size; ++c) {
+            const float *iptr = input_x_ptr + b * c * size;
+            float *optr = out_ptr + b * c * size;
+            for (int i = 0; i < size; ++i) {
+              optr[i] = iptr[i] * scales[c];
+            }
+          }
+        }
+      }
+    } break;
+    default:
+      break;
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/sigmoid_kernel.cpp b/src/operators/kernel/arm/sigmoid_kernel.cpp
index 20f275ff482d7073195d075c374e4a0969993714..9f5e6a2048d940ddc4592777a773c69d976033bd 100644
--- a/src/operators/kernel/arm/sigmoid_kernel.cpp
+++ b/src/operators/kernel/arm/sigmoid_kernel.cpp
@@ -15,69 +15,25 @@ limitations under the License. */
 #ifdef SIGMOID_OP
 
 #include "../sigmoid_kernel.h"
-#if __ARM_NEON
+#include "../central-arm-func/sigmoid_arm_func.h"
+#ifdef __ARM_NEON
 #include "../../math/math_func_neon.h"
 #endif
-
+#include <cmath>
 namespace paddle_mobile {
 namespace operators {
 
 using framework::DDim;
 using framework::Tensor;
 
-void sigmoid(const Tensor *X, Tensor *Y) {
-#if __ARM_NEON
-  const float *input = X->data<float>();
-  float *output = Y->mutable_data<float>();
-  const DDim &dDim = X->dims();
-  int axis_index = 1;
-  if (dDim.size() < 4) {
-    axis_index = 0;
-  }
-  DDim outer_ddim =
-      paddle_mobile::framework::slice_ddim(dDim, 0, axis_index + 1);
-  DDim inner_ddim =
-      paddle_mobile::framework::slice_ddim(dDim, axis_index + 1, dDim.size());
-  int out_size = paddle_mobile::framework::product(outer_ddim);
-  int inner_size = paddle_mobile::framework::product(inner_ddim);
-
-  DLOG << "outsize=" << out_size;
-  DLOG << "innersize=" << inner_size;
-  #pragma omp parallel for
-  for (int i = 0; i < out_size; ++i) {
-    const float *input_outer_ptr = input + i * inner_size;
-    float *output_outer_ptr = output + i * inner_size;
-    int nn = inner_size >> 2;
-    int remain = inner_size - (nn << 2);
-    float32x4_t _one = vdupq_n_f32(1.f);
-    for (; nn > 0; nn--) {
-      float32x4_t data = vld1q_f32(input_outer_ptr);
-      data = vnegq_f32(data);
-      data = exp_ps(data);
-      data = vaddq_f32(data, _one);
-      float32x4_t out_data = vrecpeq_f32(data);
-      out_data = vmulq_f32(vrecpsq_f32(data, out_data), out_data);
-      vst1q_f32(output_outer_ptr, out_data);
-
-      input_outer_ptr += 4;
-      output_outer_ptr += 4;
-    }
-    for (; remain > 0; remain--) {
-      *output_outer_ptr = 1.f / (1.f + exp(-*input_outer_ptr));
-      output_outer_ptr++;
-      input_outer_ptr++;
-    }
-  }
-#endif
+template <>
+bool SigmoidKernel<CPU, float>::Init(SigmoidParam *param) {
+  return true;
 }
 
 template <>
 void SigmoidKernel<CPU, float>::Compute(const SigmoidParam &param) const {
-  const Tensor *in_x = param.InputX();
-  Tensor *out = param.Out();
-  auto x_dims = in_x->dims();
-  out->Resize(x_dims);
-  sigmoid(in_x, out);
+  SigmoidCompute<float>(param);
 }
 
 template class SigmoidKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/slice_kernel.cpp b/src/operators/kernel/arm/slice_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..62efec9d2fb01568a108df8f3516085d81865bf7
--- /dev/null
+++ b/src/operators/kernel/arm/slice_kernel.cpp
@@ -0,0 +1,22 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SLICE_OP
+
+#include "operators/kernel/slice_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {}
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/arm/softmax_kernel.cpp b/src/operators/kernel/arm/softmax_kernel.cpp
index 542283242d09abfbad8830eb0b36136ed35a6ef6..3ce763be38678319cfc23be83180450e5d3b209c 100644
--- a/src/operators/kernel/arm/softmax_kernel.cpp
+++ b/src/operators/kernel/arm/softmax_kernel.cpp
@@ -15,17 +15,19 @@ limitations under the License. */
 #ifdef SOFTMAX_OP
 
 #include "../softmax_kernel.h"
-#include "../../math/softmax.h"
+#include "../central-arm-func/softmax_arm_func.h"
+#include "operators/math/softmax.h"
 namespace paddle_mobile {
 namespace operators {
 
+template <>
+bool SoftmaxKernel<CPU, float>::Init(SoftmaxParam *param) {
+  return true;
+}
+
 template <>
 void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam &param) const {
-  const Tensor *in_x = param.InputX();
-  Tensor *out = param.Out();
-  auto x_dims = in_x->dims();
-  out->Resize(x_dims);
-  math::SoftmaxFuntor<CPU, float>()(in_x, out);
+  SoftmaxCompute<float>(param);
 }
 
 template class SoftmaxKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/transpose_kernel.cpp b/src/operators/kernel/arm/transpose_kernel.cpp
index 1b41968f40d036d55b98298a76564dcc12576571..c358edd76e93cee3f8be6086a70c34671c87d383 100644
--- a/src/operators/kernel/arm/transpose_kernel.cpp
+++ b/src/operators/kernel/arm/transpose_kernel.cpp
@@ -14,67 +14,19 @@ limitations under the License. */
 #ifdef TRANSPOSE_OP
 
 #include "operators/kernel/transpose_kernel.h"
+#include "operators/kernel/central-arm-func/transpose_arm_func.h"
+
 namespace paddle_mobile {
 namespace operators {
 
-// vector<int> pos;
-// template <typename T>
-// void TransposeFunc(const int numel, const T* input, const vector<int> axis,
-//                    const vector<int> old_strides, const vector<int>
-//                    new_strides, T* output) {
-//   for (int i = 0; i < numel; ++i) {
-//     int old_idx = 0;
-//     int idx = i;
-//     for (int j = 0; j < axis.size(); ++j) {
-//       int order = axis[j];
-//       old_idx += (idx / new_strides[j]) * old_strides[order];
-//       idx %= new_strides[j];
-//     }
-//     output[i] = input[old_idx];
-//   }
-// }
-
 template <>
-void TransposeKernel<CPU, float>::Compute(const TransposeParam& param) const {
-  const auto* input_x = param.InputX();
-  const auto input_x_dims = input_x->dims();
-  auto* out = param.Out();
-  const auto axis = param.Axis();
-  const auto* input_x_data = input_x->data<float>();
-  auto* out_data = out->mutable_data<float>();
-
-  size_t ndim = axis.size();
-  std::vector<int> xdim(ndim);
-  std::vector<int> xstride(ndim);
-  std::vector<int> xout(ndim);
-  for (int i = 0; i < ndim; i++) {
-    int j = ndim - 1 - i;
-    xdim[j] = input_x_dims[axis[i]];
-    xstride[j] = 1;
-    for (int k = axis[i] + 1; k < ndim; k++) {
-      xstride[j] *= input_x_dims[k];
-    }
-    xout[j] = xstride[j] * xdim[j];
-  }
+bool TransposeKernel<CPU, float>::Init(TransposeParam *param) {
+  return true;
+}
 
-  auto numel = input_x->numel();
-  size_t pind = 0;
-  std::vector<int> ind(ndim);
-  for (int i = 0; i < numel; i++) {
-    out_data[i] = input_x_data[pind];
-    ind[0]++;
-    pind += xstride[0];
-    for (int j = 0; j < ndim - 1; j++) {
-      if (ind[j] == xdim[j]) {
-        ind[j + 1]++;
-        ind[j] = 0;
-        pind += xstride[j + 1];
-        pind -= xout[j];
-      } else {
-        break;
-      }
-    }
-  }
+template <>
+void TransposeKernel<CPU, float>::Compute(const TransposeParam &param) const {
+  TransposeCompute<float>(param);
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/batchnorm_kernel.h b/src/operators/kernel/batchnorm_kernel.h
index 6c795b2d5e9e7e81fb25d4a1a6dd3ca13c04bd9b..367dd0996c0df5fba7c3570285cf5e2cfd3fac99 100644
--- a/src/operators/kernel/batchnorm_kernel.h
+++ b/src/operators/kernel/batchnorm_kernel.h
@@ -29,6 +29,7 @@ class BatchNormKernel
     : public framework::OpKernelBase<DeviceType, BatchNormParam> {
  public:
   void Compute(const BatchNormParam &param) const;
+  bool Init(BatchNormParam *param);
 };
 
 }  // namespace operators
diff --git a/src/operators/kernel/box_coder_kernel.h b/src/operators/kernel/box_coder_kernel.h
index 1c612b373cd086fcd566fe69e71eb77e4d1a30b6..2ad63ecd90a07d955c3e239277ac1bd60f3510bb 100644
--- a/src/operators/kernel/box_coder_kernel.h
+++ b/src/operators/kernel/box_coder_kernel.h
@@ -30,6 +30,7 @@ class BoxCoderKernel
     : public framework::OpKernelBase<DeviceType, BoxCoderParam> {
  public:
   void Compute(const BoxCoderParam& param) const;
+  bool Init(BoxCoderParam* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/central-arm-func/batchnorm_arm_func.h b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc591035065e4cbbe71ff8f6bd6cbab9c6fe9e79
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
@@ -0,0 +1,303 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef BATCHNORM_OP
+
+#pragma once
+
+#include <cmath>
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void BatchnormCompute(const BatchNormParam &param) {
+  const Tensor *input_x = param.InputX();
+  auto input_x_ptr = input_x->data<float>();
+  const auto &x_dims = input_x->dims();
+  const int N = x_dims[0];
+  const int C = x_dims[1];
+  const int H = x_dims[2];
+  const int W = x_dims[3];
+  const int stride0 = C * H * W;
+  const int stride1 = H * W;
+  const int stride2 = W;
+  Tensor *out = param.OutputY();
+  auto out_ptr = out->mutable_data<float>();
+  const float epsilon = param.Epsilon();
+  const Tensor *mean = param.InputMean();
+  const Tensor *variance = param.InputVariance();
+  const Tensor *scale = param.InputScale();
+  const Tensor *bias = param.InputBias();
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+
+  //  Tensor inv_std;
+  //  auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
+
+  PADDLE_MOBILE_ENFORCE(C == variance->numel(),
+                        "C must equal to variance.numel()");
+
+  int HXW = H * W;
+
+#if __ARM_NEON
+#if __aarch64__
+  float *inv_std_ptr = new float[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+
+  Tensor new_scale;
+  auto new_scale_ptr = new_scale.mutable_data<float>(framework::make_ddim({C}));
+  Tensor new_bias;
+  auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
+
+  /// ((x - est_mean) * (inv_var) * scale + bias equal to
+  /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+    {
+      for (int n = 0; n < N; n++) {
+        for (int h = 0; h < H; h++) {
+          int tmp_index = n * stride0 + i * stride1 + h * stride2;
+          for (int w = 0; w < W; w++) {
+            int index = tmp_index + w;
+            out_ptr[index] =
+                input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
+          }
+        }
+      }
+    }
+  }
+  delete[] inv_std_ptr;
+#else
+
+  if (HXW > 32) {
+    int NXC = N * C;
+    float *inv_std_ptr = new float[NXC * 4];
+    float *volatile new_scale_ptr = new float[NXC * 4];
+    float *volatile new_bias_ptr = new float[NXC * 4];
+
+    /// std = (var + epsilon).sqrt();
+    /// inv_std = 1 / std;
+    for (int i = 0; i < C * 4; i += 4) {
+      int index = i / 4;
+      inv_std_ptr[i] =
+          1 / static_cast<float>(pow((variance_ptr[index] + epsilon), 0.5));
+      inv_std_ptr[i + 1] = inv_std_ptr[i];
+      inv_std_ptr[i + 2] = inv_std_ptr[i];
+      inv_std_ptr[i + 3] = inv_std_ptr[i];
+
+      new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[index];
+      new_scale_ptr[i + 1] = new_scale_ptr[i];
+      new_scale_ptr[i + 2] = new_scale_ptr[i];
+      new_scale_ptr[i + 3] = new_scale_ptr[i];
+
+      new_bias_ptr[i] =
+          bias_ptr[index] - mean_ptr[index] * inv_std_ptr[i] * scale_ptr[index];
+
+      new_bias_ptr[i + 1] = new_bias_ptr[i];
+      new_bias_ptr[i + 2] = new_bias_ptr[i];
+      new_bias_ptr[i + 3] = new_bias_ptr[i];
+    }
+
+    for (int j = C * 4; j < NXC * 4; ++j) {
+      new_scale_ptr[j] = new_scale_ptr[j - C * 4];
+      new_bias_ptr[j] = new_bias_ptr[j - C * 4];
+    }
+
+    asm volatile(
+        "subs %[N], %[N], #1                  \n\t"
+        "blt        end_n_%=                  \n\t"
+        "loop_n_%=:                           \n\t"
+
+        "subs %[C], %[C], #1                   \n\t"
+        "blt        end_c_%=                  \n\t"
+        "loop_c_%=:                           \n\t"
+
+        "vld1.32 {q9}, [%[new_scale_ptr]]!    \n\t"
+        "vld1.32 {q10}, [%[new_bias_ptr]]!    \n\t"
+
+        "mov r6, %[HXW]       \n\t"
+
+        "subs r6, r6, #32                       \n\t"
+        "blt        end_hw_%=                   \n\t"
+        "loop_hw_%=:                            \n\t"
+
+        "vld1.32 {q1, q2}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q3, q4}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q5, q6}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q7, q8}, [%[input_x_ptr]]!    \n\t"
+
+        "vmul.f32   q1, q1,   q9  \n\t"
+        "vmul.f32   q2, q2,   q9  \n\t"
+        "vmul.f32   q3, q3,   q9  \n\t"
+        "vmul.f32   q4, q4,   q9  \n\t"
+
+        "vmul.f32   q5, q5,   q9  \n\t"
+        "vmul.f32   q6, q6,   q9  \n\t"
+        "vmul.f32   q7, q7,   q9  \n\t"
+        "vmul.f32   q8, q8,   q9  \n\t"
+
+        "vadd.f32   q1,  q1,  q10 \n\t"
+        "vadd.f32   q2, q2,   q10  \n\t"
+        "vadd.f32   q3, q3,   q10  \n\t"
+        "vadd.f32   q4,  q4,  q10 \n\t"
+        "vadd.f32   q5,  q5,  q10 \n\t"
+        "vadd.f32   q6,  q6,  q10 \n\t"
+        "vadd.f32   q7,  q7,  q10 \n\t"
+        "vadd.f32   q8,  q8,  q10 \n\t"
+
+        "vst1.32 {q1, q2}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q3, q4}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q5, q6}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q7, q8}, [%[out_ptr]]!       \n\t"
+
+        "subs r6, r6, #32                    \n\t"
+        "bge        loop_hw_%=                \n\t"
+        "end_hw_%=:                           \n\t"
+
+        "cmp  r6, #0                                \n\t"
+        "bge  end_remainder_%=                      \n\t"
+        "mov r5, #4                             \n\t"
+        "mul  r6, r6, r5                            \n\t"
+        "add %[input_x_ptr], %[input_x_ptr], r6     \n\t"
+
+        "vld1.32 {q1, q2}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q3, q4}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q5, q6}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q7, q8}, [%[input_x_ptr]]!    \n\t"
+
+        "vmul.f32   q1, q1,   q9  \n\t"
+        "vmul.f32   q2, q2,   q9  \n\t"
+        "vmul.f32   q3, q3,   q9  \n\t"
+        "vmul.f32   q4, q4,   q9  \n\t"
+        "vmul.f32   q5, q5,   q9  \n\t"
+        "vmul.f32   q6, q6,   q9  \n\t"
+        "vmul.f32   q7, q7,   q9  \n\t"
+        "vmul.f32   q8, q8,   q9  \n\t"
+        "vadd.f32   q1,  q1,  q10 \n\t"
+        "vadd.f32   q2, q2,   q10  \n\t"
+        "vadd.f32   q3, q3,   q10  \n\t"
+        "vadd.f32   q4,  q4,  q10 \n\t"
+        "vadd.f32   q5,  q5,  q10 \n\t"
+        "vadd.f32   q6,  q6,  q10 \n\t"
+        "vadd.f32   q7,  q7,  q10 \n\t"
+        "vadd.f32   q8,  q8,  q10 \n\t"
+
+        "add %[out_ptr], %[out_ptr], r6         \n\t"
+        "vst1.32 {q1, q2}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q3, q4}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q5, q6}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q7, q8}, [%[out_ptr]]!        \n\t"
+
+        "end_remainder_%=:                      \n\t"
+
+        "subs %[C], %[C], #1                    \n\t"
+        "bge        loop_c_%=                   \n\t"
+        "end_c_%=:                              \n\t"
+
+        "subs %[N], %[N], #1                    \n\t"
+        "bge        loop_n_%=                   \n\t"
+        "end_n_%=:                              \n\t"
+        :
+        : [input_x_ptr] "r"(input_x_ptr), [out_ptr] "r"(out_ptr),
+          [new_scale_ptr] "r"(new_scale_ptr), [new_bias_ptr] "r"(new_bias_ptr),
+          [N] "r"(N), [C] "r"(C), [HXW] "r"(HXW)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+          "q10", "r5", "r6");
+
+    delete[] inv_std_ptr;
+    delete[] new_scale_ptr;
+    delete[] new_bias_ptr;
+
+  } else {
+    float *inv_std_ptr = new float[C];
+    for (int i = 0; i < C; i++) {
+      inv_std_ptr[i] =
+          1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+    }
+
+    Tensor new_scale;
+    auto new_scale_ptr =
+        new_scale.mutable_data<float>(framework::make_ddim({C}));
+    Tensor new_bias;
+    auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
+
+    /// ((x - est_mean) * (inv_var) * scale + bias equal to
+    /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+    for (int i = 0; i < C; i++) {
+      new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+      new_bias_ptr[i] =
+          bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+      {
+        for (int n = 0; n < N; n++) {
+          for (int h = 0; h < H; h++) {
+            int tmp_index = n * stride0 + i * stride1 + h * stride2;
+            for (int w = 0; w < W; w++) {
+              int index = tmp_index + w;
+              out_ptr[index] =
+                  input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
+            }
+          }
+        }
+      }
+    }
+
+    delete[] inv_std_ptr;
+  }
+#endif
+#else
+  float *inv_std_ptr = new float[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+
+  Tensor new_scale;
+  auto new_scale_ptr = new_scale.mutable_data<float>(framework::make_ddim({C}));
+  Tensor new_bias;
+  auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
+
+  /// ((x - est_mean) * (inv_var) * scale + bias equal to
+  /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+    {
+      for (int n = 0; n < N; n++) {
+        for (int h = 0; h < H; h++) {
+          int tmp_index = n * stride0 + i * stride1 + h * stride2;
+          for (int w = 0; w < W; w++) {
+            int index = tmp_index + w;
+            out_ptr[index] =
+                input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
+          }
+        }
+      }
+    }
+  }
+  delete[] inv_std_ptr;
+#endif
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/box_coder_arm_func.h b/src/operators/kernel/central-arm-func/box_coder_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..63558dea7b8d64d0fba3daae14e10c565f8feb2d
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/box_coder_arm_func.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef BOXCODER_OP
+#pragma once
+
+#include <cmath>
+#include "framework/tensor.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+void EncodeCenterSize(const framework::Tensor& target_box,
+                      const framework::Tensor& prior_box,
+                      const framework::Tensor& prior_box_var, T* output) {
+  int64_t row = target_box.dims()[0];
+  int64_t col = prior_box.dims()[0];
+  int64_t len = prior_box.dims()[1];
+  auto* target_box_data = target_box.data<T>();
+  auto* prior_box_data = prior_box.data<T>();
+  auto* prior_box_var_data = prior_box_var.data<T>();
+
+  for (int64_t i = 0; i < row; ++i) {
+    for (int64_t j = 0; j < col; ++j) {
+      T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len];
+      T prior_box_height =
+          prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+      T prior_box_center_x =
+          (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+      T prior_box_center_y =
+          (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+
+      T target_box_center_x =
+          (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
+      T target_box_center_y =
+          (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
+      T target_box_width =
+          target_box_data[i * len + 2] - target_box_data[i * len];
+      T target_box_height =
+          target_box_data[i * len + 3] - target_box_data[i * len + 1];
+
+      size_t offset = i * col * len + j * len;
+      output[offset] = (target_box_center_x - prior_box_center_x) /
+                       prior_box_width / prior_box_var_data[j * len];
+      output[offset + 1] = (target_box_center_y - prior_box_center_y) /
+                           prior_box_height / prior_box_var_data[j * len + 1];
+      output[offset + 2] =
+          std::log(std::fabs(target_box_width / prior_box_width)) /
+          prior_box_var_data[j * len + 2];
+      output[offset + 3] =
+          std::log(std::fabs(target_box_height / prior_box_height)) /
+          prior_box_var_data[j * len + 3];
+    }
+  }
+}
+
+template <typename T>
+void DecodeCenterSize(const framework::Tensor& target_box,
+                      const framework::Tensor& prior_box,
+                      const framework::Tensor& prior_box_var, T* output) {
+  int64_t row = target_box.dims()[0];
+  int64_t col = prior_box.dims()[0];
+  int64_t len = prior_box.dims()[1];
+
+  auto* target_box_data = target_box.data<T>();
+  auto* prior_box_data = prior_box.data<T>();
+  auto* prior_box_var_data = prior_box_var.data<T>();
+
+  for (int64_t i = 0; i < row; ++i) {
+    for (int64_t j = 0; j < col; ++j) {
+      size_t offset = i * col * len + j * len;
+      T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len];
+      T prior_box_height =
+          prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+      T prior_box_center_x =
+          (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+      T prior_box_center_y =
+          (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+
+      T target_box_center_x = prior_box_var_data[j * len] *
+                                  target_box_data[offset] * prior_box_width +
+                              prior_box_center_x;
+      T target_box_center_y = prior_box_var_data[j * len + 1] *
+                                  target_box_data[offset + 1] *
+                                  prior_box_height +
+                              prior_box_center_y;
+      T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
+                                    target_box_data[offset + 2]) *
+                           prior_box_width;
+      T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
+                                     target_box_data[offset + 3]) *
+                            prior_box_height;
+
+      output[offset] = target_box_center_x - target_box_width / 2;
+      output[offset + 1] = target_box_center_y - target_box_height / 2;
+      output[offset + 2] = target_box_center_x + target_box_width / 2;
+      output[offset + 3] = target_box_center_y + target_box_height / 2;
+    }
+  }
+}
+
+template <typename P>
+void BoxCoderCompute(const BoxCoderParam& param) {
+  const auto* input_priorbox = param.InputPriorBox();
+  const auto* input_priorboxvar = param.InputPriorBoxVar();
+  const auto* input_targetbox = param.InputTargetBox();
+
+  const auto& code_type = param.CodeType();
+
+  auto row = input_targetbox->dims()[0];
+  auto col = input_priorbox->dims()[0];
+  auto len = input_priorbox->dims()[1];
+
+  framework::Tensor* output_box = param.OutputBox();
+  auto* output_box_dataptr = output_box->mutable_data<float>({row, col, len});
+
+  if (code_type == "encode_center_size") {
+    EncodeCenterSize<float>(*input_targetbox, *input_priorbox,
+                            *input_priorboxvar, output_box_dataptr);
+  }
+  if (code_type == "decode_center_size") {
+    DecodeCenterSize<float>(*input_targetbox, *input_priorbox,
+                            *input_priorboxvar, output_box_dataptr);
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/concat_arm_func.h b/src/operators/kernel/central-arm-func/concat_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9926505b33b32ee83a16f882cc0f775797f154a
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/concat_arm_func.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONCAT_OP
+#pragma once
+
+#include <vector>
+
+namespace paddle_mobile {
+namespace operators {
+template <typename T>
+class ConcatFunctor {
+ public:
+  void operator()(const std::vector<framework::Tensor> &input, const int axis,
+                  framework::Tensor *output) {
+    size_t num = input.size();
+    int rows = 1;
+    auto dim_0 = input[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      rows *= dim_0[i];
+    }
+    int out_rows = rows, out_cols = 0;
+
+    std::vector<int64_t> input_cols(input.size());
+    for (int i = 0; i < num; ++i) {
+      int t_cols = input[i].numel() / rows;
+      out_cols += t_cols;
+      input_cols[i] = t_cols;
+    }
+
+    // computation
+    for (int k = 0; k < out_rows; ++k) {
+      T *dst_ptr = output->data<T>() + k * out_cols;
+      int col_idx = 0;
+      for (int j = 0; j < num; ++j) {
+        int col_len = input_cols[j];
+        const T *src_prt = input[j].data<T>() + k * col_len;
+        memory::Copy(dst_ptr + col_idx, src_prt, sizeof(T) * col_len);
+        col_idx += col_len;
+      }
+    }
+  }
+};
+
+template <typename P>
+void ConcatCompute(const ConcatParam &param) {
+  auto inputs = param.Inputs();
+  auto *out = param.Out();
+  int64_t axis = param.Axis();
+  out->mutable_data<float>();
+
+  /// Sometimes direct copies will be faster, this maybe need deeply analysis.
+  if (axis == 0 && inputs.size() < 10) {
+    size_t output_offset = 0;
+    for (auto *in : inputs) {
+      auto in_stride = framework::stride_numel(in->dims());
+      auto out_stride = framework::stride_numel(out->dims());
+      auto dst = out->data<float>() + output_offset;
+      auto src = in->data<float>();
+      PADDLE_MOBILE_ENFORCE(
+          in_stride.size() == out_stride.size(),
+          "src and dst tensor should have the same dims size.");
+      memory::Copy(dst, src, sizeof(float) * in_stride[0]);
+      output_offset += in_stride[0];
+    }
+  } else {
+    std::vector<framework::Tensor> inputs_concat(inputs.size());
+    for (int j = 0; j < inputs.size(); ++j) {
+      inputs_concat[j] = *inputs[j];
+    }
+    ConcatFunctor<float> concat_functor;
+    concat_functor(inputs_concat, static_cast<int>(axis), out);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_add_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..c01a068fb9732b64da4097844736f7484fdfcab9
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
@@ -0,0 +1,144 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADD_OP
+#pragma once
+
+#include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+void ConvAddBasic(const FusionConvAddParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor bias = *param.Bias();
+  int axis = param.Axis();
+  Tensor *output = param.Output();
+  float *biase_data = bias.data<float>();
+
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(1), false, biase_data);
+    }
+  }
+}
+
+template <typename P>
+void ConvAddCompute(const FusionConvAddParam &param) {
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
+                               param.Bias(), true);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    //        math::DepthwiseConv3x3(param.Input(), param.Strides(),
+    //        param.Paddings(),
+    //                               param.Filter(), param.Bias(),
+    //                               param.Output(), false);
+
+    math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), param.Output(),
+                                 *param.Bias(), true);
+
+  } else {
+    ConvAddBasic(param);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..2683b078889059f10b85c1e9fe74374342689418
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
@@ -0,0 +1,143 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#pragma once
+
+#include <vector>
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+void ConvAddBNReluBasic(const FusionConvAddBNReluParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor new_bias = *param.NewBias();
+  Tensor new_scale = *param.NewScale();
+  Tensor *output = param.Output();
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+
+      math::matmulWithBn<float>(
+          filter_slice, false, col_matrix, false, static_cast<float>(1),
+          &out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
+    }
+  }
+}
+template <typename P>
+void ConvAddBNReluCompute(const FusionConvAddBNReluParam &param) {
+  Tensor Bias;
+  Bias.mutable_data<float>({param.Groups()});
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
+                                        param.Output(), param.NewScale(),
+                                        param.NewBias(), true);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
+    //                                        param.Output(), param.NewScale(),
+    //                                        param.NewBias(), 1);
+    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
+                                          param.Output(), param.NewScale(),
+                                          param.NewBias(), true);
+  } else {
+    ConvAddBNReluBasic(param);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c619dd2a29ce140c783af0637f51153a1866791
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
@@ -0,0 +1,121 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDRELU_OP
+
+#pragma once
+#include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void ConvAddReluCompute(const FusionConvAddReluParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor bias = *param.Bias();
+  int axis = param.Axis();
+  Tensor *output = param.Output();
+  float *biase_data = bias.data<float>();
+
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(1), true, biase_data);
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..33caded3afaaf125bac9108f2fafeda3d3c2049f
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -0,0 +1,136 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_OP
+
+#pragma once
+#include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+inline void ConvBasic(const ConvParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor *output = param.Output();
+  output->mutable_data<float>();
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(0));
+    }
+  }
+}
+
+template <typename P>
+void ConvCompute(const ConvParam &param) {
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
+                               nullptr, false);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3) {
+    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
+                           param.Filter(), nullptr, param.Output(), false);
+  } else {
+    ConvBasic(param);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ec74b1d6ede34cf889c5999df1e902bb9ece4f3
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#pragma once
+#include <vector>
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+void ConvBNReluBasic(const FusionConvBNReluParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor new_bias = *param.NewBias();
+  Tensor new_scale = *param.NewScale();
+
+  Tensor *output = param.Output();
+
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+
+      math::matmulWithBn<float>(
+          filter_slice, false, col_matrix, false, static_cast<float>(1),
+          &out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
+    }
+  }
+}
+
+template <typename P>
+void ConvBNReluCompute(const FusionConvBNReluParam &param) {
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
+                                        param.Output(), param.NewScale(),
+                                        param.NewBias(), true);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
+    //                                        param.Output(), param.NewScale(),
+    //                                        param.NewBias(), 1);
+    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
+                                          param.Output(), param.NewScale(),
+                                          param.NewBias(), true);
+  } else {
+    ConvBNReluBasic(param);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h b/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c120c60be79b51c2a2e4c8fbccf1d546871f839
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
@@ -0,0 +1,114 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_TRANSPOSE
+
+#include <vector>
+
+#include "framework/ddim.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+#pragma once
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void ConvTransposeCompute(const ConvTransposeParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor *output = param.Output();
+
+  auto strides = param.Strides();
+  auto paddings = param.Paddings();
+  auto dilations = param.Dilations();
+  auto groups = param.Groups();
+
+  const int batch_size = input->dims()[0];
+
+  std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
+  std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
+
+  size_t data_dim = filter_shape_vec.size() - 2;
+
+  // 5 或者 7
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+
+  // output c / groups
+  col_shape_vec[0] = output->dims()[1] / groups;
+  for (size_t i = 0; i < data_dim; ++i) {
+    // filter shape  filter h  filter w
+    col_shape_vec[i + 1] = filter_shape_vec[i + 2];
+    // input shape  input h  input w
+    col_shape_vec[i + 1 + data_dim] = input_shape_vec[i + 2];
+  }
+
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  Tensor col;
+  col.mutable_data<P>(col_shape);
+
+  Tensor col_matrix;
+  col_matrix.ShareDataWith(col);
+  col_matrix.Resize(col_matrix_shape);
+
+  framework::DDim output_shape =
+      framework::slice_ddim(output->dims(), 1, output->dims().size());
+
+  framework::DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
+
+  // filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w)
+  framework::DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
+  filter.Resize(filter_matrix_shape);
+
+  output->mutable_data<P>();
+
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Col2ImFunctor<math::ColFormat::kCFO, CPU, P> col2im;
+  math::Col2VolFunctor<CPU, P> col2vol;
+
+  for (int i = 0; i < batch_size; ++i) {
+    Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
+    Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
+
+    for (int g = 0; g < groups; ++g) {
+      Tensor in_slice = input_batch.Slice(g * in_step, (g + 1) * in_step);
+      Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
+      Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);
+
+      math::matmul(filter_slice, true, in_slice, false, static_cast<P>(1.0),
+                   &col_matrix, static_cast<P>(0.0));
+      if (data_dim == 2U) {
+        col2im(col, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &out_slice);
+
+      } else if (data_dim == 3U) {
+        col2vol(col, dilations, strides, paddings, &out_slice);
+      }
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..60b09df597f218eefc7f95ba3f342ae0c51c7000
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef DEPTHWISECONV_OP
+
+#pragma once
+#include <operators/math/depthwise_conv_3x3.h>
+#include <vector>
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
+
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void DepthwiseConvCompute(const ConvParam &param) {
+  Tensor Bias;
+  Bias.mutable_data<float>({param.Groups()});
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
+                               &Bias, false);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    //    math::DepthwiseConv3x3(param.Input(), param.Strides(),
+    //    param.Paddings(),
+    //                           param.Filter(), &Bias, param.Output(), false);
+    math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), param.Output(),
+                                 Bias, false);
+
+  } else {
+    ConvBasic(param);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h b/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..2166fd2a97846dc10733e89a999407afc79e888b
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
@@ -0,0 +1,140 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DWCONVBNRELU_OP
+
+#pragma once
+#include <vector>
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+void DWConvBNReluBasic(const FusionDWConvBNReluParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor new_bias = *param.NewBias();
+  Tensor new_scale = *param.NewScale();
+
+  Tensor *output = param.Output();
+
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmulWithBn<float>(
+          filter_slice, false, col_matrix, false, static_cast<float>(1),
+          &out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
+    }
+  }
+}
+template <typename P>
+void DWConvBNReluCompute(const FusionDWConvBNReluParam &param) {
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
+                                        param.Output(), param.NewScale(),
+                                        param.NewBias(), true);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
+    //                                        param.Output(), param.NewScale(),
+    //                                        param.NewBias(), 1);
+    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
+                                          param.Output(), param.NewScale(),
+                                          param.NewBias(), true);
+  } else {
+    DWConvBNReluBasic(param);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..91150cfb8c206521fd628c972ab8b35400b5ff53
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEADD_OP
+
+#pragma once
+#include "operators/math/elementwise_op_function.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct AddFunctor {
+  inline T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename P>
+void ElementwiseAddCompute(const ElementwiseAddParam &param) {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *Out = param.Out();
+  Out->mutable_data<float>();
+  int axis = param.Axis();
+  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
+                                                 AddFunctor<float>(), Out);
+}
+
+template class ElementwiseAddKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a689dfc18e3b8677faa61b5c90cb46321f3f4c3
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_FC_OP
+
+#pragma once
+#include "operators/math/math_function.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void FusionFcCompute(const FusionFcParam &param) {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  const Tensor *input_z = param.InputZ();
+  auto *input_z_data = input_z->data<float>();
+  int axis = param.Axis();
+  Tensor *out = param.Out();
+  auto *out_data = out->mutable_data<float>();
+  float *bias_data = out->mutable_data<float>();
+  const Tensor x_matrix =
+      input_x->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+          : *input_x;
+  const Tensor y_matrix =
+      input_y->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+          : *input_y;
+  auto out_dim = out->dims();
+  if (out_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
+  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
+                        " out_dim.size must be 2.");
+  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
+  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ");
+
+  int64_t classes = input_z->numel();
+  for (int i = 0; i < out_dim[0]; i++) {
+    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
+  }
+
+  //  for (int i = 0; i < out->numel(); i++) {
+  //    DLOG << out_data[i];
+  //  }
+  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                      out, static_cast<float>(1), false, bias_data);
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  //            if (out_dim.size() != 2) {
+  //                out->Resize(out_dim);
+  //            }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/lrn_arm_func.h b/src/operators/kernel/central-arm-func/lrn_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..15c279d1605ed7348b766855497411fbe541e2f6
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/lrn_arm_func.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef LRN_OP
+
+#pragma once
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void LrnCompute(const LrnParam &param) {
+  const Tensor *input_x = param.InputX();
+  auto x_dims = input_x->dims();
+  Tensor *out = param.Out();
+  out->mutable_data<float>();
+  /// data_format = NCHW
+  const int N = x_dims[0];
+  const int C = x_dims[1];
+  const int H = x_dims[2];
+  const int W = x_dims[3];
+
+  const int n = param.N();
+  const float alpha = param.Alpha();
+  const float beta = param.Beta();
+  const float k = param.K();
+  LRNFunctor<float> lrnFunctor;
+  lrnFunctor(*input_x, out, N, C, H, W, n, k, alpha, beta);
+}
+
+template class LrnKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/mul_arm_func.h b/src/operators/kernel/central-arm-func/mul_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2da67afe1d2eb746971a2443bdb449eb2b66ec4
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/mul_arm_func.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef MUL_OP
+
+#pragma once
+
+namespace paddle_mobile {
+namespace operators {
+
+// 1、如果x,y维度都是2维，
+// x = [[1,2],   y = [[5,6],
+//      [3,4]]        [7,8]]
+// 运算结果为正常矩阵相乘。结果 out =
+//  [[1*5+2*7,1*6+2*8],[3*5+4*7, 3*6+4*8]]
+//
+// 2、如果x的维度大于2或者y的维度大于2,x的维度(2,3,4) ,y的维度(4,1,2)
+// x = [[[1,2,3,4],
+//       [2,3,4,5],
+//       [3,4,5,6]],
+//      [[1,2,3,4],
+//       [2,3,4,5],
+//       [3,4,5,6]]]
+// y = [[[1,2]],
+//      [[3,4]],
+//      [[5,6]],
+//      [[7,8]]]
+// 需要借助x_num_col_dims和y_num_col_dims将x和y的维度转换为2维
+// 从模型中读到参数,x_num_col_dims = 2,y_num_col_dims = 1,左开右闭
+// (1) 将x = (2,3,4)的index [0,x_num_col_dims)部分2,3相乘，得到6，
+//     [x_num_col_dims,xdim.size())部分4相乘，得到4，
+//     将Tensor x的dims重写成(6,4)
+// (2) 将y = (4,1,2)的index [0,y_num_col_dims)部分4相乘，得到4，
+//     [y_num_col_dims,ydim.size())部分1,2相乘，得到2,
+//     将Tensor y的dims重写成(4,2)
+// 并不影响x,y在内存中的分布。
+// x = [[1,2,3,4],             y = [[1,2],
+//      [2,3,4,5],                  [3,4],
+//      [3,4,5,6],   矩阵乘法        [5,6],
+//      [1,2,3,4],                  [7,8]]
+//      [2,3,4,5],
+//      [3,4,5,6]]
+// 结果x(6行4列)乘y(4行2列)，按1中矩阵相乘，结果out(6行2列)
+
+template <typename P>
+void MulCompute(const MulParam &param) {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *out = param.Out();
+  out->mutable_data<float>();
+  const Tensor x_matrix =
+      input_x->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+          : *input_x;
+  const Tensor y_matrix =
+      input_y->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+          : *input_y;
+  auto out_dim = out->dims();
+  if (out_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                      out, static_cast<float>(0));
+  if (out_dim.size() != 2) {
+    out->Resize(out_dim);
+  }
+}
+
+template class MulKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h b/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccc99c13c673fb42d151da6db3372848f5d8bd7c
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
@@ -0,0 +1,285 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef MULTICLASSNMS_OP
+#pragma once
+
+#include <algorithm>
+#include <map>
+#include <utility>
+#include <vector>
+#include "framework/tensor.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+constexpr int kOutputDim = 6;
+constexpr int kBBoxSize = 4;
+
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+static inline void GetMaxScoreIndex(
+    const std::vector<T>& scores, const T threshold, int top_k,
+    std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <class T>
+static inline T BBoxArea(const T* box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static inline T JaccardOverlap(const T* box1, const T* box2,
+                               const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_w = inter_xmax - inter_xmin;
+    const T inter_h = inter_ymax - inter_ymin;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <typename T>
+static inline void NMSFast(const framework::Tensor& bbox,
+                           const framework::Tensor& scores,
+                           const T score_threshold, const T nms_threshold,
+                           const T eta, const int64_t top_k,
+                           std::vector<int>* selected_indices) {
+  // The total boxes for each instance.
+  int64_t num_boxes = bbox.dims()[0];
+  // 4: [xmin ymin xmax ymax]
+  int64_t box_size = bbox.dims()[1];
+
+  std::vector<T> scores_data(num_boxes);
+  std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
+  std::vector<std::pair<T, int>> sorted_indices;
+  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+
+  selected_indices->clear();
+  T adaptive_threshold = nms_threshold;
+  const T* bbox_data = bbox.data<T>();
+
+  while (sorted_indices.size() != 0) {
+    const int idx = sorted_indices.front().second;
+    bool keep = true;
+    for (size_t k = 0; k < selected_indices->size(); ++k) {
+      if (keep) {
+        const int kept_idx = (*selected_indices)[k];
+        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                      bbox_data + kept_idx * box_size, true);
+        keep = overlap <= adaptive_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) {
+      selected_indices->push_back(idx);
+    }
+    sorted_indices.erase(sorted_indices.begin());
+    if (keep && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+}
+
+template <typename T>
+void MultiClassNMS(const framework::Tensor& scores,
+                   const framework::Tensor& bboxes,
+                   std::map<int, std::vector<int>>* indices, int* num_nmsed_out,
+                   const int& background_label, const int& nms_top_k,
+                   const int& keep_top_k, const T& nms_threshold,
+                   const T& nms_eta, const T& score_threshold) {
+  int64_t class_num = scores.dims()[0];
+  int64_t predict_dim = scores.dims()[1];
+  int num_det = 0;
+  for (int64_t c = 0; c < class_num; ++c) {
+    if (c == background_label) continue;
+    framework::Tensor score = scores.Slice(c, c + 1);
+    /// [c] is key
+    NMSFast<float>(bboxes, score, score_threshold, nms_threshold, nms_eta,
+                   nms_top_k, &((*indices)[c]));
+    num_det += (*indices)[c].size();
+  }
+
+  *num_nmsed_out = num_det;
+  const T* scores_data = scores.data<T>();
+  if (keep_top_k > -1 && num_det > keep_top_k) {
+    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+    for (const auto& it : *indices) {
+      int label = it.first;
+      const T* sdata = scores_data + label * predict_dim;
+      const std::vector<int>& label_indices = it.second;
+      for (size_t j = 0; j < label_indices.size(); ++j) {
+        int idx = label_indices[j];
+        // PADDLE_ENFORCE_LT(idx, predict_dim);
+        score_index_pairs.push_back(
+            std::make_pair(sdata[idx], std::make_pair(label, idx)));
+      }
+    }
+    // Keep top k results per image.
+    std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
+                     SortScorePairDescend<std::pair<int, int>>);
+    score_index_pairs.resize(keep_top_k);
+
+    // Store the new indices.
+    std::map<int, std::vector<int>> new_indices;
+    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+      int label = score_index_pairs[j].second.first;
+      int idx = score_index_pairs[j].second.second;
+      new_indices[label].push_back(idx);
+    }
+    new_indices.swap(*indices);
+    *num_nmsed_out = keep_top_k;
+  }
+}
+
+template <typename T>
+void MultiClassOutput(const framework::Tensor& scores,
+                      const framework::Tensor& bboxes,
+                      const std::map<int, std::vector<int>>& selected_indices,
+                      framework::Tensor* outs) {
+  int predict_dim = scores.dims()[1];
+  auto* scores_data = scores.data<T>();
+  auto* bboxes_data = bboxes.data<T>();
+  auto* odata = outs->data<T>();
+
+  int count = 0;
+  for (const auto& it : selected_indices) {
+    /// one batch
+    int label = it.first;
+    const T* sdata = scores_data + label * predict_dim;
+    const std::vector<int>& indices = it.second;
+    for (size_t j = 0; j < indices.size(); ++j) {
+      int idx = indices[j];
+      const T* bdata = bboxes_data + idx * kBBoxSize;
+      odata[count * kOutputDim] = label;           // label
+      odata[count * kOutputDim + 1] = sdata[idx];  // score
+      // xmin, ymin, xmax, ymax
+      std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
+      count++;
+    }
+  }
+}
+
+template <typename P>
+void MultiClassNMSCompute(const MultiClassNMSParam& param) {
+  const auto* input_bboxes = param.InputBBoxes();
+  const auto& input_bboxes_dims = input_bboxes->dims();
+
+  const auto* input_scores = param.InputScores();
+  const auto& input_scores_dims = input_scores->dims();
+
+  auto* outs = param.Out();
+  auto background_label = param.BackGroundLabel();
+  auto nms_top_k = param.NMSTopK();
+  auto keep_top_k = param.KeepTopK();
+  auto nms_threshold = param.NMSThreshold();
+  auto nms_eta = param.NMSEta();
+  auto score_threshold = param.ScoreThreshold();
+
+  int64_t batch_size = input_scores_dims[0];
+  int64_t class_num = input_scores_dims[1];
+  int64_t predict_dim = input_scores_dims[2];
+  int64_t box_dim = input_bboxes_dims[2];
+
+  std::vector<std::map<int, std::vector<int>>> all_indices;
+  std::vector<size_t> batch_starts = {0};
+  for (int64_t i = 0; i < batch_size; ++i) {
+    framework::Tensor ins_score = input_scores->Slice(i, i + 1);
+    ins_score.Resize({class_num, predict_dim});
+
+    framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
+    ins_boxes.Resize({predict_dim, box_dim});
+
+    std::map<int, std::vector<int>> indices;
+    int num_nmsed_out = 0;
+    MultiClassNMS<float>(ins_score, ins_boxes, &indices, &num_nmsed_out,
+                         background_label, nms_top_k, keep_top_k, nms_threshold,
+                         nms_eta, score_threshold);
+    all_indices.push_back(indices);
+    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+  }
+
+  int num_kept = batch_starts.back();
+  if (num_kept == 0) {
+    float* od = outs->mutable_data<float>({1});
+    od[0] = -1;
+  } else {
+    outs->mutable_data<float>({num_kept, kOutputDim});
+    for (int64_t i = 0; i < batch_size; ++i) {
+      framework::Tensor ins_score = input_scores->Slice(i, i + 1);
+      ins_score.Resize({class_num, predict_dim});
+
+      framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
+      ins_boxes.Resize({predict_dim, box_dim});
+
+      int64_t s = batch_starts[i];
+      int64_t e = batch_starts[i + 1];
+      if (e > s) {
+        framework::Tensor out = outs->Slice(s, e);
+        MultiClassOutput<float>(ins_score, ins_boxes, all_indices[i], &out);
+      }
+    }
+  }
+
+  //            framework::LoD lod;
+  //            lod.emplace_back(batch_starts);
+  //
+  //            outs->set_lod(lod);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/pool_arm_func.h b/src/operators/kernel/central-arm-func/pool_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1beb82da1072d199217d0722eaae6fcb0123490
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/pool_arm_func.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef POOL_OP
+#pragma once
+
+#include <string>
+#include <vector>
+#include "operators/math/pooling.h"
+
+namespace paddle_mobile {
+namespace operators {
+using framework::Tensor;
+
+inline void PoolBasic(std::string pooling_type, std::vector<int> ksize,
+                      std::vector<int> strides, std::vector<int> paddings,
+                      const Tensor *in_x, Tensor *out) {
+  if (pooling_type == "max") {
+    math::PoolFunctor<CPU, math::MaxPool<float>, float> pool2d_forward;
+    math::MaxPool<float> pool_process;
+    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
+
+  } else if (pooling_type == "avg") {
+    math::PoolFunctor<CPU, math::AvgPool<float>, float> pool2d_forward;
+    math::AvgPool<float> pool_process;
+    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
+  }
+}
+template <typename P>
+void PoolCompute(const PoolParam &param) {
+  const Tensor *in_x = param.Input();
+  Tensor *out = param.Output();
+  std::string pooling_type = param.PoolingType();
+
+  std::vector<int> ksize = param.Ksize();
+
+  std::vector<int> strides = param.Strides();
+
+  std::vector<int> paddings = param.Paddings();
+  if (ksize.size() != 2) {
+    LOG(paddle_mobile::LogLevel::kLOG_ERROR)
+        << "Pool op only supports 2D and 3D input.";
+  }
+
+  if (param.isGlobalPooling()) {
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      paddings[i] = 0;
+      ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+    }
+  }
+
+  if (ksize[0] == 3 && ksize[0] == ksize[1]) {
+    if (pooling_type == "max") {
+      if (strides[0] == strides[1] && strides[0] == 1 &&
+          paddings[0] == paddings[1] && paddings[1] == 1) {
+        math::Pool3x3Maxs1p1(in_x, out);
+      } else {
+        math::Pool3x3Max(strides, paddings, in_x, out);
+      }
+    } else if (pooling_type == "avg") {
+      if (strides[0] == strides[1] && strides[0] == 1 &&
+          paddings[0] == paddings[1] && paddings[1] == 1) {
+        math::Pool3x3Avgs1p1(in_x, out);
+      } else {
+        math::Pool3x3Avg(strides, paddings, in_x, out);
+      }
+    }
+
+  } else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
+#if __ARM_NEON
+#if __aarch64__
+    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+#else
+    if (pooling_type == "max") {
+      math::Pool2x2Max(strides, paddings, in_x, out);
+    } else if (pooling_type == "avg") {
+      math::Pool2x2Avg(strides, paddings, in_x, out);
+    }
+#endif
+#else
+    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+#endif  // __ARM_NEON
+
+  } else {
+    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/central-arm-func/prior_box_arm_func.h b/src/operators/kernel/central-arm-func/prior_box_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..892dceb9254ac423d3591a0fc9e9347bc375831b
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/prior_box_arm_func.h
@@ -0,0 +1,149 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PRIORBOX_OP
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct ClipFunctor {
+  inline T operator()(T in) const {
+    return std::min<T>(std::max<T>(in, 0.), 1.);
+  }
+};
+
+template <typename P>
+void PriorBoxCompute(const PriorBoxParam &param) {
+  const auto *input_ = param.Input();
+  const auto &input_dims = input_->dims();
+
+  const auto *input_image = param.InputImage();
+  const auto &input_image_dims = input_image->dims();
+
+  const auto &min_sizes = param.MinSizes();
+  const auto &max_sizes = param.MaxSizes();
+  const auto &variances = param.Variances();
+  const auto &input_aspect_ratio = param.AspectRatios();
+  const bool &flip = param.Flip();
+  const bool &clip = param.Clip();
+  const float &step_w = param.StepW();
+  const float &step_h = param.StepH();
+  const float &offset = param.Offset();
+
+  Tensor *output_boxes = param.OutputBoxes();
+  auto output_boxes_dataptr = output_boxes->mutable_data<float>();
+  Tensor *output_variances = param.OutputVariances();
+  auto output_variances_dataptr = output_variances->mutable_data<float>();
+
+  std::vector<float> aspect_ratios;
+  ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
+
+  auto img_width = input_image_dims[3];
+  auto img_height = input_image_dims[2];
+
+  auto feature_width = input_dims[3];
+  auto feature_height = input_dims[2];
+
+  auto stride0 = output_boxes->dims()[1] * output_boxes->dims()[2] *
+                 output_boxes->dims()[3];
+  auto stride1 = output_boxes->dims()[2] * output_boxes->dims()[3];
+  auto stride2 = output_boxes->dims()[3];
+
+  float step_width, step_height;
+  /// 300 / 19
+  if (step_w == 0 || step_h == 0) {
+    step_width = static_cast<float>(img_width) / feature_width;
+    step_height = static_cast<float>(img_height) / feature_height;
+  } else {
+    step_width = step_w;
+    step_height = step_h;
+  }
+
+  int num_priors = aspect_ratios.size() * min_sizes.size();
+  if (!max_sizes.empty()) {
+    num_priors += max_sizes.size();
+  }
+
+  for (int h = 0; h < feature_height; ++h) {
+    for (int w = 0; w < feature_width; ++w) {
+      /// map origin image
+      float center_x = (w + offset) * step_width;
+      float center_y = (h + offset) * step_height;
+      float box_width, box_height;
+      int idx = 0;
+      for (size_t s = 0; s < min_sizes.size(); ++s) {
+        auto min_size = min_sizes[s];
+        // priors with different aspect ratios
+        for (float ar : aspect_ratios) {
+          box_width = min_size * sqrt(ar) / 2.;
+          box_height = min_size / sqrt(ar) / 2.;
+          /// box_width/2 , / img_width 为了得到feature map 相对于
+          /// 原图的归一化位置的比例。
+          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] =
+              (center_x - box_width) / img_width;
+          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] =
+              (center_y - box_height) / img_height;
+          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 2] =
+              (center_x + box_width) / img_width;
+          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] =
+              (center_y + box_height) / img_height;
+          idx++;
+        }
+        if (!max_sizes.empty()) {
+          auto max_size = max_sizes[s];
+          // square prior with size sqrt(minSize * maxSize)
+          box_width = box_height = sqrt(min_size * max_size) / 2.;
+          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] =
+              (center_x - box_width) / img_width;
+          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] =
+              (center_y - box_height) / img_height;
+          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 2] =
+              (center_x + box_width) / img_width;
+          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] =
+              (center_y + box_height) / img_height;
+          idx++;
+        }
+      }
+    }
+  }
+  if (clip) {
+    math::Transform trans;
+    ClipFunctor<float> clip_func;
+    trans(output_boxes_dataptr, output_boxes_dataptr + output_boxes->numel(),
+          output_boxes_dataptr, clip_func);
+  }
+
+  if ((variances.size() != 4)) {
+    LOG(kLOG_ERROR) << " variances.size() must be 4.";
+  }
+
+  int64_t box_num = feature_height * feature_width * num_priors;
+
+  for (int i = 0; i < box_num; i++) {
+    output_variances_dataptr[4 * i] = variances[0];
+    output_variances_dataptr[4 * i + 1] = variances[1];
+    output_variances_dataptr[4 * i + 2] = variances[2];
+    output_variances_dataptr[4 * i + 3] = variances[3];
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/relu_arm_func.h b/src/operators/kernel/central-arm-func/relu_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1002c1f10e9c2d72d764693384500c4793dda46
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/relu_arm_func.h
@@ -0,0 +1,109 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RELU_OP
+#pragma once
+
+#include <operators/math/transform.h>
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct ReluFunctor {
+  inline T operator()(T in) const { return in > 0 ? in : 0; }
+};
+
+/*
+ * @b 特化到具体平台的实现, param 从 op 层传入
+ * */
+template <typename P>
+void ReluCompute(const ReluParam &param) {
+  const auto *input_x = param.InputX();
+  auto *input_x_ptr = input_x->data<float>();
+  auto *out = param.Out();
+  auto *out_ptr = out->mutable_data<float>();
+
+  int numel = input_x->numel();
+  //  if (numel > 64) {
+  //    asm volatile(
+  //        "pld        [%[input_x_ptr], #0]        \n\t"
+  //        "vmov.f32   q8,    #0.0                 \n\t"
+  //        "subs %[num], %[num], #32                \n\t"
+  //        "blt        end_num_%=                  \n\t"
+  //        "loop_num_%=:                           \n\t"
+  //        "pld        [%[input_x_ptr], #1024]      \n\t"
+  //
+  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+  //
+  //        "vmax.f32 q0, q0, q8                   \n\t"
+  //        "vmax.f32 q1, q1, q8                    \n\t"
+  //        "vmax.f32 q2, q2, q8                   \n\t"
+  //        "vmax.f32 q3, q3, q8                   \n\t"
+  //        "vmax.f32 q4, q4, q8                   \n\t"
+  //        "vmax.f32 q5, q5, q8                   \n\t"
+  //        "vmax.f32 q6, q6, q8                   \n\t"
+  //        "vmax.f32 q7, q7, q8                   \n\t"
+  //
+  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+  //
+  //        "subs %[num], %[num], #32              \n\t"
+  //        "bge        loop_num_%=                \n\t"
+  //        "end_num_%=:                           \n\t"
+  //        "cmp %[num], #0                         \n\t"
+  //        "bge   end_%=                          \n\t"
+  //        "mov r6, #4                             \n\t"
+  //        "mul r5, %[num], r6                     \n\t"
+  //        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
+  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+  //        "vmax.f32 q0, q0, q8                   \n\t"
+  //        "vmax.f32 q1, q1, q8                    \n\t"
+  //        "vmax.f32 q2, q2, q8                   \n\t"
+  //        "vmax.f32 q3, q3, q8                   \n\t"
+  //        "vmax.f32 q4, q4, q8                   \n\t"
+  //        "vmax.f32 q5, q5, q8                   \n\t"
+  //        "vmax.f32 q6, q6, q8                   \n\t"
+  //        "vmax.f32 q7, q7, q8                   \n\t"
+  //        "add %[out_ptr], %[out_ptr], r5       \n\t"
+  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+  //        "end_%=:                                \n\t"
+  //        :
+  //        :
+  //        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num]
+  //        "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
+  //        "q7", "q8", "r5",
+  //          "r6");
+  //  } else {
+  ReluFunctor<float> func_;
+  math::Transform trans;
+  trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
+  //  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/reshape_arm_func.h b/src/operators/kernel/central-arm-func/reshape_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa0a3a7c6b5a58938af1c5917c8c2fa0de3f9e90
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/reshape_arm_func.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE_OP
+#pragma once
+
+#include <vector>
+#include "operators/kernel/reshape_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void ReshapeCompute(const ReshapeParam &param) {
+  const auto *input_x = param.InputX();
+  const auto &input_x_dims = input_x->dims();
+  auto *out = param.Out();
+  framework::DDim out_dims = out->dims();
+  const auto *input_shape = param.InputShape();
+
+  if (input_shape) {
+    auto *shape_data = input_shape->data<int>();
+    framework::Tensor cpu_shape_tensor;
+    auto shape =
+        std::vector<int>(shape_data, shape_data + input_shape->numel());
+    out_dims = ValidateShape(shape, input_x->dims());
+  }
+
+  bool inplace = param.Inplace();
+  out->Resize(out_dims);
+  if (!inplace) {
+    out->mutable_data<float>();
+    framework::TensorCopy(*input_x, out);
+    out->Resize(out_dims);
+  } else {
+    out->ShareDataWith(*input_x);
+    out->Resize(out_dims);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/sigmoid_arm_func.h b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..c612c4b092143ef8925f81a6d6fefe9cd9dff25b
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SIGMOID_OP
+#pragma once
+
+#include <cmath>
+
+#include "operators/op_param.h"
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#include "operators/math/math_func_neon.h"
+#endif
+
+namespace paddle_mobile {
+namespace operators {
+using framework::DDim;
+void sigmoid(const Tensor *X, Tensor *Y) {
+#ifdef __ARM_NEON
+  const float *input = X->data<float>();
+  float *output = Y->mutable_data<float>();
+  const DDim &dDim = X->dims();
+  int axis_index = 1;
+  if (dDim.size() < 4) {
+    axis_index = 0;
+  }
+  DDim outer_ddim =
+      paddle_mobile::framework::slice_ddim(dDim, 0, axis_index + 1);
+  DDim inner_ddim =
+      paddle_mobile::framework::slice_ddim(dDim, axis_index + 1, dDim.size());
+  int out_size = paddle_mobile::framework::product(outer_ddim);
+  int inner_size = paddle_mobile::framework::product(inner_ddim);
+
+  DLOG << "outsize=" << out_size;
+  DLOG << "innersize=" << inner_size;
+  #pragma omp parallel for
+  for (int i = 0; i < out_size; ++i) {
+    const float *input_outer_ptr = input + i * inner_size;
+    float *output_outer_ptr = output + i * inner_size;
+    int nn = inner_size >> 2;
+    int remain = inner_size - (nn << 2);
+    float32x4_t _one = vdupq_n_f32(1.f);
+    for (; nn > 0; nn--) {
+      float32x4_t data = vld1q_f32(input_outer_ptr);
+      data = vnegq_f32(data);
+      data = exp_ps(data);
+      data = vaddq_f32(data, _one);
+      float32x4_t out_data = vrecpeq_f32(data);
+      out_data = vmulq_f32(vrecpsq_f32(data, out_data), out_data);
+      vst1q_f32(output_outer_ptr, out_data);
+
+      input_outer_ptr += 4;
+      output_outer_ptr += 4;
+    }
+    for (; remain > 0; remain--) {
+      *output_outer_ptr = 1.f / (1.f + exp(-*input_outer_ptr));
+      output_outer_ptr++;
+      input_outer_ptr++;
+    }
+  }
+#else
+#endif
+}
+
+template <typename P>
+void SigmoidCompute(const SigmoidParam &param) {
+  const Tensor *in_x = param.InputX();
+  Tensor *out = param.Out();
+  auto x_dims = in_x->dims();
+  out->Resize(x_dims);
+  sigmoid(in_x, out);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/central-arm-func/softmax_arm_func.h b/src/operators/kernel/central-arm-func/softmax_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..896532109409d316346ba4a8d14aaa85a500b007
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/softmax_arm_func.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SOFTMAX_OP
+#pragma once
+#include "../../math/softmax.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename P>
+void SoftmaxCompute(const SoftmaxParam &param) {
+  const Tensor *in_x = param.InputX();
+  Tensor *out = param.Out();
+  auto x_dims = in_x->dims();
+  out->Resize(x_dims);
+  math::SoftmaxFuntor<CPU, float>()(in_x, out);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/central-arm-func/transpose_arm_func.h b/src/operators/kernel/central-arm-func/transpose_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..43a16d0e85ff058764b5a711283e38e15eccae57
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/transpose_arm_func.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TRANSPOSE_OP
+#pragma once
+
+#include <vector>
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+// vector<int> pos;
+// template <typename T>
+// void TransposeFunc(const int numel, const T* input, const vector<int> axis,
+//                    const vector<int> old_strides, const vector<int>
+//                    new_strides, T* output) {
+//   for (int i = 0; i < numel; ++i) {
+//     int old_idx = 0;
+//     int idx = i;
+//     for (int j = 0; j < axis.size(); ++j) {
+//       int order = axis[j];
+//       old_idx += (idx / new_strides[j]) * old_strides[order];
+//       idx %= new_strides[j];
+//     }
+//     output[i] = input[old_idx];
+//   }
+// }
+
+template <typename P>
+void TransposeCompute(const TransposeParam& param) {
+  const auto* input_x = param.InputX();
+  const auto input_x_dims = input_x->dims();
+  auto* out = param.Out();
+  const auto axis = param.Axis();
+  const auto* input_x_data = input_x->data<float>();
+  auto* out_data = out->mutable_data<float>();
+
+  size_t ndim = axis.size();
+  std::vector<int> xdim(ndim);
+  std::vector<int> xstride(ndim);
+  std::vector<int> xout(ndim);
+  for (int i = 0; i < ndim; i++) {
+    int j = ndim - 1 - i;
+    xdim[j] = input_x_dims[axis[i]];
+    xstride[j] = 1;
+    for (int k = axis[i] + 1; k < ndim; k++) {
+      xstride[j] *= input_x_dims[k];
+    }
+    xout[j] = xstride[j] * xdim[j];
+  }
+
+  auto numel = input_x->numel();
+  size_t pind = 0;
+  std::vector<int> ind(ndim);
+  for (int i = 0; i < numel; i++) {
+    out_data[i] = input_x_data[pind];
+    ind[0]++;
+    pind += xstride[0];
+    for (int j = 0; j < ndim - 1; j++) {
+      if (ind[j] == xdim[j]) {
+        ind[j + 1]++;
+        ind[j] = 0;
+        pind += xstride[j + 1];
+        pind -= xout[j];
+      } else {
+        break;
+      }
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/concat_kernel.h b/src/operators/kernel/concat_kernel.h
index 3b649974e8bb670b7ec81c61f185a2d8f9b24ad0..adba64391e3e79569030c95e2d2681a31187f03a 100644
--- a/src/operators/kernel/concat_kernel.h
+++ b/src/operators/kernel/concat_kernel.h
@@ -27,6 +27,7 @@ template <typename DeviceType, typename T>
 class ConcatKernel : public framework::OpKernelBase<DeviceType, ConcatParam> {
  public:
   void Compute(const ConcatParam &param) const;
+  bool Init(ConcatParam *param);
 };
 
 }  // namespace operators
diff --git a/src/operators/kernel/conv_add_bn_kernel.h b/src/operators/kernel/conv_add_bn_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc11ef1d71f402f32b2da6490877626247884a44
--- /dev/null
+++ b/src/operators/kernel/conv_add_bn_kernel.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_CONVADDBN_OP
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::DDim;
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class ConvAddBNKernel : public OpKernelBase<DeviceType, FusionConvAddBNParam> {
+ public:
+  void Compute(const FusionConvAddBNParam &param) const;
+  bool Init(FusionConvAddBNParam *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/conv_add_bn_relu_kernel.h b/src/operators/kernel/conv_add_bn_relu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..73aaf4c900393b9cbee4682fc67147d9ef0853fc
--- /dev/null
+++ b/src/operators/kernel/conv_add_bn_relu_kernel.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::DDim;
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class ConvAddBNReluKernel
+    : public OpKernelBase<DeviceType, FusionConvAddBNReluParam> {
+ public:
+  void Compute(const FusionConvAddBNReluParam &param) const;
+  bool Init(FusionConvAddBNReluParam *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/conv_add_kernel.h b/src/operators/kernel/conv_add_kernel.h
index 63086ea25f4ba97295a2f243366c37b66dfed0b5..5a351f8afcf7b73fb6c56dff48c08d7b5204ca10 100644
--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
@@ -17,12 +17,14 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
-#if __ARM_NEON
+#ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
+#include "common/common.h"
 #include "framework/ddim.h"
 #include "framework/operator.h"
 #include "operators/math/conv_func.h"
+#include "operators/math/depthwise_conv_3x3.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"
@@ -38,6 +40,7 @@ template <typename DeviceType, typename T>
 class ConvAddKernel : public OpKernelBase<DeviceType, FusionConvAddParam> {
  public:
   void Compute(const FusionConvAddParam &param) const;
+  bool Init(FusionConvAddParam *param);
 };
 
 }  // namespace operators
diff --git a/src/operators/kernel/conv_add_relu_kernel.h b/src/operators/kernel/conv_add_relu_kernel.h
index ed2cc896f438220468c074b742ffa7e12779a63e..931313273d150fa1ad159e7069fbc3812d6e6657 100644
--- a/src/operators/kernel/conv_add_relu_kernel.h
+++ b/src/operators/kernel/conv_add_relu_kernel.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef FUSION_CONVADD_RELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 
 #include <vector>
 #include "framework/ddim.h"
@@ -36,6 +36,7 @@ class ConvAddReluKernel
     : public OpKernelBase<DeviceType, FusionConvAddReluParam> {
  public:
   void Compute(const FusionConvAddReluParam &param) const;
+  bool Init(FusionConvAddReluParam *param);
 };
 
 }  // namespace operators
diff --git a/src/operators/kernel/conv_bn_relu_kernel.h b/src/operators/kernel/conv_bn_relu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9d4df5d8f597deebaf2b53491851b7ce03fc7aa
--- /dev/null
+++ b/src/operators/kernel/conv_bn_relu_kernel.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::DDim;
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class ConvBNReluKernel
+    : public OpKernelBase<DeviceType, FusionConvBNReluParam> {
+ public:
+  void Compute(const FusionConvBNReluParam &param) const;
+  bool Init(FusionConvBNReluParam *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/conv_kernel.h b/src/operators/kernel/conv_kernel.h
index 06c0c2c55629d9762cffa0b2c5572050b95bc771..fedbee32a006f263fd3de25064496dad1a23177b 100644
--- a/src/operators/kernel/conv_kernel.h
+++ b/src/operators/kernel/conv_kernel.h
@@ -32,23 +32,9 @@ template <typename DeviceType, typename T>
 class ConvKernel : public OpKernelBase<DeviceType, ConvParam> {
  public:
   void Compute(const ConvParam &param) const;
+  bool Init(ConvParam *param);
 };
 
-inline bool IsExpand(const std::vector<int64_t> &filter_dim,
-                     const std::vector<int> &strides,
-                     const std::vector<int> &paddings,
-                     const std::vector<int> &dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/conv_transpose_kernel.h b/src/operators/kernel/conv_transpose_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb9751d2462e0f338c8fe90d3de8020c133722e6
--- /dev/null
+++ b/src/operators/kernel/conv_transpose_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_TRANSPOSE
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class ConvTransposeKernel
+    : public OpKernelBase<DeviceType, ConvTransposeParam> {
+ public:
+  void Compute(const ConvTransposeParam &param) const;
+
+  bool Init(ConvTransposeParam *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // PADDLE_MOBILE_DE_CONV_KERNEL_H
diff --git a/src/operators/kernel/depthwise_conv_kernel.h b/src/operators/kernel/depthwise_conv_kernel.h
index 1ef76a573e27ff09fe7842ad78e9fe6042a742a1..b74a58a649bd9fa27e941e2cd5ea50b30c0218cb 100644
--- a/src/operators/kernel/depthwise_conv_kernel.h
+++ b/src/operators/kernel/depthwise_conv_kernel.h
@@ -31,6 +31,7 @@ template <typename DeviceType, typename T>
 class DepthwiseConvKernel : public OpKernelBase<DeviceType, ConvParam> {
  public:
   void Compute(const ConvParam &param) const;
+  bool Init(ConvParam *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/dropout_kernel.h b/src/operators/kernel/dropout_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ef6b9dd62d88f012eba3456c676ac0d33bf9e52
--- /dev/null
+++ b/src/operators/kernel/dropout_kernel.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef DROPOUT_OP
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+#pragma once
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class DropoutKernel : public framework::OpKernelBase<DeviceType, DropoutParam> {
+ public:
+  void Compute(const DropoutParam& param) const;
+  bool Init(DropoutParam* para);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/dwconv_bn_relu_kernel.h b/src/operators/kernel/dwconv_bn_relu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..91478ae5ecba37472e7e30f774f2c515b6952eee
--- /dev/null
+++ b/src/operators/kernel/dwconv_bn_relu_kernel.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_DWCONVBNRELU_OP
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::DDim;
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class DWConvBNReluKernel
+    : public OpKernelBase<DeviceType, FusionDWConvBNReluParam> {
+ public:
+  void Compute(const FusionDWConvBNReluParam &param) const;
+  bool Init(FusionDWConvBNReluParam *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/elementwise_add_kernel.h b/src/operators/kernel/elementwise_add_kernel.h
index 7a2f92120105b9f9539937e00c392c0eb77e3830..70334c1d3f788f60e974da74133823f82ab05765 100644
--- a/src/operators/kernel/elementwise_add_kernel.h
+++ b/src/operators/kernel/elementwise_add_kernel.h
@@ -30,6 +30,7 @@ class ElementwiseAddKernel
     : public framework::OpKernelBase<DeviceType, ElementwiseAddParam> {
  public:
   void Compute(const ElementwiseAddParam &param) const;
+  bool Init(ElementwiseAddParam *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/elementwise_add_relu_kernel.h b/src/operators/kernel/elementwise_add_relu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b3ff25ec983365a7a94cff5b047eba3466fe932
--- /dev/null
+++ b/src/operators/kernel/elementwise_add_relu_kernel.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_ELEMENTWISEADDRELU_OP
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using namespace framework;
+
+template <typename DeviceType, typename T>
+class ElementwiseAddReluKernel
+    : public framework::OpKernelBase<DeviceType, ElementwiseAddReluParam> {
+ public:
+  void Compute(const ElementwiseAddReluParam &param) const;
+  bool Init(ElementwiseAddReluParam *param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fc_relu_kernel.h b/src/operators/kernel/fc_relu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ceff36f8ef49ff996769802b1f39e52e955c45d0
--- /dev/null
+++ b/src/operators/kernel/fc_relu_kernel.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_FCRELU_OP
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/math/math_function.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class FusionFcReluKernel
+    : public framework::OpKernelBase<DeviceType, FusionFcReluParam> {
+ public:
+  void Compute(const FusionFcReluParam& param) const;
+  bool Init(FusionFcReluParam* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/concat_kernel.cpp b/src/operators/kernel/fpga/concat_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6e04787a58bc437bf0738cf67072426f1cbaa57
--- /dev/null
+++ b/src/operators/kernel/fpga/concat_kernel.cpp
@@ -0,0 +1,55 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONCAT_OP
+
+#include "operators/kernel/concat_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConcatKernel<FPGA, float>::Init(ConcatParam *param) {
+  return true;
+}
+
+template <>
+void ConcatKernel<FPGA, float>::Compute(const ConcatParam &param) const {
+  auto inputs = param.Inputs();
+  auto *out = param.Out();
+  int64_t axis = param.Axis();
+  out->mutable_data<half>();
+
+  DDim out_dim = out->dims();
+  int pixels = out_dim[1] * out_dim[2];
+  auto out_channel = out_dim[3];
+
+  auto out_offset = 0;
+  for (int i = 0; i < inputs.size(); ++i) {
+    auto input = inputs[i];
+    auto channels = input->dims()[3];
+    out_offset += channels;
+    auto src = input->data<half>();
+    for (int j = 0; j < pixels; ++j) {
+      auto dst = out->mutable_data<half>() + out_offset;
+      memory::Copy(dst, src, sizeof(half));
+    }
+  }
+}
+template class ConcatKernel<FPGA, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..095ae4a6d0c8d642aa1e8225bb69f27fb63091b0
--- /dev/null
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
@@ -0,0 +1,103 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBN_OP
+
+#include "operators/kernel/conv_add_bn_kernel.h"
+#include "fpga/api/fpga_api.h"
+#include "fpga/fpga_quantilization.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam *param) {
+  bool relu_enabled = false;
+  const Tensor *input = param->Input();
+  auto input_ptr = input->data<half>();
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  Tensor *filter = param->Filter();
+
+  Tensor *out = param->Output();
+  auto out_ptr = out->mutable_data<half>();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(input->dims()[1] == bias->dims()[0] &&
+                            bias->dims()[0] == param->InputBias()->dims()[0],
+                        "Image channel should be equal to bias number");
+
+  const int channel = input->dims()[1];
+  float *bs_ptr =
+      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
+  Tensor *new_scale = new Tensor();
+  Tensor *new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] =
+        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i * 2] = new_scale_ptr[i];
+    bs_ptr[i * 2 + 1] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  Tensor *quant_filter = fpga::quantify_filter(filter);
+
+  // delete original filter?
+  filter = quant_filter;
+
+  auto filter_ptr = filter->data<float>();
+  fpga::ConvArgs convArgs;
+  convArgs.relu_enabled = relu_enabled;
+  convArgs.filter_address = (void *)filter_ptr;
+  convArgs.filter_num = filter->dims()[0];
+  convArgs.group_num = param->Groups();
+  convArgs.sb_address = (void *)bs_ptr;
+  convArgs.kernel.stride_h = param->Strides()[0];
+  convArgs.kernel.stride_w = param->Strides()[1];
+  convArgs.kernel.height = filter->dims()[2];
+  convArgs.kernel.width = filter->dims()[3];
+  convArgs.image.address = (void *)input_ptr;
+  convArgs.image.channels = input->dims()[1];
+  convArgs.image.height = input->dims()[2];
+  convArgs.image.width = input->dims()[3];
+  convArgs.image.pad_height = param->Paddings()[0];
+  convArgs.image.pad_width = param->Paddings()[1];
+  convArgs.image.scale_address = input->fpga_args().scale_pointer();
+  convArgs.output.address = (void *)out_ptr;
+  convArgs.output.scale_address = out->fpga_args().scale_pointer();
+  param->SetFpgaArgs(convArgs);
+
+  return true;
+}
+
+template <>
+void ConvAddBNKernel<FPGA, float>::Compute(
+    const FusionConvAddBNParam &param) const {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+template class ConvAddBNKernel<FPGA, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2f80ec9742cae356c3b9a2dcd0c79027da37e7b4
--- /dev/null
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
@@ -0,0 +1,94 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#include "operators/kernel/conv_add_bn_relu_kernel.h"
+#include "memory/t_malloc.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddBNReluKernel<FPGA, float>::Init(FusionConvAddBNReluParam *param) {
+  bool relu_enabled = true;
+  const Tensor *input = param->Input();
+  auto input_ptr = input->data<half>();
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  const Tensor *filter = param->Filter();
+  auto filter_ptr = filter->data<float>();
+  Tensor *out = param->Output();
+  auto out_ptr = out->mutable_data<half>();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(input->dims()[1] == bias->dims()[0] &&
+                            bias->dims()[0] == param->InputBias()->dims()[0],
+                        "Image channel should be equal to bias number");
+
+  const int channel = input->dims()[1];
+  float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  Tensor *new_scale = new Tensor();
+  Tensor *new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] =
+        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i * 2] = new_scale_ptr[i];
+    bs_ptr[i * 2 + 1] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  fpga::ConvArgs convArgs;
+  convArgs.relu_enabled = relu_enabled;
+  convArgs.filter_address = (void *)filter_ptr;
+  convArgs.filter_num = filter->dims()[0];
+  convArgs.group_num = param->Groups();
+  convArgs.sb_address = (void *)bs_ptr;
+  convArgs.kernel.stride_h = param->Strides()[0];
+  convArgs.kernel.stride_w = param->Strides()[1];
+  convArgs.kernel.height = filter->dims()[2];
+  convArgs.kernel.width = filter->dims()[3];
+  convArgs.image.address = (void *)input_ptr;
+  convArgs.image.channels = input->dims()[1];
+  convArgs.image.height = input->dims()[2];
+  convArgs.image.width = input->dims()[3];
+  convArgs.image.pad_height = param->Paddings()[0];
+  convArgs.image.pad_width = param->Paddings()[1];
+  convArgs.image.scale_address = input->fpga_args().scale_pointer();
+  convArgs.output.address = (void *)out_ptr;
+  convArgs.output.scale_address = out->fpga_args().scale_pointer();
+  param->SetFpgaArgs(convArgs);
+  return true;
+}
+
+template <>
+void ConvAddBNReluKernel<FPGA, float>::Compute(
+    const FusionConvAddBNReluParam &param) const {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+template class ConvAddBNReluKernel<FPGA, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a20f4e4837c6ceadf7f445b0b82d951ba3c1721b
--- /dev/null
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDRELU_OP
+
+#include "operators/kernel/conv_add_relu_kernel.h"
+#include "common/enforce.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam *param) {
+  bool relu_enabled = true;
+  const Tensor *input = param->Input();
+  auto input_ptr = input->data<half>();
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  const Tensor *filter = param->Filter();
+  auto filter_ptr = filter->data<float>();
+  Tensor *out = param->Output();
+  auto out_ptr = out->mutable_data<half>();
+
+  PADDLE_MOBILE_ENFORCE(input->dims()[1] == bias->dims()[0],
+                        "Image channel should be equal to bias number");
+  int channel = input->dims()[1];
+  float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i * 2] = 1;
+    bs_ptr[i * 2 + 1] = bias_ptr[i];
+  }
+
+  fpga::ConvArgs convArgs;
+  convArgs.relu_enabled = relu_enabled;
+  convArgs.filter_address = (void *)filter_ptr;
+  convArgs.filter_num = filter->dims()[0];
+  convArgs.group_num = param->Groups();
+  convArgs.sb_address = (void *)bs_ptr;
+  convArgs.kernel.stride_h = param->Strides()[0];
+  convArgs.kernel.stride_w = param->Strides()[1];
+  convArgs.kernel.height = filter->dims()[2];
+  convArgs.kernel.width = filter->dims()[3];
+  convArgs.image.address = (void *)input_ptr;
+  convArgs.image.channels = input->dims()[1];
+  convArgs.image.height = input->dims()[2];
+  convArgs.image.width = input->dims()[3];
+
+  convArgs.image.pad_height = param->Paddings()[0];
+  convArgs.image.pad_width = param->Paddings()[1];
+  convArgs.image.scale_address = input->fpga_args().scale_pointer();
+  convArgs.output.address = (void *)out_ptr;
+  convArgs.output.scale_address = out->fpga_args().scale_pointer();
+  param->SetFpgaArgs(convArgs);
+  return true;
+}
+
+template <>
+void ConvAddReluKernel<FPGA, float>::Compute(
+    const FusionConvAddReluParam &param) const {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+template class ConvAddReluKernel<FPGA, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/conv_kernel.cpp b/src/operators/kernel/fpga/conv_kernel.cpp
index d2c08992a89f0837de318c876fbab2892ee34e89..91d0f393fcc1018bacd507c5f7975f7b3a2a56ca 100644
--- a/src/operators/kernel/fpga/conv_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_kernel.cpp
@@ -15,12 +15,21 @@ limitations under the License. */
 #ifdef CONV_OP
 
 #include "operators/kernel/conv_kernel.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-void ConvKernel<FPGA, float>::Compute(const ConvParam &param) const {}
+bool ConvKernel<FPGA, float>::Init(ConvParam *param) {
+  return true;
+}
+
+template <>
+void ConvKernel<FPGA, float>::Compute(const ConvParam &param) const {
+  // ConvCompute<float>(param);
+}
+
 template class ConvKernel<FPGA, float>;
 
 }  // namespace operators
diff --git a/src/operators/kernel/fpga/dropout_kernel.cpp b/src/operators/kernel/fpga/dropout_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb6ece969d83aba6bb7fe91a3688607df0be8a8b
--- /dev/null
+++ b/src/operators/kernel/fpga/dropout_kernel.cpp
@@ -0,0 +1,40 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef DROPOUT_OP
+
+#include "operators/kernel/dropout_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DropoutKernel<FPGA, float>::Init(DropoutParam *param) {
+  param->Out()->ShareDataWith(*param->InputX());
+  return true;
+}
+
+template <>
+void DropoutKernel<FPGA, float>::Compute(const DropoutParam &param) const {
+  // auto *input_x = param.InputX();
+  // auto *out = param.Out();
+  // auto input_x_ptr = input_x->data<float>();
+  // auto out_ptr = out->mutable_data<float>();
+  // out_ptr = const_cast<float *>(input_x_ptr);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..88a19beb41f67e5fc9336c8883c8ea75aaa939e0
--- /dev/null
+++ b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_ELEMENTWISEADDRELU_OP
+
+#include "operators/kernel/elementwise_add_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ElementwiseAddReluKernel<FPGA, float>::Init(
+    ElementwiseAddReluParam *param) {
+  bool relu_enabled = true;
+  const Tensor *input_x = param->InputX();
+  const Tensor *input_y = param->InputY();
+  Tensor *out = param->Out();
+  auto input_x_ptr = input_x->data<half>();
+  auto input_y_ptr = input_y->data<half>();
+  auto out_ptr = out->mutable_data<half>();
+
+  fpga::EWAddArgs ewaddArgs;
+  ewaddArgs.relu_enabled = relu_enabled;
+  ewaddArgs.const0 = 1;
+  ewaddArgs.const1 = 1;
+  ewaddArgs.image0.address = (void *)input_x_ptr;
+  ewaddArgs.image0.channels = input_x->dims()[1];
+  ewaddArgs.image0.scale_address =
+      input_x->fpga_args().scale_pointer();  // ew has scale attribute??
+  ewaddArgs.image0.height = input_x->dims()[2];
+  ewaddArgs.image0.width = input_x->dims()[3];
+  ewaddArgs.image0.pad_height = 0;
+  ewaddArgs.image0.pad_width = 0;
+  ewaddArgs.image1.address = (void *)input_y_ptr;
+  ewaddArgs.image1.channels = input_y->dims()[1];
+  ewaddArgs.image1.scale_address =
+      input_y->fpga_args().scale_pointer();  // ew has scale attribute??
+  ewaddArgs.image1.height = input_y->dims()[2];
+  ewaddArgs.image1.width = input_y->dims()[3];
+  ewaddArgs.image1.pad_height = 0;
+  ewaddArgs.image1.pad_width = 0;
+  ewaddArgs.output.scale_address = out->fpga_args().scale_pointer();
+  ewaddArgs.output.address = (void *)out_ptr;
+  param->SetFpgaArgs(ewaddArgs);
+  return true;
+}
+
+template <>
+void ElementwiseAddReluKernel<FPGA, float>::Compute(
+    const ElementwiseAddReluParam &param) const {
+  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..21e334b12b70be1980d9417ed11161143106d1c6
--- /dev/null
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
@@ -0,0 +1,75 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FCRELU_OP
+#include "operators/kernel/fc_relu_kernel.h"
+#include "fpga/api/fpga_api.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam *param) {
+  bool relu_enabled = true;
+  const Tensor *input_x = param->InputX();
+  auto input_x_ptr = input_x->data<half>();
+  const Tensor *input_y = param->InputY();
+  auto input_y_ptr = input_y->data<float>();
+  const Tensor *input_z = param->InputZ();
+  auto input_z_ptr = input_z->data<float>();
+  Tensor *out = param->Out();
+  auto out_ptr = out->mutable_data<half>();
+
+  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0],
+                        "Image channel should be equal to weight number");
+  int channel = input_x->dims()[1];
+  float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i * 2] = 1;
+    bs_ptr[i * 2 + 1] = input_z_ptr[i];
+  }
+
+  fpga::ConvArgs convArgs;
+  convArgs.relu_enabled = relu_enabled;
+  convArgs.filter_address = (void *)input_y_ptr;
+  convArgs.filter_num = out->dims()[1];
+  convArgs.group_num = 1;
+  convArgs.sb_address = (void *)bs_ptr;
+  convArgs.kernel.stride_w = 1;
+  convArgs.kernel.stride_h = 1;
+  convArgs.kernel.height = input_x->dims()[2];
+  convArgs.kernel.width = input_x->dims()[3];
+  convArgs.image.address = (void *)input_x_ptr;
+  convArgs.image.channels = input_x->dims()[1];
+  convArgs.image.height = input_x->dims()[2];
+  convArgs.image.width = input_x->dims()[3];
+  convArgs.image.pad_height = 0;
+  convArgs.image.pad_width = 0;
+  convArgs.image.scale_address =
+      input_x->fpga_args().scale_pointer();  // fc input has scale attribute??
+  convArgs.output.address = (void *)out_ptr;
+  convArgs.output.scale_address =
+      out->fpga_args().scale_pointer();  // fc output has scale attribute??
+  param->SetFpgaArgs(convArgs);
+
+  return true;
+}
+template <>
+void FusionFcReluKernel<FPGA, float>::Compute(
+    const FusionFcReluParam &param) const {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..505b8768565dc4003152c3493b558448f9d73d04
--- /dev/null
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
@@ -0,0 +1,74 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FC_OP
+
+#include "operators/kernel/fusion_fc_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FusionFcKernel<FPGA, float>::Init(FusionFcParam *param) {
+  bool relu_enabled = false;
+  const Tensor *input_x = param->InputX();
+  auto input_x_ptr = input_x->data<half>();
+  const Tensor *input_y = param->InputY();
+  auto input_y_ptr = input_y->data<float>();
+  const Tensor *input_z = param->InputZ();
+  auto input_z_ptr = input_z->data<float>();
+  Tensor *out = param->Out();
+  auto out_ptr = out->mutable_data<half>();
+
+  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0],
+                        "Image channel should be equal to weight number");
+  int channel = input_x->dims()[1];
+  float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i * 2] = 1;
+    bs_ptr[i * 2 + 1] = input_z_ptr[i];
+  }
+
+  fpga::ConvArgs convArgs;
+  convArgs.relu_enabled = relu_enabled;
+  convArgs.filter_address = (void *)input_y_ptr;
+  convArgs.filter_num = out->dims()[1];
+  convArgs.group_num = 1;
+  convArgs.sb_address = (void *)bs_ptr;
+  convArgs.kernel.stride_w = 1;
+  convArgs.kernel.stride_h = 1;
+  convArgs.kernel.height = input_x->dims()[2];
+  convArgs.kernel.width = input_x->dims()[3];
+  convArgs.image.address = (void *)input_x_ptr;
+  convArgs.image.channels = input_x->dims()[1];
+  convArgs.image.height = input_x->dims()[2];
+  convArgs.image.width = input_x->dims()[3];
+  convArgs.image.pad_height = 0;
+  convArgs.image.pad_width = 0;
+  convArgs.image.scale_address =
+      input_x->fpga_args().scale_pointer();  // fc input has scale attribute??
+  convArgs.output.address = (void *)out_ptr;
+  convArgs.output.scale_address =
+      out->fpga_args().scale_pointer();  // fc output has scale attribute??
+  param->SetFpgaArgs(convArgs);
+  return true;
+}
+
+template <>
+void FusionFcKernel<FPGA, float>::Compute(const FusionFcParam &param) const {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/pool_kernel.cpp b/src/operators/kernel/fpga/pool_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a7ff022c3b8616847c48a71bf94e4018cedcad2e
--- /dev/null
+++ b/src/operators/kernel/fpga/pool_kernel.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POOL_OP
+
+#include "operators/kernel/pool_kernel.h"
+
+class PoolingArgs;
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool PoolKernel<FPGA, float>::Init(PoolParam *param) {
+  const Tensor *input = param->Input();
+  auto input_ptr = input->data<half>();
+  Tensor *output = param->Output();
+  auto output_ptr = output->mutable_data<half>();
+  vector<int> ksize = param->Ksize();
+  vector<int> strides = param->Strides();
+  vector<int> paddings = param->Paddings();
+
+  fpga::PoolingArgs poolArgs;
+  poolArgs.image.address = (void *)input_ptr;
+  poolArgs.image.channels = input->dims()[1];
+  poolArgs.image.height = input->dims()[2];
+  poolArgs.image.width = input->dims()[3];
+  poolArgs.image.pad_height = paddings[0];
+  poolArgs.image.pad_width = paddings[1];
+  poolArgs.output.address = output_ptr;
+  poolArgs.kernel.height = ksize[0];
+  poolArgs.kernel.width = ksize[1];
+  poolArgs.kernel.stride_h = strides[0];
+  poolArgs.kernel.stride_w = strides[1];
+  param->SetFpgaArgs(poolArgs);
+  return true;
+}
+
+template <>
+void PoolKernel<FPGA, float>::Compute(const PoolParam &param) const {
+#ifdef PADDLE_MOBILE_FPGA
+  fpga::ComputeFpgaPool(param.FpgaArgs());
+#endif
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fusion_fc_kernel.h b/src/operators/kernel/fusion_fc_kernel.h
index cbb9721c7fc810cbb9feb56160dc71660f3f4489..0e31134ba5a18405a5855db1e85b3885608c4071 100644
--- a/src/operators/kernel/fusion_fc_kernel.h
+++ b/src/operators/kernel/fusion_fc_kernel.h
@@ -28,6 +28,7 @@ class FusionFcKernel
     : public framework::OpKernelBase<DeviceType, FusionFcParam> {
  public:
   void Compute(const FusionFcParam& param) const;
+  bool Init(FusionFcParam* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/im2sequence_kernel.h b/src/operators/kernel/im2sequence_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa798fd6af5592a062de207714dc9fee2afb93df
--- /dev/null
+++ b/src/operators/kernel/im2sequence_kernel.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef IM2SEQUENCE_OP
+
+#include "framework/operator.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+#pragma once
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class Im2SequenceKernel
+    : public framework::OpKernelBase<DeviceType, Im2SequenceParam> {
+ public:
+  void Compute(const Im2SequenceParam& param) const;
+  bool Init(Im2SequenceParam* para);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/lrn_kernel.h b/src/operators/kernel/lrn_kernel.h
index d92d15e5e9d22c431799e924ef1b5ba0de840004..ee19d6e40ee0b5b66f62ce6535370a81c28950af 100644
--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef LRN_OP
-
-#pragma once
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include "framework/operator.h"
 #include "operators/op_param.h"
 
+#include <cmath>
+
 #ifdef __ARM_NEON
 #include "arm_neon.h"
 #include "operators/math/math_func_neon.h"
@@ -46,6 +49,7 @@ struct LRNFunctor {
     std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), 0.0);
 
     for (int a = 0; a < N; a++) {
+#pragma parallel for
       for (int b = 0; b < C; b++) {
         for (int index = start; index < end; index++) {
           int channel = b + index;
@@ -169,6 +173,7 @@ template <typename DeviceType, typename T>
 class LrnKernel : public framework::OpKernelBase<DeviceType, LrnParam> {
  public:
   void Compute(const LrnParam &param) const;
+  bool Init(LrnParam *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/mali/ACL_Android b/src/operators/kernel/mali/ACL_Android
new file mode 160000
index 0000000000000000000000000000000000000000..591027fcffea084100c756e48356e0f8a48e35e5
--- /dev/null
+++ b/src/operators/kernel/mali/ACL_Android
@@ -0,0 +1 @@
+Subproject commit 591027fcffea084100c756e48356e0f8a48e35e5
diff --git a/src/operators/kernel/mali/acl_operator.cc b/src/operators/kernel/mali/acl_operator.cc
new file mode 100755
index 0000000000000000000000000000000000000000..562d2fe1c46aa7a30b6418c7a3fcb21daafffa0f
--- /dev/null
+++ b/src/operators/kernel/mali/acl_operator.cc
@@ -0,0 +1,220 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#if USE_ACL == 1
+#include "acl_operator.h"
+unsigned int bypass_acl_class_layer =
+    (0 | FLAGS_ENABLE_ACL_CONCAT |
+     /*0xffffffff |*/ /*FLAGS_ENABLE_ACL_FC |*/ /*FLAGS_ENABLE_ACL_LRN
+                                                   |*/
+     0);
+
+int enable_schedule = 0;
+
+#ifdef USE_PROFILING
+
+#include "arm_neon.h"
+
+unsigned int acl_log_flags =
+    (0 | MASK_LOG_APP_TIME | /*MASK_LOG_ALLOCATE | */  /*MASK_LOG_ALLOCATE | */
+     /*MASK_LOG_RUN      | */ /*MASK_LOG_CONFIG   | */ /*MASK_LOG_COPY     | */
+     MASK_LOG_ABSVAL | MASK_LOG_BNLL | MASK_LOG_CONV | MASK_LOG_FC |
+     MASK_LOG_LRN | MASK_LOG_POOLING | MASK_LOG_RELU | MASK_LOG_SIGMOID |
+     MASK_LOG_SOFTMAX | MASK_LOG_TANH | MASK_LOG_LC | MASK_LOG_BN |
+     MASK_LOG_CONCAT | 0);
+#include <stdio.h>  /* printf */
+#include <stdlib.h> /* getenv */
+#endif              // USE_PROFILING
+
+static bool force_enable_gpu = false;
+bool AclEnableSchedule(int enable) {
+  enable_schedule = enable;
+  if (enable) {
+    force_enable_gpu = true;
+  }
+  return true;
+}
+int isScheduleEnable() { return enable_schedule; }
+
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+
+bool ACLOperator::init_gpu_env = true;
+#ifdef USE_OPENCL
+bool ACLOperator::support_opencl_ = false;
+bool opencl_is_available() { return arm_compute::opencl_is_available(); }
+#elif defined(USE_OPENGLES)
+bool ACLOperator::support_opengles_ = false;
+#endif
+ACLOperator::ACLOperator(bool is_gpu)
+    : operator_state_(operator_not_init),
+      force_bypass_acl_path_(false),
+      target_hint_(TargetHint::DONT_CARE),
+      convolution_method_hint_(ConvolutionMethodHint::GEMM),
+      _group(1),
+      name_(""),
+      input_idx_(0),
+      output_idx_(0),
+      is_gpu_(is_gpu) {
+  const char* pBypassACL;
+  if (init_gpu_env) {
+#ifdef USE_OPENCL
+    try {
+      if (opencl_is_available()) {
+        arm_compute::CLScheduler::get().default_init();
+        support_opencl_ = true;
+      }
+    } catch (std::exception& e) {
+      support_opencl_ = false;
+    }
+#elif defined(USE_OPENGLES)
+    try {
+      arm_compute::GCScheduler::get().default_init();
+      support_opengles_ = true;
+    } catch (std::exception& e) {
+      support_opengles_ = false;
+    }
+#endif
+    init_gpu_env = false;
+  }
+  if (force_enable_gpu) is_gpu_ = true;
+  pBypassACL = getenv("BYPASSACL");
+  if (pBypassACL) {
+    unsigned int bacl;
+    sscanf(pBypassACL, "%i", &bacl);
+    if (bacl != bypass_acl_class_layer) {
+      bypass_acl_class_layer = bacl;
+      printf("BYPASSACL<%s>\n", pBypassACL);
+      printf("BYPASSACL: %x\n", bypass_acl_class_layer);
+    }
+  }
+
+#ifdef USE_PROFILING
+  const char* pLogACL;
+  pLogACL = getenv("LOGACL");
+  if (pLogACL) {
+    unsigned int alf;
+    sscanf(pLogACL, "%i", &alf);
+    if (alf != acl_log_flags) {
+      acl_log_flags = alf;
+      printf("LOGACL<%s>\n", pLogACL);
+      printf("LOGACL: %x\n", acl_log_flags);
+    }
+  }
+#endif  // USE_PROFILING
+  const char* pEnableSchedule;
+  pEnableSchedule = getenv("ENABLESCHEDULE");
+  if (pEnableSchedule) {
+    int bshedule;
+    sscanf(pEnableSchedule, "%i", &bshedule);
+    if (bshedule != enable_schedule) {
+      enable_schedule = bshedule;
+      printf("ENABLESCHEDULE<%s>\n", pEnableSchedule);
+      printf("ENABLESCHEDULE: %x\n", enable_schedule);
+    }
+    if (enable_schedule) {
+      AclEnableSchedule(1);
+    }
+  }
+}
+ACLOperator::~ACLOperator() {}
+
+bool ACLOperator::new_tensor(std::unique_ptr<ACLTensor>& tensor,
+                             arm_compute::TensorShape& shape, void* mem,
+                             bool commit) {
+  auto acl_tensor =
+      new ACLTensor(arm_compute::TensorInfo(shape, arm_compute::Format::F32));
+  acl_tensor->set_target(getTargetHint());
+  acl_tensor->bindmem(mem);
+  if (commit) acl_tensor->commit();
+  tensor = (std::unique_ptr<ACLTensor>)std::move(acl_tensor);
+  return true;
+}
+bool ACLOperator::new_tensor(std::unique_ptr<ACLSubTensor>& tensor,
+                             std::unique_ptr<ACLTensor>& parent,
+                             arm_compute::TensorShape& shape,
+                             arm_compute::Coordinates& coord) {
+  auto acl_tensor = new ACLSubTensor(parent, shape, coord);
+  acl_tensor->set_target(getTargetHint());
+  tensor = (std::unique_ptr<ACLSubTensor>)std::move(acl_tensor);
+  return true;
+}
+
+void ACLTensor::commit(TensorType type) {
+  settensortype(type);
+  if (mem_) {
+    if (!allocate_) {
+#ifdef USE_PROFILING
+      logtime_util log_time(ACL_ALLOCATE_INFO);
+#endif  // USE_PROFILING
+      allocate();
+      allocate_ = true;
+    }
+    if (type_ != tensor_output) {
+      tensor_copy(mem_);
+    }
+    mem_ = nullptr;
+  }
+}
+
+int BaseACLTensor::tensor_copy(arm_compute::ITensor* tensor, void* mem,
+                               bool toTensor) {
+#ifdef USE_PROFILING
+  logtime_util log_time(ACL_COPY_INFO);
+#endif  // USE_PROFILING
+  arm_compute::Window window;
+  // Iterate through the rows (not each element)
+  window.use_tensor_dimensions(tensor->info()->tensor_shape(),
+                               /* first_dimension =*/arm_compute::Window::DimY);
+
+  int width = tensor->info()->tensor_shape()[0];
+  int height = tensor->info()->tensor_shape()[1];
+  int deepth = tensor->info()->tensor_shape()[2];
+  map();
+  // Create an iterator:
+  arm_compute::Iterator it(tensor, window);
+  // Except it works for an arbitrary number of dimensions
+  if (toTensor) {  // mem->tensor
+    arm_compute::execute_window_loop(
+        window,
+        [&](const arm_compute::Coordinates& id) {
+          memcpy(it.ptr(),
+                 ((char*)mem) +
+                     ((id[3] * (width * height * deepth) +
+                       id.z() * (width * height) + id.y() * width + id.x()) *
+                      tensor->info()->element_size()),
+                 width * tensor->info()->element_size());
+        },
+        it);
+  } else {  // tensor-->mem
+    arm_compute::execute_window_loop(
+        window,
+        [&](const arm_compute::Coordinates& id) {
+          memcpy(((char*)mem) + ((id[3] * (width * height * deepth) +
+                                  id.z() * (width * height) + id.y() * width) *
+                                 tensor->info()->element_size()),
+                 it.ptr(), width * tensor->info()->element_size());
+        },
+        it);
+  }
+  unmap();
+
+  return 0;
+}
+
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/mali/acl_operator.h b/src/operators/kernel/mali/acl_operator.h
new file mode 100755
index 0000000000000000000000000000000000000000..bf8200d486f91998c79540177ab1b26596a3e9dc
--- /dev/null
+++ b/src/operators/kernel/mali/acl_operator.h
@@ -0,0 +1,1145 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef ACL_OPERATOR_H_
+#define ACL_OPERATOR_H_
+#include <framework/tensor.h>
+#include <operators/op_param.h>
+
+#if USE_ACL == 1
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+#endif
+
+#ifdef USE_OPENGLES
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h"
+#endif
+
+#include "acl_tensor.h"
+#define FLAGS_ENABLE_ACL_ABSVAL 0x00000001
+#define FLAGS_ENABLE_ACL_BNLL 0x00000002
+#define FLAGS_ENABLE_ACL_CONV 0x00000004
+#define FLAGS_ENABLE_ACL_FC 0x00000008
+#define FLAGS_ENABLE_ACL_LRN 0x00000010
+#define FLAGS_ENABLE_ACL_POOLING 0x00000020
+#define FLAGS_ENABLE_ACL_RELU 0x00000040
+#define FLAGS_ENABLE_ACL_SIGMOID 0x00000080
+#define FLAGS_ENABLE_ACL_SOFTMAX 0x00000100
+#define FLAGS_ENABLE_ACL_TANH 0x00000200
+#define FLAGS_ENABLE_ACL_LC 0x00000400
+#define FLAGS_ENABLE_ACL_BN 0x00000800
+#define FLAGS_ENABLE_ACL_CONCAT 0x00001000
+extern unsigned int bypass_acl_class_layer;
+
+#ifdef USE_PROFILING
+#include <sys/time.h>
+#define NANO_SEC_CONV 1000000
+
+#define MASK_LOG_APP_TIME 0x00000001
+#define MASK_LOG_ALLOCATE 0x00000002
+#define MASK_LOG_RUN 0x00000004
+#define MASK_LOG_CONFIG 0x00000008
+#define MASK_LOG_COPY 0x00000010
+#define MASK_LOG_ABSVAL 0x00000020
+#define MASK_LOG_BNLL 0x00000040
+#define MASK_LOG_CONV 0x00000080
+#define MASK_LOG_FC 0x00000100
+#define MASK_LOG_LRN 0x00000200
+#define MASK_LOG_POOLING 0x00000400
+#define MASK_LOG_RELU 0x00000800
+#define MASK_LOG_SIGMOID 0x00001000
+#define MASK_LOG_SOFTMAX 0x00002000
+#define MASK_LOG_TANH 0x00004000
+#define MASK_LOG_LC 0x00008000
+#define MASK_LOG_BN 0x00010000
+#define MASK_LOG_CONCAT 0x00020000
+#define APP_TIME_INFO MASK_LOG_APP_TIME, "time:       \t"
+#define ACL_ALLOCATE_INFO MASK_LOG_ALLOCATE, "allocate:   \t\t"
+#define ACL_RUN_INFO MASK_LOG_RUN, "run:        \t\t\t"
+#define ACL_CONFIG_INFO MASK_LOG_CONFIG, "configure:  \t\t\t\t"
+#define ACL_COPY_INFO MASK_LOG_COPY, "tensor_copy:\t\t\t\t\t"
+#define ACL_ABSVAL_INFO MASK_LOG_ABSVAL, "ACL_ABSVAL :\t\t\t\t\t\t"
+#define ACL_BNLL_INFO MASK_LOG_BNLL, "ACL_BNLL   :\t\t\t\t\t\t\t"
+#define ACL_CONV_INFO MASK_LOG_CONV, "ACL_CONV   :\t\t\t\t\t\t\t\t"
+#define ACL_FC_INFO MASK_LOG_FC, "ACL_FC     :\t\t\t\t\t\t\t\t\t"
+#define ACL_LRN_INFO MASK_LOG_LRN, "ACL_LRN    :\t\t\t\t\t\t\t\t\t\t"
+#define ACL_POOLING_INFO MASK_LOG_POOLING, "ACL_POOLING:\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_RELU_INFO MASK_LOG_RELU, "ACL_RELU   :\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_SIGMOID_INFO \
+  MASK_LOG_SIGMOID, "ACL_SIGMOID:\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_SOFTMAX_INFO \
+  MASK_LOG_SOFTMAX, "ACL_SOFTMAX:\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_TANH_INFO \
+  MASK_LOG_TANH, "ACL_TANH   :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_LC_INFO MASK_LOG_LC, "ACL_LC     :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_BN_INFO \
+  MASK_LOG_BN, "ACL_BN     :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_CONCAT_INFO \
+  MASK_LOG_CONCAT, "ACL_CONCAT :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+extern unsigned int acl_log_flags;
+
+class logtime_util {
+ public:
+  logtime_util() { mask = 0; }
+  logtime_util(int mask_, const char *information_) {
+    setlogtime_info(mask_, information_);
+  }
+  void setlogtime_info(int mask_, const char *information_) {
+    mask = mask_;
+    if (acl_log_flags & mask) {
+      strncpy(information, information_, 255);
+      gettimeofday(&tv[0], NULL);
+    }
+  }
+  ~logtime_util() {
+    if (acl_log_flags & mask) {
+      int time[2];
+      gettimeofday(&tv[1], NULL);
+      time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
+      time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
+      printf("%s %.6lf\n", information,
+             (((double)time[1] - time[0]) / NANO_SEC_CONV));
+    }
+  }
+  void log_time(bool start) {
+    if (acl_log_flags & mask) {
+      if (start) {
+        gettimeofday(&tv[0], NULL);
+      } else {
+        int time[2];
+        gettimeofday(&tv[1], NULL);
+        time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
+        time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
+        printf("%s %.6lf\n", information,
+               (((double)time[1] - time[0]) / NANO_SEC_CONV));
+      }
+    }
+  }
+
+ private:
+  struct timeval tv[2];
+  int mask;
+  char information[256];
+};
+
+#endif  // USE_PROFILING
+
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+
+class AclParameters {
+ public:
+  AclParameters() {
+    dilated = false;
+    dim = 2;
+    num_group = 1;
+  }
+  int batch;
+  int in_depth;
+  int in_rows;
+  int in_cols;
+
+  int out_depth;
+  int out_rows;
+  int out_cols;
+  int out_num;
+
+  int filter_rows;
+  int filter_cols;
+
+  int stride_rows;
+  int stride_cols;
+
+  int pad_rows;
+  int pad_cols;
+
+  int dilation_rows;
+  int dilation_cols;
+
+  int num_group;
+  bool dilated;
+  int dim;
+  int epsilon;
+
+  int nsize;
+  float alpha;
+  float beta;
+  float knorm;
+
+  void *input_data;
+  void *output_data;
+  void *weight_data;
+  void *biases_data;
+  void *mean_data;
+  void *var_data;
+
+  std::string pool_type;
+  std::string act_type;
+  std::string data_layout;
+
+  bool is_global_pool;
+  bool is_channel_concat;
+  bool is_bypass;
+
+  std::vector<framework::LoDTensor *> in_tensor;
+};
+
+enum TensorType {
+  tensor_input,
+  tensor_output,
+  tensor_weights,
+  tensor_biases,
+  tensor_mean,
+  tensor_var,
+  tensor_beta,
+  tensor_gamma,
+  tensor_concat,
+  tensor_data,
+};
+enum OperatorState {
+  operator_not_init,
+  operator_init_done,
+  operator_reinit,
+};
+enum OperateType {
+  operate_type_pooling,
+  operate_type_activation,
+  operate_type_lrn,
+  operate_type_conv,
+  operate_type_lc,
+  operate_type_fc,
+  operate_type_bn,
+  operate_type_softmax,
+  operate_type_concat,
+};
+
+class BaseACLTensor {
+ public:
+  BaseACLTensor() : type_(tensor_input), allocate_(false) {}
+  virtual ~BaseACLTensor() {}
+  virtual void bindmem(void *mem) { mem_ = mem; }
+  virtual void settensortype(TensorType type) { type_ = type; }
+  virtual void map(bool blocking = true) {}
+  virtual void unmap() {}
+  virtual void commit(TensorType type = tensor_data) {}
+  int tensor_copy(arm_compute::ITensor *tensor, void *mem,
+                  bool toTensor = true);
+
+ protected:
+  void *mem_;
+  TensorType type_;
+  bool allocate_;
+};
+class ACLTensor : public BaseACLTensor, public Tensor {
+ public:
+  explicit ACLTensor(arm_compute::TensorInfo &&info) : Tensor(info) {}
+  virtual void map(bool blocking = true) {
+    if (!allocate_) {
+      Tensor::allocate();
+      allocate_ = true;
+    }
+    Tensor::map(blocking);
+  }
+  virtual int tensor_copy(void *mem, bool toTensor = true) {
+    auto acl_tensor = this;
+    arm_compute::ITensor *tensor = acl_tensor->tensor();
+    BaseACLTensor::tensor_copy(tensor, mem, toTensor);
+    return 0;
+  }
+  virtual void unmap() { Tensor::unmap(); }
+  virtual void commit(TensorType type = tensor_data);
+};
+class ACLSubTensor : public BaseACLTensor, public SubTensor {
+ public:
+  ACLSubTensor(std::unique_ptr<ACLTensor> &parent,
+               arm_compute::TensorShape &shape, arm_compute::Coordinates &coord)
+      : SubTensor(parent.get(), shape, coord) {}
+  virtual int tensor_copy(void *mem, bool toTensor = true) { return 0; }
+};
+
+template <typename T>
+class TensorPair {
+ public:
+  TensorPair() {}
+  ~TensorPair() {}
+  TensorType type;
+  std::unique_ptr<T> tensor;
+};
+template <typename T>
+std::unique_ptr<T> &tensor_item(
+    std::vector<std::unique_ptr<TensorPair<T>>> &pool, TensorType type,
+    int idx) {
+  int count = 0;
+  for (auto &item : pool) {
+    if (item.get()->type == type) {
+      ++count;
+    }
+    if (item.get()->type == type && idx == count - 1) {
+      return item.get()->tensor;
+    }
+  }
+  pool.push_back((std::unique_ptr<TensorPair<T>>)std::move(new TensorPair<T>));
+  auto item = pool[pool.size() - 1].get();
+  item->type = type;
+  item->tensor = NULL;
+  return item->tensor;
+}
+class ACLOperator {
+ public:
+  virtual void commit() {
+    for (auto &item : tensor_pool_) {
+      if (item.get()->tensor) item.get()->tensor->commit(item.get()->type);
+    }
+  }
+  inline void run() {
+    commit();
+#ifdef USE_PROFILING
+    logtime_util log_time(ACL_RUN_INFO);
+#endif  // USE_PROFILING
+    for (auto &c : funcs_) {
+      c->run();
+    }
+  }
+
+  inline std::vector<std::unique_ptr<arm_compute::IFunction>> &funcs() {
+    return funcs_;
+  }
+  inline std::unique_ptr<ACLSubTensor> &sinput(int idx = 0) {
+    return tensor_item(subtensor_pool_, tensor_input, idx);
+  }
+  inline std::unique_ptr<ACLSubTensor> &soutput(int idx = 0) {
+    return tensor_item(subtensor_pool_, tensor_output, idx);
+  }
+  inline std::unique_ptr<ACLSubTensor> &sweights(int idx = 0) {
+    return tensor_item(subtensor_pool_, tensor_weights, idx);
+  }
+  inline std::unique_ptr<ACLSubTensor> &sbiases(int idx = 0) {
+    return tensor_item(subtensor_pool_, tensor_biases, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &cinput(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_concat, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &input(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_input, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &output(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_output, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &weights(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_weights, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &biases(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_biases, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &mean(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_mean, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &var(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_var, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &beta(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_beta, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &gamma(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_gamma, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &tensor(TensorType type) {
+    switch (type) {
+      case tensor_biases:
+        return biases();
+        break;
+      case tensor_weights:
+        return weights();
+        break;
+      case tensor_output:
+        return output();
+        break;
+      default:
+      case tensor_input:
+        return input();
+        break;
+    }
+    return input();
+  }
+
+  explicit ACLOperator(bool is_gpu = false);
+  virtual ~ACLOperator();
+  inline TargetHint getTargetHint() {
+#ifdef USE_OPENCL
+    if (target_hint_ == TargetHint::DONT_CARE) {
+      if (is_gpu_) {
+        return TargetHint::OPENCL;
+      }
+      return TargetHint::NEON;
+    }
+    return target_hint_;
+#elif defined(USE_OPENGLES)
+    if (target_hint_ == TargetHint::DONT_CARE) {
+      if (is_gpu_) {
+        return TargetHint::OPENGLES;
+      }
+      return TargetHint::NEON;
+    }
+    return target_hint_;
+#else
+    return TargetHint::NEON;
+#endif
+  }
+  inline void setTargetHint(TargetHint hint) { target_hint_ = hint; }
+  inline ConvolutionMethodHint &getConvMethod() {
+    return convolution_method_hint_;
+  }
+  inline void setConvMethod() {
+    convolution_method_hint_ = ConvolutionMethodHint::DIRECT;
+  }
+  inline bool tensor_mem(std::unique_ptr<ACLTensor> &tensor, void *mem) {
+    tensor->bindmem(mem);
+    return true;
+  }
+  inline bool tensor_mem(void *mem, std::unique_ptr<ACLTensor> &tensor) {
+    tensor->tensor_copy(mem, false);
+    return true;
+  }
+  bool new_tensor(std::unique_ptr<ACLTensor> &tensor,
+                  arm_compute::TensorShape &shape, void *mem = nullptr,
+                  bool commit = false);
+  bool new_tensor(std::unique_ptr<ACLSubTensor> &tensor,
+                  std::unique_ptr<ACLTensor> &parent,
+                  arm_compute::TensorShape &shape,
+                  arm_compute::Coordinates &coord);
+  inline int &group() { return _group; }
+  inline void set_operator_property(OperateType type, const char *name) {
+    name_ = name;
+    type_ = type;
+  }
+  inline void acl_run(void *input_data, void *output_data) {
+    if (input_data) tensor_mem(input(), input_data);
+    run();
+    tensor_mem(output_data, output());
+  }
+  inline int &input_idx() { return input_idx_; }
+  inline int &output_idx() { return output_idx_; }
+
+ protected:
+  inline bool isGPUMode() {
+#ifdef USE_OPENCL
+    if (!support_opencl_) return false;
+    return getTargetHint() == TargetHint::OPENCL;
+#elif defined(USE_OPENGLES)
+    if (!support_opengles_) return false;
+    return getTargetHint() == TargetHint::OPENGLES;
+#endif
+    return false;
+  }
+  inline OperatorState &opstate() { return operator_state_; }
+  inline bool is_operator_init_done(arm_compute::TensorShape shape,
+                                    TensorType type = tensor_input) {
+    checkreshape(shape, type);
+    return operator_state_ == operator_init_done;
+  }
+  inline void set_operator_init_done() {
+    opstate() = operator_init_done;
+    set_bypass_state(false);
+  }
+  inline void set_bypass_state(bool state = false) {
+    force_bypass_acl_path_ = state;
+  }
+  inline OperatorState checkreshape(arm_compute::TensorShape shape,
+                                    TensorType type = tensor_input) {
+    opstate() = reshape(shape, type);
+    if (opstate() == operator_reinit) {
+      freeres();
+    }
+    return opstate();
+  }
+  inline OperatorState reshape(arm_compute::TensorShape &shape,
+                               TensorType type) {
+    arm_compute::TensorShape _shape;
+    std::unique_ptr<ACLTensor> &acl_tensor = tensor(type);
+    if (!acl_tensor.get()) return operator_not_init;
+    _shape = acl_tensor->info().tensor_shape();
+    if (_shape.total_size() == shape.total_size() && _shape[0] == shape[0] &&
+        _shape[1] == shape[1]) {
+      return operator_init_done;
+    }
+    return operator_reinit;
+  }
+  inline void freeres() {
+    tensor_pool_.clear();
+    subtensor_pool_.clear();
+    funcs_.clear();
+  }
+  inline const char *&name() { return name_; }
+  inline void set_in_out_index(int indata_idx, int outdata_idx) {
+    input_idx() = indata_idx;
+    output_idx() = outdata_idx;
+  }
+
+ protected:
+  std::vector<std::unique_ptr<TensorPair<ACLTensor>>> tensor_pool_;
+  std::vector<std::unique_ptr<TensorPair<ACLSubTensor>>> subtensor_pool_;
+  std::vector<std::unique_ptr<arm_compute::IFunction>> funcs_;
+  OperatorState operator_state_;
+  bool force_bypass_acl_path_;
+  TargetHint target_hint_;
+  ConvolutionMethodHint convolution_method_hint_;
+  static bool support_opengles_;
+  static bool support_opencl_;
+  static bool init_gpu_env;
+  int _group;
+  const char *name_;
+  OperateType type_;
+  int input_idx_, output_idx_;
+  bool is_gpu_;
+};
+
+int isScheduleEnable();
+
+template <typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    arm_compute::ITensor *input, arm_compute::ITensor *output) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(dynamic_cast<TensorType *>(input),
+                dynamic_cast<TensorType *>(output));
+
+  return std::move(op);
+}
+
+template <typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    arm_compute::ITensor *input, arm_compute::ITensor *output) {
+  return instantiate_function<OperatorType, TensorType>(input, output);
+}
+
+template <typename OpType, typename OpTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint &hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  func = instantiate<OpType, OpTensor>(input->tensor(), output->tensor());
+  return func;
+}
+
+template <typename OperatorType, typename TensorType, typename VectorTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    VectorTensor inputs, arm_compute::ITensor *output) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(inputs, dynamic_cast<TensorType *>(output));
+
+  return std::move(op);
+}
+
+template <typename OperatorType, typename TensorType, typename VectorTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    VectorTensor inputs, arm_compute::ITensor *output) {
+  return instantiate_function<OperatorType, TensorType, VectorTensor>(inputs,
+                                                                      output);
+}
+
+template <typename OpType, typename OpTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func_lists(
+    ACLOperator *&acl_op, std::unique_ptr<ACLTensor> &output, int num,
+    TargetHint &hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  static std::vector<OpTensor *> tensors;
+  tensors.clear();
+  for (int i = 0; i < num; ++i) {
+    tensors.push_back(
+        dynamic_cast<OpTensor *>(acl_op->cinput(i).get()->tensor()));
+  }
+  func = instantiate<OpType, OpTensor, std::vector<OpTensor *>>(
+      tensors, output->tensor());
+  return func;
+}
+
+template <typename OperatorType, typename TensorType, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    arm_compute::ITensor *input, arm_compute::ITensor *output,
+    const OperatorInfo &info) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(dynamic_cast<TensorType *>(input),
+                dynamic_cast<TensorType *>(output), info);
+
+  return std::move(op);
+}
+
+template <typename OperatorType, typename TensorType, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    arm_compute::ITensor *input, arm_compute::ITensor *output,
+    const OperatorInfo &info) {
+  return instantiate_function<OperatorType, TensorType, OperatorInfo>(
+      input, output, info);
+}
+
+template <typename OpType, typename OpTensor, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    const OperatorInfo &info, TargetHint &hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  func = instantiate<OpType, OpTensor, OperatorInfo>(input->tensor(),
+                                                     output->tensor(), info);
+  return func;
+}
+
+template <typename OperatorType, typename TensorType, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    arm_compute::ITensor *input, arm_compute::ITensor *weights,
+    arm_compute::ITensor *biases, arm_compute::ITensor *output,
+    const OperatorInfo &info) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(dynamic_cast<TensorType *>(input),
+                dynamic_cast<TensorType *>(weights),
+                dynamic_cast<TensorType *>(biases),
+                dynamic_cast<TensorType *>(output), info);
+  return std::move(op);
+}
+
+template <typename OperatorType, typename TensorType, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    arm_compute::ITensor *input, arm_compute::ITensor *weights,
+    arm_compute::ITensor *biases, arm_compute::ITensor *output,
+    const OperatorInfo &info) {
+  return instantiate_function<OperatorType, TensorType, OperatorInfo>(
+      input, weights, biases, output, info);
+}
+
+template <typename OpType, typename OpTensor, typename OperatorInfo,
+          typename ACLTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &weights,
+    std::unique_ptr<ACLTensor> &biases, std::unique_ptr<ACLTensor> &output,
+    const OperatorInfo &info, TargetHint &hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  arm_compute::ITensor *biases_tensor = NULL;
+
+  if (biases.get()) {
+    biases_tensor = biases->tensor();
+  }
+  func = instantiate<OpType, OpTensor, OperatorInfo>(
+      input->tensor(), weights->tensor(), biases_tensor, output->tensor(),
+      info);
+  return func;
+}
+
+template <typename Dtype, typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    arm_compute::ITensor *input, arm_compute::ITensor *output,
+    arm_compute::ITensor *mean, arm_compute::ITensor *var,
+    arm_compute::ITensor *beta, arm_compute::ITensor *gamma, Dtype &eps) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(
+      dynamic_cast<TensorType *>(input), dynamic_cast<TensorType *>(output),
+      dynamic_cast<TensorType *>(mean), dynamic_cast<TensorType *>(var),
+      dynamic_cast<TensorType *>(beta), dynamic_cast<TensorType *>(gamma), eps);
+
+  return std::move(op);
+}
+
+template <typename Dtype, typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    arm_compute::ITensor *input, arm_compute::ITensor *output,
+    arm_compute::ITensor *mean, arm_compute::ITensor *var,
+    arm_compute::ITensor *beta, arm_compute::ITensor *gamma, Dtype eps) {
+  return instantiate_function<Dtype, OperatorType, TensorType>(
+      input, output, mean, var, beta, gamma, eps);
+}
+
+template <typename Dtype, typename OpType, typename OpTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    std::unique_ptr<ACLTensor> &mean, std::unique_ptr<ACLTensor> &var,
+    std::unique_ptr<ACLTensor> &beta, std::unique_ptr<ACLTensor> &gamma,
+    Dtype eps, TargetHint hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  func = instantiate<Dtype, OpType, OpTensor>(
+      input->tensor(), output->tensor(), mean->tensor(), var->tensor(),
+      beta->tensor(), gamma->tensor(), eps);
+  return func;
+}
+
+template <typename OperatorInfo>
+bool instantiate_op_pooling(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func<arm_compute::CLPoolingLayer, arm_compute::ICLTensor,
+                            arm_compute::PoolingLayerInfo>(input, output, info,
+                                                           hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(
+        instantiate_op_func<arm_compute::GCPoolingLayer, arm_compute::IGCTensor,
+                            arm_compute::PoolingLayerInfo>(input, output, info,
+                                                           hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func<arm_compute::NEPoolingLayer, arm_compute::ITensor,
+                            arm_compute::PoolingLayerInfo>(input, output, info,
+                                                           hint));
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_activation(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(instantiate_op_func<arm_compute::CLActivationLayer,
+                                       arm_compute::ICLTensor,
+                                       arm_compute::ActivationLayerInfo>(
+        input, output, info, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(instantiate_op_func<arm_compute::GCActivationLayer,
+                                       arm_compute::IGCTensor,
+                                       arm_compute::ActivationLayerInfo>(
+        input, output, info, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(instantiate_op_func<arm_compute::NEActivationLayer,
+                                       arm_compute::ITensor,
+                                       arm_compute::ActivationLayerInfo>(
+        input, output, info, hint));
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_lrn(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(instantiate_op_func<arm_compute::CLNormalizationLayer,
+                                       arm_compute::ICLTensor,
+                                       arm_compute::NormalizationLayerInfo>(
+        input, output, info, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(instantiate_op_func<arm_compute::GCNormalizationLayer,
+                                       arm_compute::IGCTensor,
+                                       arm_compute::NormalizationLayerInfo>(
+        input, output, info, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(instantiate_op_func<arm_compute::NENormalizationLayer,
+                                       arm_compute::ITensor,
+                                       arm_compute::NormalizationLayerInfo>(
+        input, output, info, hint));
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_conv(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+  std::unique_ptr<ACLTensor> &weights = acl_op->weights();
+  std::unique_ptr<ACLTensor> &biases = acl_op->biases();
+  ConvolutionMethodHint &conv_method = acl_op->getConvMethod();
+  bool has_biases = biases.get() ? true : false;
+  int &groups = acl_op->group();
+  arm_compute::TensorShape input_shape = input->info().tensor_shape();
+  arm_compute::TensorShape weights_shape = weights->info().tensor_shape();
+  arm_compute::TensorShape biases_shape;
+  if (has_biases) {
+    biases_shape = biases->info().tensor_shape();
+  }
+  arm_compute::TensorShape output_shape = output->info().tensor_shape();
+
+  if (groups == 1) {
+    if (conv_method == ConvolutionMethodHint::GEMM) {
+#ifdef USE_OPENCL
+      if (hint == TargetHint::OPENCL) {
+        func.push_back(instantiate_op_func<arm_compute::CLConvolutionLayer,
+                                           arm_compute::ICLTensor,
+                                           arm_compute::PadStrideInfo>(
+            acl_op->input(), acl_op->weights(), acl_op->biases(),
+            acl_op->output(), info, hint));
+        return true;
+      }
+#elif defined(USE_OPENGLES)
+      if (hint == TargetHint::OPENGLES) {
+        func.push_back(instantiate_op_func<arm_compute::GCConvolutionLayer,
+                                           arm_compute::IGCTensor,
+                                           arm_compute::PadStrideInfo>(
+            acl_op->input(), acl_op->weights(), acl_op->biases(),
+            acl_op->output(), info, hint));
+        return true;
+      }
+#endif
+      {
+        func.push_back(instantiate_op_func<arm_compute::NEConvolutionLayer,
+                                           arm_compute::ITensor,
+                                           arm_compute::PadStrideInfo>(
+            acl_op->input(), acl_op->weights(), acl_op->biases(),
+            acl_op->output(), info, hint));
+      }
+    } else {
+#ifdef USE_OPENCL
+      if (hint == TargetHint::OPENCL) {
+        func.push_back(
+            instantiate_op_func<arm_compute::CLDirectConvolutionLayer,
+                                arm_compute::ICLTensor,
+                                arm_compute::PadStrideInfo>(
+                acl_op->input(), acl_op->weights(), acl_op->biases(),
+                acl_op->output(), info, hint));
+        return true;
+      }
+#elif defined(USE_OPENGLES)
+      if (hint == TargetHint::OPENGLES) {
+        func.push_back(
+            instantiate_op_func<arm_compute::GCDirectConvolutionLayer,
+                                arm_compute::IGCTensor,
+                                arm_compute::PadStrideInfo>(
+                acl_op->input(), acl_op->weights(), acl_op->biases(),
+                acl_op->output(), info, hint));
+        return true;
+      }
+#endif
+      {
+        func.push_back(
+            instantiate_op_func<arm_compute::NEDirectConvolutionLayer,
+                                arm_compute::ITensor,
+                                arm_compute::PadStrideInfo>(
+                acl_op->input(), acl_op->weights(), acl_op->biases(),
+                acl_op->output(), info, hint));
+      }
+    }
+    return true;
+  }
+
+  // Calculate sub-tensor splits
+  const int input_split = input_shape.z() / groups;
+  const int output_split = output_shape.z() / groups;
+  const int weights_split = weights_shape[3] / groups;
+  const int biases_split = biases_shape.x() / groups;
+
+  // Calculate sub-tensor shapes
+  input_shape.set(2, input_split);
+  output_shape.set(2, output_split);
+  weights_shape.set(3, weights_split);
+  biases_shape.set(0, biases_split);
+
+  for (auto i = 0; i < groups; ++i) {
+    // Calculate sub-tensors starting coordinates
+    arm_compute::Coordinates input_coord(0, 0, input_split * i);
+    arm_compute::Coordinates output_coord(0, 0, output_split * i);
+    arm_compute::Coordinates weights_coord(0, 0, 0, weights_split * i);
+    arm_compute::Coordinates biases_coord(biases_split * i);
+
+    // Create sub-tensors for input, output, weights and bias
+    acl_op->new_tensor(acl_op->sinput(i), acl_op->input(), input_shape,
+                       input_coord);
+    acl_op->new_tensor(acl_op->soutput(i), acl_op->output(), output_shape,
+                       output_coord);
+    acl_op->new_tensor(acl_op->sweights(i), acl_op->weights(), weights_shape,
+                       weights_coord);
+    if (has_biases) {
+      acl_op->new_tensor(acl_op->sbiases(i), acl_op->biases(), biases_shape,
+                         biases_coord);
+    }
+
+    bool use_opencl = false;
+    if (conv_method == ConvolutionMethodHint::GEMM) {
+#ifdef USE_OPENCL
+      if (hint == TargetHint::OPENCL) {
+        use_opencl = true;
+        func.push_back(
+            instantiate_op_func<arm_compute::CLConvolutionLayer,
+                                arm_compute::ICLTensor,
+                                arm_compute::PadStrideInfo, ACLSubTensor>(
+                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
+                acl_op->soutput(i), info, hint));
+      }
+#endif
+      if (!use_opencl) {
+        func.push_back(
+            instantiate_op_func<arm_compute::NEConvolutionLayer,
+                                arm_compute::ITensor,
+                                arm_compute::PadStrideInfo, ACLSubTensor>(
+                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
+                acl_op->soutput(i), info, hint));
+      }
+    } else {
+#ifdef USE_OPENCL
+      if (hint == TargetHint::OPENCL) {
+        use_opencl = true;
+        func.push_back(
+            instantiate_op_func<arm_compute::CLDirectConvolutionLayer,
+                                arm_compute::ICLTensor,
+                                arm_compute::PadStrideInfo, ACLSubTensor>(
+                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
+                acl_op->soutput(i), info, hint));
+      }
+#endif
+      if (!use_opencl) {
+        func.push_back(
+            instantiate_op_func<arm_compute::NEDirectConvolutionLayer,
+                                arm_compute::ITensor,
+                                arm_compute::PadStrideInfo, ACLSubTensor>(
+                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
+                acl_op->soutput(i), info, hint));
+      }
+    }
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_lc(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+  std::unique_ptr<ACLTensor> &weights = acl_op->weights();
+  std::unique_ptr<ACLTensor> &biases = acl_op->biases();
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func<arm_compute::CLLocallyConnectedLayer,
+                            arm_compute::ICLTensor, arm_compute::PadStrideInfo>(
+            input, weights, biases, output, info, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func<arm_compute::NELocallyConnectedLayer,
+                            arm_compute::ITensor, arm_compute::PadStrideInfo>(
+            input, weights, biases, output, info, hint));
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_fc(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+  std::unique_ptr<ACLTensor> &weights = acl_op->weights();
+  std::unique_ptr<ACLTensor> &biases = acl_op->biases();
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(instantiate_op_func<arm_compute::CLFullyConnectedLayer,
+                                       arm_compute::ICLTensor, bool>(
+        input, weights, biases, output, info, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(instantiate_op_func<arm_compute::GCFullyConnectedLayer,
+                                       arm_compute::IGCTensor, bool>(
+        input, weights, biases, output, info, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(instantiate_op_func<arm_compute::NEFullyConnectedLayer,
+                                       arm_compute::ITensor, bool>(
+        input, weights, biases, output, info, hint));
+  }
+  return true;
+}
+template <typename Dtype>
+bool instantiate_op_bn(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, Dtype eps) {
+  std::unique_ptr<ACLTensor> &mean = acl_op->mean();
+  std::unique_ptr<ACLTensor> &var = acl_op->var();
+  std::unique_ptr<ACLTensor> &beta = acl_op->beta();
+  std::unique_ptr<ACLTensor> &gamma = acl_op->gamma();
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func<Dtype, arm_compute::CLBatchNormalizationLayer,
+                            arm_compute::ICLTensor>(input, output, mean, var,
+                                                    beta, gamma, eps, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(
+        instantiate_op_func<Dtype, arm_compute::GCBatchNormalizationLayer,
+                            arm_compute::IGCTensor>(input, output, mean, var,
+                                                    beta, gamma, eps, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func<Dtype, arm_compute::NEBatchNormalizationLayer,
+                            arm_compute::ITensor>(input, output, mean, var,
+                                                  beta, gamma, eps, hint));
+  }
+  return true;
+}
+inline bool instantiate_op_softmax(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, void *data) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func<arm_compute::CLSoftmaxLayer,
+                            arm_compute::ICLTensor>(input, output, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(
+        instantiate_op_func<arm_compute::GCSoftmaxLayer,
+                            arm_compute::IGCTensor>(input, output, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func<arm_compute::NESoftmaxLayer, arm_compute::ITensor>(
+            input, output, hint));
+  }
+  return true;
+}
+inline bool instantiate_op_concat(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, int num) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func_lists<arm_compute::CLDepthConcatenateLayer,
+                                  arm_compute::ICLTensor>(acl_op, output, num,
+                                                          hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(
+        instantiate_op_func_lists<arm_compute::GCDepthConcatenateLayer,
+                                  arm_compute::IGCTensor>(acl_op, output, num,
+                                                          hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func_lists<arm_compute::NEDepthConcatenateLayer,
+                                  arm_compute::ITensor>(acl_op, output, num,
+                                                        hint));
+  }
+  return true;
+}
+template <typename Dtype>
+void *InputdataPtr(ACLOperator *op,
+                   const std::vector<framework::LoDTensor *> &input_data,
+                   Dtype type, int index = -1) {
+  if (index == -1) index = 0;
+  return (void *)(input_data[index]->mutable_data<Dtype>());
+}
+
+template <typename Dtype>
+void acl_run(ACLOperator *op,
+             const std::vector<framework::LoDTensor *> &in_data, void *out_data,
+             Dtype type, bool multi_input_run = true) {
+  for (int i = 0; i < in_data.size(); ++i) {
+    op->tensor_mem(op->cinput(i), InputdataPtr(op, in_data, type, i));
+  }
+  op->acl_run(NULL, out_data);
+}
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef USE_PROFILING
+#define acl_configure(opname, acl_op, args...)                                \
+  {                                                                           \
+    set_operator_property(acl::operate_type_##opname, #opname);               \
+    logtime_util log_time(ACL_CONFIG_INFO);                                   \
+    instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(),         \
+                            acl_op->output(), acl_op->getTargetHint(), args); \
+  }
+#else
+#define acl_configure(opname, acl_op, args...)                                \
+  {                                                                           \
+    set_operator_property(acl::operate_type_##opname, #opname);               \
+    instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(),         \
+                            acl_op->output(), acl_op->getTargetHint(), args); \
+  }
+#endif
+
+#define ACLOp_Ptr(a) dynamic_cast<ACLOperator *>(a)
+
+#endif  // USE_ACL
+
+#endif  // ACL_OPERATOR_H_
diff --git a/src/operators/kernel/mali/acl_tensor.cc b/src/operators/kernel/mali/acl_tensor.cc
new file mode 100755
index 0000000000000000000000000000000000000000..97a6add20a7ca1b9a6b4f9c9a7e6d1ba1f4e2e0a
--- /dev/null
+++ b/src/operators/kernel/mali/acl_tensor.cc
@@ -0,0 +1,160 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "acl_tensor.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+
+#ifdef USE_ACL
+template <typename TensorType>
+std::unique_ptr<arm_compute::ITensor> initialise_tensor(
+    arm_compute::TensorInfo &info) {
+  auto tensor = cpp14::make_unique<TensorType>();
+  tensor->allocator()->init(info);
+  return std::move(tensor);
+}
+
+template <typename TensorType>
+void tensor_allocate(arm_compute::ITensor &tensor) {
+  auto itensor = dynamic_cast<TensorType *>(&tensor);
+  itensor->allocator()->allocate();
+}
+
+Tensor::Tensor(arm_compute::TensorInfo &info) noexcept
+    : _target(TargetHint::DONT_CARE), _info(info), _tensor(nullptr) {}
+
+Tensor::Tensor(Tensor &&src) noexcept
+    : _target(src._target),
+      _info(std::move(src._info)),
+      _tensor(std::move(src._tensor)) {}
+
+arm_compute::ITensor *Tensor::set_target(TargetHint target) {
+  switch (target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      _tensor = initialise_tensor<arm_compute::CLTensor>(_info);
+      break;
+#elif defined(USE_OPENGLES)
+    case TargetHint::OPENGLES:
+      _tensor = initialise_tensor<arm_compute::GCTensor>(_info);
+      break;
+#endif
+    case TargetHint::NEON:
+      _tensor = initialise_tensor<arm_compute::Tensor>(_info);
+      break;
+    default:
+      break;
+  }
+  _target = target;
+  return _tensor.get();
+}
+
+void Tensor::allocate() {
+  switch (_target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      tensor_allocate<arm_compute::CLTensor>(*_tensor);
+      break;
+#elif defined(USE_OPENGLES)
+    case TargetHint::OPENGLES:
+      tensor_allocate<arm_compute::GCTensor>(*_tensor);
+      break;
+#endif
+    case TargetHint::NEON:
+      tensor_allocate<arm_compute::Tensor>(*_tensor);
+      break;
+    default:
+      break;
+  }
+}
+void Tensor::map(bool blocking) {
+#ifdef USE_OPENCL
+  if (_target == TargetHint::OPENCL)
+    dynamic_cast<arm_compute::CLTensor *>(tensor())->map(blocking);
+#elif defined(USE_OPENGLES)
+  if (_target == TargetHint::OPENGLES)
+    dynamic_cast<arm_compute::GCTensor *>(tensor())->map(blocking);
+#endif
+}
+void Tensor::unmap() {
+#ifdef USE_OPENCL
+  if (_target == TargetHint::OPENCL)
+    dynamic_cast<arm_compute::CLTensor *>(tensor())->unmap();
+#elif defined(USE_OPENGLES)
+  if (_target == TargetHint::OPENGLES)
+    dynamic_cast<arm_compute::GCTensor *>(tensor())->unmap();
+#endif
+}
+
+template <typename SubTensorType, typename ParentTensorType>
+std::unique_ptr<arm_compute::ITensor> initialise_subtensor(
+    arm_compute::ITensor *parent, arm_compute::TensorShape shape,
+    arm_compute::Coordinates coords) {
+  auto ptensor = dynamic_cast<ParentTensorType *>(parent);
+  auto subtensor = cpp14::make_unique<SubTensorType>(ptensor, shape, coords);
+  return std::move(subtensor);
+}
+SubTensor::SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
+                     arm_compute::Coordinates &coords) noexcept
+    : _target(TargetHint::DONT_CARE),
+      _tensor_shape(tensor_shape),
+      _coords(coords),
+      _parent(nullptr),
+      _subtensor(nullptr) {
+  _parent = parent->tensor();
+  _target = parent->target();
+
+  instantiate_subtensor();
+}
+arm_compute::ITensor *SubTensor::set_target(TargetHint target) {
+  return (target == _target) ? _subtensor.get() : nullptr;
+}
+
+arm_compute::ITensor *SubTensor::tensor() { return _subtensor.get(); }
+
+const arm_compute::ITensor *SubTensor::tensor() const {
+  return _subtensor.get();
+}
+
+TargetHint SubTensor::target() const { return _target; }
+
+void SubTensor::allocate() {
+  // NOP for sub-tensors
+}
+
+void SubTensor::instantiate_subtensor() {
+  switch (_target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      _subtensor = initialise_subtensor<arm_compute::CLSubTensor,
+                                        arm_compute::ICLTensor>(
+          _parent, _tensor_shape, _coords);
+      break;
+#endif
+    default:
+    case TargetHint::NEON:
+      _subtensor =
+          initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(
+              _parent, _tensor_shape, _coords);
+      break;
+  }
+}
+
+#endif
+
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/mali/acl_tensor.h b/src/operators/kernel/mali/acl_tensor.h
new file mode 100755
index 0000000000000000000000000000000000000000..1d4f59371e355ddd2e89a709eec0b5451c1c3502
--- /dev/null
+++ b/src/operators/kernel/mali/acl_tensor.h
@@ -0,0 +1,128 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef ACL_TENSOR_H_
+#define ACL_TENSOR_H_
+
+#ifdef USE_ACL
+#ifdef USE_OPENCL
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#elif defined(USE_OPENGLES)
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#endif
+#include "arm_compute/runtime/SubTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+enum class TargetHint {
+  DONT_CARE,
+  OPENCL,
+  OPENGLES,
+  NEON,
+};
+
+enum class ConvolutionMethodHint {
+  GEMM,
+  DIRECT,
+};
+namespace cpp14 {
+template <class T>
+struct _Unique_if {
+  typedef std::unique_ptr<T> _Single_object;
+};
+
+template <class T>
+struct _Unique_if<T[]> {
+  typedef std::unique_ptr<T[]> _Unknown_bound;
+};
+
+template <class T, size_t N>
+struct _Unique_if<T[N]> {
+  typedef void _Known_bound;
+};
+
+template <class T, class... Args>
+typename _Unique_if<T>::_Single_object make_unique(Args &&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+template <class T>
+typename _Unique_if<T>::_Unknown_bound make_unique(size_t n) {
+  typedef typename std::remove_extent<T>::type U;
+  return std::unique_ptr<T>(new U[n]());
+}
+
+template <class T, class... Args>
+typename _Unique_if<T>::_Known_bound make_unique(Args &&...);
+}  // namespace cpp14
+
+class Tensor {
+ public:
+  explicit Tensor(arm_compute::TensorInfo &info) noexcept;
+  virtual ~Tensor() {}
+  Tensor(Tensor &&src) noexcept;
+  void set_info(arm_compute::TensorInfo &&info) { _info = info; }
+  arm_compute::ITensor *set_target(TargetHint target);
+  const arm_compute::TensorInfo &info() const { return _info; }
+  arm_compute::ITensor *tensor() { return _tensor.get(); }
+  void allocate();
+  void init() {}
+  TargetHint target() const { return _target; }
+  virtual void map(bool blocking = true);
+  virtual void unmap();
+
+ private:
+  TargetHint _target;
+  arm_compute::TensorInfo _info;
+  std::unique_ptr<arm_compute::ITensor> _tensor;
+};
+
+class SubTensor {
+ public:
+  SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
+            arm_compute::Coordinates &coords) noexcept;
+  ~SubTensor() {}
+  arm_compute::ITensor *tensor();
+  const arm_compute::ITensor *tensor() const;
+  TargetHint target() const;
+  void allocate();
+  arm_compute::ITensor *set_target(TargetHint target);
+
+ private:
+  /** Instantiates a sub-tensor */
+  void instantiate_subtensor();
+
+ private:
+  /**< Target that this tensor is pinned on */
+  TargetHint _target;
+  /**< SubTensor shape */
+  arm_compute::TensorShape _tensor_shape;
+  /**< SubTensor Coordinates */
+  arm_compute::Coordinates _coords;
+  /**< Parent tensor */
+  arm_compute::ITensor *_parent;
+  /**< SubTensor */
+  std::unique_ptr<arm_compute::ITensor> _subtensor;
+};
+
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif  // ACL_TENSOR_H_
diff --git a/src/operators/kernel/mali/batchnorm_kernel.cpp b/src/operators/kernel/mali/batchnorm_kernel.cpp
old mode 100644
new mode 100755
index 5ad6d6f015c9d8ec095f8269642dd72f4d0a56a1..ad648d615cd8f9134b212d484d7174c95e027551
--- a/src/operators/kernel/mali/batchnorm_kernel.cpp
+++ b/src/operators/kernel/mali/batchnorm_kernel.cpp
@@ -14,18 +14,151 @@ limitations under the License. */
 
 #ifdef BATCHNORM_OP
 
-#pragma once
-
 #include "operators/kernel/batchnorm_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
 
+template <typename DeviceType, typename T>
+class AclBatchNormOp : public acl::ACLOperator {
+ public:
+  AclBatchNormOp() {
+    this->force_bypass_acl_path_ = bypass_acl_class_layer & FLAGS_ENABLE_ACL_BN;
+  }
+  ~AclBatchNormOp() = default;
+  AclBatchNormOp(const AclBatchNormOp&) = delete;
+  AclBatchNormOp& operator=(const AclBatchNormOp&) = delete;
+  AclBatchNormOp(AclBatchNormOp&&) = delete;
+  AclBatchNormOp& operator=(AclBatchNormOp&&) = delete;
+
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const BatchNormParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+
+    arm_compute::TensorShape mean_shape(args.in_depth);
+    arm_compute::TensorShape var_shape = mean_shape;
+    arm_compute::TensorShape beta_shape = mean_shape;
+    arm_compute::TensorShape gamma_shape = mean_shape;
+
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+
+    new_tensor(mean(), mean_shape, args.mean_data);
+    new_tensor(var(), var_shape, args.var_data);
+    new_tensor(beta(), beta_shape, args.biases_data);
+    new_tensor(gamma(), gamma_shape, args.weight_data);
+
+    acl_configure(bn, this, args.epsilon);
+  }
+
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const BatchNormParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    InitAclLayer(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+
+    return bypass_acl;
+  }
+
+ private:
+  void AclParametersByContext(const BatchNormParam& param) {
+    const Tensor* in_x = param.InputX();
+    Tensor* out = param.OutputY();
+    const Tensor* scale = param.InputScale();
+    const Tensor* bias = param.InputBias();
+    const Tensor* saved_mean = param.InputMean();
+    const Tensor* saved_variance = param.InputVariance();
+
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    const T* weight_data = scale->data<T>();
+    const T* bias_data = bias->data<T>();
+    const T* mean_data = saved_mean->data<T>();
+    const T* var_data = saved_variance->data<T>();
+
+    float epsilon = param.Epsilon();
+
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    // args.weight_data = (void*)weight_data;
+    // args.biases_data = (void*)bias_data;
+    args.mean_data = (void*)mean_data;
+    args.var_data = (void*)var_data;
+    args.epsilon = epsilon;
+
+    args.dim = in_x->dims().size();
+
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+
+    args.out_num = out->dims()[0];
+    args.out_depth = out->dims()[1];
+    args.out_rows = out->dims()[2];
+    args.out_cols = out->dims()[3];
+
+    args.weight_data = (void*)weight_data;
+    args.biases_data = (void*)bias_data;
+
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
+
+template <>
+bool BatchNormKernel<GPU_MALI, float>::Init(BatchNormParam* param) {
+  AclBatchNormOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclBatchNormOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
+  return true;
+}
+
 template <>
 void BatchNormKernel<GPU_MALI, float>::Compute(
-    const BatchNormParam &param) const {}
+    const BatchNormParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclBatchNormOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  acl_op->RunAcl(args.input_data, args.output_data);
+}
 
+template class BatchNormKernel<GPU_MALI, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
 #endif
+#endif
diff --git a/src/operators/kernel/mali/concat_kernel.cpp b/src/operators/kernel/mali/concat_kernel.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..aaa586b6d977bfca96e596261ec090637cf87207
--- /dev/null
+++ b/src/operators/kernel/mali/concat_kernel.cpp
@@ -0,0 +1,136 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONCAT_OP
+
+#include "operators/kernel/concat_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class AclConcatOp : public acl::ACLOperator {
+ public:
+  AclConcatOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONCAT;
+  }
+  ~AclConcatOp() = default;
+  AclConcatOp(const AclConcatOp&) = delete;
+  AclConcatOp& operator=(const AclConcatOp&) = delete;
+  AclConcatOp(AclConcatOp&&) = delete;
+  AclConcatOp& operator=(AclConcatOp&&) = delete;
+
+  acl::AclParameters& getargs() { return args; }
+
+  void InitAclLayer(const ConcatParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    const std::vector<framework::LoDTensor*>* input_data = &args.in_tensor;
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.batch);
+
+    if (is_operator_init_done(output_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    T type;
+
+    for (int i = 0; i < input_data->size(); i++) {
+      int in_batch = (*input_data)[i]->dims()[0];
+      int in_channels = (*input_data)[i]->dims()[1];
+      int in_width = (*input_data)[i]->dims()[2];
+      int in_height = (*input_data)[i]->dims()[3];
+      arm_compute::TensorShape in_shape(in_width, in_height, in_channels);
+
+      new_tensor(cinput(i), in_shape,
+                 acl::InputdataPtr(this, args.in_tensor, type, i));
+    }
+
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+
+    acl_configure(concat, this, input_data->size());
+  }
+
+  void RunAcl(const std::vector<framework::LoDTensor*>& input, void* output) {
+    T type;
+    acl::acl_run(this, input, output, type);
+  }
+  bool Bypass_acl(const ConcatParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    InitAclLayer(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || !args.is_channel_concat) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+
+ private:
+  void AclParametersByContext(const ConcatParam& param) {
+    auto inputs = param.Inputs();
+    auto* output = param.Out();
+    int64_t axis = param.Axis();
+
+    T* output_data = output->mutable_data<T>();
+
+    args.is_channel_concat = (axis == 1);
+    args.in_tensor = inputs;
+    args.output_data = (void*)output_data;
+
+    args.batch = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+  }
+  acl::AclParameters args;
+};
+
+template <>
+bool ConcatKernel<GPU_MALI, float>::Init(ConcatParam* param) {
+  AclConcatOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConcatOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
+  return true;
+}
+
+template <>
+void ConcatKernel<GPU_MALI, float>::Compute(const ConcatParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclConcatOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  acl_op->RunAcl(args.in_tensor, args.output_data);
+}
+
+template class ConcatKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+#endif
diff --git a/src/operators/kernel/mali/conv_add_kernel.cpp b/src/operators/kernel/mali/conv_add_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..318db016d12f36981c07627139bcc49d07162d52
--- /dev/null
+++ b/src/operators/kernel/mali/conv_add_kernel.cpp
@@ -0,0 +1,232 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADD_OP
+
+#include "operators/kernel/conv_add_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class AclConvAddOp : public acl::ACLOperator {
+ public:
+  AclConvAddOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
+  }
+  ~AclConvAddOp() = default;
+  AclConvAddOp(const AclConvAddOp&) = delete;
+  AclConvAddOp& operator=(const AclConvAddOp&) = delete;
+  AclConvAddOp(AclConvAddOp&&) = delete;
+  AclConvAddOp& operator=(AclConvAddOp&&) = delete;
+
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const FusionConvAddParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+    arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
+                                           args.in_depth / args.num_group,
+                                           args.out_depth);
+    arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PadStrideInfo conv_info(
+        args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
+        arm_compute::DimensionRoundingType::FLOOR);
+
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+
+    // check_direct_conv();
+    group() = args.num_group;
+    //[kernel_x, kernel_y, IFM, OFM]
+    new_tensor(weights(), weights_shape, args.weight_data);
+    //[OFM]
+    if (args.biases_data) {
+      new_tensor(biases(), biases_shape, args.biases_data);
+    }
+
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+
+    acl_configure(conv, this, conv_info);
+  }
+
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const FusionConvAddParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    InitAclLayer(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
+      bypass_acl = true;
+    }
+    if (args.dim > 2) {
+      bypass_acl = true;
+    }
+    if (args.dilated) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+
+ private:
+  void check_direct_conv() {
+    bool use_direct_conv = false;
+    const char* pDirectConv;
+    pDirectConv = getenv("DIRECTCONV");
+    if (pDirectConv) {
+      unsigned int bdirectconv;
+      sscanf(pDirectConv, "%i", &bdirectconv);
+      if (bdirectconv != use_direct_conv) {
+        use_direct_conv = bdirectconv;
+        printf("DIRECTCONV<%s>\n", pDirectConv);
+        printf("DIRECTCONV: %x\n", use_direct_conv);
+      }
+    }
+    int pad_data[2], kernel[2];
+    pad_data[1] = args.pad_rows;
+    pad_data[0] = args.pad_cols;
+    kernel[1] = args.filter_rows;
+    kernel[0] = args.filter_cols;
+    if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
+                             pad_data[0] == 0 && pad_data[1] == 0) ||
+                            (kernel[0] == 3 && kernel[1] == 3 &&
+                             pad_data[0] <= 1 && pad_data[1] <= 1))) {
+      setConvMethod();  // NEDirectConvolutionLayer only for 1x1 and 3x3
+    }
+  }
+
+  void AclParametersByContext(const FusionConvAddParam& param) {
+    const Tensor* input = param.Input();
+    Tensor filter = *param.Filter();
+    Tensor* output = param.Output();
+    Tensor* bias;
+
+    int groups = param.Groups();
+    std::vector<int> strides = param.Strides();
+    std::vector<int> paddings = param.Paddings();
+    std::vector<int> dilations = param.Dilations();
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>();
+    const T* weight_data = filter.data<T>();
+
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.weight_data = (void*)weight_data;
+    args.biases_data = nullptr;
+
+    try {
+      bias = param.Bias();
+    } catch (const std::exception& e) {
+    }
+    if (bias) {
+      const T* biases_data = bias->data<T>();
+      args.biases_data = (void*)biases_data;
+    }
+
+    args.num_group = groups;
+
+    args.dilation_rows = dilations[0];
+    args.dilation_cols = dilations[1];
+    if (dilations[0] != 1 || dilations[1] != 1) {
+      args.dilated = true;
+    }
+
+    // NCHW
+    // std::cout << "In dims: " << (input->dims()).size() << std::endl;
+    args.batch = input->dims()[0];
+    args.in_depth = input->dims()[1];
+    args.in_rows = input->dims()[2];
+    args.in_cols = input->dims()[3];
+    // std::cout <<"In N: " << args.batch << " C: " <<  args.in_depth
+    //  << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout << "Out dims: " << (output->dims()).size() << std::endl;
+    args.out_num = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    args.filter_rows = filter.dims()[2];
+    args.filter_cols = filter.dims()[3];
+    // std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
+    //  << " I: " <<  static_cast<int>(filter.dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+  }
+  acl::AclParameters args;
+};
+
+template <>
+bool ConvAddKernel<GPU_MALI, float>::Init(FusionConvAddParam* param) {
+  AclConvAddOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConvAddOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
+  return true;
+}
+
+template <>
+void ConvAddKernel<GPU_MALI, float>::Compute(
+    const FusionConvAddParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclConvAddOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+
+  acl_op->RunAcl(args.input_data, args.output_data);
+}
+
+template class ConvAddKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+#endif
diff --git a/src/operators/kernel/mali/conv_kernel.cpp b/src/operators/kernel/mali/conv_kernel.cpp
old mode 100644
new mode 100755
index c6bb6306b714fe511695e1b47a5e63c611eca465..c548977ebaa34fabc1b1fe54d6db9690bcb424f1
--- a/src/operators/kernel/mali/conv_kernel.cpp
+++ b/src/operators/kernel/mali/conv_kernel.cpp
@@ -15,15 +15,211 @@ limitations under the License. */
 #ifdef CONV_OP
 
 #include "operators/kernel/conv_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
 
+template <typename DeviceType, typename T>
+class AclConvOp : public acl::ACLOperator {
+ public:
+  AclConvOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
+  }
+  ~AclConvOp() = default;
+  AclConvOp(const AclConvOp&) = delete;
+  AclConvOp& operator=(const AclConvOp&) = delete;
+  AclConvOp(AclConvOp&&) = delete;
+  AclConvOp& operator=(AclConvOp&&) = delete;
+
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const ConvParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+    arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
+                                           args.in_depth / args.num_group,
+                                           args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PadStrideInfo conv_info(
+        args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
+        arm_compute::DimensionRoundingType::FLOOR);
+
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+
+    check_direct_conv();
+    //[kernel_x, kernel_y, IFM, OFM]
+    new_tensor(weights(), weights_shape, args.weight_data);
+    //[OFM]
+    // if (args.biases_data) {
+    //    new_tensor(biases(),biases_shape,args.biases_data);
+    //}
+
+    group() = args.num_group;
+
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+
+    acl_configure(conv, this, conv_info);
+  }
+
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const ConvParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    InitAclLayer(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
+      bypass_acl = true;
+    }
+    if (args.dim > 2) {
+      bypass_acl = true;
+    }
+    if (args.dilated) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+
+ private:
+  void check_direct_conv() {
+    bool use_direct_conv = false;
+    const char* pDirectConv;
+    pDirectConv = getenv("DIRECTCONV");
+    if (pDirectConv) {
+      unsigned int bdirectconv;
+      sscanf(pDirectConv, "%i", &bdirectconv);
+      if (bdirectconv != use_direct_conv) {
+        use_direct_conv = bdirectconv;
+        printf("DIRECTCONV<%s>\n", pDirectConv);
+        printf("DIRECTCONV: %x\n", use_direct_conv);
+      }
+    }
+    int pad_data[2], kernel[2];
+    pad_data[1] = args.pad_rows;
+    pad_data[0] = args.pad_cols;
+    kernel[1] = args.filter_rows;
+    kernel[0] = args.filter_cols;
+    if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
+                             pad_data[0] == 0 && pad_data[1] == 0) ||
+                            (kernel[0] == 3 && kernel[1] == 3 &&
+                             pad_data[0] <= 1 && pad_data[1] <= 1))) {
+      setConvMethod();  // NEDirectConvolutionLayer only for 1x1 and 3x3
+    }
+  }
+
+  void AclParametersByContext(const ConvParam& param) {
+    const Tensor* input = param.Input();
+    Tensor filter = *param.Filter();
+    Tensor* output = param.Output();
+
+    int groups = param.Groups();
+    std::vector<int> strides = param.Strides();
+    std::vector<int> paddings = param.Paddings();
+    std::vector<int> dilations = param.Dilations();
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>();
+    const T* weight_data = filter.data<T>();
+
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.weight_data = (void*)weight_data;
+    args.biases_data = nullptr;
+
+    // try {
+    //     bias = context.Input<framework::Tensor>("Bias");
+    // } catch (const std::exception& e) {
+    // }
+    // if (bias) {
+    //     const T* biases_data = bias->data<T>();
+    //     args.biases_data = (void*)biases_data;
+    // }
+
+    args.num_group = groups;
+
+    args.dilation_rows = dilations[0];
+    args.dilation_cols = dilations[1];
+    if (dilations[0] != 1 || dilations[1] != 1) {
+      args.dilated = true;
+    }
+
+    // NCHW
+    // std::cout << "In dims: " << (input->dims()).size() << std::endl;
+    args.batch = input->dims()[0];
+    args.in_depth = input->dims()[1];
+    args.in_rows = input->dims()[2];
+    args.in_cols = input->dims()[3];
+    std::cout << "In N: " << args.batch << " C: " << args.in_depth
+              << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout << "Out dims: " << (output->dims()).size() << std::endl;
+    args.out_num = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    args.filter_rows = filter.dims()[2];
+    args.filter_cols = filter.dims()[3];
+    // std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
+    //  << " I: " <<  static_cast<int>(filter.dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+  }
+  acl::AclParameters args;
+};
+
 template <>
-void ConvKernel<GPU_MALI, float>::Compute(const ConvParam &param) const {
-  //  ArmConvImplement imp;
-  //  imp.Compute(param);
-  param.Output()->mutable_data<float>()[0] = 100.0;
+bool ConvKernel<GPU_MALI, float>::Init(ConvParam* param) {
+  AclConvOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConvOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
+  return true;
+}
+
+template <>
+void ConvKernel<GPU_MALI, float>::Compute(const ConvParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclConvOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  acl_op->RunAcl(args.input_data, args.output_data);
 }
 
 template class ConvKernel<GPU_MALI, float>;
@@ -31,3 +227,4 @@ template class ConvKernel<GPU_MALI, float>;
 }  // namespace paddle_mobile
 
 #endif
+#endif
diff --git a/src/operators/kernel/mali/elementwise_add_kernel.cpp b/src/operators/kernel/mali/elementwise_add_kernel.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..9748bbbb5454f10ad9ea83e37d599fb1c6cdb53e
--- /dev/null
+++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp
@@ -0,0 +1,51 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEADD_OP
+
+#pragma once
+
+#include "operators/kernel/elementwise_add_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct AddFunctor {
+  inline T operator()(T a, T b) const { return a + b; }
+};
+
+template <>
+bool ElementwiseAddKernel<GPU_MALI, float>::Init(ElementwiseAddParam *param) {
+  return true;
+}
+
+template <>
+void ElementwiseAddKernel<GPU_MALI, float>::Compute(
+    const ElementwiseAddParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *Out = param.Out();
+  Out->mutable_data<float>();
+  int axis = param.Axis();
+  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
+                                                 AddFunctor<float>(), Out);
+}
+
+template class ElementwiseAddKernel<GPU_MALI, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/mali/fushion_fc_kernel.cpp b/src/operators/kernel/mali/fushion_fc_kernel.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..44a7ce2af62a1d27aff8181f6742bebda1d6d066
--- /dev/null
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
@@ -0,0 +1,75 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_FC_OP
+
+#include "operators/kernel/fusion_fc_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FusionFcKernel<GPU_MALI, float>::Init(FusionFcParam *param) {
+  return true;
+}
+
+template <>
+void FusionFcKernel<GPU_MALI, float>::Compute(
+    const FusionFcParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  const Tensor *input_z = param.InputZ();
+  auto *input_z_data = input_z->data<float>();
+  int axis = param.Axis();
+  Tensor *out = param.Out();
+  auto *out_data = out->mutable_data<float>();
+  const Tensor x_matrix =
+      input_x->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+          : *input_x;
+  const Tensor y_matrix =
+      input_y->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+          : *input_y;
+  auto out_dim = out->dims();
+  if (out_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
+  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
+                        " out_dim.size must be 2.");
+  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
+  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ")
+
+  int64_t classes = input_z->numel();
+  for (int i = 0; i < out_dim[0]; i++) {
+    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
+  }
+
+  for (int i = 0; i < out->numel(); i++) {
+    DLOG << out_data[i];
+  }
+  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                      out, static_cast<float>(1));
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  //            if (out_dim.size() != 2) {
+  //                out->Resize(out_dim);
+  //            }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/mali/lrn_kernel.cpp b/src/operators/kernel/mali/lrn_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4fb5fca8695dccc45c7169d8572618965b3d84a3
--- /dev/null
+++ b/src/operators/kernel/mali/lrn_kernel.cpp
@@ -0,0 +1,157 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef LRN_OP
+
+#pragma once
+
+#include "operators/kernel/lrn_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/kernel/central-arm-func/lrn_arm_func.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class AclLrnOp : public acl::ACLOperator {
+ public:
+  AclLrnOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_LRN;
+  }
+  ~AclLrnOp() = default;
+  AclLrnOp(const AclLrnOp&) = delete;
+  AclLrnOp& operator=(const AclLrnOp&) = delete;
+  AclLrnOp(AclLrnOp&&) = delete;
+  AclLrnOp& operator=(AclLrnOp&&) = delete;
+
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const LrnParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape shape(args.in_cols, args.in_rows, args.in_depth);
+
+    if (is_operator_init_done(shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+
+    arm_compute::NormalizationLayerInfo norm_info(
+        arm_compute::NormType::CROSS_MAP, args.nsize, args.alpha, args.beta,
+        args.knorm);
+
+    //[width, height, IFM]
+    new_tensor(input(), shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), shape, args.output_data);
+
+    acl_configure(lrn, this, norm_info);
+  }
+
+  void Set_bypass(bool bypass) { args.is_bypass = bypass; }
+
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const LrnParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    InitAclLayer(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+
+    return bypass_acl;
+  }
+
+ private:
+  void AclParametersByContext(const LrnParam& param) {
+    const Tensor* in_x = param.InputX();
+    Tensor* out = param.Out();
+
+    int n = param.N();
+    T alpha = param.Alpha();
+    T beta = param.Beta();
+    T k = param.K();
+
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+
+    args.nsize = n;
+    args.alpha = alpha;
+    args.beta = beta;
+    args.knorm = k;
+
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
+
+template <>
+bool LrnKernel<GPU_MALI, float>::Init(LrnParam* param) {
+  AclLrnOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclLrnOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  if (acl_op->Bypass_acl(*param)) {
+    acl_op->Set_bypass(true);
+    std::cout << "init acl failed" << std::endl;
+    return true;
+  }
+  return true;
+}
+
+template <>
+void LrnKernel<GPU_MALI, float>::Compute(const LrnParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclLrnOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  if (args.is_bypass) {
+    std::cout << "bypass op" << std::endl;
+    LrnCompute<float>(param);
+    return;
+  }
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  for (int n = 0; n < args.batch; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth * args.in_cols * args.in_rows;
+    output_data += args.in_depth * args.in_cols * args.in_rows;
+  }
+}
+
+template class LrnKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+#endif
diff --git a/src/operators/kernel/mali/mul_kernel.cpp b/src/operators/kernel/mali/mul_kernel.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..3a9ec4ebb319d9e521240ad987a49549c22c1ff2
--- /dev/null
+++ b/src/operators/kernel/mali/mul_kernel.cpp
@@ -0,0 +1,59 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef MUL_OP
+
+#pragma once
+
+#include "operators/kernel/mul_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool MulKernel<GPU_MALI, float>::Init(MulParam *param) {
+  return true;
+}
+
+template <>
+void MulKernel<GPU_MALI, float>::Compute(const MulParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *out = param.Out();
+  out->mutable_data<float>();
+  const Tensor x_matrix =
+      input_x->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+          : *input_x;
+  const Tensor y_matrix =
+      input_y->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+          : *input_y;
+  auto out_dim = out->dims();
+  if (out_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                      out, static_cast<float>(0));
+  if (out_dim.size() != 2) {
+    out->Resize(out_dim);
+  }
+}
+
+template class MulKernel<GPU_MALI, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/mali/pool_kernel.cpp b/src/operators/kernel/mali/pool_kernel.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..1f49391341d69a0690352c69c9c208550f8e1c24
--- /dev/null
+++ b/src/operators/kernel/mali/pool_kernel.cpp
@@ -0,0 +1,220 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef POOL_OP
+
+#pragma once
+
+#include "operators/kernel/pool_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class AclPoolOp : public acl::ACLOperator {
+ public:
+  AclPoolOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_POOLING;
+  }
+  ~AclPoolOp() = default;
+  AclPoolOp(const AclPoolOp&) = delete;
+  AclPoolOp& operator=(const AclPoolOp&) = delete;
+  AclPoolOp(AclPoolOp&&) = delete;
+  AclPoolOp& operator=(AclPoolOp&&) = delete;
+
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const PoolParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth);
+    // arm_compute::TensorShape weights_shape(
+    // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PoolingLayerInfo pool_info;
+
+    if (args.pool_type == "max") {
+      pool_info = arm_compute::PoolingLayerInfo(
+          arm_compute::PoolingType::MAX, args.filter_rows,
+          arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
+                                     args.pad_cols, args.pad_rows,
+                                     arm_compute::DimensionRoundingType::CEIL));
+    } else {
+      pool_info = arm_compute::PoolingLayerInfo(
+          arm_compute::PoolingType::AVG, args.filter_rows,
+          arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
+                                     args.pad_cols, args.pad_rows,
+                                     arm_compute::DimensionRoundingType::CEIL));
+    }
+
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+
+    acl_configure(pooling, this, pool_info);
+  }
+
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const PoolParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    InitAclLayer(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    if (args.pool_type != "max" && args.pool_type != "avg") {
+      bypass_acl = true;
+    }
+    if (args.filter_rows != args.filter_cols) {
+      bypass_acl = true;
+    }
+    // if (args.filter_rows!=2 && args.filter_rows!=3) {
+    //     bypass_acl = true;
+    // }
+    return bypass_acl;
+  }
+
+ private:
+  void AclParametersByContext(const PoolParam& param) {
+    const Tensor* in_x = param.Input();
+    Tensor* out = param.Output();
+    std::string pooling_type = param.PoolingType();
+
+    std::vector<int> ksize = param.Ksize();
+
+    std::vector<int> strides = param.Strides();
+
+    std::vector<int> paddings = param.Paddings();
+
+    bool is_global_pooling = param.isGlobalPooling();
+
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+
+    args.is_global_pool = is_global_pooling;
+    args.pool_type = pooling_type;
+
+    args.filter_rows = ksize[0];
+    args.filter_cols = ksize[1];
+    args.dim = ksize.size();
+
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+    // std::cout <<"In N: " << args.batch << " C: " <<  args.in_depth
+    //  << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    // std::cout <<"Filter O: " << static_cast<int>(filter->dims()[0])
+    //  << " I: " <<  static_cast<int>(filter->dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"PoolingType: " << args.pool_type << "\n";
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+
+    args.out_depth = args.in_depth;
+    // args.out_rows = out->dims()[2];
+    // args.out_cols = out->dims()[3];
+    args.out_rows = static_cast<int>(ceil(static_cast<float>(args.in_rows +
+                                                             2 * args.pad_rows -
+                                                             args.filter_rows) /
+                                          args.stride_rows)) +
+                    1;
+    args.out_cols = static_cast<int>(ceil(static_cast<float>(args.in_cols +
+                                                             2 * args.pad_cols -
+                                                             args.filter_cols) /
+                                          args.stride_cols)) +
+                    1;
+
+    if (is_global_pooling) {
+      args.filter_rows = args.in_rows;
+      args.filter_cols = args.in_cols;
+      args.pad_rows = 0;
+      args.pad_cols = 0;
+    }
+  }
+  acl::AclParameters args;
+};
+
+template <>
+bool PoolKernel<GPU_MALI, float>::Init(PoolParam* param) {
+  AclPoolOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclPoolOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
+  return true;
+}
+
+template <>
+void PoolKernel<GPU_MALI, float>::Compute(const PoolParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclPoolOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  for (int n = 0; n < args.batch; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth * args.in_cols * args.in_rows;
+    output_data += args.in_depth * args.out_cols * args.out_rows;
+  }
+}
+
+template class PoolKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+#endif
diff --git a/src/operators/kernel/mali/relu_kernel.cpp b/src/operators/kernel/mali/relu_kernel.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..1a8c0f88543199e7a863cc44b5b0a6be3bc6212d
--- /dev/null
+++ b/src/operators/kernel/mali/relu_kernel.cpp
@@ -0,0 +1,134 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RELU_OP
+
+#pragma once
+
+#include "operators/kernel/relu_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class AclReluOp : public acl::ACLOperator {
+ public:
+  AclReluOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_RELU;
+  }
+  ~AclReluOp() = default;
+  AclReluOp(const AclReluOp&) = delete;
+  AclReluOp& operator=(const AclReluOp&) = delete;
+  AclReluOp(AclReluOp&&) = delete;
+  AclReluOp& operator=(AclReluOp&&) = delete;
+
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const ReluParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.in_cols, args.in_rows,
+                                          args.in_depth, args.out_num);
+    // arm_compute::TensorShape weights_shape(
+    // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::ActivationLayerInfo::ActivationFunction type;
+    type = arm_compute::ActivationLayerInfo::ActivationFunction::RELU;
+
+    arm_compute::ActivationLayerInfo act_info(type);
+
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+
+    acl_configure(activation, this, act_info);
+  }
+
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const ReluParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    InitAclLayer(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+
+ private:
+  void AclParametersByContext(const ReluParam& param) {
+    const auto* input_x = param.InputX();
+    auto* out = param.Out();
+
+    const T* input_data = input_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+
+    args.batch = input_x->dims()[0];
+    args.in_depth = input_x->dims()[1];
+    args.in_rows = input_x->dims()[2];
+    args.in_cols = input_x->dims()[3];
+    args.out_num = out->dims()[0];
+  }
+  acl::AclParameters args;
+};
+
+template <>
+bool ReluKernel<GPU_MALI, float>::Init(ReluParam* param) {
+  AclReluOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclReluOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
+  return true;
+}
+
+template <>
+void ReluKernel<GPU_MALI, float>::Compute(const ReluParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclReluOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  acl_op->RunAcl(args.input_data, args.output_data);
+}
+
+template class ReluKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+#endif
diff --git a/src/operators/kernel/mali/reshape_kernel.cpp b/src/operators/kernel/mali/reshape_kernel.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..57837a677033590e92a307bd69a77c076c5ba805
--- /dev/null
+++ b/src/operators/kernel/mali/reshape_kernel.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE_OP
+
+#pragma once
+
+#include "operators/kernel/reshape_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ReshapeKernel<GPU_MALI, float>::Init(ReshapeParam *param) {
+  return true;
+}
+
+template <>
+void ReshapeKernel<GPU_MALI, float>::Compute(const ReshapeParam &param) const {
+  const auto *input_x = param.InputX();
+  const auto &input_x_dims = input_x->dims();
+  auto *out = param.Out();
+  framework::DDim out_dims = out->dims();
+  const auto *input_shape = param.InputShape();
+
+  if (input_shape) {
+    auto *shape_data = input_shape->data<int>();
+    framework::Tensor cpu_shape_tensor;
+    auto shape =
+        std::vector<int>(shape_data, shape_data + input_shape->numel());
+    out_dims = ValidateShape(shape, input_x->dims());
+  }
+
+  bool inplace = param.Inplace();
+  out->Resize(out_dims);
+  if (!inplace) {
+    out->mutable_data<float>();
+    framework::TensorCopy(*input_x, out);
+    out->Resize(out_dims);
+  } else {
+    out->ShareDataWith(*input_x);
+    out->Resize(out_dims);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/mali/softmax_kernel.cpp b/src/operators/kernel/mali/softmax_kernel.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..37d2f2b6b1656602e5acfecd3ac79733f570844d
--- /dev/null
+++ b/src/operators/kernel/mali/softmax_kernel.cpp
@@ -0,0 +1,138 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SOFTMAX_OP
+
+#pragma once
+
+#include "operators/kernel/softmax_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class AclSoftmaxOp : public acl::ACLOperator {
+ public:
+  AclSoftmaxOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_SOFTMAX;
+  }
+  ~AclSoftmaxOp() = default;
+  AclSoftmaxOp(const AclSoftmaxOp&) = delete;
+  AclSoftmaxOp& operator=(const AclSoftmaxOp&) = delete;
+  AclSoftmaxOp(AclSoftmaxOp&&) = delete;
+  AclSoftmaxOp& operator=(AclSoftmaxOp&&) = delete;
+
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const SoftmaxParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape shape(args.in_depth, args.batch);
+
+    if (is_operator_init_done(shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+
+    //[width, height, IFM]
+    new_tensor(input(), shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), shape, args.output_data);
+
+    acl_configure(softmax, this, NULL);
+  }
+
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const SoftmaxParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    InitAclLayer(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+
+    return bypass_acl;
+  }
+
+ private:
+  void AclParametersByContext(const SoftmaxParam& param) {
+    const framework::Tensor* in_x = param.InputX();
+    framework::Tensor* out = param.Out();
+    auto x_dims = in_x->dims();
+    out->Resize(x_dims);
+
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->data<T>();
+
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+
+    args.out_num = out->dims()[0];
+
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
+
+template <>
+bool SoftmaxKernel<GPU_MALI, float>::Init(SoftmaxParam* param) {
+  AclSoftmaxOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclSoftmaxOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
+  return true;
+}
+
+template <>
+void SoftmaxKernel<GPU_MALI, float>::Compute(const SoftmaxParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclSoftmaxOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+
+  for (int n = 0; n < args.out_num; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth;
+    output_data += args.in_depth;
+  }
+}
+
+template class SoftmaxKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+#endif
diff --git a/src/operators/kernel/mul_kernel.h b/src/operators/kernel/mul_kernel.h
index 4ca1df1af188b4e9b95644d0796a7968f873f6f4..f7dcb738b38448fe38eb60dcbbd4a2abda7a858a 100644
--- a/src/operators/kernel/mul_kernel.h
+++ b/src/operators/kernel/mul_kernel.h
@@ -29,6 +29,7 @@ template <typename DeviceType, typename T>
 class MulKernel : public framework::OpKernelBase<DeviceType, MulParam> {
  public:
   void Compute(const MulParam &param) const;
+  bool Init(MulParam *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/multiclass_nms_kernel.h b/src/operators/kernel/multiclass_nms_kernel.h
index 82bafe2685423f8014d95b8fc875554567d2094a..9bd00b874a1140373decca582f793febf0e941ec 100644
--- a/src/operators/kernel/multiclass_nms_kernel.h
+++ b/src/operators/kernel/multiclass_nms_kernel.h
@@ -28,6 +28,7 @@ class MultiClassNMSKernel
     : public framework::OpKernelBase<DeviceType, MultiClassNMSParam> {
  public:
   void Compute(const MultiClassNMSParam& param) const;
+  bool Init(MultiClassNMSParam* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/pool_kernel.h b/src/operators/kernel/pool_kernel.h
index 2a7b0ec48edeb922d6701e6ce4a9b6a514bc58f7..fd9faa3d5a508084924e080f5c5ed7e7b454b5f2 100644
--- a/src/operators/kernel/pool_kernel.h
+++ b/src/operators/kernel/pool_kernel.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #pragma once
 
 #include "framework/operator.h"
-#include "operators/math/pooling.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
@@ -28,6 +27,7 @@ template <typename DeviceType, typename T>
 class PoolKernel : public OpKernelBase<DeviceType, PoolParam> {
  public:
   void Compute(const PoolParam &param) const override;
+  bool Init(PoolParam *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/prelu_kernel.h b/src/operators/kernel/prelu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..15696174377f04ad9a62366e03ded1f2cdcdee9e
--- /dev/null
+++ b/src/operators/kernel/prelu_kernel.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+#pragma once
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class PReluKernel : public framework::OpKernelBase<DeviceType, PReluParam> {
+ public:
+  void Compute(const PReluParam& param) const;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/prior_box_kernel.h b/src/operators/kernel/prior_box_kernel.h
index 3e7c72a736ea56beb6cede1d5892675d6721163f..d169a01d7f45f7dbdcc02be0e1e71690b8550af8 100644
--- a/src/operators/kernel/prior_box_kernel.h
+++ b/src/operators/kernel/prior_box_kernel.h
@@ -16,8 +16,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+#include <cmath>
 #include <vector>
-
 #include "framework/operator.h"
 #include "operators/math/transform.h"
 #include "operators/op_param.h"
@@ -54,6 +55,7 @@ class PriorBoxKernel
     : public framework::OpKernelBase<DeviceType, PriorBoxParam> {
  public:
   void Compute(const PriorBoxParam& param) const;
+  bool Init(PriorBoxParam* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/relu_kernel.h b/src/operators/kernel/relu_kernel.h
index 793268f35a78255f853c85d1af0d2ef0d3d328e5..64016656b20b0fdb08f1342f7853e2e727a6bb81 100644
--- a/src/operators/kernel/relu_kernel.h
+++ b/src/operators/kernel/relu_kernel.h
@@ -27,6 +27,7 @@ template <typename DeviceType, typename T>
 class ReluKernel : public framework::OpKernelBase<DeviceType, ReluParam> {
  public:
   void Compute(const ReluParam& param) const;
+  bool Init(ReluParam* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/reshape_kernel.h b/src/operators/kernel/reshape_kernel.h
index 6b153e5fe3eba73f548fd1fc0ab9f95a5b390bf1..47eba531b9f36d83d44588d9cdfb162519c24180 100644
--- a/src/operators/kernel/reshape_kernel.h
+++ b/src/operators/kernel/reshape_kernel.h
@@ -71,6 +71,7 @@ template <typename DeviceType, typename T>
 class ReshapeKernel : public framework::OpKernelBase<DeviceType, ReshapeParam> {
  public:
   void Compute(const ReshapeParam& param) const;
+  bool Init(ReshapeParam* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/resize_kernel.h b/src/operators/kernel/resize_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1b9e64fc00863e83caa1c44606cdb3ec9f44817
--- /dev/null
+++ b/src/operators/kernel/resize_kernel.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESIZE_OP
+
+#pragma once
+
+#include <vector>
+#include "framework/operator.h"
+
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+inline framework::DDim CalOutputShape(const ResizeParam &param) {
+  const auto *input_x = param.InputX();
+  const auto &input_x_dims = input_x->dims();
+  auto *out = param.Out();
+  framework::DDim out_dims = out->dims();
+  const auto *input_shape = param.InputShape();
+
+  if (input_shape) {
+    auto *shape_data = input_shape->data<int>();
+    framework::Tensor cpu_shape_tensor;
+    auto shape =
+        std::vector<int>(shape_data, shape_data + input_shape->numel());
+    const int in_batch_size = input_x->dims()[0];
+    const int in_chan_size = input_x->dims()[1];
+    const int in_height = input_x->dims()[2];
+    const int in_width = input_x->dims()[3];
+
+    int out_height = 0;
+    int out_width = 0;
+    bool is_pyramid_test = param.IsPyramidTest();
+    if (is_pyramid_test == false) {
+      out_height = param.Height();
+      out_width = param.Width();
+      PADDLE_MOBILE_ENFORCE(out_height > 0, "output height is required");
+      PADDLE_MOBILE_ENFORCE(out_width > 0, "output width is required");
+
+    } else {
+      float out_height_scale = param.OutHeightScale();
+      float out_width_scale = param.OutWidthScale();
+      PADDLE_MOBILE_ENFORCE(out_height_scale > 0,
+                            "output height scale is required");
+      PADDLE_MOBILE_ENFORCE(out_width_scale > 0,
+                            "output width scale is required");
+
+      out_height = int(out_height_scale * in_height);
+      out_width = int(out_width_scale * in_width);
+    }
+
+    out_dims = framework::make_ddim(
+        {in_batch_size, in_chan_size, in_height, in_width});
+  }
+  return out_dims;
+}
+
+template <typename DeviceType, typename T>
+class ResizeKernel : public framework::OpKernelBase<DeviceType, ResizeParam> {
+ public:
+  void Compute(const ResizeParam &param) const;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/scale_kernel.h b/src/operators/kernel/scale_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..98ac71d0bbad86f595171ad7ac5b2a1cdf5908fa
--- /dev/null
+++ b/src/operators/kernel/scale_kernel.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+#pragma once
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class ScaleKernel : public framework::OpKernelBase<DeviceType, ScaleParam> {
+ public:
+  void Compute(const ScaleParam& param) const;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/sigmoid_kernel.h b/src/operators/kernel/sigmoid_kernel.h
index e901f02096c764537f268f628ccdc379f3a503e1..fc3eb5e1bf158c541b2f00d9e57ddd4699344006 100644
--- a/src/operators/kernel/sigmoid_kernel.h
+++ b/src/operators/kernel/sigmoid_kernel.h
@@ -26,6 +26,7 @@ template <typename DeviceType, typename T>
 class SigmoidKernel : public OpKernelBase<DeviceType, SigmoidParam> {
  public:
   void Compute(const SigmoidParam& param) const override;
+  bool Init(SigmoidParam* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/slice_kernel.h b/src/operators/kernel/slice_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd3b8dc767076c5244509f6015c42bee87df100b
--- /dev/null
+++ b/src/operators/kernel/slice_kernel.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+#pragma once
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class SliceKernel : public framework::OpKernelBase<DeviceType, SliceParam> {
+ public:
+  void Compute(const SliceParam& param) const {}
+};
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/softmax_kernel.h b/src/operators/kernel/softmax_kernel.h
index 2b2d753cf666a6eb58f70f2f43afbbefb3953d8b..a500d9c81cce96b0f1db6d45981ad9aa02ea7c0b 100644
--- a/src/operators/kernel/softmax_kernel.h
+++ b/src/operators/kernel/softmax_kernel.h
@@ -23,12 +23,11 @@ namespace paddle_mobile {
 namespace operators {
 using framework::OpKernelBase;
 
-void simoid(Tensor *X, Tensor *Y);
-
 template <typename DeviceType, typename T>
 class SoftmaxKernel : public OpKernelBase<DeviceType, SoftmaxParam> {
  public:
   void Compute(const SoftmaxParam &param) const override;
+  bool Init(SoftmaxParam *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/transpose_kernel.h b/src/operators/kernel/transpose_kernel.h
index 82d73ac82cd28edbd5b6fc349748293fd00fcf45..f1a21ebbb28c2acdb905ce9f09c28f0d47e17294 100644
--- a/src/operators/kernel/transpose_kernel.h
+++ b/src/operators/kernel/transpose_kernel.h
@@ -29,6 +29,7 @@ class TransposeKernel
     : public framework::OpKernelBase<DeviceType, TransposeParam> {
  public:
   void Compute(const TransposeParam& param) const;
+  bool Init(TransposeParam* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/lrn_op.cpp b/src/operators/lrn_op.cpp
index 2533ab19a5084513a991082f148d546cb0059657..dde9123edf3568020f933bb7375be99e40f2367b 100644
--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -24,16 +24,16 @@ void LrnOp<Dtype, T>::InferShape() const {
   auto x_dims = this->param_.InputX()->dims();
   this->param_.Out()->Resize(x_dims);
 }
-template class LrnOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(lrn);
 REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
diff --git a/src/operators/lrn_op.h b/src/operators/lrn_op.h
index d67b9f6be741581918b09d19a8a8b26c28ceed1c..0d756a14f4d935fd59ac2bfc7c811c674b1587fe 100644
--- a/src/operators/lrn_op.h
+++ b/src/operators/lrn_op.h
@@ -46,4 +46,13 @@ class LrnOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(lrn);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(lrn);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/math/conv_func.h b/src/operators/math/conv_func.h
index 3d23f6c8a24be7f52e1b322e07addb47ccd8b056..d9e2da0db5c50e0b0f9b11d5584bfce8b75777cd 100644
--- a/src/operators/math/conv_func.h
+++ b/src/operators/math/conv_func.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#if __ARM_NEON
+#ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
 
@@ -49,7 +49,7 @@ inline void expand_bias(Tensor &bias, int axis, const DDim &dDim) {
   auto new_ptr = bias.mutable_data<float>();
   int axis_size = dDim[axis];
 
-#if __ARM_NEON
+#ifdef __ARM_NEON
   for (int i = 0; i < outer_size; ++i) {
     int inner_num = inner_size >> 4;
     int remain = inner_size - (inner_num << 4);
diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c93278a661f72152debcef7066bdd751bccc5b4e
--- /dev/null
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -0,0 +1,1884 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/math/depthwise_conv_3x3.h"
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+#include <vector>
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
+                      vector<int> paddings, const Tensor *filter, Tensor *bias,
+                      Tensor *output, bool if_bias) {
+  const int batch_size = input->dims()[0];
+
+  const int input_height = input->dims()[2];
+
+  const int input_width = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int _kernel_size = 3;
+  const int stride_height = strides[0];
+  const int stride_width = strides[1];
+  const int padding_height = paddings[0];
+  const int padding_width = paddings[1];
+  const float zero = 0;
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+  const int filter_channel_stride = 9;
+
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+  if (if_bias) {
+    math::expand_bias(*bias, 1, output->dims());
+    output->ShareDataWith(*bias);
+  }
+  float *output_data = output->mutable_data<float>();
+
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  const int filter_batch_stride = output_channels * output_channel_stride;
+  const float *pos1, *pos2, *pos3, *filter1, *filter2, *filter3, *output_ptr;
+  int hstart, wstart, hend, wend;
+  float result;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      filter1 = filter_data;
+      filter2 = filter1 + 3;
+      filter3 = filter2 + 3;
+
+      for (int ph = 0; ph < output_height; ph++) {
+        for (int pw = 0; pw < output_width; pw++) {
+          hstart = ph * stride_height - padding_height;
+          wstart = pw * stride_width - padding_width;
+          hend = min(hstart + _kernel_size, input_height + padding_height);
+          wend = min(wstart + _kernel_size, input_width + padding_width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          hend = min(hend, input_height);
+          wend = min(wend, input_width);
+          pos1 = input_data + hstart * input_width + wstart;
+          pos2 = input_data + (hstart + 1) * input_width + wstart;
+          pos3 = input_data + (hstart + 2) * input_width + wstart;
+          output_ptr = output_data + ph * output_width + pw;
+
+          if (hend - hstart != 3 || wend - wstart != 3) {
+            result = 0;
+            float fake_input[9] = {0};
+            if (hstart == 0 && wstart == 0) {
+              // 左上角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend && k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k -
+                                   (3 - wend)];
+                  }
+                }
+              }
+            } else if (hstart == 0 && wend == input_width) {
+              // 右上角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend && k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k + wstart];
+                  }
+                }
+              }
+
+            } else if (hend == input_height && wstart == 0) {
+              // 左下角
+
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - 1 - hstart && k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k - (3 - wend)];
+                  }
+                }
+              }
+            } else if (hend == input_height && wend == input_width) {
+              // 右下角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - hstart - 1 &&
+                      k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+            } else if (hstart == 0) {
+              // 顶部
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k + wstart];
+                  }
+                }
+              }
+
+            } else if (hend == input_height) {
+              // 底部
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - hstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+
+            } else if (wstart == 0) {
+              // 左侧
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width +
+                                   (k - (3 - wend))];
+                  }
+                }
+              }
+
+            } else if (wend == input_width) {
+              // 右侧
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+            }
+            for (int l = 0; l < 9; ++l) {
+              result += fake_input[l] * filter1[l];
+            }
+            if (if_bias) {
+              output_data[ph * output_width + pw] += result;
+            } else {
+              output_data[ph * output_width + pw] = result;
+            }
+
+          } else {
+#if __ARM_NEON
+#if __aarch64__
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos2);
+            const float32x4_t data3 = vld1q_f32(pos3);
+
+            const float32x4_t v_filter1 = vld1q_f32(filter1);
+            const float32x4_t v_filter2 = vld1q_f32(filter2);
+            const float32x4_t v_filter3 = vld1q_f32(filter3);
+            float32x4_t mula = vmulq_f32(data1, v_filter1);
+            mula = vmlaq_f32(mula, data2, v_filter2);
+            mula = vmlaq_f32(mula, data3, v_filter3);
+            float32x2_t res = vpadd_f32(
+                vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
+            res = vpadd_f32(res, res);
+            if (if_bias) {
+              output_data[ph * output_width + pw] += vget_lane_f32(res, 0);
+            } else {
+              output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
+            }
+#else
+            asm volatile(
+
+                "vld1.32  {q1}, [%[pos1]]        \n\t"
+                "vld1.32  {q4}, [%[filter1]]        \n\t"
+                "vmov.f32 q0,    #0.0              \n\t"
+
+                "vld1.32  {q2}, [%[pos2]]        \n\t"
+                "vld1.32  {q5}, [%[filter2]]        \n\t"
+                "vmla.f32 q0, q1, q4           \n\t"
+
+                "vld1.32  {q3}, [%[pos3]]        \n\t"
+                "vld1.32  {q6}, [%[filter3]]        \n\t"
+
+                "vmla.f32 q0, q2, q5           \n\t"
+                "vmla.f32 q0, q3, q6          \n\t"
+
+                "vmov.f32 d1[1],  %[zero]         \n\t"
+
+                "vadd.f32  d4, d0, d1           \n\t"
+                "vadd.f32  s10, s8, s9            \n\t"
+                "vst1.32 {d5[0]},[%[output_ptr]]    \n\t"
+                :
+                : [input_data] "r"(input_data), [pos1] "r"(pos1),
+                  [pos2] "r"(pos2), [pos3] "r"(pos3), [filter1] "r"(filter1),
+                  [filter2] "r"(filter2), [filter3] "r"(filter3),
+                  [output_ptr] "r"(output_ptr), [zero] "r"(zero)
+                : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
+#endif  // __aarch64__
+#else
+
+#endif  // __ARM_NEON
+          }
+        }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+      filter_data += filter_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+}
+
+void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
+                          Tensor *output, Tensor *bias, bool if_bias) {
+#if __ARM_NEON
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+  float *output_data = output->data<float>();
+  const float *bias_data;
+  if (if_bias) {
+    bias_data = bias->data<float>();
+  }
+
+  const int h = static_cast<int>(input->dims()[2]);
+  const int w = static_cast<int>(input->dims()[3]);
+  const int l = h;
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int c = static_cast<int>(input->dims()[1]);
+  const int hxw = h * w;
+  float32x4_t vbias = vdupq_n_f32(0.0);
+  for (int b = 0; b < batch_size; ++b) {
+    const float *filter_data_tmp = filter_data;
+
+    for (int j = 0; j < c; ++j) {
+      if (if_bias) {
+        vbias = vdupq_n_f32(bias_data[j]);
+      }
+
+      int l_mid = l - 2;  // l=1->l_mid=-1,l=2->l_mid=0
+      float w00 = filter_data_tmp[0];
+      float w01 = filter_data_tmp[1];
+      float w02 = filter_data_tmp[2];
+      float w10 = filter_data_tmp[3];
+      float w11 = filter_data_tmp[4];
+      float w12 = filter_data_tmp[5];
+      float w20 = filter_data_tmp[6];
+      float w21 = filter_data_tmp[7];
+      float w22 = filter_data_tmp[8];
+
+      output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
+                       w21 * input_data[l] + w22 * input_data[l + 1];
+      output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
+                           w20 * input_data[2 * l - 2] +
+                           w21 * input_data[2 * l - 1];
+      output_data[(l - 1) * l] =
+          w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
+          w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
+      output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
+                               w01 * input_data[(l - 2) * (l + 1) + 1] +
+                               w10 * input_data[l * l - 2] +
+                               w11 * input_data[l * l - 1];
+      if (if_bias) {
+        output_data[0] += bias_data[j];
+        output_data[l - 1] += bias_data[j];
+        output_data[(l - 1) * l] += bias_data[j];
+        output_data[l * l - 1] += bias_data[j];
+      }
+
+      for (int i = 1; i < l - 1; ++i) {
+        output_data[i * l] =
+            w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
+            w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
+            w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1];
+
+        output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
+                                     w01 * input_data[i * l + l - 1 - l] +
+                                     w10 * input_data[i * l + l - 1 - 1] +
+                                     w11 * input_data[i * l + l - 1] +
+                                     w20 * input_data[i * l + l - 1 + l - 1] +
+                                     w21 * input_data[i * l + l - 1 + l];
+        if (if_bias) {
+          output_data[i * l] += bias_data[j];
+          output_data[i * l + l - 1] += bias_data[j];
+        }
+      }
+
+      // top 1 row and bottom 1 row
+      const float *input_tmp = input_data;
+
+      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
+          tmp3, tmp4, tmp5, out0;
+      in0 = vld1q_f32(input_tmp);
+      in2 = vld1q_f32(input_tmp + l);
+      const float *input_tmp_end = input_tmp + (l - 2) * l;
+      in4 = vld1q_f32(input_tmp_end);
+      in6 = vld1q_f32(input_tmp_end + l);
+      int c_mid = l_mid;
+      auto output_ptr = output_data + 1;
+      for (; c_mid > 3; c_mid -= 4) {
+        in1 = vld1q_f32(input_tmp + 4);
+        in3 = vld1q_f32(input_tmp + l + 4);
+
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+
+        out0 = vmulq_n_f32(in0, w10);
+        out0 = vmlaq_n_f32(out0, tmp0, w11);
+        out0 = vmlaq_n_f32(out0, tmp1, w12);
+        out0 = vmlaq_n_f32(out0, in2, w20);
+        out0 = vmlaq_n_f32(out0, tmp2, w21);
+        out0 = vmlaq_n_f32(out0, tmp3, w22);
+        out0 = vaddq_f32(out0, vbias);
+
+        vst1q_f32(output_ptr, out0);
+
+        in5 = vld1q_f32(input_tmp_end + 4);
+        in7 = vld1q_f32(input_tmp_end + l + 4);
+
+        tmp0 = vextq_f32(in4, in5, 1);
+        tmp1 = vextq_f32(in4, in5, 2);
+        tmp2 = vextq_f32(in6, in7, 1);
+        tmp3 = vextq_f32(in6, in7, 2);
+
+        out0 = vmulq_n_f32(in4, w00);
+        out0 = vmlaq_n_f32(out0, tmp0, w01);
+        out0 = vmlaq_n_f32(out0, tmp1, w02);
+        out0 = vmlaq_n_f32(out0, in6, w10);
+        out0 = vmlaq_n_f32(out0, tmp2, w11);
+        out0 = vmlaq_n_f32(out0, tmp3, w12);
+        out0 = vaddq_f32(out0, vbias);
+
+        vst1q_f32(output_ptr + (l - 1) * l, out0);
+
+        // can optimize to each 8 stride.
+        input_tmp += 4;
+        input_tmp_end += 4;
+        output_ptr += 4;
+        in0 = in1;
+        in2 = in3;
+        in4 = in5;
+        in6 = in7;
+      }
+
+      // top right pad
+      float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]);
+      float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]);
+
+      tmp0 = vextq_f32(in0, pad0, 1);
+      tmp1 = vextq_f32(in0, pad0, 2);
+      tmp2 = vextq_f32(in2, pad1, 1);
+      tmp3 = vextq_f32(in2, pad1, 2);
+
+      out0 = vmulq_n_f32(in0, w10);
+      out0 = vmlaq_n_f32(out0, tmp0, w11);
+      out0 = vmlaq_n_f32(out0, tmp1, w12);
+      out0 = vmlaq_n_f32(out0, in2, w20);
+      out0 = vmlaq_n_f32(out0, tmp2, w21);
+      out0 = vmlaq_n_f32(out0, tmp3, w22);
+      out0 = vaddq_f32(out0, vbias);
+
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + i, out0, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + i, out0, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + i, out0, 2);
+        }
+      }
+
+      // bottom right pad
+      float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]);
+      float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]);
+
+      tmp0 = vextq_f32(in4, pad2, 1);
+      tmp1 = vextq_f32(in4, pad2, 2);
+      tmp2 = vextq_f32(in6, pad3, 1);
+      tmp3 = vextq_f32(in6, pad3, 2);
+
+      out0 = vmulq_n_f32(in4, w00);
+      out0 = vmlaq_n_f32(out0, tmp0, w01);
+      out0 = vmlaq_n_f32(out0, tmp1, w02);
+      out0 = vmlaq_n_f32(out0, in6, w10);
+      out0 = vmlaq_n_f32(out0, tmp2, w11);
+      out0 = vmlaq_n_f32(out0, tmp3, w12);
+      out0 = vaddq_f32(out0, vbias);
+
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2);
+        }
+      }
+      // mid
+
+      for (int i = 0; i < l - 2; ++i) {
+        auto output_ptr = output_data + (i + 1) * l + 1;
+        input_tmp = input_data + i * l;
+        auto in0_tmp = vld1q_f32(input_tmp);
+        auto in2_tmp = vld1q_f32(input_tmp + l);
+        auto in4_tmp = vld1q_f32(input_tmp + l + l);
+        c_mid = l_mid;
+        for (; c_mid > 3; c_mid -= 4) {
+          auto in1_tmp = vld1q_f32(input_tmp + 4);
+          auto in3_tmp = vld1q_f32(input_tmp + l + 4);
+          auto in5_tmp = vld1q_f32(input_tmp + l + l + 4);
+
+          tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
+          tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
+          tmp2 = vextq_f32(in2_tmp, in3_tmp, 1);
+          tmp3 = vextq_f32(in2_tmp, in3_tmp, 2);
+          tmp4 = vextq_f32(in4_tmp, in5_tmp, 1);
+          tmp5 = vextq_f32(in4_tmp, in5_tmp, 2);
+
+          out0 = vmulq_n_f32(in0_tmp, w00);
+          out0 = vmlaq_n_f32(out0, tmp0, w01);
+          out0 = vmlaq_n_f32(out0, tmp1, w02);
+          out0 = vmlaq_n_f32(out0, in2_tmp, w10);
+          out0 = vmlaq_n_f32(out0, tmp2, w11);
+          out0 = vmlaq_n_f32(out0, tmp3, w12);
+          out0 = vmlaq_n_f32(out0, in4_tmp, w20);
+          out0 = vmlaq_n_f32(out0, tmp4, w21);
+          out0 = vmlaq_n_f32(out0, tmp5, w22);
+          out0 = vaddq_f32(out0, vbias);
+
+          vst1q_f32(output_ptr, out0);
+
+          output_ptr += 4;
+          input_tmp += 4;
+          in0_tmp = in1_tmp;
+          in2_tmp = in3_tmp;
+          in4_tmp = in5_tmp;
+        }
+
+        float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]);
+        float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]);
+        float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]);
+
+        tmp0 = vextq_f32(in0_tmp, pad0, 1);
+        tmp1 = vextq_f32(in0_tmp, pad0, 2);
+        tmp2 = vextq_f32(in2_tmp, pad1, 1);
+        tmp3 = vextq_f32(in2_tmp, pad1, 2);
+        tmp4 = vextq_f32(in4_tmp, pad2, 1);
+        tmp5 = vextq_f32(in4_tmp, pad2, 2);
+
+        out0 = vmulq_n_f32(in0_tmp, w00);
+        out0 = vmlaq_n_f32(out0, tmp0, w01);
+        out0 = vmlaq_n_f32(out0, tmp1, w02);
+        out0 = vmlaq_n_f32(out0, in2_tmp, w10);
+        out0 = vmlaq_n_f32(out0, tmp2, w11);
+        out0 = vmlaq_n_f32(out0, tmp3, w12);
+        out0 = vmlaq_n_f32(out0, in4_tmp, w20);
+        out0 = vmlaq_n_f32(out0, tmp4, w21);
+        out0 = vmlaq_n_f32(out0, tmp5, w22);
+        out0 = vaddq_f32(out0, vbias);
+
+        for (int i = 0; i < c_mid; ++i) {
+          if (i == 0) {
+            vst1q_lane_f32(output_ptr + i, out0, 0);
+          }
+          if (i == 1) {
+            vst1q_lane_f32(output_ptr + i, out0, 1);
+          }
+          if (i == 2) {
+            vst1q_lane_f32(output_ptr + i, out0, 2);
+          }
+        }
+      }
+      output_data += hxw;
+      input_data += hxw;
+      filter_data_tmp += 9;
+    }
+  }
+#endif
+}
+
+void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
+                                   Tensor *output, const Tensor *new_scale,
+                                   const Tensor *new_bias, bool if_relu) {
+#if __ARM_NEON
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+  float *output_data = output->data<float>();
+  const float *newscale_data = new_scale->data<float>();
+  const float *newbias_data = new_bias->data<float>();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int input_channel = static_cast<int>(input->dims()[1]);
+
+  const int input_height = static_cast<int>(input->dims()[2]);
+  const int input_width = static_cast<int>(input->dims()[3]);
+  const int output_height = static_cast<int>(output->dims()[2]);
+  const int output_width = static_cast<int>(output->dims()[3]);
+
+  const int hxw = input_height * input_width;
+
+  const int l = input_height;
+  float32x4_t vnewbias = vdupq_n_f32(0.0);
+  float32x4_t vnewscale = vdupq_n_f32(1.0);
+  float32x4_t vzero = vdupq_n_f32(0);
+
+  for (int b = 0; b < batch_size; b++) {
+    filter_data = filter->data<float>();
+    for (int c = 0; c < input_channel; c++) {
+      vnewbias = vdupq_n_f32(newbias_data[c]);
+      vnewscale = vdupq_n_f32(newscale_data[c]);
+
+      float w00 = filter_data[0];
+      float w01 = filter_data[1];
+      float w02 = filter_data[2];
+      float w10 = filter_data[3];
+      float w11 = filter_data[4];
+      float w12 = filter_data[5];
+      float w20 = filter_data[6];
+      float w21 = filter_data[7];
+      float w22 = filter_data[8];
+
+      output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
+                       w21 * input_data[l] + w22 * input_data[l + 1];
+      output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
+                           w20 * input_data[2 * l - 2] +
+                           w21 * input_data[2 * l - 1];
+      output_data[(l - 1) * l] =
+          w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
+          w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
+      output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
+                               w01 * input_data[(l - 2) * (l + 1) + 1] +
+                               w10 * input_data[l * l - 2] +
+                               w11 * input_data[l * l - 1];
+      output_data[0] = output_data[0] * newscale_data[c] + newbias_data[c];
+      output_data[l - 1] =
+          output_data[l - 1] * newscale_data[c] + newbias_data[c];
+      output_data[(l - 1) * l] =
+          output_data[(l - 1) * l] * newscale_data[c] + newbias_data[c];
+      output_data[l * l - 1] =
+          output_data[l * l - 1] * newscale_data[c] + newbias_data[c];
+
+      if (if_relu) {
+        output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
+        output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - 1];
+        output_data[(l - 1) * l] =
+            output_data[(l - 1) * l] < 0 ? 0 : output_data[(l - 1) * l];
+        output_data[l * l - 1] =
+            output_data[l * l - 1] < 0 ? 0 : output_data[l * l - 1];
+      }
+      for (int i = 1; i < l - 1; ++i) {
+        output_data[i * l] =
+            w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
+            w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
+            w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1];
+
+        output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
+                                     w01 * input_data[i * l + l - 1 - l] +
+                                     w10 * input_data[i * l + l - 1 - 1] +
+                                     w11 * input_data[i * l + l - 1] +
+                                     w20 * input_data[i * l + l - 1 + l - 1] +
+                                     w21 * input_data[i * l + l - 1 + l];
+        output_data[i * l] =
+            output_data[i * l] * newscale_data[c] + newbias_data[c];
+        output_data[i * l + l - 1] =
+            output_data[i * l + l - 1] * newscale_data[c] + newbias_data[c];
+
+        if (if_relu) {
+          output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i * l];
+          output_data[i * l + l - 1] =
+              output_data[i * l + l - 1] < 0 ? 0 : output_data[i * l + l - 1];
+        }
+      }
+
+      int m;
+      for (m = 1; m < output_width - 4; m += 4) {
+        float *output_ptr = output_data + m;
+        float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0;
+        in0 = vld1q_f32(input_data + m - 1);
+        in1 = vld1q_f32(input_data + m + 3);
+        in2 = vld1q_f32(input_data + input_width + m - 1);
+        in3 = vld1q_f32(input_data + input_width + m + 3);
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+        out0 = vmulq_n_f32(in0, w10);
+        out0 = vmlaq_n_f32(out0, tmp0, w11);
+        out0 = vmlaq_n_f32(out0, tmp1, w12);
+        out0 = vmlaq_n_f32(out0, in2, w20);
+        out0 = vmlaq_n_f32(out0, tmp2, w21);
+        out0 = vmlaq_n_f32(out0, tmp3, w22);
+        out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+        if (if_relu) {
+          out0 = vmaxq_f32(out0, vzero);
+        }
+        vst1q_f32(output_ptr, out0);
+      }
+
+      for (m = 1; (m + 3) < output_width - 1; m += 4) {
+      }
+      for (int j = m; j < output_width - 1; j++) {
+        output_data[j] = input_data[j - 1] * w10 + input_data[j] * w11 +
+                         input_data[j + 1] * w12 +
+                         input_data[input_width + j - 1] * w20 +
+                         input_data[input_width + j] * w21 +
+                         input_data[input_width + j + 1] * w22;
+        output_data[j] = output_data[j] * newscale_data[c] + newbias_data[c];
+
+        if (if_relu) {
+          output_data[j] = output_data[j] < 0 ? 0 : output_data[j];
+        }
+      }
+
+      for (m = 1; m < output_width - 4; m += 4) {
+        float *output_ptr =
+            output_data + (output_height - 1) * output_width + m;
+
+        float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0;
+        in0 = vld1q_f32(input_data + (output_height - 2) * input_width + m - 1);
+        in1 = vld1q_f32(input_data + (output_height - 2) * input_width + m + 3);
+        in2 = vld1q_f32(input_data + (output_height - 1) * input_width + m - 1);
+        in3 = vld1q_f32(input_data + (output_height - 1) * input_width + m + 3);
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+        out0 = vmulq_n_f32(in0, w00);
+        out0 = vmlaq_n_f32(out0, tmp0, w01);
+        out0 = vmlaq_n_f32(out0, tmp1, w02);
+        out0 = vmlaq_n_f32(out0, in2, w10);
+        out0 = vmlaq_n_f32(out0, tmp2, w11);
+        out0 = vmlaq_n_f32(out0, tmp3, w12);
+        out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+        if (if_relu) {
+          out0 = vmaxq_f32(out0, vzero);
+        }
+        vst1q_f32(output_ptr, out0);
+      }
+      for (m = 1; (m + 3) < output_width - 1; m = m + 4) {
+      }
+      for (int j = m; j < output_width - 1; j++) {
+        output_data[(output_height - 1) * input_width + j] =
+            input_data[(output_height - 2) * input_width + j - 1] * w00 +
+            input_data[(output_height - 2) * input_width + j] * w01 +
+            input_data[(output_height - 2) * input_width + j + 1] * w02 +
+            input_data[(output_height - 1) * input_width + j - 1] * w10 +
+            input_data[(output_height - 1) * input_width + j] * w11 +
+            input_data[(output_height - 1) * input_width + j + 1] * w12;
+        output_data[(output_height - 1) * output_width + j] =
+            output_data[(output_height - 1) * output_width + j] *
+                newscale_data[c] +
+            newbias_data[c];
+
+        if (if_relu) {
+          output_data[(output_height - 1) * output_width + j] =
+              output_data[(output_height - 1) * output_width + j] < 0
+                  ? 0
+                  : output_data[(output_height - 1) * output_width + j];
+        }
+      }
+#pragma omp parallel for
+      for (int i = 1; i < output_height - 1; i++) {
+        for (int m = 1; (m + 3) < output_width - 1; m = m + 4) {
+          float *output_ptr = output_data + i * output_width + m;
+          float32x4_t in0, in1, in2, in3, in4, in5, tmp0, tmp1, tmp2, tmp3,
+              tmp4, tmp5, out0;
+          in0 = vld1q_f32(input_data + (i - 1) * input_width + m - 1);
+          in1 = vld1q_f32(input_data + (i - 1) * input_width + m + 3);
+          in2 = vld1q_f32(input_data + i * input_width + m - 1);
+          in3 = vld1q_f32(input_data + i * input_width + m + 3);
+          in4 = vld1q_f32(input_data + (i + 1) * input_width + m - 1);
+          in5 = vld1q_f32(input_data + (i + 1) * input_width + m + 3);
+
+          tmp0 = vextq_f32(in0, in1, 1);
+          tmp1 = vextq_f32(in0, in1, 2);
+          tmp2 = vextq_f32(in2, in3, 1);
+          tmp3 = vextq_f32(in2, in3, 2);
+          tmp4 = vextq_f32(in4, in5, 1);
+          tmp5 = vextq_f32(in4, in5, 2);
+
+          out0 = vmulq_n_f32(in0, w00);
+          out0 = vmlaq_n_f32(out0, tmp0, w01);
+          out0 = vmlaq_n_f32(out0, tmp1, w02);
+          out0 = vmlaq_n_f32(out0, in2, w10);
+          out0 = vmlaq_n_f32(out0, tmp2, w11);
+          out0 = vmlaq_n_f32(out0, tmp3, w12);
+          out0 = vmlaq_n_f32(out0, in4, w20);
+          out0 = vmlaq_n_f32(out0, tmp4, w21);
+          out0 = vmlaq_n_f32(out0, tmp5, w22);
+
+          out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+          if (if_relu) {
+            out0 = vmaxq_f32(out0, vzero);
+          }
+          vst1q_f32(output_ptr, out0);
+        }
+        int m;
+        for (m = 1; (m + 3) < output_width - 1; m = m + 4) {
+        }
+
+        for (int j = m; j < output_width - 1; j++) {
+          output_data[i * output_width + j] =
+              input_data[(i - 1) * input_width + j - 1] * w00 +
+              input_data[(i - 1) * input_width + j] * w01 +
+              input_data[(i - 1) * input_width + j + 1] * w02 +
+              input_data[(i)*input_width + j - 1] * w10 +
+              input_data[(i)*input_width + j] * w11 +
+              input_data[(i)*input_width + j + 1] * w12 +
+              input_data[(i + 1) * input_width + j - 1] * w20 +
+              input_data[(i + 1) * input_width + j] * w21 +
+              input_data[(i + 1) * input_width + j + 1] * w22;
+          output_data[i * output_width + j] =
+              newscale_data[c] * output_data[i * output_width + j] +
+              newbias_data[c];
+          if (if_relu) {
+            output_data[i * output_width + j] =
+                output_data[i * output_width + j] < 0
+                    ? 0
+                    : output_data[i * output_width + j];
+          }
+        }
+      }
+
+      input_data = input_data + hxw;
+      output_data = output_data + hxw;
+      filter_data = filter_data + 9;
+    }
+  }
+
+    /*
+        const float *input_data = input->data<float>();
+        const float *filter_data = filter->data<float>();
+        float *output_data = output->data<float>();
+        const float *newscale_data = new_scale->data<float>();
+        const float *newbias_data = new_bias->data<float>();
+
+        const int h = static_cast<int>(input->dims()[2]);
+        const int w = static_cast<int>(input->dims()[3]);
+        const int l = h;
+
+        const int batch_size = static_cast<int>(input->dims()[0]);
+        const int c = static_cast<int>(input->dims()[1]);
+        const int hxw = h * w;
+        float32x4_t vnewbias = vdupq_n_f32(0.0);
+        float32x4_t vnewscale = vdupq_n_f32(1.0);
+        float32x4_t vzero = vdupq_n_f32(0);
+
+        for (int b = 0; b < batch_size; ++b) {
+          const float *filter_data_tmp = filter_data;
+
+          for (int j = 0; j < c; ++j) {
+            vnewbias = vdupq_n_f32(newbias_data[j]);
+            vnewscale = vdupq_n_f32(newscale_data[j]);
+
+            int l_mid = l - 2;  // l=1->l_mid=-1,l=2->l_mid=0
+            float w00 = filter_data_tmp[0];
+            float w01 = filter_data_tmp[1];
+            float w02 = filter_data_tmp[2];
+            float w10 = filter_data_tmp[3];
+            float w11 = filter_data_tmp[4];
+            float w12 = filter_data_tmp[5];
+            float w20 = filter_data_tmp[6];
+            float w21 = filter_data_tmp[7];
+            float w22 = filter_data_tmp[8];
+
+            output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
+                             w21 * input_data[l] + w22 * input_data[l + 1];
+
+            output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l -
+       1] + w20 * input_data[2 * l - 2] + w21 * input_data[2 * l - 1];
+
+            output_data[(l - 1) * l] =
+                w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l +
+       1] + w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
+            output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
+                                     w01 * input_data[(l - 2) * (l + 1) + 1] +
+                                     w10 * input_data[l * l - 2] +
+                                     w11 * input_data[l * l - 1];
+            output_data[0] = output_data[0] * newscale_data[j] +
+       newbias_data[j]; output_data[l - 1] = output_data[l - 1] *
+       newscale_data[j] + newbias_data[j]; output_data[(l - 1) * l] =
+                output_data[(l - 1) * l] * newscale_data[j] + newbias_data[j];
+            output_data[l * l - 1] =
+                output_data[l * l - 1] * newscale_data[j] + newbias_data[j];
+
+            if (if_relu) {
+              output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
+              output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l -
+       1]; output_data[(l - 1) * l] = output_data[(l - 1) * l] < 0 ? 0 :
+       output_data[(l - 1) * l]; output_data[l * l - 1] = output_data[l * l - 1]
+       < 0 ? 0 : output_data[l * l - 1];
+            }
+            for (int i = 1; i < l - 1; ++i) {
+              output_data[i * l] =
+                  w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1]
+       + w11 * input_data[i * l] + w12 * input_data[i * l + 1] + w21 *
+       input_data[i * l + l] + w22 * input_data[i * l + l + 1]; output_data[i *
+       l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] + w01 * input_data[i
+       * l + l - 1 - l] + w10 * input_data[i * l + l - 1 - 1] + w11 *
+       input_data[i * l + l - 1] + w20 * input_data[i * l + l - 1 + l - 1] + w21
+       * input_data[i * l + l - 1 + l]; output_data[i * l] = output_data[i * l]
+       * newscale_data[j] + newbias_data[j]; output_data[i * l + l - 1] =
+                  output_data[i * l + l - 1] * newscale_data[j] +
+       newbias_data[j];
+
+              if (if_relu) {
+                output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i
+       * l]; output_data[i * l + l - 1] = output_data[i * l + l - 1] < 0 ? 0 :
+       output_data[i * l + l - 1];
+              }
+            }
+
+            // top 1 row and bottom 1 row
+            const float *input_tmp = input_data;
+
+            float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1,
+       tmp2, tmp3, tmp4, tmp5, out0; in0 = vld1q_f32(input_tmp); in2 =
+       vld1q_f32(input_tmp + l); const float *input_tmp_end = input_tmp + (l -
+       2) * l; in4 = vld1q_f32(input_tmp_end); in6 = vld1q_f32(input_tmp_end +
+       l); int c_mid = l_mid; auto output_ptr = output_data + 1; for (; c_mid >
+       3; c_mid -= 4) { in1 = vld1q_f32(input_tmp + 4); in3 =
+       vld1q_f32(input_tmp + l + 4);
+
+              tmp0 = vextq_f32(in0, in1, 1);
+              tmp1 = vextq_f32(in0, in1, 2);
+
+              tmp2 = vextq_f32(in2, in3, 1);
+              tmp3 = vextq_f32(in2, in3, 2);
+
+              out0 = vmulq_n_f32(in0, w10);
+              out0 = vmlaq_n_f32(out0, tmp0, w11);
+              out0 = vmlaq_n_f32(out0, tmp1, w12);
+              out0 = vmlaq_n_f32(out0, in2, w20);
+              out0 = vmlaq_n_f32(out0, tmp2, w21);
+              out0 = vmlaq_n_f32(out0, tmp3, w22);
+              out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+              if (if_relu) {
+                out0 = vmaxq_f32(out0, vzero);
+              }
+              vst1q_f32(output_ptr, out0);
+
+              in5 = vld1q_f32(input_tmp_end + 4);
+              in7 = vld1q_f32(input_tmp_end + l + 4);
+
+              tmp0 = vextq_f32(in4, in5, 1);
+              tmp1 = vextq_f32(in4, in5, 2);
+              tmp2 = vextq_f32(in6, in7, 1);
+              tmp3 = vextq_f32(in6, in7, 2);
+
+              out0 = vmulq_n_f32(in4, w00);
+              out0 = vmlaq_n_f32(out0, tmp0, w01);
+              out0 = vmlaq_n_f32(out0, tmp1, w02);
+              out0 = vmlaq_n_f32(out0, in6, w10);
+              out0 = vmlaq_n_f32(out0, tmp2, w11);
+              out0 = vmlaq_n_f32(out0, tmp3, w12);
+              out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+              if (if_relu) {
+                out0 = vmaxq_f32(out0, vzero);
+              }
+              vst1q_f32(output_ptr + (l - 1) * l, out0);
+
+              // can optimize to each 8 stride.
+              input_tmp += 4;
+              input_tmp_end += 4;
+              output_ptr += 4;
+              in0 = in1;
+              in2 = in3;
+              in4 = in5;
+              in6 = in7;
+            }
+
+            // top right pad
+            float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]);
+            float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]);
+
+            tmp0 = vextq_f32(in0, pad0, 1);
+            tmp1 = vextq_f32(in0, pad0, 2);
+            tmp2 = vextq_f32(in2, pad1, 1);
+            tmp3 = vextq_f32(in2, pad1, 2);
+
+            out0 = vmulq_n_f32(in0, w10);
+            out0 = vmlaq_n_f32(out0, tmp0, w11);
+            out0 = vmlaq_n_f32(out0, tmp1, w12);
+            out0 = vmlaq_n_f32(out0, in2, w20);
+            out0 = vmlaq_n_f32(out0, tmp2, w21);
+            out0 = vmlaq_n_f32(out0, tmp3, w22);
+            out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+            if (if_relu) {
+              out0 = vmaxq_f32(out0, vzero);
+            }
+            for (int i = 0; i < c_mid; ++i) {
+              if (i == 0) {
+                vst1q_lane_f32(output_ptr + i, out0, 0);
+              }
+              if (i == 1) {
+                vst1q_lane_f32(output_ptr + i, out0, 1);
+              }
+              if (i == 2) {
+                vst1q_lane_f32(output_ptr + i, out0, 2);
+              }
+            }
+
+            // bottom right pad
+            float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]);
+            float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]);
+
+            tmp0 = vextq_f32(in4, pad2, 1);
+            tmp1 = vextq_f32(in4, pad2, 2);
+            tmp2 = vextq_f32(in6, pad3, 1);
+            tmp3 = vextq_f32(in6, pad3, 2);
+
+            out0 = vmulq_n_f32(in4, w00);
+            out0 = vmlaq_n_f32(out0, tmp0, w01);
+            out0 = vmlaq_n_f32(out0, tmp1, w02);
+            out0 = vmlaq_n_f32(out0, in6, w10);
+            out0 = vmlaq_n_f32(out0, tmp2, w11);
+            out0 = vmlaq_n_f32(out0, tmp3, w12);
+            out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+            if (if_relu) {
+              out0 = vmaxq_f32(out0, vzero);
+            }
+            for (int i = 0; i < c_mid; ++i) {
+              if (i == 0) {
+                vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0);
+              }
+              if (i == 1) {
+                vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1);
+              }
+              if (i == 2) {
+                vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2);
+              }
+            }
+            // mid
+
+
+            for (int i = 0; i < l - 2; ++i) {
+              auto output_ptr = output_data + (i + 1) * l + 1;
+              input_tmp = input_data + i * l;
+              auto in0_tmp = vld1q_f32(input_tmp);
+              auto in2_tmp = vld1q_f32(input_tmp + l);
+              auto in4_tmp = vld1q_f32(input_tmp + l + l);
+              c_mid = l_mid;
+              for (; c_mid > 3; c_mid -= 4) {
+                auto in1_tmp = vld1q_f32(input_tmp + 4);
+                auto in3_tmp = vld1q_f32(input_tmp + l + 4);
+                auto in5_tmp = vld1q_f32(input_tmp + l + l + 4);
+
+                tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
+                tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
+                tmp2 = vextq_f32(in2_tmp, in3_tmp, 1);
+                tmp3 = vextq_f32(in2_tmp, in3_tmp, 2);
+                tmp4 = vextq_f32(in4_tmp, in5_tmp, 1);
+                tmp5 = vextq_f32(in4_tmp, in5_tmp, 2);
+
+                out0 = vmulq_n_f32(in0_tmp, w00);
+                out0 = vmlaq_n_f32(out0, tmp0, w01);
+                out0 = vmlaq_n_f32(out0, tmp1, w02);
+                out0 = vmlaq_n_f32(out0, in2_tmp, w10);
+                out0 = vmlaq_n_f32(out0, tmp2, w11);
+                out0 = vmlaq_n_f32(out0, tmp3, w12);
+                out0 = vmlaq_n_f32(out0, in4_tmp, w20);
+                out0 = vmlaq_n_f32(out0, tmp4, w21);
+                out0 = vmlaq_n_f32(out0, tmp5, w22);
+                out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+                if (if_relu) {
+                  out0 = vmaxq_f32(out0, vzero);
+                }
+                vst1q_f32(output_ptr, out0);
+
+                output_ptr += 4;
+                input_tmp += 4;
+                in0_tmp = in1_tmp;
+                in2_tmp = in3_tmp;
+                in4_tmp = in5_tmp;
+              }
+
+              float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]);
+              float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]);
+              float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]);
+
+              tmp0 = vextq_f32(in0_tmp, pad0, 1);
+              tmp1 = vextq_f32(in0_tmp, pad0, 2);
+              tmp2 = vextq_f32(in2_tmp, pad1, 1);
+              tmp3 = vextq_f32(in2_tmp, pad1, 2);
+              tmp4 = vextq_f32(in4_tmp, pad2, 1);
+              tmp5 = vextq_f32(in4_tmp, pad2, 2);
+
+              out0 = vmulq_n_f32(in0_tmp, w00);
+              out0 = vmlaq_n_f32(out0, tmp0, w01);
+              out0 = vmlaq_n_f32(out0, tmp1, w02);
+              out0 = vmlaq_n_f32(out0, in2_tmp, w10);
+              out0 = vmlaq_n_f32(out0, tmp2, w11);
+              out0 = vmlaq_n_f32(out0, tmp3, w12);
+              out0 = vmlaq_n_f32(out0, in4_tmp, w20);
+              out0 = vmlaq_n_f32(out0, tmp4, w21);
+              out0 = vmlaq_n_f32(out0, tmp5, w22);
+              out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+              if (if_relu) {
+                out0 = vmaxq_f32(out0, vzero);
+              }
+              for (int i = 0; i < c_mid; ++i) {
+                if (i == 0) {
+                  vst1q_lane_f32(output_ptr + i, out0, 0);
+                }
+                if (i == 1) {
+                  vst1q_lane_f32(output_ptr + i, out0, 1);
+                }
+                if (i == 2) {
+                  vst1q_lane_f32(output_ptr + i, out0, 2);
+                }
+              }
+            }
+            output_data += hxw;
+            input_data += hxw;
+            filter_data_tmp += 9;
+          }
+        }
+    */
+
+#endif
+}
+
+void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
+                                   Tensor *output, const Tensor *new_scale,
+                                   const Tensor *new_bias, bool if_relu) {
+#if __ARM_NEON
+
+  const int batch_size = input->dims()[0];
+
+  const int input_height = input->dims()[2];
+
+  const int input_width = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int _kernel_size = 3;
+  const int stride_height = 2;
+  const int stride_width = 2;
+  const int padding_height = 1;
+  const int padding_width = 1;
+  const float zero = 0;
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+  const int filter_channel_stride = 9;
+  const float *newscale_data = new_scale->data<float>();
+  const float *newbias_data = new_bias->data<float>();
+
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+
+  float *output_data = output->mutable_data<float>();
+
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  const int filter_batch_stride = output_channels * output_channel_stride;
+  const float *pos1, *pos2, *pos3, *filter1, *filter2, *filter3, *output_ptr;
+  int hstart, wstart, hend, wend;
+  float result;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      filter1 = filter_data;
+      filter2 = filter1 + 3;
+      filter3 = filter2 + 3;
+
+      for (int ph = 0; ph < output_height; ph++) {
+        for (int pw = 0; pw < output_width; pw++) {
+          hstart = ph * stride_height - padding_height;
+          wstart = pw * stride_width - padding_width;
+          hend = min(hstart + _kernel_size, input_height + padding_height);
+          wend = min(wstart + _kernel_size, input_width + padding_width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          hend = min(hend, input_height);
+          wend = min(wend, input_width);
+          pos1 = input_data + hstart * input_width + wstart;
+          pos2 = input_data + (hstart + 1) * input_width + wstart;
+          pos3 = input_data + (hstart + 2) * input_width + wstart;
+          output_ptr = output_data + ph * output_width + pw;
+
+          if (hend - hstart != 3 || wend - wstart != 3) {
+            result = 0;
+            float fake_input[9] = {0};
+            if (hstart == 0 && wstart == 0) {
+              // 左上角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend && k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k -
+                                   (3 - wend)];
+                  }
+                }
+              }
+            } else if (hstart == 0 && wend == input_width) {
+              // 右上角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend && k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k + wstart];
+                  }
+                }
+              }
+
+            } else if (hend == input_height && wstart == 0) {
+              // 左下角
+
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - 1 - hstart && k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k - (3 - wend)];
+                  }
+                }
+              }
+            } else if (hend == input_height && wend == input_width) {
+              // 右下角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - hstart - 1 &&
+                      k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+            } else if (hstart == 0) {
+              // 顶部
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k + wstart];
+                  }
+                }
+              }
+
+            } else if (hend == input_height) {
+              // 底部
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - hstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+
+            } else if (wstart == 0) {
+              // 左侧
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width +
+                                   (k - (3 - wend))];
+                  }
+                }
+              }
+
+            } else if (wend == input_width) {
+              // 右侧
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+            }
+            for (int l = 0; l < 9; ++l) {
+              result += fake_input[l] * filter1[l];
+            }
+            output_data[ph * output_width + pw] =
+                newscale_data[c] * result + newbias_data[c];
+
+            if (if_relu) {
+              output_data[ph * output_width + pw] =
+                  output_data[ph * output_width + pw] < 0
+                      ? 0
+                      : output_data[ph * output_width + pw];
+            }
+          } else {
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos2);
+            const float32x4_t data3 = vld1q_f32(pos3);
+
+            const float32x4_t v_filter1 = vld1q_f32(filter1);
+            const float32x4_t v_filter2 = vld1q_f32(filter2);
+            const float32x4_t v_filter3 = vld1q_f32(filter3);
+            float32x4_t mula = vmulq_f32(data1, v_filter1);
+            mula = vmlaq_f32(mula, data2, v_filter2);
+            mula = vmlaq_f32(mula, data3, v_filter3);
+            float32x2_t res = vpadd_f32(
+                vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
+            res = vpadd_f32(res, res);
+            output_data[ph * output_width + pw] =
+                vget_lane_f32(res, 0) * newscale_data[c] + newbias_data[c];
+
+            if (if_relu) {
+              output_data[ph * output_width + pw] =
+                  output_data[ph * output_width + pw] < 0
+                      ? 0
+                      : output_data[ph * output_width + pw];
+            }
+          }
+        }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+      filter_data += filter_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+#endif
+}
+
+void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
+                            Tensor *output, Tensor bias, bool if_bias) {
+#if __ARM_NEON
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+  float *output_data = output->data<float>();
+  const float *bias_data = bias.data<float>();
+
+  const int in_h = static_cast<int>(input->dims()[2]);
+  const int in_w = static_cast<int>(input->dims()[3]);
+  const int out_h = static_cast<int>(output->dims()[2]);
+  const int out_w = static_cast<int>(output->dims()[3]);
+  const int out_l = out_h;
+  const int in_l = in_h;
+  const int inhxw = in_h * in_w;
+  const int outhxw = out_h * out_w;
+  const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0;
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int c = static_cast<int>(input->dims()[1]);
+  const float *input_row_ptr;
+  float *output_row_ptr;
+
+  const int w_times = (out_w - 2) / 3;
+
+  float32x4_t vbias = vdupq_n_f32(0.0);
+
+  float32x4x2_t input_buff_mid{}, input_buff_bottom[w_times + 1];
+  float32x4_t elewise_res0, elewise_res1, elewise_res2, res3;
+  int out2in_mid;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int b = batch_size; b > 0; --b) {
+    const float *filter_data_tmp = filter_data;
+    for (int j = 0; j < c; ++j) {
+      auto output_data_tmp = output_data + j * out_h * out_w;
+      auto input_data_tmp = input_data + j * in_h * in_w;
+      auto input_const = input_data_tmp;
+
+      if (if_bias) {
+        vbias = vdupq_n_f32(bias_data[j]);
+      }
+
+      float w00 = filter_data_tmp[0];
+      float w01 = filter_data_tmp[1];
+      float w02 = filter_data_tmp[2];
+      float w10 = filter_data_tmp[3];
+      float w11 = filter_data_tmp[4];
+      float w12 = filter_data_tmp[5];
+      float w20 = filter_data_tmp[6];
+      float w21 = filter_data_tmp[7];
+      float w22 = filter_data_tmp[8];
+
+      int h_mid = 0;
+
+      for (; h_mid < out_h - 1; h_mid++) {
+        input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w;
+        output_row_ptr = output_data_tmp + 1 + h_mid * out_w;
+
+        for (int w4 = 0; w4 < w_times + 1; w4++) {
+          if (h_mid == 0) {
+            elewise_res1 = zero;
+            elewise_res0 = zero;
+            elewise_res2 = zero;
+          } else {
+            elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01);
+            elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00);
+            elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02);
+          }
+          input_buff_mid = vld2q_f32(input_row_ptr);
+          input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w);
+
+          elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1], w11);
+          elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10);
+          elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
+
+          elewise_res1 =
+              vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21);
+          elewise_res0 =
+              vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0], w20);
+          elewise_res2 =
+              vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0], w22);
+
+          res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
+                           vaddq_f32(elewise_res0, elewise_res1));
+          res3 = vaddq_f32(res3, vbias);
+          vst1q_f32(output_row_ptr, res3);
+
+          input_row_ptr += 6;
+          output_row_ptr += 3;
+        }
+      }
+      clock();
+
+      input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w;
+      output_row_ptr = output_data_tmp + 1 + h_mid * out_w;
+
+      for (int w4 = 0; w4 < w_times + 1; w4++) {
+        elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01);
+        elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00);
+        elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02);
+
+        input_buff_mid = vld2q_f32(input_row_ptr);
+        input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w);
+
+        elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1], w11);
+        elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10);
+        elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
+
+        if (!if_pad) {
+          elewise_res1 =
+              vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21);
+          elewise_res0 =
+              vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0], w20);
+          elewise_res2 =
+              vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0], w22);
+        }
+        res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
+                         vaddq_f32(elewise_res0, elewise_res1));
+        res3 = vaddq_f32(res3, vbias);
+
+        if ((w4 != w_times)) {
+          vst1q_f32(output_row_ptr, res3);
+        } else {
+          if (out_l - 2 - w_times * 3 == 1) {
+            vst1q_lane_f32(output_row_ptr, res3, 0);
+          } else if (out_l - 2 - w_times * 3 == 2) {
+            vst1q_lane_f32(output_row_ptr, res3, 0);
+            vst1q_lane_f32(output_row_ptr + 1, res3, 1);
+          }
+        }
+        input_row_ptr += 6;
+        output_row_ptr += 3;
+      }
+
+      output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 +
+                           input_const[in_l] * w21 +
+                           input_const[in_l + 1] * w22;
+
+      out2in_mid = (out_l - 1) * 2;
+      output_data_tmp[out_l - 1] =
+          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+          w20 * input_const[out2in_mid + in_w - 1] +
+          w21 * input_const[out2in_mid + in_w] +
+          (1 - if_pad) * (w12 * input_const[out2in_mid + 1] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+
+      out2in_mid = (out_l - 1) * 2 * in_w;
+
+      output_data_tmp[out_l * (out_l - 1)] =
+          w01 * input_const[out2in_mid - in_w] +
+          w02 * input_const[out2in_mid - in_w + 1] +
+          w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
+          (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+      out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2;
+
+      output_data_tmp[out_l * out_l - 1] =
+          w00 * input_const[out2in_mid - in_w - 1] +
+          w01 * input_const[out2in_mid - in_w] +
+          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+          (1 - if_pad) * (w20 * input_const[out2in_mid + in_w - 1] +
+                          w21 * input_const[out2in_mid + in_w] +
+                          w02 * input_const[out2in_mid - in_w + 1] +
+                          w12 * input_const[out2in_mid + 1] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+      if (if_bias) {
+        output_data_tmp[0] += bias_data[j];
+        output_data_tmp[out_l - 1] += bias_data[j];
+        output_data_tmp[out_l * (out_l - 1)] += bias_data[j];
+        output_data_tmp[out_l * out_l - 1] += bias_data[j];
+      }
+      for (int i = 1; i < out_h - 1; i++) {
+        out2in_mid = i * 2 * in_w;
+        output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] +
+                                     w02 * input_const[out2in_mid - in_w + 1] +
+                                     w11 * input_const[out2in_mid] +
+                                     w12 * input_const[out2in_mid + 1] +
+                                     w21 * input_const[out2in_mid + in_w] +
+                                     w22 * input_const[out2in_mid + in_w + 1];
+
+        out2in_mid = i * 2 * in_w + (out_l - 1) * 2;
+        output_data_tmp[i * out_l + out_l - 1] =
+            w00 * input_const[out2in_mid - in_w - 1] +
+            w01 * input_const[out2in_mid - in_w] +
+            w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+            w20 * input_const[out2in_mid + in_w - 1] +
+            w21 * input_const[out2in_mid + in_w] +
+            (1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] +
+                            w12 * input_const[out2in_mid + 1] +
+                            w22 * input_const[out2in_mid + in_w + 1]);
+        if (if_bias) {
+          output_data_tmp[i * out_l] += bias_data[j];
+          output_data_tmp[i * out_l + out_l - 1] += bias_data[j];
+        }
+      }
+      filter_data_tmp += 9;
+    }
+    input_data += inhxw * c;
+    output_data += outhxw * c;
+  }
+#endif
+}
+
+void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
+                                     Tensor *output, const Tensor *new_scale,
+                                     const Tensor *new_bias, bool if_relu) {
+#if __ARM_NEON
+#ifdef _OPENMP
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+  float *output_data = output->data<float>();
+  const float *newscale_data = new_scale->data<float>();
+  const float *newbias_data = new_bias->data<float>();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int input_channel = static_cast<int>(input->dims()[1]);
+
+  const int input_height = static_cast<int>(input->dims()[2]);
+  const int input_width = static_cast<int>(input->dims()[3]);
+  const int output_height = static_cast<int>(output->dims()[2]);
+  const int output_width = static_cast<int>(output->dims()[3]);
+  const int inhxw = input_height * input_width;
+  const int outhxw = output_height * output_width;
+
+  float32x4_t vnewbias = vdupq_n_f32(0.0);
+  float32x4_t vnewscale = vdupq_n_f32(1.0);
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int b = 0; b < batch_size; b++) {
+    filter_data = filter->data<float>();
+    for (int c = 0; c < input_channel; c++) {
+      vnewbias = vdupq_n_f32(newbias_data[c]);
+      vnewscale = vdupq_n_f32(newscale_data[c]);
+
+      float w00 = filter_data[0];
+      float w01 = filter_data[1];
+      float w02 = filter_data[2];
+      float w10 = filter_data[3];
+      float w11 = filter_data[4];
+      float w12 = filter_data[5];
+      float w20 = filter_data[6];
+      float w21 = filter_data[7];
+      float w22 = filter_data[8];
+
+      int m;
+      for (m = 1; m < output_width - 2; m = m + 3) {
+        float *output_ptr = output_data + m;
+        float32x4x2_t input_buff_mid{}, input_buff_bottom{};
+        float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0;
+        input_buff_mid = vld2q_f32(input_data + (2 * m - 1));
+        input_buff_bottom = vld2q_f32(input_data + input_width + (2 * m - 1));
+
+        in0 = input_buff_mid.val[0];
+        tmp0 = input_buff_mid.val[1];
+        tmp1 = vextq_f32(in0, zero, 1);
+
+        in2 = input_buff_bottom.val[0];
+        tmp2 = input_buff_bottom.val[1];
+        tmp3 = vextq_f32(in2, zero, 1);
+
+        out0 = vmulq_n_f32(in0, w10);
+        out0 = vmlaq_n_f32(out0, tmp0, w11);
+        out0 = vmlaq_n_f32(out0, tmp1, w12);
+        out0 = vmlaq_n_f32(out0, in2, w20);
+        out0 = vmlaq_n_f32(out0, tmp2, w21);
+        out0 = vmlaq_n_f32(out0, tmp3, w22);
+        out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+        if (if_relu) {
+          out0 = vmaxq_f32(out0, zero);
+        }
+        vst1q_f32(output_ptr, out0);
+      }
+      for (m = 1; m < output_width - 2; m += 3) {
+      }
+      for (int j = m; j < output_width; j++) {
+        output_data[j] = input_data[2 * j - 1] * w10 + input_data[2 * j] * w11 +
+                         input_data[2 * j + 1] * w12 +
+                         input_data[2 * j - 1 + input_width] * w20 +
+                         input_data[2 * j + input_width] * w21 +
+                         input_data[2 * j + 1 + input_width] * w22;
+        output_data[j] = newscale_data[c] * output_data[j] + newbias_data[c];
+        if (if_relu) {
+          output_data[j] = output_data[j] < 0 ? 0 : output_data[j];
+        }
+      }
+
+#pragma omp parallel for
+
+      for (int i = 1; i < output_height; i += 1) {
+        for (int m = 1; m < output_width - 2; m += 3) {
+          float *output_ptr = output_data + i * output_width + m;
+          float32x4x2_t input_buff_top{}, input_buff_mid{}, input_buff_bottom{};
+          float32x4_t in0, in1, in2, in3, in4, in5, tmp0, tmp1, tmp2, tmp3,
+              tmp4, tmp5, out0;
+          input_buff_top =
+              vld2q_f32(input_data + (2 * i - 1) * input_width + (2 * m - 1));
+          input_buff_mid =
+              vld2q_f32(input_data + (2 * i) * input_width + (2 * m - 1));
+          input_buff_bottom =
+              vld2q_f32(input_data + (2 * i + 1) * input_width + (2 * m - 1));
+
+          in0 = input_buff_top.val[0];
+          tmp0 = input_buff_top.val[1];
+          tmp1 = vextq_f32(in0, zero, 1);
+
+          in2 = input_buff_mid.val[0];
+          tmp2 = input_buff_mid.val[1];
+          tmp3 = vextq_f32(in2, zero, 1);
+
+          in4 = input_buff_bottom.val[0];
+          tmp4 = input_buff_bottom.val[1];
+          tmp5 = vextq_f32(in4, zero, 1);
+
+          out0 = vmulq_n_f32(in0, w00);
+          out0 = vmlaq_n_f32(out0, tmp0, w01);
+          out0 = vmlaq_n_f32(out0, tmp1, w02);
+          out0 = vmlaq_n_f32(out0, in2, w10);
+          out0 = vmlaq_n_f32(out0, tmp2, w11);
+          out0 = vmlaq_n_f32(out0, tmp3, w12);
+          out0 = vmlaq_n_f32(out0, in4, w20);
+          out0 = vmlaq_n_f32(out0, tmp4, w21);
+          out0 = vmlaq_n_f32(out0, tmp5, w22);
+          out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+          if (if_relu) {
+            out0 = vmaxq_f32(out0, zero);
+          }
+          vst1q_f32(output_ptr, out0);
+        }
+        int m;
+        for (m = 1; m < output_width - 2; m += 3) {
+        }
+        for (int j = m; j < output_width; j++) {
+          output_data[i * output_width + j] =
+              input_data[(2 * i - 1) * input_width + 2 * j - 1] * w00 +
+              input_data[(2 * i - 1) * input_width + 2 * j] * w01 +
+              input_data[(2 * i - 1) * input_width + 2 * j + 1] * w02 +
+              input_data[(2 * i) * input_width + 2 * j - 1] * w10 +
+              input_data[(2 * i) * input_width + 2 * j] * w11 +
+              input_data[(2 * i) * input_width + 2 * j + 1] * w12 +
+              input_data[(2 * i + 1) * input_width + 2 * j - 1] * w20 +
+              input_data[(2 * i + 1) * input_width + 2 * j] * w21 +
+              input_data[(2 * i + 1) * input_width + 2 * j + 1] * w22;
+          output_data[i * output_width + j] =
+              newscale_data[c] * output_data[i * output_width + j] +
+              newbias_data[c];
+          if (if_relu) {
+            output_data[i * output_width + j] =
+                output_data[i * output_width + j] < 0
+                    ? 0
+                    : output_data[i * output_width + j];
+          }
+        }
+      }
+      output_data[0] = input_data[0] * w11 + input_data[1] * w12 +
+                       input_data[input_height] * w21 +
+                       input_data[input_height + 1] * w22;
+
+      output_data[0] = newscale_data[c] * output_data[0] + newbias_data[c];
+      if (if_relu) {
+        output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
+      }
+      for (int i = 1; i < output_height; i++) {
+        output_data[i * output_width] =
+            input_data[(2 * i - 1) * input_width] * w01 +
+            input_data[(2 * i - 1) * input_width + 1] * w02 +
+            input_data[(2 * i) * input_width] * w11 +
+            input_data[(2 * i) * input_width + 1] * w12 +
+            input_data[(2 * i + 1) * input_width] * w21 +
+            input_data[(2 * i + 1) * input_width + 1] * w22;
+
+        output_data[i * output_width] =
+            newscale_data[c] * output_data[i * output_width] + newbias_data[c];
+        if (if_relu) {
+          output_data[i * output_width] = output_data[i * output_width] < 0
+                                              ? 0
+                                              : output_data[i * output_width];
+        }
+      }
+
+      input_data = input_data + inhxw;
+      output_data = output_data + outhxw;
+      filter_data = filter_data + 9;
+    }
+  }
+
+#else
+
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+  float *output_data = output->data<float>();
+  const float *newscale_data = new_scale->data<float>();
+  const float *newbias_data = new_bias->data<float>();
+
+  float32x4_t vnewbias = vdupq_n_f32(0.0);
+  float32x4_t vnewscale = vdupq_n_f32(1.0);
+
+  const int in_h = static_cast<int>(input->dims()[2]);
+  const int in_w = static_cast<int>(input->dims()[3]);
+  const int out_h = static_cast<int>(output->dims()[2]);
+  const int out_w = static_cast<int>(output->dims()[3]);
+  const int out_l = out_h;
+  const int in_l = in_h;
+  const int inhxw = in_h * in_w;
+  const int outhxw = out_h * out_w;
+  const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0;
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int c = static_cast<int>(input->dims()[1]);
+  const float *input_row_ptr;
+  float *output_row_ptr;
+
+  const int w_times = (out_w - 2) / 3;
+
+  float32x4x2_t input_buff_mid{}, input_buff_bottom[w_times + 1];
+  float32x4_t elewise_res0, elewise_res1, elewise_res2, res3;
+  int out2in_mid;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int b = batch_size; b > 0; --b) {
+    const float *filter_data_tmp = filter_data;
+    for (int j = 0; j < c; ++j) {
+      auto output_data_tmp = output_data + j * out_h * out_w;
+      auto input_data_tmp = input_data + j * in_h * in_w;
+      auto input_const = input_data_tmp;
+
+      vnewbias = vdupq_n_f32(newbias_data[j]);
+      vnewscale = vdupq_n_f32(newscale_data[j]);
+
+      float w00 = filter_data_tmp[0];
+      float w01 = filter_data_tmp[1];
+      float w02 = filter_data_tmp[2];
+      float w10 = filter_data_tmp[3];
+      float w11 = filter_data_tmp[4];
+      float w12 = filter_data_tmp[5];
+      float w20 = filter_data_tmp[6];
+      float w21 = filter_data_tmp[7];
+      float w22 = filter_data_tmp[8];
+
+      int h_mid = 0;
+
+      for (; h_mid < out_h - 1; h_mid++) {
+        input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w;
+        output_row_ptr = output_data_tmp + 1 + h_mid * out_w;
+
+        for (int w4 = 0; w4 < w_times + 1; w4++) {
+          if (h_mid == 0) {
+            elewise_res1 = zero;
+            elewise_res0 = zero;
+            elewise_res2 = zero;
+          } else {
+            elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01);
+            elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00);
+            elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02);
+          }
+          input_buff_mid = vld2q_f32(input_row_ptr);
+          input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w);
+
+          elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1], w11);
+          elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10);
+          elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
+
+          elewise_res1 =
+              vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21);
+          elewise_res0 =
+              vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0], w20);
+          elewise_res2 =
+              vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0], w22);
+
+          res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
+                           vaddq_f32(elewise_res0, elewise_res1));
+          res3 = vmlaq_f32(vnewbias, vnewscale, res3);
+
+          if (if_relu) {
+            res3 = vmaxq_f32(res3, zero);
+          }
+          vst1q_f32(output_row_ptr, res3);
+
+          input_row_ptr += 6;
+          output_row_ptr += 3;
+        }
+      }
+      clock();
+
+      input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w;
+      output_row_ptr = output_data_tmp + 1 + h_mid * out_w;
+
+      for (int w4 = 0; w4 < w_times + 1; w4++) {
+        elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01);
+        elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00);
+        elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02);
+
+        input_buff_mid = vld2q_f32(input_row_ptr);
+        input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w);
+
+        elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1], w11);
+        elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10);
+        elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
+
+        if (!if_pad) {
+          elewise_res1 =
+              vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21);
+          elewise_res0 =
+              vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0], w20);
+          elewise_res2 =
+              vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0], w22);
+        }
+        res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
+                         vaddq_f32(elewise_res0, elewise_res1));
+        res3 = vmlaq_f32(vnewbias, vnewscale, res3);
+
+        if (if_relu) {
+          res3 = vmaxq_f32(res3, zero);
+        }
+        if ((w4 != w_times)) {
+          vst1q_f32(output_row_ptr, res3);
+        } else {
+          if (out_l - 2 - w_times * 3 == 1) {
+            vst1q_lane_f32(output_row_ptr, res3, 0);
+          } else if (out_l - 2 - w_times * 3 == 2) {
+            vst1q_lane_f32(output_row_ptr, res3, 0);
+            vst1q_lane_f32(output_row_ptr + 1, res3, 1);
+          }
+        }
+        input_row_ptr += 6;
+        output_row_ptr += 3;
+      }
+
+      output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 +
+                           input_const[in_l] * w21 +
+                           input_const[in_l + 1] * w22;
+
+      out2in_mid = (out_l - 1) * 2;
+      output_data_tmp[out_l - 1] =
+          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+          w20 * input_const[out2in_mid + in_w - 1] +
+          w21 * input_const[out2in_mid + in_w] +
+          (1 - if_pad) * (w12 * input_const[out2in_mid + 1] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+
+      out2in_mid = (out_l - 1) * 2 * in_w;
+
+      output_data_tmp[out_l * (out_l - 1)] =
+          w01 * input_const[out2in_mid - in_w] +
+          w02 * input_const[out2in_mid - in_w + 1] +
+          w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
+          (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+      out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2;
+
+      output_data_tmp[out_l * out_l - 1] =
+          w00 * input_const[out2in_mid - in_w - 1] +
+          w01 * input_const[out2in_mid - in_w] +
+          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+          (1 - if_pad) * (w20 * input_const[out2in_mid + in_w - 1] +
+                          w21 * input_const[out2in_mid + in_w] +
+                          w02 * input_const[out2in_mid - in_w + 1] +
+                          w12 * input_const[out2in_mid + 1] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+      output_data_tmp[0] =
+          output_data_tmp[0] * newscale_data[j] + newbias_data[j];
+      output_data_tmp[out_l - 1] =
+          output_data_tmp[out_l - 1] * newscale_data[j] + newbias_data[j];
+      output_data_tmp[out_l * (out_l - 1)] =
+          output_data_tmp[out_l * (out_l - 1)] * newscale_data[j] +
+          newbias_data[j];
+      output_data_tmp[out_l * out_l - 1] =
+          output_data_tmp[out_l * out_l - 1] * newscale_data[j] +
+          newbias_data[j];
+      if (if_relu) {
+        output_data_tmp[0] = output_data_tmp[0] < 0 ? 0 : output_data_tmp[0];
+        output_data_tmp[out_l - 1] =
+            output_data_tmp[out_l - 1] < 0 ? 0 : output_data_tmp[out_l - 1];
+        output_data_tmp[out_l * (out_l - 1)] =
+            output_data_tmp[out_l * (out_l - 1)] < 0
+                ? 0
+                : output_data_tmp[out_l * (out_l - 1)];
+        output_data_tmp[out_l * out_l - 1] =
+            output_data_tmp[out_l * out_l - 1] < 0
+                ? 0
+                : output_data_tmp[out_l * out_l - 1];
+      }
+      for (int i = 1; i < out_h - 1; i++) {
+        out2in_mid = i * 2 * in_w;
+        output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] +
+                                     w02 * input_const[out2in_mid - in_w + 1] +
+                                     w11 * input_const[out2in_mid] +
+                                     w12 * input_const[out2in_mid + 1] +
+                                     w21 * input_const[out2in_mid + in_w] +
+                                     w22 * input_const[out2in_mid + in_w + 1];
+
+        out2in_mid = i * 2 * in_w + (out_l - 1) * 2;
+        output_data_tmp[i * out_l + out_l - 1] =
+            w00 * input_const[out2in_mid - in_w - 1] +
+            w01 * input_const[out2in_mid - in_w] +
+            w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+            w20 * input_const[out2in_mid + in_w - 1] +
+            w21 * input_const[out2in_mid + in_w] +
+            (1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] +
+                            w12 * input_const[out2in_mid + 1] +
+                            w22 * input_const[out2in_mid + in_w + 1]);
+        output_data_tmp[i * out_l] =
+            output_data_tmp[i * out_l] * newscale_data[j] + newbias_data[j];
+        output_data_tmp[i * out_l + out_l - 1] =
+            output_data_tmp[i * out_l + out_l - 1] * newscale_data[j] +
+            newbias_data[j];
+        if (if_relu) {
+          output_data_tmp[i * out_l] =
+              output_data_tmp[i * out_l] < 0 ? 0 : output_data_tmp[i * out_l];
+          output_data_tmp[i * out_l + out_l - 1] =
+              output_data_tmp[i * out_l + out_l - 1] < 0
+                  ? 0
+                  : output_data_tmp[i * out_l + out_l - 1];
+        }
+      }
+      filter_data_tmp += 9;
+    }
+    input_data += inhxw * c;
+    output_data += outhxw * c;
+  }
+#endif
+#endif
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/math/depthwise_conv_3x3.h b/src/operators/math/depthwise_conv_3x3.h
new file mode 100644
index 0000000000000000000000000000000000000000..60e979648f871e640924a3373c625c311c3dd067
--- /dev/null
+++ b/src/operators/math/depthwise_conv_3x3.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+#include "framework/tensor.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+using framework::Tensor;
+using std::max;
+using std::min;
+using std::vector;
+
+void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
+                      vector<int> paddings, const Tensor *filter, Tensor *bias,
+                      Tensor *output, bool if_bias);
+void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
+                          Tensor *output, Tensor *bias, bool if_bias);
+void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
+                                   Tensor *output, const Tensor *new_scale,
+                                   const Tensor *new_bias, bool if_relu);
+void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
+                                   Tensor *output, const Tensor *new_scale,
+                                   const Tensor *new_bias, bool if_relu);
+void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
+                            Tensor *output, Tensor bias, bool if_bias);
+void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
+                                     Tensor *output, const Tensor *new_scale,
+                                     const Tensor *new_bias, bool if_relu);
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index da3dacb58a72d779d2ccd1224bbf4eab12dfbb91..3730cf350a1399e5f3c1473fd1ce8d7b1d13b1b6 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -15,23 +15,40 @@ limitations under the License. */
 #include "operators/math/gemm.h"
 #include "common/log.h"
 #include "memory/t_malloc.h"
-#ifndef X86
+#if __ARM_NEON
 #include <arm_neon.h>
 #endif
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 
 namespace paddle_mobile {
 namespace operators {
 namespace math {
-alignas(64) float packedA[MC * KC];
-alignas(64) float packedB[KC * NC];
-alignas(64) float ab[MR * NR];
+int MC = 0;
+int KC = 0;
+int NC = 0;
+
+float *packedA;
+float *packedB;
+float *packedC;
+float *zero;
+
+typedef void (*FnPack)(int, int, int, const float *, int, float *);
+typedef void (*FnAddDot)(int, const float *, const float *, float *, int);
+
+FnPack procPackA;
+FnPack procPackB;
+FnAddDot procAddDot;
+
+/*
 // 将A矩阵分块复制到连续内存(ColMajor)
-void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
+void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                  float *buffer) {
   int i, j;
   const float *Aij;
-  for (i = 0; i < m - paddingM; i += MR) {
-    for (int j = 0; j < k; ++j) {
+  for (i = 0; i < m - m_tail; i += MR) {
+    for (j = 0; j < k; ++j) {
       Aij = &A(i, j);
       *buffer++ = *Aij;
       *buffer++ = *(Aij + 1);
@@ -39,42 +56,13 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
       *buffer++ = *(Aij + 3);
     }
   }
-  if (paddingM != 0) {
+  if (m_tail != 0) {
     for (j = 0; j < k; ++j) {
-      Aij = &A(m - paddingM, j);
-      for (i = 0; i < paddingM; ++i) {
+      Aij = &A(m - m_tail, j);
+      for (i = 0; i < m_tail; ++i) {
         *buffer++ = *(Aij + i);
       }
-      for (i = paddingM; i < MR; ++i) {
-        *buffer++ = 0;
-      }
-    }
-  }
-}
-
-// 将A矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
-                  float *buffer) {
-  int i, j;
-  const float *Ai, *Ai1, *Ai2, *Ai3;
-  for (i = 0; i < m - paddingM; i += MR) {
-    Ai = &A(i, 0);
-    Ai1 = &A(i + 1, 0);
-    Ai2 = &A(i + 2, 0);
-    Ai3 = &A(i + 3, 0);
-    for (int j = 0; j < k; ++j) {
-      *buffer++ = *Ai++;
-      *buffer++ = *Ai1++;
-      *buffer++ = *Ai2++;
-      *buffer++ = *Ai3++;
-    }
-  }
-  if (paddingM != 0) {
-    for (j = 0; j < k; ++j) {
-      for (i = m - paddingM; i < m; ++i) {
-        *buffer++ = A(i, j);
-      }
-      for (i = m; i < m + (MR - paddingM); ++i) {
+      for (i = m_tail; i < MR; ++i) {
         *buffer++ = 0;
       }
     }
@@ -82,11 +70,11 @@ void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
 }
 
 // 将B矩阵分块复制到连续内存(ColMajor)
-void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
+void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                  float *buffer) {
   int i, j;
   const float *Bj, *Bj1, *Bj2, *Bj3;
-  for (j = 0; j < n - paddingN; j += NR) {
+  for (j = 0; j < n - n_tail; j += NR) {
     Bj = &B(0, j);
     Bj1 = &B(0, j + 1);
     Bj2 = &B(0, j + 2);
@@ -98,128 +86,631 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
       *buffer++ = *Bj3++;
     }
   }
-  if (paddingN != 0) {
+  if (n_tail != 0) {
     for (i = 0; i < k; ++i) {
-      for (int j = n - paddingN; j < n; ++j) {
+      for (int j = n - n_tail; j < n; ++j) {
         *buffer++ = B(i, j);
       }
-      for (int j = n; j < n + (NR - paddingN); ++j) {
+      for (int j = n; j < n + (NR - n_tail); ++j) {
         *buffer++ = 0;
       }
     }
   }
 }
+*/
 
-// 将B矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
-                  float *buffer) {
-  int i, j;
-  const float *Bij;
-  for (j = 0; j < n - paddingN; j += NR) {
-    for (i = 0; i < k; ++i) {
-      Bij = &B(i, j);
-      asm volatile(
-          "vld1.32    {q0}, [%[Bij]]        \n\t"
-          "vst1.32    {q0}, [%[buffer]]!    \n\t"
-          : [buffer] "+r"(buffer)
-          : [Bij] "r"(Bij)
-          : "memory", "q0");
+// 将A矩阵分块复制到连续内存(RowMajor)
+void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer) {
+  const float *a0, *a1, *a2, *a3;
+  for (int i = 0; i < m - m_tail; i += MR) {
+    a0 = A + i * lda;
+    a1 = A + (i + 1) * lda;
+    a2 = A + (i + 2) * lda;
+    a3 = A + (i + 3) * lda;
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
     }
   }
-  if (paddingN != 0) {
-    for (i = 0; i < k; ++i) {
-      Bij = &B(i, n - paddingN);
-      for (int j = n - paddingN; j < n; ++j) {
-        *buffer++ = *Bij++;
-      }
-      for (int j = n; j < n + (NR - paddingN); ++j) {
-        *buffer++ = 0;
-      }
+
+  if (m_tail != 0) {
+    a0 = &A(m - m_tail, 0);
+    a1 = a0 + lda;
+    a2 = a0 + 2 * lda;
+    a3 = a0 + 3 * lda;
+    switch (m_tail) {
+      case 1:
+        a1 = zero;
+      case 2:
+        a2 = zero;
+      case 3:
+        a3 = zero;
+        break;
+      default:
+        break;
+    }
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
     }
   }
 }
 
-// 分块矩阵乘法
-void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                 const float *B, int ldb, float beta, float *C, int ldc,
-                 int first_time) {
-  int Buff_A_M = m;
-  int Buff_B_N = n;
+void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer) {
+  const int i_length = m - m_tail;
+  for (int i = 0; i < i_length; i += MR) {
+    const float *a0 = A + i * lda;
+    const float *a1 = A + (i + 1) * lda;
+    const float *a2 = A + (i + 2) * lda;
+    const float *a3 = A + (i + 3) * lda;
+    const float *a4 = A + (i + 4) * lda;
+    const float *a5 = A + (i + 5) * lda;
+    float *local_buffer = buffer + i * k;
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+    }
+  }
+  if (m_tail != 0) {
+    const float *a0 = &A(i_length, 0);
+    const float *a1 = a0 + lda;
+    const float *a2 = a0 + 2 * lda;
+    const float *a3 = a0 + 3 * lda;
+    const float *a4 = a0 + 4 * lda;
+    const float *a5 = a0 + 5 * lda;
+    float *local_buffer = buffer + i_length * k;
+    switch (m_tail) {
+      case 1:
+        a1 = zero;
+      case 2:
+        a2 = zero;
+      case 3:
+        a3 = zero;
+      case 4:
+        a4 = zero;
+      case 5:
+        a5 = zero;
+        break;
+      default:
+        break;
+    }
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+    }
+  }
+}
 
-  int _mc = m % MR;
-  int _nc = n % NR;
+void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
+                        float *buffer) {
+  const int i_length = m - m_tail;
+#pragma omp parallel for
+  for (int i = 0; i < i_length; i += MR) {
+    const float *a0 = A + i * lda;
+    const float *a1 = A + (i + 1) * lda;
+    const float *a2 = A + (i + 2) * lda;
+    const float *a3 = A + (i + 3) * lda;
+    const float *a4 = A + (i + 4) * lda;
+    const float *a5 = A + (i + 5) * lda;
+    float *local_buffer = buffer + i * k;
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+    }
+  }
+  if (m_tail != 0) {
+    const float *a0 = &A(i_length, 0);
+    const float *a1 = a0 + lda;
+    const float *a2 = a0 + 2 * lda;
+    const float *a3 = a0 + 3 * lda;
+    const float *a4 = a0 + 4 * lda;
+    const float *a5 = a0 + 5 * lda;
+    float *local_buffer = buffer + i_length * k;
+    switch (m_tail) {
+      case 1:
+        a1 = zero;
+      case 2:
+        a2 = zero;
+      case 3:
+        a3 = zero;
+      case 4:
+        a4 = zero;
+      case 5:
+        a5 = zero;
+        break;
+      default:
+        break;
+    }
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+    }
+  }
+}
 
-  if (_mc != 0) {
-    Buff_A_M = m + (MR - _mc);
+void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer) {
+  const int i_length = m - m_tail;
+  for (int i = 0; i < i_length; i += MR) {
+    const float *a0 = A + i * lda;
+    const float *a1 = A + (i + 1) * lda;
+    const float *a2 = A + (i + 2) * lda;
+    const float *a3 = A + (i + 3) * lda;
+    const float *a4 = A + (i + 4) * lda;
+    const float *a5 = A + (i + 5) * lda;
+    const float *a6 = A + (i + 6) * lda;
+    const float *a7 = A + (i + 7) * lda;
+    float *local_buffer = buffer + i * k;
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+      *local_buffer++ = *a6++;
+      *local_buffer++ = *a7++;
+    }
+  }
+  if (m_tail != 0) {
+    const float *a0 = &A(i_length, 0);
+    const float *a1 = a0 + lda;
+    const float *a2 = a0 + 2 * lda;
+    const float *a3 = a0 + 3 * lda;
+    const float *a4 = a0 + 4 * lda;
+    const float *a5 = a0 + 5 * lda;
+    const float *a6 = a0 + 6 * lda;
+    const float *a7 = a0 + 7 * lda;
+    float *local_buffer = buffer + i_length * k;
+    switch (m_tail) {
+      case 1:
+        a1 = zero;
+      case 2:
+        a2 = zero;
+      case 3:
+        a3 = zero;
+      case 4:
+        a4 = zero;
+      case 5:
+        a5 = zero;
+      case 6:
+        a6 = zero;
+      case 7:
+        a7 = zero;
+        break;
+      default:
+        break;
+    }
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+      *local_buffer++ = *a6++;
+      *local_buffer++ = *a7++;
+    }
   }
+}
 
-  if (_nc != 0) {
-    Buff_B_N = n + (NR - _nc);
+void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
+                        float *buffer) {
+  const int i_length = m - m_tail;
+#pragma omp parallel for
+  for (int i = 0; i < i_length; i += MR) {
+    const float *a0 = A + i * lda;
+    const float *a1 = A + (i + 1) * lda;
+    const float *a2 = A + (i + 2) * lda;
+    const float *a3 = A + (i + 3) * lda;
+    const float *a4 = A + (i + 4) * lda;
+    const float *a5 = A + (i + 5) * lda;
+    const float *a6 = A + (i + 6) * lda;
+    const float *a7 = A + (i + 7) * lda;
+    float *local_buffer = buffer + i * k;
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+      *local_buffer++ = *a6++;
+      *local_buffer++ = *a7++;
+    }
+  }
+  if (m_tail != 0) {
+    const float *a0 = &A(i_length, 0);
+    const float *a1 = a0 + lda;
+    const float *a2 = a0 + 2 * lda;
+    const float *a3 = a0 + 3 * lda;
+    const float *a4 = a0 + 4 * lda;
+    const float *a5 = a0 + 5 * lda;
+    const float *a6 = a0 + 6 * lda;
+    const float *a7 = a0 + 7 * lda;
+    float *local_buffer = buffer + i_length * k;
+    switch (m_tail) {
+      case 1:
+        a1 = zero;
+      case 2:
+        a2 = zero;
+      case 3:
+        a3 = zero;
+      case 4:
+        a4 = zero;
+      case 5:
+        a5 = zero;
+      case 6:
+        a6 = zero;
+      case 7:
+        a7 = zero;
+        break;
+      default:
+        break;
+    }
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+      *local_buffer++ = *a6++;
+      *local_buffer++ = *a7++;
+    }
   }
+}
 
-  if (first_time) {
-    PackMatrixB_(k, n, _nc, B, ldb, packedB);
+// 将B矩阵分块复制到连续内存(RowMajor)
+void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
+                    float *buffer) {
+  const int j_length = n - n_tail;
+  for (int j = 0; j < j_length; j += NR) {
+    float *local_buffer = buffer + j * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j);
+#if __ARM_NEON
+#if __aarch64__
+      asm volatile(
+          "prfm   pldl1keep,        [%[b0]]           \n\t"
+          "ld1    {v0.4s, v1.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s},   [%[local_buffer]],  #32 \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1");
+#else
+      asm volatile(
+          "pld        [%[b0]]                     \n\t"
+          "vld1.32    {q0, q1},   [%[b0]]         \n\t"
+          "vst1.32    {q0, q1},   [%[local_buffer]]!    \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "q0", "q1");
+#endif  // __aarch64__
+#else
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+#endif  // __ARM_NEON
+    }
+  }
+  if (n_tail != 0) {
+    float *local_buffer = buffer + j_length * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j_length);
+      for (int j = j_length; j < n; ++j) {
+        *local_buffer++ = *b0++;
+      }
+      for (int j = n; j < j_length + NR; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
   }
-  PackMatrixA_(m, k, _mc, A, lda, packedA);
+}
 
-  int i, j, mc, nc;
+void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
+                        float *buffer) {
+  const int j_length = n - n_tail;
+#pragma omp parallel for
+  for (int j = 0; j < j_length; j += NR) {
+    float *local_buffer = buffer + j * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j);
+#if __ARM_NEON
+#if __aarch64__
+      asm volatile(
+          "prfm   pldl1keep,        [%[b0]]           \n\t"
+          "ld1    {v0.4s, v1.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s},   [%[local_buffer]],  #32 \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1");
+#else
+      asm volatile(
+          "pld        [%[b0]]                     \n\t"
+          "vld1.32    {q0, q1},   [%[b0]]         \n\t"
+          "vst1.32    {q0, q1},   [%[local_buffer]]!    \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "q0", "q1");
+#endif  // __aarch64__
+#else
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+#endif  // __ARM_NEON
+    }
+  }
+  if (n_tail != 0) {
+    float *local_buffer = buffer + j_length * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j_length);
+      for (int j = j_length; j < n; ++j) {
+        *local_buffer++ = *b0++;
+      }
+      for (int j = n; j < j_length + NR; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+}
 
-  // B 取 4 列, 打包预热
-  for (j = 0; j < Buff_B_N; j += NR) {
-    nc = (n - j) < NR ? _nc : NR;
-    // A 取 4 行，打包预热
-    for (i = 0; i < Buff_A_M; i += MR) {
-      mc = (m - i) < MR ? _mc : MR;
-      AddDot4x4(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
-                &C(i, j), ldc, mc, nc);
+#if __aarch64__
+void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
+                     float *buffer) {
+  const int j_length = n - n_tail;
+  for (int j = 0; j < j_length; j += NR) {
+    float *local_buffer = buffer + j * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j);
+      asm volatile(
+          "prfm   pldl2keep,        [%[b0], #64]           \n\t"
+          "ld1    {v0.4s, v1.4s, v2.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s, v2.4s},   [%[local_buffer]],  #48 \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1", "v2");
+    }
+  }
+  if (n_tail != 0) {
+    float *local_buffer = buffer + j_length * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j_length);
+      for (int j = j_length; j < n; ++j) {
+        *local_buffer++ = *b0++;
+      }
+      for (int j = n; j < j_length + NR; ++j) {
+        *local_buffer++ = 0;
+      }
     }
   }
 }
 
-// 分块矩阵乘法
-void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda,
-                      const float *B, int ldb, float beta, float *C, int ldc,
-                      int first_time, bool relu = false) {
-  int Buff_A_M = m;
-  int Buff_B_N = n;
+void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
+                         float *buffer) {
+  const int j_length = n - n_tail;
+#pragma omp parallel for
+  for (int j = 0; j < j_length; j += NR) {
+    float *local_buffer = buffer + j * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j);
+      asm volatile(
+          "prfm   pldl2keep,        [%[b0], #64]           \n\t"
+          "ld1    {v0.4s, v1.4s, v2.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s, v2.4s},   [%[local_buffer]],  #48 \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1", "v2");
+    }
+  }
+  if (n_tail != 0) {
+    float *local_buffer = buffer + j_length * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j_length);
+      for (int j = j_length; j < n; ++j) {
+        *local_buffer++ = *b0++;
+      }
+      for (int j = n; j < j_length + NR; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+}
 
-  int _mc = m % MR;
-  int _nc = n % NR;
+void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
+                     float *buffer) {
+  const int j_length = n - n_tail;
+  for (int j = 0; j < n - n_tail; j += NR) {
+    float *local_buffer = buffer + j * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j);
+      asm volatile(
+          "prfm   pldl2keep,        [%[b0], #64]           \n\t"
+          "ld1    {v0.4s, v1.4s, v2.4s, v3.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s, v2.4s, v3.4s},   [%[local_buffer]],  #64 \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1", "v2", "v3");
+    }
+  }
+  if (n_tail != 0) {
+    float *local_buffer = buffer + j_length * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j_length);
+      for (int j = j_length; j < n; ++j) {
+        *local_buffer++ = *b0++;
+      }
+      for (int j = n; j < j_length + NR; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+}
 
-  if (_mc != 0) {
-    Buff_A_M = m + (MR - _mc);
+void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
+                         float *buffer) {
+  const int j_length = n - n_tail;
+#pragma omp parallel for
+  for (int j = 0; j < n - n_tail; j += NR) {
+    float *local_buffer = buffer + j * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j);
+      asm volatile(
+          "prfm   pldl2keep,        [%[b0], #64]           \n\t"
+          "ld1    {v0.4s, v1.4s, v2.4s, v3.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s, v2.4s, v3.4s},   [%[local_buffer]],  #64 \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1", "v2", "v3");
+    }
+  }
+  if (n_tail != 0) {
+    float *local_buffer = buffer + j_length * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j_length);
+      for (int j = j_length; j < n; ++j) {
+        *local_buffer++ = *b0++;
+      }
+      for (int j = n; j < j_length + NR; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
   }
+}
+#endif  // __aarch64__
 
-  if (_nc != 0) {
-    Buff_B_N = n + (NR - _nc);
+// 分块矩阵乘法
+void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
+                 float beta, float *c, float *C, int ldc, bool relu) {
+#pragma omp parallel for
+  for (int j = 0; j < nc; j += NR) {
+    for (int i = 0; i < mc; i += MR) {
+#if __aarch64__
+      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#else
+      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#endif
+    }
   }
 
-  float packedA[MC * KC];
-  static float packedB[KC * NC];
+  if (alpha != 1) {
+    WriteWithAlphaBeta(mc, nc, c, C, ldc);
+    return;
+  }
+  if (beta == 0) {
+    WriteBasic(mc, nc, c, C, ldc);
+    return;
+  }
+  if (beta == 1 && !relu) {
+    WriteWithAdd(mc, nc, c, C, ldc);
+    return;
+  }
+  if (beta == 1 && relu) {
+    WriteWithAddRelu(mc, nc, c, C, ldc);
+    return;
+  }
+}
 
-  if (first_time) {
-    PackMatrixB_(k, n, _nc, B, ldb, packedB);
+// 分块矩阵乘法
+void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
+                         const float *b, float beta, float *c, float *C,
+                         int ldc, bool relu, float *bias) {
+#pragma omp parallel for
+  for (int j = 0; j < nc; j += NR) {
+    for (int i = 0; i < mc; i += MR) {
+#if __aarch64__
+      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#else
+      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#endif
+    }
   }
-  PackMatrixA_(m, k, _mc, A, lda, packedA);
 
-  int i, j, mc, nc;
+  if (alpha != 1) {
+    WriteWithAlphaBeta(mc, nc, c, C, ldc);
+    return;
+  }
+  if (beta == 0) {
+    WriteBasic(mc, nc, c, C, ldc);
+    return;
+  }
+  if (beta == 1 && !relu) {
+    WriteWithAddV1(mc, nc, c, C, ldc, bias);
+    return;
+  }
+  if (beta == 1 && relu) {
+    WriteWithAddReluV1(mc, nc, c, C, ldc, bias);
+    return;
+  }
+}
 
-  // B 取 4 列, 打包预热
-  for (j = 0; j < Buff_B_N; j += NR) {
-    nc = (n - j) < NR ? _nc : NR;
-    // A 取 4 行，打包预热
-    for (i = 0; i < Buff_A_M; i += MR) {
-      mc = (m - i) < MR ? _mc : MR;
-      AddDot4x4_relu(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
-                     &C(i, j), ldc, mc, nc, relu);
+// 分块矩阵乘法
+void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+                       const float *b, float beta, float *c, float *C, int ldc,
+                       bool relu, float *new_scale, float *new_bias) {
+#pragma omp parallel for
+  for (int j = 0; j < nc; j += NR) {
+    for (int i = 0; i < mc; i += MR) {
+#if __aarch64__
+      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#else
+      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#endif
     }
   }
+
+  if (relu) {
+    WriteWithBnRelu(mc, nc, c, C, ldc, new_scale, new_bias);
+  } else {
+    WriteWithBn(mc, nc, c, C, ldc, new_scale, new_bias);
+  }
 }
 
-// 计算一个更小的 4 * 4 的 C 矩阵分块
-#if defined(IOS)
-void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
-               int ldb, float beta, float *C, int ldc, int mc, int nc) {
+#if __ARM_NEON
+#if __aarch64__
+
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
   // init C
   float32x4_t cv0 = vdupq_n_f32(0.0);
   float32x4_t cv1 = vdupq_n_f32(0.0);
@@ -246,99 +737,360 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
     a += MR;
     b += NR;
   }
-  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
-      }
-      if (j == 0) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
-      } else if (j == 1) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
-      } else if (j == 2) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
-      } else if (j == 3) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
-      }
-    }
-  }
+
+  vst1q_f32(c, cv0);
+  vst1q_f32(c + ldc, cv1);
+  vst1q_f32(c + 2 * ldc, cv2);
+  vst1q_f32(c + 3 * ldc, cv3);
+  //  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
 }
 
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu = false) {
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
   // init C
   float32x4_t cv0 = vdupq_n_f32(0.0);
   float32x4_t cv1 = vdupq_n_f32(0.0);
   float32x4_t cv2 = vdupq_n_f32(0.0);
   float32x4_t cv3 = vdupq_n_f32(0.0);
+  float32x4_t cv4 = vdupq_n_f32(0.0);
+  float32x4_t cv5 = vdupq_n_f32(0.0);
+  float32x4_t cv6 = vdupq_n_f32(0.0);
+  float32x4_t cv7 = vdupq_n_f32(0.0);
 
   float32x4_t av;
-  float32x4_t bv;
+  float32x4_t bv0;
+  float32x4_t bv1;
 
   float32x2_t av01;
   float32x2_t av23;
 
   for (int p = 0; p < k; p += 1) {
     av = vld1q_f32(a);
-    bv = vld1q_f32(b);
+    bv0 = vld1q_f32(b);
+    bv1 = vld1q_f32(b + 4);
 
     av01 = vget_low_f32(av);
-    cv0 = vmlaq_lane_f32(cv0, bv, av01, 0);
-    cv1 = vmlaq_lane_f32(cv1, bv, av01, 1);
+    cv0 = vmlaq_lane_f32(cv0, bv0, av01, 0);
+    cv1 = vmlaq_lane_f32(cv1, bv1, av01, 0);
+    cv2 = vmlaq_lane_f32(cv2, bv0, av01, 1);
+    cv3 = vmlaq_lane_f32(cv3, bv1, av01, 1);
     av23 = vget_high_f32(av);
-    cv2 = vmlaq_lane_f32(cv2, bv, av23, 0);
-    cv3 = vmlaq_lane_f32(cv3, bv, av23, 1);
+    cv4 = vmlaq_lane_f32(cv4, bv0, av23, 0);
+    cv5 = vmlaq_lane_f32(cv5, bv1, av23, 0);
+    cv6 = vmlaq_lane_f32(cv6, bv0, av23, 1);
+    cv7 = vmlaq_lane_f32(cv7, bv1, av23, 1);
 
     a += MR;
     b += NR;
   }
-  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
+
+  vst1q_f32(c, cv0);
+  vst1q_f32(c + 4, cv1);
+  vst1q_f32(c + ldc, cv2);
+  vst1q_f32(c + ldc + 4, cv3);
+  vst1q_f32(c + 2 * ldc, cv4);
+  vst1q_f32(c + 2 * ldc + 4, cv5);
+  vst1q_f32(c + 3 * ldc, cv6);
+  vst1q_f32(c + 3 * ldc + 4, cv7);
+}
+
+// 分块矩阵乘法结果回写
+// C = A * B
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
       }
-      if (j == 0) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
-      } else if (j == 1) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
-      } else if (j == 2) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
-      } else if (j == 3) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
       }
-      if (C(i, j) < 0) {
-        C(i, j) = 0;
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
       }
     }
   }
 }
 
-#elif defined(ARMV7)
-void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
-               int ldb, float beta, float *C, int ldc, int mc, int nc) {
-  int kc1 = k / 4, kc2 = k % 4;
-  int bytes_ldc = 4 * ldc;
-  int flag_alpha = (alpha == 1.0) ? 1 : 2;
-  int flag_beta;
-  if (beta == 0.0) {
-    flag_beta = 0;
-  } else if (beta == 1.0) {
-    flag_beta = 1;
-  } else {
-    flag_beta = 2;
+// C = alpha * A * B + beta * C
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+
+// C = A * B + C
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t cv1;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv1 = vld1q_f32(C_ptr);
+      cv = vaddq_f32(cv, cv1);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv1 = vld1q_f32(C_ptr);
+      cv = vaddq_f32(cv, cv1);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+      }
+    }
+  }
+}
+// C = A * B + bias
+void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t biasv;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    biasv = vld1q_dup_f32(bias + i);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+        C_ptr++;
+      }
+    }
+  }
+}
+
+// C = A * B + C, relu(C)
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t cv1;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv1 = vld1q_f32(C_ptr);
+      cv = vaddq_f32(cv, cv1);
+      cv = vmaxq_f32(cv, zero);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv1 = vld1q_f32(C_ptr);
+      cv = vaddq_f32(cv, cv1);
+      cv = vmaxq_f32(cv, zero);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+      }
+    }
+  }
+}
+
+// C = A * B + bias, relu(C)
+void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
+                        float *bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t biasv;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    biasv = vld1q_dup_f32(bias + i);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      cv = vmaxq_f32(cv, zero);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      cv = vmaxq_f32(cv, zero);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+        C_ptr++;
+      }
+    }
+  }
+}
+
+// C = A * B, batchnorm(C)
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
+                 float *new_bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t cv1;
+  float32x4_t bias;
+  float32x2_t scale;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    bias = vld1q_dup_f32(new_bias);
+    scale = vld1_dup_f32(new_scale);
+    new_bias++;
+    new_scale++;
+    float scale0 = vget_lane_f32(scale, 0);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vmlaq_n_f32(bias, cv, scale0);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vmlaq_n_f32(bias, cv, scale0);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+        C_ptr++;
+      }
+    }
+  }
+}
+
+// C = A * B, batchnorm(C), relu(C)
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                     float *new_scale, float *new_bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t bias;
+  float32x2_t scale;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    bias = vld1q_dup_f32(new_bias);
+    scale = vld1_dup_f32(new_scale);
+    new_bias++;
+    new_scale++;
+    float scale0 = vget_lane_f32(scale, 0);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vmlaq_n_f32(bias, cv, scale0);
+      cv = vmaxq_f32(cv, zero);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vmlaq_n_f32(bias, cv, scale0);
+      cv = vmaxq_f32(cv, zero);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+      }
+    }
   }
+}
+
+#else
+
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k / 4;
+  int kc2 = k % 4;
+  int step = 4 * ldc;
   asm volatile(
-      "pld        [%[a]]              \n\t"
-      "pld        [%[b]]              \n\t"
+      "pld        [%[a_ptr]]          \n\t"
+      "pld        [%[b_ptr]]          \n\t"
       "vmov.f32   q10,    #0.0        \n\t"
       "vmov.f32   q11,    #0.0        \n\t"
       "vmov.f32   q12,    #0.0        \n\t"
@@ -347,20 +1099,10 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
       "subs       %[kc1], %[kc1], #1  \n\t"
       "blt        end_kc1_%=          \n\t"
       "loop_kc1_%=:                   \n\t"
-      "pld        [%[a], #64]         \n\t"
-      "pld        [%[b], #64]         \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
-      "vmla.f32   q10, q2, d0[0]      \n\t"
-      "vmla.f32   q11, q2, d0[1]      \n\t"
-      "vmla.f32   q12, q2, d1[0]      \n\t"
-      "vmla.f32   q13, q2, d1[1]      \n\t"
-      "vmla.f32   q10, q3, d2[0]      \n\t"
-      "vmla.f32   q11, q3, d2[1]      \n\t"
-      "vmla.f32   q12, q3, d3[0]      \n\t"
-      "vmla.f32   q13, q3, d3[1]      \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
+      "pld        [%[a_ptr], #64]     \n\t"
+      "pld        [%[b_ptr], #64]     \n\t"
+      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
       "vmla.f32   q10, q2, d0[0]      \n\t"
       "vmla.f32   q11, q2, d0[1]      \n\t"
       "vmla.f32   q12, q2, d1[0]      \n\t"
@@ -369,448 +1111,272 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
       "vmla.f32   q11, q3, d2[1]      \n\t"
       "vmla.f32   q12, q3, d3[0]      \n\t"
       "vmla.f32   q13, q3, d3[1]      \n\t"
+      "vld1.32    {q4, q5}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q6, q7}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q10, q6, d8[0]      \n\t"
+      "vmla.f32   q11, q6, d8[1]      \n\t"
+      "vmla.f32   q12, q6, d9[0]      \n\t"
+      "vmla.f32   q13, q6, d9[1]      \n\t"
+      "vmla.f32   q10, q7, d10[0]     \n\t"
+      "vmla.f32   q11, q7, d10[1]     \n\t"
+      "vmla.f32   q12, q7, d11[0]     \n\t"
+      "vmla.f32   q13, q7, d11[1]     \n\t"
       "subs       %[kc1], %[kc1], #1  \n\t"
       "bge        loop_kc1_%=         \n\t"
       "end_kc1_%=:                    \n\t"
 
       "subs       %[kc2], %[kc2], #1  \n\t"
       "blt        end_kc2_%=          \n\t"
-      "vld1.32    {q0}, [%[a]]!       \n\t"
-      "vld1.32    {q1}, [%[b]]!       \n\t"
+      "loop_kc2_%=:                   \n\t"
+      "vld1.32    {q0}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q1}, [%[b_ptr]]!   \n\t"
       "vmla.f32   q10, q1, d0[0]      \n\t"
       "vmla.f32   q11, q1, d0[1]      \n\t"
       "vmla.f32   q12, q1, d1[0]      \n\t"
       "vmla.f32   q13, q1, d1[1]      \n\t"
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "bge        loop_kc2_%=         \n\t"
       "end_kc2_%=:                    \n\t"
 
-      "cmp        %[mc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-      "cmp        %[nc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-
-      "vmov.f32   d8[0],    %[alpha]  \n\t"
-      "vmov.f32   d8[1],    %[beta]   \n\t"
-
-      "cmp        %[flag_alpha],  #1  \n\t"
-      "bne        alpha_%=            \n\t"
-
-      "alpha_%=:                      \n\t"
-      "vmul.f32   q10, q10, d8[0]     \n\t"
-      "vmul.f32   q11, q11, d8[0]     \n\t"
-      "vmul.f32   q12, q12, d8[0]     \n\t"
-      "vmul.f32   q13, q13, d8[0]     \n\t"
-
-      "beta_%=:                       \n\t"
-      "cmp        %[flag_beta],   #0  \n\t"
-      "beq        memory_%=           \n\t"
-
-      "mov        r4,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
-      "vld1.32    {q0}, [r4], r6      \n\t"
-      "vld1.32    {q1}, [r4], r6      \n\t"
-      "vld1.32    {q2}, [r4], r6      \n\t"
-      "vld1.32    {q3}, [r4]          \n\t"
-      "cmp        %[flag_beta],   #1  \n\t"
-      "beq        beta_eq1_%=         \n\t"
-      "bne        beta_ne1_%=         \n\t"
-
-      "beta_eq1_%=:                   \n\t"
-      "vadd.f32   q10, q10, q0        \n\t"
-      "vadd.f32   q11, q11, q1        \n\t"
-      "vadd.f32   q12, q12, q2        \n\t"
-      "vadd.f32   q13, q13, q3        \n\t"
-      "b          memory_%=           \n\t"
-
-      "beta_ne1_%=:                   \n\t"
-      "vmla.f32   q10, q0, d8[1]      \n\t"
-      "vmla.f32   q11, q1, d8[1]      \n\t"
-      "vmla.f32   q12, q2, d8[1]      \n\t"
-      "vmla.f32   q13, q3, d8[1]      \n\t"
-
-      "memory_%=:                     \n\t"
-      "mov        r5,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
+      "mov        r5,     %[c]        \n\t"
+      "mov        r6,     %[step]     \n\t"
       "vst1.32    {q10}, [r5], r6     \n\t"
       "vst1.32    {q11}, [r5], r6     \n\t"
       "vst1.32    {q12}, [r5], r6     \n\t"
       "vst1.32    {q13}, [r5]         \n\t"
-      "b          end_%=              \n\t"
-
-      "temp_%=:                       \n\t"
-      "vst1.32    {q10, q11}, [%[ab]]!\n\t"
-      "vst1.32    {q12, q13}, [%[ab]] \n\t"
-      "end_%=:                        \n\t"
       :
-      : [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
-        [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
-        [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13");
-
-  if (mc != MR || nc != NR) {
-    int i, j;
-    for (i = 0; i < mc; ++i) {
-      for (j = 0; j < nc; ++j) {
-        if (beta == 0.0) {
-          if (alpha != 1.0) {
-            C(i, j) = alpha * ab[i * MR + j];
-          } else {
-            C(i, j) = ab[i * MR + j];
-          }
-        } else {
-          if (beta != 1.0) {
-            C(i, j) *= beta;
-          }
-          if (alpha != 1.0) {
-            C(i, j) += alpha * ab[i * MR + j];
-          } else {
-            C(i, j) += ab[i * MR + j];
-          }
-        }
-      }
-    }
-  }
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [step] "r"(step)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q10", "q11", "q12", "q13");
 }
 
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu = false) {
-  int kc1 = k / 4, kc2 = k % 4;
-  int bytes_ldc = 4 * ldc;
-  int flag_alpha = (alpha == 1.0) ? 1 : 2;
-  int flag_beta;
-  if (beta == 0.0) {
-    flag_beta = 0;
-  } else if (beta == 1.0) {
-    flag_beta = 1;
-  } else {
-    flag_beta = 2;
-  }
-  asm volatile(
-      "pld        [%[a]]              \n\t"
-      "pld        [%[b]]              \n\t"
-      "vmov.f32   q10,    #0.0        \n\t"
-      "vmov.f32   q11,    #0.0        \n\t"
-      "vmov.f32   q12,    #0.0        \n\t"
-      "vmov.f32   q13,    #0.0        \n\t"
+/*
+void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
+                  const float *B, int ldb, float beta, float *C, int ldc,
+                  bool relu) {
+  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
 
-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "blt        end_kc1_%=          \n\t"
-      "loop_kc1_%=:                   \n\t"
-      "pld        [%[a], #64]         \n\t"
-      "pld        [%[b], #64]         \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
-      "vmla.f32   q10, q2, d0[0]      \n\t"
-      "vmla.f32   q11, q2, d0[1]      \n\t"
-      "vmla.f32   q12, q2, d1[0]      \n\t"
-      "vmla.f32   q13, q2, d1[1]      \n\t"
-      "vmla.f32   q10, q3, d2[0]      \n\t"
-      "vmla.f32   q11, q3, d2[1]      \n\t"
-      "vmla.f32   q12, q3, d3[0]      \n\t"
-      "vmla.f32   q13, q3, d3[1]      \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
-      "vmla.f32   q10, q2, d0[0]      \n\t"
-      "vmla.f32   q11, q2, d0[1]      \n\t"
-      "vmla.f32   q12, q2, d1[0]      \n\t"
-      "vmla.f32   q13, q2, d1[1]      \n\t"
-      "vmla.f32   q10, q3, d2[0]      \n\t"
-      "vmla.f32   q11, q3, d2[1]      \n\t"
-      "vmla.f32   q12, q3, d3[0]      \n\t"
-      "vmla.f32   q13, q3, d3[1]      \n\t"
-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "bge        loop_kc1_%=         \n\t"
-      "end_kc1_%=:                    \n\t"
+  const float *a0, *b0, *b1, *b2, *b3;
+  float *c0, *C0;
 
-      "subs       %[kc2], %[kc2], #1  \n\t"
-      "blt        end_kc2_%=          \n\t"
-      "vld1.32    {q0}, [%[a]]!       \n\t"
-      "vld1.32    {q1}, [%[b]]!       \n\t"
-      "vmla.f32   q10, q1, d0[0]      \n\t"
-      "vmla.f32   q11, q1, d0[1]      \n\t"
-      "vmla.f32   q12, q1, d1[0]      \n\t"
-      "vmla.f32   q13, q1, d1[1]      \n\t"
-      "end_kc2_%=:                    \n\t"
+  int volatile kc1 = k / 4;
+  int volatile kc2 = k % 4;
+  int volatile nc1 = n / 16;
+  int _nc1 = n % 16;
+  int volatile nc2 = _nc1 / 4;
+  int volatile nc3 = _nc1 % 4;
+  for (int i = 0; i < kc1; i++) {
+    a0 = A + i * 4;
+    b0 = B + i * 4 * ldb;
+    b1 = b0 + ldb;
+    b2 = b1 + ldb;
+    b3 = b2 + ldb;
+    c0 = bufferC;
+    asm volatile(
+        "pld        [%[a0], #16]          \n\t"
+        "vld1.32    {q0}, [%[a0]]         \n\t"
 
-      "cmp        %[mc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-      "cmp        %[nc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-
-      "vmov.f32   d8[0],    %[alpha]  \n\t"
-      "vmov.f32   d8[1],    %[beta]   \n\t"
-
-      "cmp        %[flag_alpha],  #1  \n\t"
-      "bne        alpha_%=            \n\t"
-
-      "alpha_%=:                      \n\t"
-      "vmul.f32   q10, q10, d8[0]     \n\t"
-      "vmul.f32   q11, q11, d8[0]     \n\t"
-      "vmul.f32   q12, q12, d8[0]     \n\t"
-      "vmul.f32   q13, q13, d8[0]     \n\t"
-
-      "beta_%=:                       \n\t"
-      "cmp        %[flag_beta],   #0  \n\t"
-      "beq        memory_%=           \n\t"
-
-      "mov        r4,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
-      "vld1.32    {q0}, [r4], r6      \n\t"
-      "vld1.32    {q1}, [r4], r6      \n\t"
-      "vld1.32    {q2}, [r4], r6      \n\t"
-      "vld1.32    {q3}, [r4]          \n\t"
-      "cmp        %[flag_beta],   #1  \n\t"
-      "beq        beta_eq1_%=         \n\t"
-      "bne        beta_ne1_%=         \n\t"
-
-      "beta_eq1_%=:                   \n\t"
-      "vadd.f32   q10, q10, q0        \n\t"
-      "vadd.f32   q11, q11, q1        \n\t"
-      "vadd.f32   q12, q12, q2        \n\t"
-      "vadd.f32   q13, q13, q3        \n\t"
-      "b          memory_%=           \n\t"
-
-      "beta_ne1_%=:                   \n\t"
-      "vmla.f32   q10, q0, d8[1]      \n\t"
-      "vmla.f32   q11, q1, d8[1]      \n\t"
-      "vmla.f32   q12, q2, d8[1]      \n\t"
-      "vmla.f32   q13, q3, d8[1]      \n\t"
-
-      "memory_%=:                     \n\t"
-      "vmax.f32 q10, q10, q14           \n\t"
-      "vmax.f32 q11, q11, q14           \n\t"
-      "vmax.f32 q12, q12, q14           \n\t"
-      "vmax.f32 q13, q13, q14           \n\t"
-      "mov        r5,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
-      "vst1.32    {q10}, [r5], r6     \n\t"
-      "vst1.32    {q11}, [r5], r6     \n\t"
-      "vst1.32    {q12}, [r5], r6     \n\t"
-      "vst1.32    {q13}, [r5]         \n\t"
-      "b          end_%=              \n\t"
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "blt        end_nc1_%=            \n\t"
+        "loop_nc1_%=:                     \n\t"
 
-      "temp_%=:                       \n\t"
-      "vst1.32    {q10, q11}, [%[ab]]!\n\t"
-      "vst1.32    {q12, q13}, [%[ab]] \n\t"
-      "end_%=:                        \n\t"
-      :
-      : [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
-        [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
-        [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13");
-
-  if (mc != MR || nc != NR) {
-    int i, j;
-    for (i = 0; i < mc; ++i) {
-      for (j = 0; j < nc; ++j) {
-        if (beta == 0.0) {
-          if (alpha != 1.0) {
-            C(i, j) = alpha * ab[i * MR + j];
-          } else {
-            C(i, j) = ab[i * MR + j];
-          }
-        } else {
-          if (beta != 1.0) {
-            C(i, j) *= beta;
-          }
-          if (alpha != 1.0) {
-            C(i, j) += alpha * ab[i * MR + j];
-          } else {
-            C(i, j) += ab[i * MR + j];
-          }
-        }
-        if (relu) {
-          if (C(i, j) < 0) {
-            C(i, j) = 0;
-          }
-        }
-      }
-    }
-  }
-}
+        "cmp        %[i],       #0        \n\t"
+        "beq        i_eq0_%=              \n\t"
+        "bne        i_ne0_%=              \n\t"
 
-#else
-void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
-               int ldb, float beta, float *C, int ldc, int mc, int nc) {
-  float c[16] = {0};
-  float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
+        "i_eq0_%=:                        \n\t"
+        "vmov.f32   q10,    #0.0          \n\t"
+        "vmov.f32   q11,    #0.0          \n\t"
+        "vmov.f32   q12,    #0.0          \n\t"
+        "vmov.f32   q13,    #0.0          \n\t"
+        "b          gemm_nc1_%=           \n\t"
 
-  for (int p = 0; p < k; p += 1) {
-    reg_b0 = *b++;
-    reg_b1 = *b++;
-    reg_b2 = *b++;
-    reg_b3 = *b++;
+        "i_ne0_%=:                        \n\t"
+        "pld        [%[c0], #64]          \n\t"
+        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
+        "sub        %[c0], %[c0], #32     \n\t"
 
-    reg_a0 = *a++;
-    reg_a1 = *a++;
-    reg_a2 = *a++;
-    reg_a3 = *a++;
+        "gemm_nc1_%=:                     \n\t"
+        "pld        [%[b0], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "vmla.f32   q11, q3, d0[0]        \n\t"
+        "vmla.f32   q12, q4, d0[0]        \n\t"
+        "vmla.f32   q13, q5, d0[0]        \n\t"
 
-    // first row
-    c[0] += reg_a0 * reg_b0;
-    c[1] += reg_a0 * reg_b1;
-    c[2] += reg_a0 * reg_b2;
-    c[3] += reg_a0 * reg_b3;
+        "pld        [%[b1], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b1]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b1]]!    \n\t"
+        "vmla.f32   q10, q2, d0[1]        \n\t"
+        "vmla.f32   q11, q3, d0[1]        \n\t"
+        "vmla.f32   q12, q4, d0[1]        \n\t"
+        "vmla.f32   q13, q5, d0[1]        \n\t"
 
-    // second row
-    c[4] += reg_a1 * reg_b0;
-    c[5] += reg_a1 * reg_b1;
-    c[6] += reg_a1 * reg_b2;
-    c[7] += reg_a1 * reg_b3;
+        "pld        [%[b2], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b2]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b2]]!    \n\t"
+        "vmla.f32   q10, q2, d1[0]        \n\t"
+        "vmla.f32   q11, q3, d1[0]        \n\t"
+        "vmla.f32   q12, q4, d1[0]        \n\t"
+        "vmla.f32   q13, q5, d1[0]        \n\t"
 
-    // third row
-    c[8] += reg_a2 * reg_b0;
-    c[9] += reg_a2 * reg_b1;
-    c[10] += reg_a2 * reg_b2;
-    c[11] += reg_a2 * reg_b3;
+        "pld        [%[b3], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b3]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b3]]!    \n\t"
+        "vmla.f32   q10, q2, d1[1]        \n\t"
+        "vmla.f32   q11, q3, d1[1]        \n\t"
+        "vmla.f32   q12, q4, d1[1]        \n\t"
+        "vmla.f32   q13, q5, d1[1]        \n\t"
 
-    // fourth row
-    c[12] += reg_a3 * reg_b0;
-    c[13] += reg_a3 * reg_b1;
-    c[14] += reg_a3 * reg_b2;
-    c[15] += reg_a3 * reg_b3;
-  }
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
-      }
-      if (alpha != 1.0) {
-        C(i, j) += alpha * c[i * MR + j];
+        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
+
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "bge        loop_nc1_%=           \n\t"
+        "end_nc1_%=:                      \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "blt        end_nc2_%=            \n\t"
+        "loop_nc2_%=:                     \n\t"
+
+        "cmp        %[i],       #0        \n\t"
+        "beq        ii_eq0_%=             \n\t"
+        "bne        ii_ne0_%=             \n\t"
+
+        "ii_eq0_%=:                       \n\t"
+        "vmov.f32   q10,    #0.0          \n\t"
+        "b          gemm_nc2_%=           \n\t"
+
+        "ii_ne0_%=:                       \n\t"
+        "pld        [%[c0], #16]          \n\t"
+        "vld1.32    {q10}, [%[c0]]        \n\t"
+
+        "gemm_nc2_%=:                     \n\t"
+        "pld        [%[b0], #16]          \n\t"
+        "vld1.32    {q2}, [%[b0]]!        \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+
+        "pld        [%[b1], #16]          \n\t"
+        "vld1.32    {q3}, [%[b1]]!        \n\t"
+        "vmla.f32   q10, q3, d0[1]        \n\t"
+
+        "pld        [%[b2], #16]          \n\t"
+        "vld1.32    {q4}, [%[b2]]!        \n\t"
+        "vmla.f32   q10, q4, d1[0]        \n\t"
+
+        "pld        [%[b3], #16]          \n\t"
+        "vld1.32    {q5}, [%[b3]]!        \n\t"
+        "vmla.f32   q10, q5, d1[1]        \n\t"
+
+        "vst1.32    {q10}, [%[c0]]!       \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "bge        loop_nc2_%=           \n\t"
+        "end_nc2_%=:                      \n\t"
+
+        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
+          [c0] "+r"(c0)
+        : [a0] "r"(a0), [i] "r"(i), [nc1] "r"(nc1), [nc2] "r"(nc2)
+        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
+
+    for (int j = 0; j < nc3; j++) {
+      if (i == 0) {
+        *c0 = (*a0) * (*b0++);
       } else {
-        C(i, j) += c[i * MR + j];
+        *c0 += (*a0) * (*b0++);
       }
+      *c0 += (*(a0 + 1)) * (*b1++);
+      *c0 += (*(a0 + 2)) * (*b2++);
+      *c0 += (*(a0 + 3)) * (*b3++);
+      c0++;
     }
   }
-}
 
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu) {
-  float c[16] = {0};
-  float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
+  for (int i = 0; i < kc2; ++i) {
+    a0 = A + 4 * kc1 + i;
+    b0 = B + (4 * kc1 + i) * ldb;
+    c0 = bufferC;
+    asm volatile(
+        "pld        [%[a0], #16]          \n\t"
+        "vld1.32    {d0}, [%[a0]]         \n\t"
 
-  for (int p = 0; p < k; p += 1) {
-    reg_b0 = *b++;
-    reg_b1 = *b++;
-    reg_b2 = *b++;
-    reg_b3 = *b++;
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "blt        end_nc1_%=            \n\t"
+        "loop_nc1_%=:                     \n\t"
 
-    reg_a0 = *a++;
-    reg_a1 = *a++;
-    reg_a2 = *a++;
-    reg_a3 = *a++;
+        "pld        [%[c0], #64]          \n\t"
+        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
+        "sub        %[c0], %[c0], #32     \n\t"
 
-    // first row
-    c[0] += reg_a0 * reg_b0;
-    c[1] += reg_a0 * reg_b1;
-    c[2] += reg_a0 * reg_b2;
-    c[3] += reg_a0 * reg_b3;
+        "gemm_nc1_%=:                     \n\t"
+        "pld        [%[b0], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "vmla.f32   q11, q3, d0[0]        \n\t"
+        "vmla.f32   q12, q4, d0[0]        \n\t"
+        "vmla.f32   q13, q5, d0[0]        \n\t"
 
-    // second row
-    c[4] += reg_a1 * reg_b0;
-    c[5] += reg_a1 * reg_b1;
-    c[6] += reg_a1 * reg_b2;
-    c[7] += reg_a1 * reg_b3;
+        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
 
-    // third row
-    c[8] += reg_a2 * reg_b0;
-    c[9] += reg_a2 * reg_b1;
-    c[10] += reg_a2 * reg_b2;
-    c[11] += reg_a2 * reg_b3;
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "bge        loop_nc1_%=           \n\t"
+        "end_nc1_%=:                      \n\t"
 
-    // fourth row
-    c[12] += reg_a3 * reg_b0;
-    c[13] += reg_a3 * reg_b1;
-    c[14] += reg_a3 * reg_b2;
-    c[15] += reg_a3 * reg_b3;
-  }
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
-      }
-      if (alpha != 1.0) {
-        C(i, j) += alpha * c[i * MR + j];
-      } else {
-        C(i, j) += c[i * MR + j];
-      }
-      if (relu) {
-        if (C(i, j) < 0) {
-          C(i, j) = 0;
-        }
-      }
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "blt        end_nc2_%=            \n\t"
+        "loop_nc2_%=:                     \n\t"
+
+        "pld        [%[c0], #16]          \n\t"
+        "vld1.32    {q10}, [%[c0]]        \n\t"
+
+        "gemm_nc2_%=:                     \n\t"
+        "vld1.32    {q2}, [%[b0]]!        \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+
+        "vst1.32    {q10}, [%[c0]]!       \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "bge        loop_nc2_%=           \n\t"
+        "end_nc2_%=:                      \n\t"
+
+        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
+          [c0] "+r"(c0)
+        : [a0] "r"(a0), [nc1] "r"(nc1), [nc2] "r"(nc2)
+        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
+
+    for (int j = 0; j < nc3; j++) {
+      *c0 += (*a0) * (*b0++);
+      c0++;
     }
   }
-}
-
-#endif
 
-// 32位 float 矩阵乘法
-void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc) {
-  int i, j, p, mc, nc, kc;
-  float beta_;
-  if (m == 1) {
-    VectorKernel(1, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  if (alpha != 1) {
+    VecWriteWithAlphaBeta(n, bufferC, C, ldc);
     return;
   }
-  for (j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-    for (p = 0; p < k; p += KC) {
-      kc = s_min(k - p, KC);
-      for (i = 0; i < m; i += MC) {
-        mc = s_min(m - i, MC);
-        if (p != 0) {
-          beta_ = 1.0;
-        } else {
-          beta_ = beta;
-        }
-        InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
-                    &C(i, j), ldc, i == 0);
-      }
-    }
+  if (beta == 0) {
+    VecWriteBasic(n, bufferC, C, ldc);
+    return;
   }
-}
-
-void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
-                const float *B, int ldb, float beta, float *C, int ldc) {
-  int i, j, p, mc, nc, kc;
-  float beta_;
-  for (j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-    for (p = 0; p < k; p += KC) {
-      kc = s_min(k - p, KC);
-      for (i = 0; i < m; i += MC) {
-        mc = s_min(m - i, MC);
-        if (p != 0) {
-          beta_ = 1.0;
-        } else {
-          beta_ = beta;
-        }
-
-        if (p + KC >= k) {
-          InnerKernel_relu(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb,
-                           beta_, &C(i, j), ldc, i == 0, true);
-        } else {
-          InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
-                      &C(i, j), ldc, i == 0);
-        }
-      }
-    }
+  if (beta == 1 && !relu) {
+    VecWriteWithAdd(n, bufferC, C, ldc);
+    return;
+  }
+  if (beta == 1 && relu) {
+    VecWriteWithAddRelu(n, bufferC, C, ldc);
+    return;
   }
 }
 
-void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                  const float *B, int ldb, float beta, float *C, int ldc) {
+void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+                        int lda, const float *B, int ldb, float beta, float *C,
+                        int ldc, bool relu, float *new_scale, float *new_bias) {
   float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
 
   const float *a0, *b0, *b1, *b2, *b3;
@@ -1012,17 +1578,1692 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
     }
   }
 
-  c0 = bufferC;
-  C0 = C;
-  for (int i = 0; i < n; i++) {
-    if (beta == 1.0) {
-      *C0++ += *c0++;
-    } else {
-      *C0++ = *c0++;
+  if (relu) {
+    VecWriteWithBnRelu(n, bufferC, C, ldc, new_scale, new_bias);
+  } else {
+    VecWriteWithBn(n, bufferC, C, ldc, new_scale, new_bias);
+  }
+}
+*/
+
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k / 4;
+  int kc2 = k % 4;
+  int step = 4 * ldc;
+  asm volatile(
+      "pld        [%[a_ptr]]          \n\t"
+      "pld        [%[b_ptr]]          \n\t"
+
+      "vmov.f32   q8,     #0.0        \n\t"
+      "vmov.f32   q9,     #0.0        \n\t"
+      "vmov.f32   q10,    #0.0        \n\t"
+      "vmov.f32   q11,    #0.0        \n\t"
+      "vmov.f32   q12,    #0.0        \n\t"
+      "vmov.f32   q13,    #0.0        \n\t"
+      "vmov.f32   q14,    #0.0        \n\t"
+      "vmov.f32   q15,    #0.0        \n\t"
+
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "blt        end_kc1_%=          \n\t"
+      "loop_kc1_%=:                   \n\t"
+
+      "pld        [%[a_ptr], #64]     \n\t"
+      "pld        [%[b_ptr], #64]     \n\t"
+
+      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vld1.32    {q4, q5}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q8,   q2,   d0[0]      \n\t"
+      "vmla.f32   q9,   q3,   d0[0]      \n\t"
+      "vmla.f32   q10,  q2,   d0[1]      \n\t"
+      "vmla.f32   q11,  q3,   d0[1]      \n\t"
+      "vmla.f32   q12,  q2,   d1[0]      \n\t"
+      "vmla.f32   q13,  q3,   d1[0]      \n\t"
+      "vmla.f32   q14,  q2,   d1[1]      \n\t"
+      "vmla.f32   q15,  q3,   d1[1]      \n\t"
+
+      "vmla.f32   q8,   q4,   d2[0]      \n\t"
+      "vmla.f32   q9,   q5,   d2[0]      \n\t"
+      "vmla.f32   q10,  q4,   d2[1]      \n\t"
+      "vmla.f32   q11,  q5,   d2[1]      \n\t"
+      "vmla.f32   q12,  q4,   d3[0]      \n\t"
+      "vmla.f32   q13,  q5,   d3[0]      \n\t"
+      "vmla.f32   q14,  q4,   d3[1]      \n\t"
+      "vmla.f32   q15,  q5,   d3[1]      \n\t"
+
+      "pld        [%[b_ptr], #64]     \n\t"
+
+      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vld1.32    {q4, q5}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q8,   q2,   d0[0]      \n\t"
+      "vmla.f32   q9,   q3,   d0[0]      \n\t"
+      "vmla.f32   q10,  q2,   d0[1]      \n\t"
+      "vmla.f32   q11,  q3,   d0[1]      \n\t"
+      "vmla.f32   q12,  q2,   d1[0]      \n\t"
+      "vmla.f32   q13,  q3,   d1[0]      \n\t"
+      "vmla.f32   q14,  q2,   d1[1]      \n\t"
+      "vmla.f32   q15,  q3,   d1[1]      \n\t"
+
+      "vmla.f32   q8,   q4,   d2[0]      \n\t"
+      "vmla.f32   q9,   q5,   d2[0]      \n\t"
+      "vmla.f32   q10,  q4,   d2[1]      \n\t"
+      "vmla.f32   q11,  q5,   d2[1]      \n\t"
+      "vmla.f32   q12,  q4,   d3[0]      \n\t"
+      "vmla.f32   q13,  q5,   d3[0]      \n\t"
+      "vmla.f32   q14,  q4,   d3[1]      \n\t"
+      "vmla.f32   q15,  q5,   d3[1]      \n\t"
+
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "bge        loop_kc1_%=         \n\t"
+      "end_kc1_%=:                    \n\t"
+
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "blt        end_kc2_%=          \n\t"
+      "loop_kc2_%=:                   \n\t"
+      "vld1.32    {q0},     [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q8,   q2,   d0[0]      \n\t"
+      "vmla.f32   q9,   q3,   d0[0]      \n\t"
+      "vmla.f32   q10,  q2,   d0[1]      \n\t"
+      "vmla.f32   q11,  q3,   d0[1]      \n\t"
+      "vmla.f32   q12,  q2,   d1[0]      \n\t"
+      "vmla.f32   q13,  q3,   d1[0]      \n\t"
+      "vmla.f32   q14,  q2,   d1[1]      \n\t"
+      "vmla.f32   q15,  q3,   d1[1]      \n\t"
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "bge        loop_kc2_%=         \n\t"
+      "end_kc2_%=:                    \n\t"
+
+      "mov        r5,     %[c]        \n\t"
+      "mov        r6,     %[step]     \n\t"
+      "vst1.32    {q8, q9},   [r5], r6     \n\t"
+      "vst1.32    {q10, q11}, [r5], r6     \n\t"
+      "vst1.32    {q12, q13}, [r5], r6     \n\t"
+      "vst1.32    {q14, q15}, [r5]         \n\t"
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [step] "r"(step)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9",
+        "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// C = A * B
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int step = 4 * ldc;
+  int step1 = 4 * (NC - 16 * nc1);
+  int volatile m = mc;
+
+  float *volatile c_ptr, *volatile C_ptr;
+  float *C0, *c0;
+  c_ptr = c;
+  C_ptr = C;
+  if (nc1 > 0) {
+    asm volatile(
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
+        "vst1.32    {q0, q1}, [r6]!         \n\t"
+
+        "vld1.32    {q2, q3}, [%[c_ptr]]!   \n\t"
+        "vst1.32    {q2, q3}, [r6]!         \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[C_ptr], %[C_ptr], %[step]   \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1]  \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3");
+  }
+
+  if (_nc1 != 0) {
+    for (int i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 16 + i * ldc;
+      c0 = c_ptr + nc1 * 16 + i * NC;
+      for (int j = 0; j < _nc1; j++) {
+        *C0++ = *c0++;
+      }
+    }
+  }
+}
+
+// C = alpha * A * B + beta * C
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+
+// C = A * B + C
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int step = 4 * ldc;
+  int step1 = 4 * (NC - 16 * nc1);
+  int volatile m = mc;
+
+  float *volatile c_ptr, *volatile C_ptr;
+  float *C0, *c0;
+  c_ptr = c;
+  C_ptr = C;
+  if (nc1 > 0) {
+    asm volatile(
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [r6]        \n\t"
+        "vld1.32    {q2, q3},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vst1.32    {q10, q11}, [r6]!       \n\t"
+
+        "vld1.32    {q4, q5},   [r6]        \n\t"
+        "vld1.32    {q6, q7},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vst1.32    {q12, q13}, [r6]!       \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[C_ptr], %[C_ptr], %[step]     \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1]    \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+          "q10", "q11", "q12", "q13");
+  }
+
+  if (_nc1 != 0) {
+    for (int i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 16 + i * ldc;
+      c0 = c_ptr + nc1 * 16 + i * NC;
+      for (int j = 0; j < _nc1; j++) {
+        *C0++ += *c0++;
+      }
     }
   }
 }
 
+// C = A * B + bias
+void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t biasv;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    biasv = vld1q_dup_f32(bias + i);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+        C_ptr++;
+      }
+    }
+  }
+}
+
+// C = A * B + C, relu(C)
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int step = 4 * ldc;
+  int step1 = 4 * (NC - 16 * nc1);
+  int volatile m = mc;
+
+  float *volatile c_ptr, *volatile C_ptr;
+  float *C0, *c0;
+  c_ptr = c;
+  C_ptr = C;
+  if (nc1 > 0) {
+    asm volatile(
+        "vmov.f32   q14,    #0.0            \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [r6]        \n\t"
+        "vld1.32    {q2, q3},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vmax.f32   q10,  q10,  q14         \n\t"
+        "vmax.f32   q11,  q11,  q14         \n\t"
+        "vst1.32    {q10, q11}, [r6]!       \n\t"
+
+        "vld1.32    {q4, q5},   [r6]        \n\t"
+        "vld1.32    {q6, q7},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vmax.f32   q12,  q12,  q14         \n\t"
+        "vmax.f32   q13,  q13,  q14         \n\t"
+        "vst1.32    {q12, q13}, [r6]!       \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[C_ptr], %[C_ptr], %[step]     \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1]    \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+          "q10", "q11", "q12", "q13");
+  }
+
+  if (_nc1 != 0) {
+    for (int i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 16 + i * ldc;
+      c0 = c_ptr + nc1 * 16 + i * NC;
+      for (int j = 0; j < _nc1; j++) {
+        *C0 += *c0;
+        if (*C0 < 0) {
+          *C0 = 0;
+        }
+        C0++;
+        c0++;
+      }
+    }
+  }
+}
+
+// C = A * B + bias, relu(C)
+void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
+                        float *bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t biasv;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    biasv = vld1q_dup_f32(bias + i);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      cv = vmaxq_f32(cv, zero);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      cv = vmaxq_f32(cv, zero);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+        C_ptr++;
+      }
+    }
+  }
+}
+
+// C = A * B, batchnorm(C)
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
+                 float *bias) {
+  if (nc < 4) {
+    for (int i = 0; i < mc; ++i) {
+      for (int j = 0; j < nc; ++j) {
+        *C = (*c) * (*scale) + (*bias);
+        C++;
+        c++;
+      }
+      C += (ldc - nc);
+      c += (NC - nc);
+      scale++;
+      bias++;
+    }
+    return;
+  }
+
+  int volatile nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int volatile nc2 = _nc1 / 4;
+  int volatile nc3 = 16 - 4 * (_nc1 % 4);
+  int volatile step = 4 * (ldc - nc);
+  int volatile step1 = 4 * (NC - nc);
+
+  asm volatile(
+      "subs       %[mc], %[mc], #1        \n\t"
+      "blt        end_mc_%=               \n\t"
+      "loop_mc_%=:                        \n\t"
+
+      "mov        r5,   %[nc1]            \n\t"
+      "mov        r6,   %[nc2]            \n\t"
+      "vld1.32    {d0},   [%[scale]]      \n\t"
+      "vld1.32    {d1},   [%[bias]]       \n\t"
+      "vdup.32    q1,   d0[0]             \n\t"
+      "vdup.32    q2,   d1[0]             \n\t"
+
+      "subs       r5,   r5,   #1          \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q3, q4},   [%[c]]!     \n\t"
+      "vmul.f32   q10,  q3,   q1          \n\t"
+      "vmul.f32   q11,  q4,   q1          \n\t"
+      "vadd.f32   q10,  q10,  q2          \n\t"
+      "vadd.f32   q11,  q11,  q2          \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+      "vld1.32    {q5, q6},   [%[c]]!     \n\t"
+      "vmul.f32   q12,  q5,   q1          \n\t"
+      "vmul.f32   q13,  q6,   q1          \n\t"
+      "vadd.f32   q12,  q12,  q2          \n\t"
+      "vadd.f32   q13,  q13,  q2          \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+      "subs       r5,   r5,   #1          \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      "subs       r6,  r6,   #1           \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+
+      "vld1.32    {q7},       [%[c]]!     \n\t"
+      "vmul.f32   q10,  q7,   q1          \n\t"
+      "vadd.f32   q10,  q10,  q2          \n\t"
+      "vst1.32    {q10},      [%[C]]!     \n\t"
+
+      "subs       r6,   r6,   #1          \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+
+      "sub        %[c],     %[c],   %[nc3]      \n\t"
+      "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+      "vld1.32    {q8},       [%[c]]!     \n\t"
+      "vmul.f32   q11,  q8,   q1          \n\t"
+      "vadd.f32   q11,  q11,  q2          \n\t"
+      "vst1.32    {q11},      [%[C]]!     \n\t"
+      "end_nc3_%=:                        \n\t"
+
+      "add        %[scale], %[scale], #4        \n\t"
+      "add        %[bias],  %[bias],  #4        \n\t"
+      "add        %[c],     %[c],     %[step1]  \n\t"
+      "add        %[C],     %[C],     %[step]   \n\t"
+
+      "subs       %[mc], %[mc], #1        \n\t"
+      "bge        loop_mc_%=              \n\t"
+      "end_mc_%=:                         \n\t"
+
+      :
+      : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
+        [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1),
+        [scale] "r"(scale), [bias] "r"(bias)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q8", "q10", "q11", "q12", "q13");
+}
+
+// C = A * B, batchnorm(C), relu(C)
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
+                     float *bias) {
+  if (nc < 4) {
+    for (int i = 0; i < mc; ++i) {
+      for (int j = 0; j < nc; ++j) {
+        *C = (*c) * (*scale) + (*bias);
+        if (*C < 0) {
+          *C = 0;
+        }
+        C++;
+        c++;
+      }
+      C += (ldc - nc);
+      c += (NC - nc);
+      scale++;
+      bias++;
+    }
+    return;
+  }
+
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int nc2 = _nc1 / 4;
+  int nc3 = 16 - 4 * (_nc1 % 4);
+  int step = 4 * (ldc - nc);
+  int step1 = 4 * (NC - nc);
+
+  asm volatile(
+      "vmov.f32   q14,    #0.0            \n\t"
+      "subs       %[mc], %[mc], #1        \n\t"
+      "blt        end_mc_%=               \n\t"
+      "loop_mc_%=:                        \n\t"
+
+      "mov        r5,   %[nc1]            \n\t"
+      "mov        r6,   %[nc2]            \n\t"
+      "vld1.32    {d0},   [%[scale]]      \n\t"
+      "vld1.32    {d1},   [%[bias]]       \n\t"
+      "vdup.32    q1,   d0[0]             \n\t"
+      "vdup.32    q2,   d1[0]             \n\t"
+
+      "subs       r5,   r5,   #1          \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q3, q4},   [%[c]]!     \n\t"
+      "vmul.f32   q10,  q3,   q1          \n\t"
+      "vmul.f32   q11,  q4,   q1          \n\t"
+      "vadd.f32   q10,  q10,  q2          \n\t"
+      "vadd.f32   q11,  q11,  q2          \n\t"
+      "vmax.f32   q10,  q10,  q14         \n\t"
+      "vmax.f32   q11,  q11,  q14         \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+      "vld1.32    {q5, q6},   [%[c]]!     \n\t"
+      "vmul.f32   q12,  q5,   q1          \n\t"
+      "vmul.f32   q13,  q6,   q1          \n\t"
+      "vadd.f32   q12,  q12,  q2          \n\t"
+      "vadd.f32   q13,  q13,  q2          \n\t"
+      "vmax.f32   q12,  q12,  q14         \n\t"
+      "vmax.f32   q13,  q13,  q14         \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+      "subs       r5,   r5,   #1          \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      "subs       r6,  r6,   #1           \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+
+      "vld1.32    {q7},       [%[c]]!     \n\t"
+      "vmul.f32   q10,  q7,   q1          \n\t"
+      "vadd.f32   q10,  q10,  q2          \n\t"
+      "vmax.f32   q10,  q10,  q14         \n\t"
+      "vst1.32    {q10},      [%[C]]!     \n\t"
+
+      "subs       r6,   r6,   #1          \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+
+      "sub        %[c],     %[c],   %[nc3]      \n\t"
+      "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+      "vld1.32    {q8},       [%[c]]!     \n\t"
+      "vmul.f32   q11,  q8,   q1          \n\t"
+      "vadd.f32   q11,  q11,  q2          \n\t"
+      "vmax.f32   q11,  q11,  q14         \n\t"
+      "vst1.32    {q11},      [%[C]]!     \n\t"
+      "end_nc3_%=:                        \n\t"
+
+      "add        %[scale], %[scale], #4        \n\t"
+      "add        %[bias],  %[bias],  #4        \n\t"
+      "add        %[c],     %[c],     %[step1]  \n\t"
+      "add        %[C],     %[C],     %[step]   \n\t"
+
+      "subs       %[mc], %[mc], #1        \n\t"
+      "bge        loop_mc_%=              \n\t"
+      "end_mc_%=:                         \n\t"
+
+      :
+      : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
+        [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1),
+        [scale] "r"(scale), [bias] "r"(bias)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q8", "q10", "q11", "q12", "q13", "q14");
+}
+
+  /*
+  // C = A * B
+  void VecWriteBasic(int n, float *c, float *C, int ldc) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
+    int nc2 = _nc1 / 4;
+    int nc3 = 16 - 4 * (_nc1 % 4);
+
+    asm volatile(
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1}, [%[c]]!       \n\t"
+        "vst1.32    {q0, q1}, [%[C]]!       \n\t"
+
+        "vld1.32    {q2, q3}, [%[c]]!       \n\t"
+        "vst1.32    {q2, q3}, [%[C]]!       \n\t"
+
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "blt        end_nc2_%=              \n\t"
+        "loop_nc2_%=:                       \n\t"
+
+        "vld1.32    {q4},     [%[c]]!       \n\t"
+        "vst1.32    {q4},     [%[C]]!       \n\t"
+
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "bge        loop_nc2_%=             \n\t"
+        "end_nc2_%=:                        \n\t"
+
+        "cmp        %[nc3],    #16          \n\t"
+        "beq        end_nc3_%=              \n\t"
+        "sub        %[c],     %[c],   %[nc3]    \n\t"
+        "sub        %[C],     %[C],   %[nc3]    \n\t"
+        "vld1.32    {q5},     [%[c]]!       \n\t"
+        "vst1.32    {q5},     [%[C]]!       \n\t"
+        "end_nc3_%=:                        \n\t"
+
+        :
+        : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5");
+  }
+
+  // C = alpha * A * B + beta * C
+  void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
+
+  // C = A * B + C
+  void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
+
+    asm volatile(
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+        "vld1.32    {q2, q3},   [%[C]]      \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+        "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+        "vld1.32    {q6, q7},   [%[C]]      \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        : [C] "+r"(C), [c] "+r"(c)
+        : [nc1] "r"(nc1)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10",
+  "q11", "q12", "q13");
+
+    if (_nc1 != 0) {
+      for (int j = 0; j < _nc1; j++) {
+        *C++ += *c++;
+      }
+    }
+  }
+
+  // C = A * B + C, relu(C)
+  void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
+
+    asm volatile(
+        "vmov.f32   q14,      #0.0          \n\t"
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+        "vld1.32    {q2, q3},   [%[C]]      \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vmax.f32   q10,  q10,  q14         \n\t"
+        "vmax.f32   q11,  q11,  q14         \n\t"
+        "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+        "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+        "vld1.32    {q6, q7},   [%[C]]      \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vmax.f32   q12,  q12,  q14         \n\t"
+        "vmax.f32   q13,  q13,  q14         \n\t"
+        "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        : [C] "+r"(C), [c] "+r"(c)
+        : [nc1] "r"(nc1)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10",
+  "q11", "q12", "q13");
+
+    if (_nc1 != 0) {
+      for (int j = 0; j < _nc1; j++) {
+        *C += *c;
+        if (*C < 0) {
+          *C = 0;
+        }
+        C++;
+        c++;
+      }
+    }
+  }
+
+  // C = A * B, batchnorm(C)
+  void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
+                      float *bias) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
+    int nc2 = _nc1 / 4;
+    int nc3 = 16 - 4 * (_nc1 % 4);
+
+    asm volatile(
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+        "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+        "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+        "vmla.f32   q10,  q0,   q2          \n\t"
+        "vmla.f32   q11,  q1,   q3          \n\t"
+        "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+        "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+        "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+        "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+        "vmla.f32   q12,  q4,   q6          \n\t"
+        "vmla.f32   q13,  q5,   q7          \n\t"
+        "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "blt        end_nc2_%=              \n\t"
+        "loop_nc2_%=:                       \n\t"
+
+        "vld1.32    {q0},   [%[c]]!         \n\t"
+        "vld1.32    {q1},   [%[scale]]!     \n\t"
+        "vld1.32    {q10},  [%[bias]]!      \n\t"
+        "vmla.f32   q10,    q0,   q1        \n\t"
+        "vst1.32    {q10},  [%[C]]!         \n\t"
+
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "bge        loop_nc2_%=             \n\t"
+        "end_nc2_%=:                        \n\t"
+
+        "cmp        %[nc3],    #16          \n\t"
+        "beq        end_nc3_%=              \n\t"
+
+        "sub        %[c],     %[c],   %[nc3]      \n\t"
+        "sub        %[scale], %[scale],  %[nc3]   \n\t"
+        "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+        "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+        "vld1.32    {q0},   [%[c]]!         \n\t"
+        "vld1.32    {q1},   [%[scale]]!     \n\t"
+        "vld1.32    {q10},  [%[bias]]!      \n\t"
+        "vmla.f32   q10,    q0,   q1        \n\t"
+        "vst1.32    {q10},  [%[C]]!         \n\t"
+        "end_nc3_%=:                        \n\t"
+
+        :
+        : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3]
+  "r"(nc3), [scale] "r"(scale), [bias] "r"(bias) : "memory", "q0", "q1", "q2",
+  "q3", "q4", "q5", "q6", "q7", "q10", "q11", "q12", "q13");
+  }
+
+  // C = A * B, batchnorm(C), relu(C)
+  void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
+                          float *bias) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
+    int nc2 = _nc1 / 4;
+    int nc3 = 16 - 4 * (_nc1 % 4);
+
+    asm volatile(
+        "vmov.f32   q14,      #0.0          \n\t"
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+        "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+        "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+        "vmla.f32   q10,  q0,   q2          \n\t"
+        "vmla.f32   q11,  q1,   q3          \n\t"
+        "vmax.f32   q10,  q10,  q14         \n\t"
+        "vmax.f32   q11,  q11,  q14         \n\t"
+        "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+        "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+        "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+        "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+        "vmla.f32   q12,  q4,   q6          \n\t"
+        "vmla.f32   q13,  q5,   q7          \n\t"
+        "vmax.f32   q12,  q12,  q14         \n\t"
+        "vmax.f32   q13,  q13,  q14         \n\t"
+        "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "blt        end_nc2_%=              \n\t"
+        "loop_nc2_%=:                       \n\t"
+
+        "vld1.32    {q0},   [%[c]]!         \n\t"
+        "vld1.32    {q1},   [%[scale]]!     \n\t"
+        "vld1.32    {q10},  [%[bias]]!      \n\t"
+        "vmla.f32   q10,    q0,   q1        \n\t"
+        "vmax.f32   q10,    q10,  q14       \n\t"
+        "vst1.32    {q10},  [%[C]]!         \n\t"
+
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "bge        loop_nc2_%=             \n\t"
+        "end_nc2_%=:                        \n\t"
+
+        "cmp        %[nc3],    #16          \n\t"
+        "beq        end_nc3_%=              \n\t"
+
+        "sub        %[c],     %[c],   %[nc3]      \n\t"
+        "sub        %[scale], %[scale],  %[nc3]   \n\t"
+        "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+        "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+        "vld1.32    {q0},   [%[c]]!         \n\t"
+        "vld1.32    {q1},   [%[scale]]!     \n\t"
+        "vld1.32    {q10},  [%[bias]]!      \n\t"
+        "vmla.f32   q10,    q0,   q1        \n\t"
+        "vmax.f32   q10,    q10,  q14       \n\t"
+        "vst1.32    {q10},  [%[C]]!         \n\t"
+        "end_nc3_%=:                        \n\t"
+
+        :
+        : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3]
+  "r"(nc3), [scale] "r"(scale), [bias] "r"(bias) : "memory", "q0", "q1", "q2",
+  "q3", "q4", "q5", "q6", "q7", "q10", "q11", "q12", "q13", "q14");
+  }
+  */
+
+#endif  // __aarch64__
+#else
+
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+  float *c0, *c1, *c2, *c3;
+  c0 = c;
+  c1 = c + ldc;
+  c2 = c + 2 * ldc;
+  c3 = c + 3 * ldc;
+  for (int p = 0; p < k; p += 1) {
+    // first row
+    c0[0] += a[0] * b[0];
+    c0[1] += a[0] * b[1];
+    c0[2] += a[0] * b[2];
+    c0[3] += a[0] * b[3];
+
+    // second row
+    c1[0] += a[1] * b[0];
+    c1[1] += a[1] * b[1];
+    c1[2] += a[1] * b[2];
+    c1[3] += a[1] * b[3];
+
+    // third row
+    c2[0] += a[2] * b[0];
+    c2[1] += a[2] * b[1];
+    c2[2] += a[2] * b[2];
+    c2[3] += a[2] * b[3];
+
+    // fourth row
+    c3[0] += a[3] * b[0];
+    c3[1] += a[3] * b[1];
+    c3[2] += a[3] * b[2];
+    c3[3] += a[3] * b[3];
+
+    a += 4;
+    b += 4;
+  }
+}
+
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {}
+
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {}
+
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {}
+
+void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {}
+
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {}
+
+void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
+                        float *bias) {}
+
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
+                 float *new_bias) {}
+
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                     float *new_scale, float *new_bias) {}
+
+#endif  // __ARM_NEON
+
+// 32位 float 矩阵乘法
+void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+           const float *B, int ldb, float beta, float *C, int ldc, bool relu,
+           float *bias) {
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int L1 = 32 * 1024;
+  int L2 = 0.5 * 1024 * 1024;
+
+  KC = k;
+  MC = L1 / (KC * sizeof(float));
+  NC = L2 / (KC * sizeof(float));
+
+  // make sure MC is multiple of MR, and NC is multiple of NR
+  int mblock_num = (m + MC - 1) / MC;
+  MC = (m + mblock_num - 1) / mblock_num;
+  MC = (MC + MR - 1) / MR * MR;
+  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+
+  int nblock_num = (n + NC - 1) / NC;
+  NC = (n + nblock_num - 1) / nblock_num;
+  NC = (NC + NR - 1) / NR * NR;
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+
+  packedA = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+  packedB = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+
+  for (int l = 0; l < KC; ++l) {
+    zero[l] = 0;
+  }
+
+  int mc, nc;
+  for (int j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+#if __aarch64__
+    // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#else
+    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#endif
+    for (int i = 0; i < m; i += MC) {
+      mc = s_min(m - i, MC);
+#if __aarch64__
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#else
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#endif
+      InnerKernelWithBias(mc, nc, alpha, packedA, packedB, beta, packedC,
+                          &C(i, j), ldc, relu, bias + i);
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
+}
+
+void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 bool relu, float *new_scale, float *new_bias) {
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int L1 = 32 * 1024;
+  int L2 = 0.5 * 1024 * 1024;
+
+  KC = k;
+  MC = L1 / (KC * sizeof(float));
+  NC = L2 / (KC * sizeof(float));
+
+  // make sure MC is multiple of MR, and NC is multiple of NR
+  int mblock_num = (m + MC - 1) / MC;
+  MC = (m + mblock_num - 1) / mblock_num;
+  MC = (MC + MR - 1) / MR * MR;
+  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+
+  int nblock_num = (n + NC - 1) / NC;
+  NC = (n + nblock_num - 1) / nblock_num;
+  NC = (NC + NR - 1) / NR * NR;
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+
+  packedA = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+  packedB = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+
+  for (int l = 0; l < KC; ++l) {
+    zero[l] = 0;
+  }
+
+  int mc, nc;
+  for (int j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+#if __aarch64__
+    // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#else
+    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#endif
+    for (int i = 0; i < m; i += MC) {
+      mc = s_min(m - i, MC);
+#if __aarch64__
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#else
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#endif
+      InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
+                        &C(i, j), ldc, relu, new_scale + i, new_bias + i);
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
+}
+
+// 32位 float 矩阵乘法
+void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
+               const float *B, int ldb, float beta, float *C, int ldc,
+               bool relu, float *bias) {
+#ifdef _OPENMP
+  int max_threads = omp_get_max_threads();
+#else
+  int max_threads = 1;
+#endif
+
+  int L1 = 32 * 1024;
+  KC = k;
+  if (m > n) {
+    // 对 A 分块
+    MC = L1 / (KC * sizeof(float));
+    int mblock_num = (m + MC - 1) / MC;
+    MC = (m + mblock_num - 1) / mblock_num;
+    MC = (MC + MR - 1) / MR * MR;
+    // 补齐 B
+    NC = (n + NR - 1) / NR * NR;
+
+#if __aarch64__
+    procPackA = PackMatrixA_6r;
+    procPackB = PackMatrixB_omp_16c;
+    procAddDot = AddDot6x16;
+#else
+    procPackA = PackMatrixA_6r;
+    procPackB = PackMatrixB_omp_8c;
+    procAddDot = AddDot6x8;
+#endif
+
+    packedB = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+    procPackB(KC, NC, NC % NR, B, ldb, packedB);
+    packedA = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
+  } else {
+    // 对 B 分块
+    NC = L1 / (KC * sizeof(float));
+    int nblock_num = (n + NC - 1) / NC;
+    NC = (n + nblock_num - 1) / nblock_num;
+    NC = (NC + NR - 1) / NR * NR;
+    // 补齐 A
+    MC = (m + MR - 1) / MR * MR;
+
+#if __aarch64__
+    procPackA = PackMatrixA_omp_6r;
+    procPackB = PackMatrixB_16c;
+    procAddDot = AddDot6x16;
+#else
+    procPackA = PackMatrixA_omp_6r;
+    procPackB = PackMatrixB_8c;
+    procAddDot = AddDot6x8;
+#endif
+
+    packedA = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+    procPackA(MC, KC, MC % MR, A, lda, packedA);
+    packedB = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
+  }
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
+
+  if (m > n) {
+#pragma omp parallel for
+    for (int i = 0; i < m; i += MC) {
+#ifdef _OPENMP
+      int local_threads = omp_get_thread_num();
+#else
+      int local_threads = 0;
+#endif
+
+      int mc;
+      mc = s_min(m - i, MC);
+      float *local_A = packedA + MC * KC * local_threads;
+      float *local_C = packedC + MC * NC * local_threads;
+      procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A);
+      InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
+                          &C(i, 0), ldc, relu, bias + i);
+    }
+  } else {
+#pragma omp parallel for
+    for (int j = 0; j < n; j += NC) {
+#ifdef _OPENMP
+      int local_threads = omp_get_thread_num();
+#else
+      int local_threads = 0;
+#endif
+
+      int nc;
+      nc = s_min(n - j, NC);
+      float *local_B = packedB + KC * NC * local_threads;
+      float *local_C = packedC + MC * NC * local_threads;
+      procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B);
+      InnerKernelWithBias(m, nc, alpha, packedA, local_B, beta, local_C,
+                          &C(0, j), ldc, relu, bias);
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
+}
+
+void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
+                     const float *B, int ldb, float beta, float *C, int ldc,
+                     bool relu, float *new_scale, float *new_bias) {
+#ifdef _OPENMP
+  int max_threads = omp_get_max_threads();
+#else
+  int max_threads = 1;
+#endif
+
+  int L1 = 32 * 1024;
+  KC = k;
+  if (m > n) {
+    // 对 A 分块
+    MC = L1 / (KC * sizeof(float));
+    int mblock_num = (m + MC - 1) / MC;
+    MC = (m + mblock_num - 1) / mblock_num;
+    MC = (MC + MR - 1) / MR * MR;
+    // 补齐 B
+    NC = (n + NR - 1) / NR * NR;
+
+#if __aarch64__
+    procPackA = PackMatrixA_6r;
+    procPackB = PackMatrixB_omp_16c;
+    procAddDot = AddDot6x16;
+#else
+    procPackA = PackMatrixA_6r;
+    procPackB = PackMatrixB_omp_8c;
+    procAddDot = AddDot6x8;
+#endif
+
+    packedB = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+    procPackB(KC, NC, NC % NR, B, ldb, packedB);
+    packedA = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
+  } else {
+    // 对 B 分块
+    NC = L1 / (KC * sizeof(float));
+    int nblock_num = (n + NC - 1) / NC;
+    NC = (n + nblock_num - 1) / nblock_num;
+    NC = (NC + NR - 1) / NR * NR;
+    // 补齐 A
+    MC = (m + MR - 1) / MR * MR;
+
+#if __aarch64__
+    procPackA = PackMatrixA_omp_6r;
+    procPackB = PackMatrixB_16c;
+    procAddDot = AddDot6x16;
+#else
+    procPackA = PackMatrixA_omp_6r;
+    procPackB = PackMatrixB_8c;
+    procAddDot = AddDot6x8;
+#endif
+
+    packedA = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+    procPackA(MC, KC, MC % MR, A, lda, packedA);
+    packedB = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
+  }
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
+
+  if (m > n) {
+#pragma omp parallel for
+    for (int i = 0; i < m; i += MC) {
+#ifdef _OPENMP
+      int local_threads = omp_get_thread_num();
+#else
+      int local_threads = 0;
+#endif
+
+      int mc;
+      mc = s_min(m - i, MC);
+      float *local_A = packedA + MC * KC * local_threads;
+      float *local_C = packedC + MC * NC * local_threads;
+      procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A);
+      InnerKernelWithBn(mc, n, alpha, local_A, packedB, beta, local_C, &C(i, 0),
+                        ldc, relu, new_scale + i, new_bias + i);
+    }
+  } else {
+#pragma omp parallel for
+    for (int j = 0; j < n; j += NC) {
+#ifdef _OPENMP
+      int local_threads = omp_get_thread_num();
+#else
+      int local_threads = 0;
+#endif
+
+      int nc;
+      nc = s_min(n - j, NC);
+      float *local_B = packedB + KC * NC * local_threads;
+      float *local_C = packedC + MC * NC * local_threads;
+      procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B);
+      InnerKernelWithBn(m, nc, alpha, packedA, local_B, beta, local_C, &C(0, j),
+                        ldc, relu, new_scale, new_bias);
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
+}
+
+void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
+#if __ARM_NEON
+#if __aarch64__
+
+  // init C
+  float32x4_t cv0 = vdupq_n_f32(0.0);
+  float32x4_t cv1 = vdupq_n_f32(0.0);
+  float32x4_t cv2 = vdupq_n_f32(0.0);
+  float32x4_t cv3 = vdupq_n_f32(0.0);
+  float32x4_t cv4 = vdupq_n_f32(0.0);
+  float32x4_t cv5 = vdupq_n_f32(0.0);
+  float32x4_t cv6 = vdupq_n_f32(0.0);
+  float32x4_t cv7 = vdupq_n_f32(0.0);
+  float32x4_t cv8 = vdupq_n_f32(0.0);
+  float32x4_t cv9 = vdupq_n_f32(0.0);
+  float32x4_t cv10 = vdupq_n_f32(0.0);
+  float32x4_t cv11 = vdupq_n_f32(0.0);
+
+  float32x4_t av;
+  float32x4_t bv0;
+  float32x4_t bv1;
+
+  float32x2_t av01;
+  float32x2_t av23;
+  float32x2_t av45;
+
+  for (int p = 0; p < k; p += 1) {
+    av = vld1q_f32(a);
+    av01 = vget_low_f32(av);
+    av23 = vget_high_f32(av);
+    av45 = vld1_f32(a + 4);
+    bv0 = vld1q_f32(b);
+    bv1 = vld1q_f32(b + 4);
+
+    cv0 = vmlaq_lane_f32(cv0, bv0, av01, 0);
+    cv1 = vmlaq_lane_f32(cv1, bv1, av01, 0);
+    cv2 = vmlaq_lane_f32(cv2, bv0, av01, 1);
+    cv3 = vmlaq_lane_f32(cv3, bv1, av01, 1);
+
+    cv4 = vmlaq_lane_f32(cv4, bv0, av23, 0);
+    cv5 = vmlaq_lane_f32(cv5, bv1, av23, 0);
+    cv6 = vmlaq_lane_f32(cv6, bv0, av23, 1);
+    cv7 = vmlaq_lane_f32(cv7, bv1, av23, 1);
+
+    cv8 = vmlaq_lane_f32(cv8, bv0, av45, 0);
+    cv9 = vmlaq_lane_f32(cv9, bv1, av45, 0);
+    cv10 = vmlaq_lane_f32(cv10, bv0, av45, 1);
+    cv11 = vmlaq_lane_f32(cv11, bv1, av45, 1);
+
+    a += MR;
+    b += NR;
+  }
+
+  vst1q_f32(c, cv0);
+  vst1q_f32(c + 4, cv1);
+  vst1q_f32(c + ldc, cv2);
+  vst1q_f32(c + ldc + 4, cv3);
+  vst1q_f32(c + 2 * ldc, cv4);
+  vst1q_f32(c + 2 * ldc + 4, cv5);
+  vst1q_f32(c + 3 * ldc, cv6);
+  vst1q_f32(c + 3 * ldc + 4, cv7);
+  vst1q_f32(c + 4 * ldc, cv8);
+  vst1q_f32(c + 4 * ldc + 4, cv9);
+  vst1q_f32(c + 5 * ldc, cv10);
+  vst1q_f32(c + 5 * ldc + 4, cv11);
+
+#else
+
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k / 4;
+  int kc2 = k % 4;
+  int step = 4 * ldc;
+  asm volatile(
+      "pld        [%[a_ptr]]            \n\t"
+      "pld        [%[b_ptr]]            \n\t"
+      "pld        [%[a_ptr],  #64]            \n\t"
+      "pld        [%[b_ptr],  #64]            \n\t"
+
+      "vmov.f32   q4,     #0.0          \n\t"
+      "vmov.f32   q5,     #0.0          \n\t"
+      "vmov.f32   q6,     #0.0          \n\t"
+      "vmov.f32   q7,     #0.0          \n\t"
+      "vmov.f32   q8,     #0.0          \n\t"
+      "vmov.f32   q9,     #0.0          \n\t"
+      "vmov.f32   q10,    #0.0          \n\t"
+      "vmov.f32   q11,    #0.0          \n\t"
+      "vmov.f32   q12,    #0.0          \n\t"
+      "vmov.f32   q13,    #0.0          \n\t"
+      "vmov.f32   q14,    #0.0          \n\t"
+      "vmov.f32   q15,    #0.0          \n\t"
+
+      "subs       %[kc1], %[kc1], #1    \n\t"
+      "blt        end_kc1_%=            \n\t"
+      "loop_kc1_%=:                     \n\t"
+
+      //      "pld        [%[a_ptr], #128]       \n\t"
+      //      "pld        [%[b_ptr], #128]       \n\t"
+      //      "pld        [%[a_ptr], #192]       \n\t"
+      //      "pld        [%[b_ptr], #192]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "subs       %[kc1], %[kc1], #1      \n\t"
+      "bge        loop_kc1_%=             \n\t"
+      "end_kc1_%=:                        \n\t"
+
+      "subs       %[kc2], %[kc2], #1      \n\t"
+      "blt        end_kc2_%=              \n\t"
+      "loop_kc2_%=:                       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "subs       %[kc2], %[kc2], #1      \n\t"
+      "bge        loop_kc2_%=             \n\t"
+      "end_kc2_%=:                        \n\t"
+
+      "mov        r5,     %[c]            \n\t"
+      "mov        r6,     %[step]         \n\t"
+      "vst1.32    {q4, q5},   [r5], r6    \n\t"
+      "vst1.32    {q6, q7},   [r5], r6    \n\t"
+      "vst1.32    {q8, q9},   [r5], r6    \n\t"
+      "vst1.32    {q10, q11}, [r5], r6    \n\t"
+      "vst1.32    {q12, q13}, [r5], r6    \n\t"
+      "vst1.32    {q14, q15}, [r5]        \n\t"
+
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [step] "r"(step)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+
+#endif  // __aarch64__
+#else
+
+#endif  // __ARM_NEON
+}
+
+#if __aarch64__
+void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) {
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k;
+  int step = 4 * ldc;
+  asm volatile(
+      "dup      v5.4s,     wzr     \n\t"
+      "dup      v6.4s,     wzr     \n\t"
+      "dup      v7.4s,     wzr     \n\t"
+      "dup      v8.4s,     wzr     \n\t"
+      "dup      v9.4s,     wzr     \n\t"
+      "dup      v10.4s,    wzr     \n\t"
+      "dup      v11.4s,    wzr     \n\t"
+      "dup      v12.4s,    wzr     \n\t"
+      "dup      v13.4s,    wzr     \n\t"
+      "dup      v14.4s,    wzr     \n\t"
+      "dup      v15.4s,    wzr     \n\t"
+      "dup      v16.4s,    wzr     \n\t"
+
+      "dup      v17.4s,    wzr     \n\t"
+      "dup      v18.4s,    wzr     \n\t"
+      "dup      v19.4s,    wzr     \n\t"
+      "dup      v20.4s,    wzr     \n\t"
+      "dup      v21.4s,    wzr     \n\t"
+      "dup      v22.4s,    wzr     \n\t"
+      "dup      v23.4s,    wzr     \n\t"
+      "dup      v24.4s,    wzr     \n\t"
+      "dup      v25.4s,    wzr     \n\t"
+      "dup      v26.4s,    wzr     \n\t"
+      "dup      v27.4s,    wzr     \n\t"
+      "dup      v28.4s,    wzr     \n\t"
+
+      "subs       %[kc1], %[kc1], #1    \n\t"
+      "blt        end_kc1_%=            \n\t"
+      "loop_kc1_%=:                     \n\t"
+
+      "prfm     pldl1keep,         [%[a_ptr],   #32]  \n\t"
+      "prfm     pldl1keep,         [%[b_ptr],   #48]  \n\t"
+
+      "ld1      {v0.4s, v1.4s},         [%[a_ptr]],   #32   \n\t"
+      "ld1      {v2.4s, v3.4s, v4.4s},  [%[b_ptr]],   #48   \n\t"
+
+      "fmla     v5.4s,    v2.4s,   v0.s[0]       \n\t"
+      "fmla     v6.4s,    v3.4s,   v0.s[0]       \n\t"
+      "fmla     v7.4s,    v4.4s,   v0.s[0]       \n\t"
+      "fmla     v8.4s,    v2.4s,   v0.s[1]       \n\t"
+      "fmla     v9.4s,    v3.4s,   v0.s[1]       \n\t"
+      "fmla     v10.4s,   v4.4s,   v0.s[1]       \n\t"
+      "fmla     v11.4s,   v2.4s,   v0.s[2]       \n\t"
+      "fmla     v12.4s,   v3.4s,   v0.s[2]       \n\t"
+      "fmla     v13.4s,   v4.4s,   v0.s[2]       \n\t"
+      "fmla     v14.4s,   v2.4s,   v0.s[3]       \n\t"
+      "fmla     v15.4s,   v3.4s,   v0.s[3]       \n\t"
+      "fmla     v16.4s,   v4.4s,   v0.s[3]       \n\t"
+
+      "fmla     v17.4s,   v2.4s,   v1.s[0]       \n\t"
+      "fmla     v18.4s,   v3.4s,   v1.s[0]       \n\t"
+      "fmla     v19.4s,   v4.4s,   v1.s[0]       \n\t"
+      "fmla     v20.4s,   v2.4s,   v1.s[1]       \n\t"
+      "fmla     v21.4s,   v3.4s,   v1.s[1]       \n\t"
+      "fmla     v22.4s,   v4.4s,   v1.s[1]       \n\t"
+      "fmla     v23.4s,   v2.4s,   v1.s[2]       \n\t"
+      "fmla     v24.4s,   v3.4s,   v1.s[2]       \n\t"
+      "fmla     v25.4s,   v4.4s,   v1.s[2]       \n\t"
+      "fmla     v26.4s,   v2.4s,   v1.s[3]       \n\t"
+      "fmla     v27.4s,   v3.4s,   v1.s[3]       \n\t"
+      "fmla     v28.4s,   v4.4s,   v1.s[3]       \n\t"
+
+      "subs       %[kc1], %[kc1], #1      \n\t"
+      "bge        loop_kc1_%=             \n\t"
+      "end_kc1_%=:                        \n\t"
+
+      "st1      {v5.4s,   v6.4s,  v7.4s},    [%[c]],   %[step]   \n\t"
+      "st1      {v8.4s,   v9.4s,  v10.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v11.4s,  v12.4s, v13.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v14.4s,  v15.4s, v16.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v17.4s,  v18.4s, v19.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v20.4s,  v21.4s, v22.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v23.4s,  v24.4s, v25.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v26.4s,  v27.4s, v28.4s},   [%[c]],   %[step]   \n\t"
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [step] "r"(step)
+      : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+        "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+        "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28");
+}
+
+void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc) {
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k;
+  int step = 4 * ldc;
+  int step1 = 4 * 6;
+  asm volatile(
+
+      "dup      v6.4s,     wzr     \n\t"
+      "dup      v7.4s,     wzr     \n\t"
+      "dup      v8.4s,     wzr     \n\t"
+      "dup      v9.4s,     wzr     \n\t"
+      "dup      v10.4s,    wzr     \n\t"
+      "dup      v11.4s,    wzr     \n\t"
+      "dup      v12.4s,    wzr     \n\t"
+      "dup      v13.4s,    wzr     \n\t"
+
+      "dup      v14.4s,    wzr     \n\t"
+      "dup      v15.4s,    wzr     \n\t"
+      "dup      v16.4s,    wzr     \n\t"
+      "dup      v17.4s,    wzr     \n\t"
+      "dup      v18.4s,    wzr     \n\t"
+      "dup      v19.4s,    wzr     \n\t"
+      "dup      v20.4s,    wzr     \n\t"
+      "dup      v21.4s,    wzr     \n\t"
+
+      "dup      v22.4s,    wzr     \n\t"
+      "dup      v23.4s,    wzr     \n\t"
+      "dup      v24.4s,    wzr     \n\t"
+      "dup      v25.4s,    wzr     \n\t"
+      "dup      v26.4s,    wzr     \n\t"
+      "dup      v27.4s,    wzr     \n\t"
+      "dup      v28.4s,    wzr     \n\t"
+      "dup      v29.4s,    wzr     \n\t"
+
+      "subs       %[kc1], %[kc1], #1    \n\t"
+      "blt        end_kc1_%=            \n\t"
+      "loop_kc1_%=:                     \n\t"
+
+      "prfm   pldl1keep,  [%[a_ptr],  #24]  \n\t"
+      "prfm   pldl1keep,  [%[b_ptr],  #64]  \n\t"
+
+      "ld1      {v0.4s, v1.4s},  [%[a_ptr]],   %[step1]       \n\t"
+      "ld1      {v2.4s, v3.4s, v4.4s, v5.4s},  [%[b_ptr]],    #64   \n\t"
+
+      "fmla     v6.4s,    v2.4s,   v0.s[0]       \n\t"
+      "fmla     v7.4s,    v3.4s,   v0.s[0]       \n\t"
+      "fmla     v8.4s,    v4.4s,   v0.s[0]       \n\t"
+      "fmla     v9.4s,    v5.4s,   v0.s[0]       \n\t"
+
+      "fmla     v10.4s,   v2.4s,   v0.s[1]       \n\t"
+      "fmla     v11.4s,   v3.4s,   v0.s[1]       \n\t"
+      "fmla     v12.4s,   v4.4s,   v0.s[1]       \n\t"
+      "fmla     v13.4s,   v5.4s,   v0.s[1]       \n\t"
+
+      "fmla     v14.4s,   v2.4s,   v0.s[2]       \n\t"
+      "fmla     v15.4s,   v3.4s,   v0.s[2]       \n\t"
+      "fmla     v16.4s,   v4.4s,   v0.s[2]       \n\t"
+      "fmla     v17.4s,   v5.4s,   v0.s[2]       \n\t"
+
+      "fmla     v18.4s,   v2.4s,   v0.s[3]       \n\t"
+      "fmla     v19.4s,   v3.4s,   v0.s[3]       \n\t"
+      "fmla     v20.4s,   v4.4s,   v0.s[3]       \n\t"
+      "fmla     v21.4s,   v5.4s,   v0.s[3]       \n\t"
+
+      "fmla     v22.4s,   v2.4s,   v1.s[0]       \n\t"
+      "fmla     v23.4s,   v3.4s,   v1.s[0]       \n\t"
+      "fmla     v24.4s,   v4.4s,   v1.s[0]       \n\t"
+      "fmla     v25.4s,   v5.4s,   v1.s[0]       \n\t"
+
+      "fmla     v26.4s,   v2.4s,   v1.s[1]       \n\t"
+      "fmla     v27.4s,   v3.4s,   v1.s[1]       \n\t"
+      "fmla     v28.4s,   v4.4s,   v1.s[1]       \n\t"
+      "fmla     v29.4s,   v5.4s,   v1.s[1]       \n\t"
+
+      "subs       %[kc1], %[kc1], #1      \n\t"
+      "bge        loop_kc1_%=             \n\t"
+      "end_kc1_%=:                        \n\t"
+
+      "st1      {v6.4s,  v7.4s,  v8.4s,  v9.4s},    [%[c]],   %[step]   \n\t"
+      "st1      {v10.4s, v11.4s, v12.4s, v13.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v14.4s, v15.4s, v16.4s, v17.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v18.4s, v19.4s, v20.4s, v21.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v22.4s, v23.4s, v24.4s, v25.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v26.4s, v27.4s, v28.4s, v29.4s},   [%[c]],   %[step]   \n\t"
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [step] "r"(step), [step1] "r"(step1)
+      : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+        "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+        "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29");
+}
+
+#endif  // __aarch64__
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h
index 73d773987b871033e29c4dfbac806afaf91892d8..40199faa4c30ec965a3980f44f1dbb6ae7d6799b 100644
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -19,12 +19,13 @@ limitations under the License. */
 #define B(i, j) B[(i)*ldb + (j)]
 #define C(i, j) C[(i)*ldc + (j)]
 
-// 分块计算的块大小，mc 与 kc 分别对应分块计算时的 m 与 k
-#define MC 128
-#define KC 128
-#define NC 1024
-#define MR 4
-#define NR 4
+#if __aarch64__
+#define MR 6
+#define NR 16
+#else
+#define MR 6
+#define NR 8
+#endif
 
 #define s_min(i, j) ((i) < (j) ? (i) : (j))
 
@@ -32,49 +33,128 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {
 
+/*
 // 将 A 矩阵分块复制到连续内存(ColMajor)
-void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
+void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                  float *buffer);
 
 // 将 B 矩阵分块复制到连续内存(ColMajor)
-void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
+void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                  float *buffer);
+*/
 
 // 将 A 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
-                  float *buffer);
+void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer);
+void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer);
+void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer);
+void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
+                        float *buffer);
+void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
+                        float *buffer);
 
 // 将 B 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
-                  float *buffer);
+void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
+                    float *buffer);
+void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
+                     float *buffer);
+void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
+                     float *buffer);
+void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
+                        float *buffer);
+void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
+                         float *buffer);
+void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
+                         float *buffer);
 
 // 分块矩阵乘法
-void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                 const float *B, int ldb, float beta, float *C, int ldc,
-                 int first_time);
-
+void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
+                 float beta, float *c, float *C, int ldc, bool relu);
+void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
+                         const float *b, float beta, float *c, float *C,
+                         int ldc, bool relu, float *bias);
+
+void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+                       const float *b, float beta, float *c, float *C, int ldc,
+                       bool relu, float *new_scale, float *new_bias);
+/*
 // 向量矩阵乘法 (M = 1)
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                  const float *B, int ldb, float beta, float *C, int ldc);
-
-// 计算一个更小的 4 * 4 的 C 矩阵分块
-void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B,
-               int ldb, float beta, float *C, int ldc, int mc, int nc);
-
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu);
+                  const float *B, int ldb, float beta, float *C, int ldc,
+                  bool relu);
+
+void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+                        int lda, const float *B, int ldb, float beta, float *C,
+                        int ldc, bool relu, float *new_scale, float *new_bias);
+*/
+
+// 计算一个更小的 C 矩阵分块
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc);
+
+// 分块矩阵乘法结果回写
+// C = A * B
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
+// C = alpha * A * B + beta * C
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B + C
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B + bias
+void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias);
+// C = A * B + C, relu(C)
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B + bias ,relu(C)
+void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
+                        float *bias);
+// C = A * B, batchnorm(C)
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
+                 float *new_bias);
+// C = A * B, batchnorm(C), relu(C)
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                     float *new_scale, float *new_bias);
+
+/*
+// 向量矩阵乘法结果回写
+// C = A * B
+void VecWriteBasic(int n, float *c, float *C, int ldc);
+// C = alpha * A * B + beta * C
+void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
+// C = A * B + C
+void VecWriteWithAdd(int n, float *c, float *C, int ldc);
+// C = A * B + C, relu(C)
+void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
+// C = A * B, batchnorm(C)
+void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
+                    float *new_bias);
+// C = A * B, batchnorm(C), relu(C)
+void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
+                        float *new_bias);
+*/
 
 // 32位 float 矩阵乘法
-void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc);
+void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+           const float *B, int ldb, float beta, float *C, int ldc, bool relu,
+           float *bias);
+
+// 32位 float 矩阵乘法, 并对结果进行 batchnrom
+void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 bool relu, float *new_scale, float *new_bias);
 
-void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
-                const float *B, int ldb, float beta, float *C, int ldc);
+// 32位 float 矩阵乘法（openmp 多线程版本）
+void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
+               const float *B, int ldb, float beta, float *C, int ldc,
+               bool relu, float *bias);
 
-// 64位 double 矩阵乘法
-void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
-           const double *B, int ldb, float beta, double *C, int ldc);
+// 32位 float 矩阵乘法, 并对结果进行 batchnrom（openmp 多线程版本）
+void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
+                     const float *B, int ldb, float beta, float *C, int ldc,
+                     bool relu, float *new_scale, float *new_bias);
 
 }  // namespace math
 }  // namespace operators
diff --git a/src/operators/math/im2col.cpp b/src/operators/math/im2col.cpp
index 625d120705aab8fcc3ea8d232b4077e213941ec4..7b0b974b542a83d381727128887bef8a48ce937f 100644
--- a/src/operators/math/im2col.cpp
+++ b/src/operators/math/im2col.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "operators/math/im2col.h"
 #include <vector>
 #ifdef __ARM_NEON
-#include "arm_neon.h"
+#include <arm_neon.h>
 #endif
 #include "common/types.h"
 namespace paddle_mobile {
@@ -69,7 +69,7 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
     int channels_col = im_channels * filter_height * filter_width;
     const T *im_data = im.data<T>();
     T *col_data = col->data<T>();
-#ifdef __ARM_NEON
+#if __ARM_NEON
     const int osize = col_height;
     const int isize = im_height;
     bool pad1 = padding[0] > 0;
diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp
index fd4106038c7446e659736c6b3c61b5aa05127e72..381624250af87f4eeff7cf316a2f0f346c399137 100644
--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -22,7 +22,8 @@ namespace math {
 template <>
 void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
                    const framework::Tensor &matrix_b, bool trans_b, float alpha,
-                   framework::Tensor *matrix_out, float beta, bool relu) {
+                   framework::Tensor *matrix_out, float beta, bool relu,
+                   float *bias) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
   auto dim_out = matrix_out->dims();
@@ -39,22 +40,23 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
 
   int M = dim_out[0];
   int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+  int K = (!trans_a) ? dim_a[1] : dim_a[0];
 
-  if (relu) {
-    sgemm_relu(M, N, K, alpha, matrix_a.data<float>(), K,
-               matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N);
-  } else {
-    sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-          beta, matrix_out->data<float>(), N);
-  }
+#ifdef _OPENMP
+  Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
+            N, beta, matrix_out->data<float>(), N, relu, bias);
+#else
+  Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+        beta, matrix_out->data<float>(), N, relu, bias);
+#endif
 }
 
 template <>
-void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
-                    const framework::Tensor &matrix_b, bool trans_b,
-                    double alpha, framework::Tensor *matrix_out, double beta,
-                    bool relu) {
+void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
+                         const framework::Tensor &matrix_b, bool trans_b,
+                         float alpha, framework::Tensor *matrix_out, float beta,
+                         bool relu, framework::Tensor *new_scale,
+                         framework::Tensor *new_bias, int group) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
   auto dim_out = matrix_out->dims();
@@ -71,7 +73,19 @@ void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
 
   int M = dim_out[0];
   int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+  int K = (!trans_a) ? dim_a[1] : dim_a[0];
+
+#ifdef _OPENMP
+  SgemmWithBn_omp(M, N, K, alpha, matrix_a.data<float>(), K,
+                  matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
+                  relu, new_scale->data<float>() + group,
+                  new_bias->data<float>() + group);
+#else
+  SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
+              N, beta, matrix_out->data<float>(), N, relu,
+              new_scale->data<float>() + group,
+              new_bias->data<float>() + group);
+#endif
 }
 
 }  // namespace math
diff --git a/src/operators/math/math_function.h b/src/operators/math/math_function.h
index 0b953ec6a3b2a03a94a91884b9daf3ed88523a22..74a3f5b8f58f5817c3de426d723a273a8a041614 100644
--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -21,11 +21,18 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {
 
-// matrix multiply with continuous memory
 template <typename T>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
             const framework::Tensor &matrix_b, bool trans_b, T alpha,
-            framework::Tensor *matrix_out, T beta, bool relu = false);
+            framework::Tensor *matrix_out, T beta, bool relu = false,
+            float *bias = nullptr);
+
+template <typename T>
+void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
+                  const framework::Tensor &matrix_b, bool trans_b, T alpha,
+                  framework::Tensor *matrix_out, T beta, bool relu,
+                  framework::Tensor *new_scale, framework::Tensor *new_bias,
+                  int group);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/math/pool_2x2.cpp b/src/operators/math/pool_2x2.cpp
index 96d277c136b4656dbb1fd682489bd7dee5c3af0e..0a2d96d4d065d7938e6872b4f073e080d7be8c3a 100644
--- a/src/operators/math/pool_2x2.cpp
+++ b/src/operators/math/pool_2x2.cpp
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef POOL_OP
-#include "pool_2x2.h"
+#include "operators/math/pool_2x2.h"
+#include <algorithm>
+#include <vector>
 
 namespace paddle_mobile {
 namespace operators {
@@ -22,6 +24,9 @@ namespace math {
 void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                 Tensor *output) {
 #if __ARM_NEON
+
+#if __aarch64__
+#else
   const int batch_size = input->dims()[0];
 
   const int input_height = input->dims()[2];
@@ -90,11 +95,16 @@ void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
     output_data += output_batch_stride;
   }
 #endif
+#else
+#endif
 }
 
 void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                 Tensor *output) {
 #if __ARM_NEON
+
+#if __aarch64__
+#else
   const int batch_size = input->dims()[0];
 
   const int input_height = input->dims()[2];
@@ -164,6 +174,9 @@ void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
     input_data += input_batch_stride;
     output_data += output_batch_stride;
   }
+
+#endif
+#else
 #endif
 }
 
diff --git a/src/operators/math/pool_2x2.h b/src/operators/math/pool_2x2.h
index 3fb0d24ba2ce854e8e63c066222e355e2c84dabb..ae32a3912b677efb50d8558700741a225e3eb3f8 100644
--- a/src/operators/math/pool_2x2.h
+++ b/src/operators/math/pool_2x2.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #pragma once
 
 #include "framework/tensor.h"
-#if __ARM_NEON
+#ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON
 namespace paddle_mobile {
diff --git a/src/operators/math/pool_3x3.cpp b/src/operators/math/pool_3x3.cpp
index f404b644d78cb1b94eb96a2d587fead2575b3814..28547b71fca6caea2ff4341b3f832c0035436a72 100644
--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
@@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef POOL_OP
-#define __ARM_NEON true
-#include "pool_3x3.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include "framework/tensor.h"
+#include "operators/math/pool_3x3.h"
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON
-
+#include <climits>
 namespace paddle_mobile {
 namespace operators {
 namespace math {
@@ -27,6 +29,499 @@ using framework::Tensor;
 using std::max;
 using std::min;
 using std::vector;
+void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+
+  const int h_in = input->dims()[2];
+
+  const int w_in = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  const int h_out = output->dims()[2];
+  const int w_out = output->dims()[3];
+  const int outputdata_channel_stride = h_out * w_out;
+  const int inputdata_channel_stride = h_in * w_in;
+  const int input_batch_stride = output_channels * inputdata_channel_stride;
+  const int output_batch_stride = output_channels * outputdata_channel_stride;
+  float *out_data = output->data<float>();
+  const float *input_data = input->data<float>();
+
+  const float coef = 1.0 / 9.0;
+  for (int k = 0; k < batch_size; ++k) {
+#pragma omp parallel for
+    for (int c = 0; c < output_channels; ++c) {
+      const float *input_seg = input_data + c * inputdata_channel_stride;
+      float *output_seg = out_data + c * outputdata_channel_stride;
+      // four corner point
+      output_seg[0] = (input_seg[0] + input_seg[1] + input_seg[w_in] +
+                       input_seg[w_in + 1]) *
+                      coef;
+      output_seg[w_out - 1] =
+          (input_seg[w_in - 2] + input_seg[w_in - 1] + input_seg[w_in * 2 - 2] +
+           input_seg[2 * w_in - 1]) *
+          coef;
+      output_seg[(h_out - 1) * w_out] =
+          (input_seg[(h_in - 2) * w_in] + input_seg[(h_in - 2) * w_in + 1] +
+           input_seg[(h_in - 1) * w_in] + input_seg[(h_in - 1) * w_in + 1]) *
+          coef;
+      output_seg[h_out * w_out - 1] =
+          (input_seg[h_in * w_in - 1] + input_seg[h_in * w_in - 2] +
+           input_seg[(h_in - 1) * w_in - 1] +
+           input_seg[(h_in - 1) * w_in - 2]) *
+          coef;
+      // left side & right side
+      for (int i = 1; i < h_in - 1; ++i) {
+        output_seg[i * w_out] =
+            (input_seg[i * w_in - w_in] + input_seg[i * w_in - w_in + 1] +
+             input_seg[i * w_in] + input_seg[i * w_in + 1] +
+             input_seg[i * w_in + w_in] + input_seg[i * w_in + w_in + 1]) *
+            coef;
+        output_seg[i * w_out + w_out - 1] =
+            (input_seg[i * w_in - w_in + w_in - 2] +
+             input_seg[i * w_in - w_in + 1 + w_in - 2] +
+             input_seg[i * w_in + w_in - 2] +
+             input_seg[i * w_in + 1 + w_in - 2] +
+             input_seg[i * w_in + w_in + w_in - 2] +
+             input_seg[i * w_in + w_in + 1 + w_in - 2]) *
+            coef;
+      }
+      // top 1 row & bottom 1 row
+      const float *input_tmp = input_seg;
+
+      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
+          tmp3, tmp4, tmp5, sum, out0;
+      float32x4_t v_coef = vdupq_n_f32(coef);
+      in0 = vld1q_f32(input_tmp);
+      in2 = vld1q_f32(input_tmp + w_in);
+      const float *input_tmp_end = input_tmp + (h_in - 2) * w_in;
+      in4 = vld1q_f32(input_tmp_end);
+      in6 = vld1q_f32(input_tmp_end + w_in);
+      int c_mid = w_out - 2;
+      auto output_ptr = output_seg + 1;
+      for (; c_mid > 3; c_mid -= 4) {
+        in1 = vld1q_f32(input_tmp + 4);
+        in3 = vld1q_f32(input_tmp + w_in + 4);
+
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+
+        sum = vaddq_f32(in0, tmp0);
+        sum = vaddq_f32(sum, tmp1);
+        sum = vaddq_f32(sum, in2);
+        sum = vaddq_f32(sum, tmp2);
+        sum = vaddq_f32(sum, tmp3);
+
+        vst1q_f32(output_ptr, vmulq_f32(sum, v_coef));
+
+        in5 = vld1q_f32(input_tmp_end + 4);
+        in7 = vld1q_f32(input_tmp_end + w_in + 4);
+
+        tmp0 = vextq_f32(in4, in5, 1);
+        tmp1 = vextq_f32(in4, in5, 2);
+        tmp2 = vextq_f32(in6, in7, 1);
+        tmp3 = vextq_f32(in6, in7, 2);
+
+        sum = vaddq_f32(in0, tmp0);
+        sum = vaddq_f32(sum, tmp1);
+        sum = vaddq_f32(sum, in2);
+        sum = vaddq_f32(sum, tmp2);
+        sum = vaddq_f32(sum, tmp3);
+
+        vst1q_f32(output_ptr + (h_out - 1) * w_out, vmulq_f32(sum, v_coef));
+
+        // can optimize to each 8 stride.
+        input_tmp += 4;
+        input_tmp_end += 4;
+        output_ptr += 4;
+        in0 = in1;
+        in2 = in3;
+        in4 = in5;
+        in6 = in7;
+      }
+      // top right remain
+      float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]);
+      float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]);
+
+      tmp0 = vextq_f32(in0, pad0, 1);
+      tmp1 = vextq_f32(in0, pad0, 2);
+      tmp2 = vextq_f32(in2, pad1, 2);
+      tmp3 = vextq_f32(in2, pad1, 2);
+
+      sum = vaddq_f32(in0, tmp0);
+      sum = vaddq_f32(sum, tmp1);
+      sum = vaddq_f32(sum, in2);
+      sum = vaddq_f32(sum, tmp2);
+      sum = vaddq_f32(sum, tmp3);
+      out0 = vmulq_f32(sum, v_coef);
+
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + i, out0, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + i, out0, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + i, out0, 2);
+        }
+      }
+
+      // bottom_right remain
+      float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]);
+      float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]);
+
+      tmp0 = vextq_f32(in4, pad2, 1);
+      tmp1 = vextq_f32(in4, pad2, 2);
+      tmp2 = vextq_f32(in6, pad3, 2);
+      tmp3 = vextq_f32(in6, pad3, 2);
+
+      sum = vaddq_f32(in4, tmp0);
+      sum = vaddq_f32(sum, tmp1);
+      sum = vaddq_f32(sum, in6);
+      sum = vaddq_f32(sum, tmp2);
+      sum = vaddq_f32(sum, tmp3);
+      out0 = vmulq_f32(sum, v_coef);
+
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 2);
+        }
+      }
+      // mid
+      for (int j = 0; j < h_out - 2; ++j) {
+        output_ptr = output_seg + w_out * (j + 1) + 1;
+        input_tmp = input_seg + j * w_in;
+
+        in0 = vld1q_f32(input_tmp);
+        in2 = vld1q_f32(input_tmp + w_in);
+        in4 = vld1q_f32(input_tmp + 2 * w_in);
+        c_mid = w_out - 2;
+        for (; c_mid > 3; c_mid -= 4) {
+          in1 = vld1q_f32(input_tmp + 4);
+          in3 = vld1q_f32(input_tmp + w_in + 4);
+          in5 = vld1q_f32(input_tmp + 2 * w_in + 4);
+
+          tmp0 = vextq_f32(in0, in1, 1);
+          tmp1 = vextq_f32(in0, in1, 2);
+          tmp2 = vextq_f32(in2, in3, 1);
+          tmp3 = vextq_f32(in2, in3, 2);
+          tmp4 = vextq_f32(in4, in5, 1);
+          tmp5 = vextq_f32(in4, in5, 2);
+
+          sum = vaddq_f32(in0, tmp0);
+          sum = vaddq_f32(sum, tmp1);
+          sum = vaddq_f32(sum, in2);
+          sum = vaddq_f32(sum, tmp2);
+          sum = vaddq_f32(sum, tmp3);
+          sum = vaddq_f32(sum, in4);
+          sum = vaddq_f32(sum, tmp4);
+          sum = vaddq_f32(sum, tmp5);
+
+          out0 = vmulq_f32(sum, v_coef);
+          vst1q_f32(output_ptr, out0);
+          output_ptr += 4;
+          input_tmp += 4;
+          in0 = in1;
+          in2 = in3;
+          in4 = in5;
+        }
+        // mid remain
+        float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]);
+        float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
+        float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
+
+        tmp0 = vextq_f32(in0, pad0, 1);
+        tmp1 = vextq_f32(in0, pad0, 2);
+        tmp2 = vextq_f32(in2, pad1, 1);
+        tmp3 = vextq_f32(in2, pad1, 2);
+        tmp4 = vextq_f32(in4, pad2, 1);
+        tmp5 = vextq_f32(in4, pad2, 2);
+
+        sum = vaddq_f32(in0, tmp0);
+        sum = vaddq_f32(sum, tmp1);
+        sum = vaddq_f32(sum, in2);
+        sum = vaddq_f32(sum, tmp2);
+        sum = vaddq_f32(sum, tmp3);
+        sum = vaddq_f32(sum, in4);
+        sum = vaddq_f32(sum, tmp4);
+        sum = vaddq_f32(sum, tmp5);
+        out0 = vmulq_f32(sum, v_coef);
+
+        for (int i = 0; i < c_mid; ++i) {
+          if (i == 0) {
+            vst1q_lane_f32(output_ptr + i, out0, 0);
+          }
+          if (i == 1) {
+            vst1q_lane_f32(output_ptr + i, out0, 1);
+          }
+          if (i == 2) {
+            vst1q_lane_f32(output_ptr + i, out0, 2);
+          }
+        }
+      }
+      //      input_data += inputdata_channel_stride;
+      //      out_data += outputdata_channel_stride;
+    }
+    input_data += input_batch_stride;
+    out_data += output_batch_stride;
+  }
+#endif
+}
+
+void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+
+  const int h_in = input->dims()[2];
+
+  const int w_in = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  const int h_out = output->dims()[2];
+  const int w_out = output->dims()[3];
+  const int outputdata_channel_stride = h_out * w_out;
+  const int inputdata_channel_stride = h_in * w_in;
+  const int input_batch_stride = output_channels * inputdata_channel_stride;
+  const int output_batch_stride = output_channels * outputdata_channel_stride;
+  float *out_data = output->data<float>();
+  const float *input_data = input->data<float>();
+  for (int k = 0; k < batch_size; ++k) {
+#pragma omp parallel for
+    for (int c = 0; c < output_channels; ++c) {
+      const float *input_seg = input_data + c * inputdata_channel_stride;
+      float *output_seg = out_data + c * outputdata_channel_stride;
+      // four corner point
+      output_seg[0] = std::max(std::max(input_seg[0], input_seg[1]),
+                               std::max(input_seg[w_in], input_seg[w_in + 1]));
+      output_seg[w_out - 1] =
+          std::max(std::max(input_seg[w_in - 2], input_seg[w_in - 1]),
+                   std::max(input_seg[w_in * 2 - 2], input_seg[2 * w_in - 1]));
+      output_seg[(h_out - 1) * w_out] =
+          std::max(std::max(input_seg[(h_in - 2) * w_in],
+                            input_seg[(h_in - 2) * w_in + 1]),
+                   std::max(input_seg[(h_in - 1) * w_in],
+                            input_seg[(h_in - 1) * w_in + 1]));
+      output_seg[h_out * w_out - 1] = std::max(
+          std::max(input_seg[(h_in - 1) * w_in - 1],
+                   input_seg[(h_in - 1) * w_in - 2]),
+          std::max(input_seg[h_in * w_in - 1], input_seg[h_in * w_in - 2]));
+      // left side & right side
+      for (int i = 1; i < h_in - 1; ++i) {
+        float max1 = std::max(input_seg[i * w_in - w_in],
+                              input_seg[i * w_in - w_in + 1]);
+        float max2 = std::max(input_seg[i * w_in], input_seg[i * w_in + 1]);
+        float max3 = std::max(input_seg[i * w_in + w_in],
+                              input_seg[i * w_in + w_in + 1]);
+        output_seg[i * w_out] = std::max(std::max(max1, max2), max3);
+
+        max1 = std::max(input_seg[i * w_in - w_in + w_in - 2],
+                        input_seg[i * w_in - w_in + 1 + w_in - 2]);
+        max2 = std::max(input_seg[i * w_in + w_in - 2],
+                        input_seg[i * w_in + 1 + w_in - 2]);
+        max3 = std::max(input_seg[i * w_in + w_in + w_in - 2],
+                        input_seg[i * w_in + w_in + 1 + w_in - 2]);
+        output_seg[i * w_out + w_out - 1] =
+            std::max(std::max(max1, max2), max3);
+      }
+      // top 1 row & bottom 1 row
+      const float *input_tmp = input_seg;
+
+      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
+          tmp3, tmp4, tmp5, max;
+      in0 = vld1q_f32(input_tmp);
+      in2 = vld1q_f32(input_tmp + w_in);
+      const float *input_tmp_end = input_tmp + (h_in - 2) * w_in;
+      in4 = vld1q_f32(input_tmp_end);
+      in6 = vld1q_f32(input_tmp_end + w_in);
+      int c_mid = w_out - 2;
+      auto output_ptr = output_seg + 1;
+      for (; c_mid > 3; c_mid -= 4) {
+        in1 = vld1q_f32(input_tmp + 4);
+        in3 = vld1q_f32(input_tmp + w_in + 4);
+
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+
+        max = vmaxq_f32(in0, tmp0);
+        max = vmaxq_f32(max, tmp1);
+        max = vmaxq_f32(max, in2);
+        max = vmaxq_f32(max, tmp2);
+        max = vmaxq_f32(max, tmp3);
+
+        vst1q_f32(output_ptr, max);
+
+        in5 = vld1q_f32(input_tmp_end + 4);
+        in7 = vld1q_f32(input_tmp_end + w_in + 4);
+
+        tmp0 = vextq_f32(in4, in5, 1);
+        tmp1 = vextq_f32(in4, in5, 2);
+        tmp2 = vextq_f32(in6, in7, 1);
+        tmp3 = vextq_f32(in6, in7, 2);
+
+        max = vmaxq_f32(in4, tmp0);
+        max = vmaxq_f32(max, tmp1);
+        max = vmaxq_f32(max, in6);
+        max = vmaxq_f32(max, tmp2);
+        max = vmaxq_f32(max, tmp3);
+
+        vst1q_f32(output_ptr + (h_out - 1) * w_out, max);
+
+        input_tmp += 4;
+        input_tmp_end += 4;
+        output_ptr += 4;
+        in0 = in1;
+        in2 = in3;
+        in4 = in5;
+        in6 = in7;
+      }
+      // top right remain
+      float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]);
+      float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]);
+
+      tmp0 = vextq_f32(in0, pad0, 1);
+      tmp1 = vextq_f32(in0, pad0, 2);
+      tmp2 = vextq_f32(in2, pad1, 1);
+      tmp3 = vextq_f32(in2, pad1, 2);
+
+      max = vmaxq_f32(in0, tmp0);
+      max = vmaxq_f32(max, tmp1);
+      max = vmaxq_f32(max, in2);
+      max = vmaxq_f32(max, tmp2);
+      max = vmaxq_f32(max, tmp3);
+
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + i, max, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + i, max, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + i, max, 2);
+        }
+      }
+
+      // bottom_right remain
+      float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]);
+      float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]);
+
+      tmp0 = vextq_f32(in4, pad2, 1);
+      tmp1 = vextq_f32(in4, pad2, 2);
+      tmp2 = vextq_f32(in6, pad3, 1);
+      tmp3 = vextq_f32(in6, pad3, 2);
+
+      max = vmaxq_f32(in4, tmp0);
+      max = vmaxq_f32(max, tmp1);
+      max = vmaxq_f32(max, in6);
+      max = vmaxq_f32(max, tmp2);
+      max = vmaxq_f32(max, tmp3);
+
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, max, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, max, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, max, 2);
+        }
+      }
+      // mid
+      for (int j = 0; j < h_out - 2; ++j) {
+        output_ptr = output_seg + (j + 1) * w_out + 1;
+        input_tmp = input_seg + j * w_in;
+
+        in0 = vld1q_f32(input_tmp);
+        in2 = vld1q_f32(input_tmp + w_in);
+        in4 = vld1q_f32(input_tmp + 2 * w_in);
+        c_mid = w_out - 2;
+        for (; c_mid > 3; c_mid -= 4) {
+          in1 = vld1q_f32(input_tmp + 4);
+          in3 = vld1q_f32(input_tmp + w_in + 4);
+          in5 = vld1q_f32(input_tmp + 2 * w_in + 4);
+
+          tmp0 = vextq_f32(in0, in1, 1);
+          tmp1 = vextq_f32(in0, in1, 2);
+          tmp2 = vextq_f32(in2, in3, 1);
+          tmp3 = vextq_f32(in2, in3, 2);
+          tmp4 = vextq_f32(in4, in5, 1);
+          tmp5 = vextq_f32(in4, in5, 2);
+
+          max = vmaxq_f32(in0, tmp0);
+          max = vmaxq_f32(max, tmp1);
+          max = vmaxq_f32(max, in2);
+          max = vmaxq_f32(max, tmp2);
+          max = vmaxq_f32(max, tmp3);
+          max = vmaxq_f32(max, in4);
+          max = vmaxq_f32(max, tmp4);
+          max = vmaxq_f32(max, tmp5);
+
+          vst1q_f32(output_ptr, max);
+          output_ptr += 4;
+          input_tmp += 4;
+          in0 = in1;
+          in2 = in3;
+          in4 = in5;
+        }
+        // mid remain
+        float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]);
+        float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
+        float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 3) * w_in - 1]);
+
+        tmp0 = vextq_f32(in0, pad0, 1);
+        tmp1 = vextq_f32(in0, pad0, 2);
+        tmp2 = vextq_f32(in2, pad1, 1);
+        tmp3 = vextq_f32(in2, pad1, 2);
+        tmp4 = vextq_f32(in4, pad2, 1);
+        tmp5 = vextq_f32(in4, pad2, 2);
+
+        max = vmaxq_f32(in0, tmp0);
+        max = vmaxq_f32(max, tmp1);
+        max = vmaxq_f32(max, in2);
+        max = vmaxq_f32(max, tmp2);
+        max = vmaxq_f32(max, tmp3);
+        max = vmaxq_f32(max, in4);
+        max = vmaxq_f32(max, tmp4);
+        max = vmaxq_f32(max, tmp5);
+
+        for (int i = 0; i < c_mid; ++i) {
+          if (i == 0) {
+            vst1q_lane_f32(output_ptr + i, max, 0);
+          }
+          if (i == 1) {
+            vst1q_lane_f32(output_ptr + i, max, 1);
+          }
+          if (i == 2) {
+            vst1q_lane_f32(output_ptr + i, max, 2);
+          }
+        }
+      }
+      //      input_data += inputdata_channel_stride;
+      //      out_data += outputdata_channel_stride;
+    }
+    input_data += input_batch_stride;
+    out_data += output_batch_stride;
+  }
+#else
+
+#endif
+}
 
 void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                 Tensor *output) {
@@ -41,11 +536,11 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
 
   const int output_height = output->dims()[2];
   const int output_width = output->dims()[3];
-  const int _kernel_size = 3;
-  const int stride_height = strides[0];
-  const int stride_width = strides[1];
-  const int padding_height = paddings[0];
-  const int padding_width = paddings[1];
+  //  const int _kernel_size = 3;
+  const int stride = strides[0];
+  //  const int stride_width = strides[1];
+  const int padding = paddings[0];
+  //  const int padding_width = paddings[1];
   const float negative_max = -INT_MAX;
   const int input_channel_stride = input_height * input_width;
   const int output_channel_stride = output_height * output_width;
@@ -55,38 +550,52 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
 
   const int input_batch_stride = output_channels * input_channel_stride;
   const int output_batch_stride = output_channels * output_channel_stride;
-  const float *pos1, *pos2, *pos3, *output_ptr;
+  const float *pos1, *output_ptr;
   int hstart, wstart, hend, wend;
   for (int i = 0; i < batch_size; ++i) {
+#pragma omp parallel for
     for (int c = 0; c < output_channels; ++c) {
+      const float *input_seg = input_data + c * input_channel_stride;
+      float *output_seg = output_data + c * output_channel_stride;
       for (int ph = 0; ph < output_height; ph++) {
         for (int pw = 0; pw < output_width; pw++) {
-          hstart = ph * stride_height - padding_height;
-          wstart = pw * stride_width - padding_width;
-          hend = min(hstart + _kernel_size, input_height + padding_height);
-          wend = min(wstart + _kernel_size, input_width + padding_width);
+          int hstart = ph * stride - padding;
+          int wstart = pw * stride - padding;
+          int hend = min(hstart + 3, input_height + padding);
+          int wend = min(wstart + 3, input_width + padding);
           hstart = max(hstart, 0);
           wstart = max(wstart, 0);
           hend = min(hend, input_height);
           wend = min(wend, input_width);
-          pos1 = input_data + hstart * input_width + wstart;
-          pos2 = input_data + (hstart + 1) * input_width + wstart;
-          pos3 = input_data + (hstart + 2) * input_width + wstart;
-          output_ptr = output_data + ph * output_width + pw;
+          const float *pos1 = input_seg + hstart * input_width + wstart;
+          const float *pos2 = input_seg + (hstart + 1) * input_width + wstart;
+          const float *pos3 = input_seg + (hstart + 2) * input_width + wstart;
+          output_ptr = output_seg + ph * output_width + pw;
 
           if (hend - hstart != 3 || wend - wstart != 3) {
             float max_value = -INT_MAX;
             for (int h = hstart; h < hend; h++) {
               for (int w = wstart; w < wend; w++) {
-                float value = input_data[h * input_width + w];
+                float value = input_seg[h * input_width + w];
                 if (value > max_value) {
                   max_value = value;
                 }
               }
             }
-            output_data[ph * output_width + pw] = max_value;
+            output_seg[ph * output_width + pw] = max_value;
           } else {
-#if defined(ARMV7)
+#if __aarch64__
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos1 + input_width);
+            const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width);
+            const float32x4_t max_data =
+                vmaxq_f32(vmaxq_f32(data1, data2), data3);
+            float32x2_t res =
+                vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
+                          vget_low_f32(max_data));
+            res = vpmax_f32(res, res);
+            output_seg[ph * output_width + pw] = vget_lane_f32(res, 0);
+#else
             asm volatile(
                 "vld1.32  {q1}, [%[pos1]]        \n\t"
                 "vld1.32  {q2}, [%[pos2]]        \n\t"
@@ -98,27 +607,14 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                 "vpmax.f32  d7, d6, d6             \n\t"
                 "vst1.32 {d7[0]},[%[output_ptr]]    \n\t"
                 :
-                : [input_data] "r"(input_data), [pos1] "r"(pos1),
+                : [input_seg] "r"(input_seg), [pos1] "r"(pos1),
                   [pos2] "r"(pos2), [pos3] "r"(pos3),
                   [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
                 : "memory", "q1", "q2", "q3", "q4");
-#else
-            const float32x4_t data1 = vld1q_f32(pos1);
-            const float32x4_t data2 = vld1q_f32(pos2);
-            const float32x4_t data3 = vld1q_f32(pos3);
-            const float32x4_t max_data =
-                vmaxq_f32(vmaxq_f32(data1, data3), data2);
-            float32x2_t res =
-                vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
-                          vget_low_f32(max_data));
-            res = vpmax_f32(res, res);
-            output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
 #endif
           }
         }
       }
-      input_data += input_channel_stride;
-      output_data += output_channel_stride;
     }
     input_data += input_batch_stride;
     output_data += output_batch_stride;
@@ -139,11 +635,8 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
 
   const int output_height = output->dims()[2];
   const int output_width = output->dims()[3];
-  const int _kernel_size = 3;
-  const int stride_height = strides[0];
-  const int stride_width = strides[1];
-  const int padding_height = paddings[0];
-  const int padding_width = paddings[1];
+  const int stride = strides[0];
+  const int padding = paddings[0];
 
   const int input_channel_stride = input_height * input_width;
   const int output_channel_stride = output_height * output_width;
@@ -157,33 +650,36 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
   const int input_batch_stride = output_channels * input_channel_stride;
   const int output_batch_stride = output_channels * output_channel_stride;
   for (int i = 0; i < batch_size; ++i) {
+#pragma omp parallel for
     for (int c = 0; c < output_channels; ++c) {
+      const float *input_seg = input_data + c * input_channel_stride;
+      float *output_seg = output_data + c * output_channel_stride;
       for (int ph = 0; ph < output_height; ph++) {
         for (int pw = 0; pw < output_width; pw++) {
-          int hstart = ph * stride_height - padding_height;
-          int wstart = pw * stride_width - padding_width;
-          int hend = min(hstart + _kernel_size, input_height + padding_height);
-          int wend = min(wstart + _kernel_size, input_width + padding_width);
+          int hstart = ph * stride - padding;
+          int wstart = pw * stride - padding;
+          int hend = min(hstart + 3, input_height + padding);
+          int wend = min(wstart + 3, input_width + padding);
           hstart = max(hstart, 0);
           wstart = max(wstart, 0);
           hend = min(hend, input_height);
           wend = min(wend, input_width);
-          const float *pos1 = input_data + hstart * input_width + wstart;
-          const float *pos2 = input_data + (hstart + 1) * input_width + wstart;
-          const float *pos3 = input_data + (hstart + 2) * input_width + wstart;
-          const float *output_ptr = output_data + ph * output_width + pw;
+          const float *pos1 = input_seg + hstart * input_width + wstart;
+          const float *pos2 = input_seg + (hstart + 1) * input_width + wstart;
+          const float *pos3 = input_seg + (hstart + 2) * input_width + wstart;
+          float *output_ptr = output_seg + ph * output_width + pw;
 
           if (hend - hstart != 3 || wend - wstart != 3) {
             float sum = 0;
             for (int h = hstart; h < hend; h++) {
               for (int w = wstart; w < wend; w++) {
-                sum += input_data[h * input_width + w];
+                sum += input_seg[h * input_width + w];
               }
             }
-            output_data[ph * output_width + pw] = sum / 9.0;
+            output_seg[ph * output_width + pw] = sum / 9.0;
           } else {
-#if defined(ARMV7)
-
+#if __aarch64__
+#else
             asm volatile(
                 "vld1.32  {q1}, [%[pos1]]        \n\t"
                 "vld1.32  {q2}, [%[pos2]]        \n\t"
@@ -197,12 +693,12 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                 "vmul.f32 d6,d7                     \n\t"
                 "vst1.32 {d6[0]},[%[output_ptr]]    \n\t"
                 :
-                : [input_data] "r"(input_data), [pos1] "r"(pos1),
+                : [input_seg] "r"(input_seg), [pos1] "r"(pos1),
                   [pos2] "r"(pos2), [pos3] "r"(pos3),
                   [output_ptr] "r"(output_ptr), [zero] "r"(zero),
                   [nine_ptr] "r"(nine_ptr)
                 : "memory", "r6", "q1", "q2", "q3", "q4");
-#else
+#endif
             const float32x4_t data1 = vld1q_f32(pos1);
             const float32x4_t data2 = vld1q_f32(pos2);
             const float32x4_t data3 = vld1q_f32(pos3);
@@ -212,17 +708,15 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                 vpadd_f32(vget_high_f32(vsetq_lane_f32(0, sum_data, 3)),
                           vget_low_f32(sum_data));
             res = vpadd_f32(res, res);
-            output_data[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0;
-#endif
+            output_seg[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0;
           }
         }
       }
-      input_data += input_channel_stride;
-      output_data += output_channel_stride;
     }
     input_data += input_batch_stride;
     output_data += output_batch_stride;
   }
+#else
 #endif
 }
 }  // namespace math
diff --git a/src/operators/math/pool_3x3.h b/src/operators/math/pool_3x3.h
index 22a398084390701aefc8815c9aa93b82b4c4ec7b..ac1eb16a4c0e077c625267545767b8f29144b8f1 100644
--- a/src/operators/math/pool_3x3.h
+++ b/src/operators/math/pool_3x3.h
@@ -15,7 +15,11 @@ limitations under the License. */
 #ifdef POOL_OP
 
 #pragma once
-
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include <algorithm>
+#include <vector>
 #include "framework/tensor.h"
 #if __ARM_NEON
 #include <arm_neon.h>
@@ -26,7 +30,8 @@ namespace operators {
 namespace math {
 using framework::Tensor;
 using std::vector;
-
+void Pool3x3Avgs1p1(const Tensor *input, Tensor *output);
+void Pool3x3Maxs1p1(const Tensor *input, Tensor *output);
 void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                 Tensor *output);
 
diff --git a/src/operators/math/pooling.cpp b/src/operators/math/pooling.cpp
index 4287408394f1a7f407154938f3e83e9fac3543a2..24db2e272e3124a223c22c6f687d868d42126f6b 100644
--- a/src/operators/math/pooling.cpp
+++ b/src/operators/math/pooling.cpp
@@ -16,6 +16,9 @@ limitations under the License. */
 
 #include "pooling.h"
 #include "common/types.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 
 namespace paddle_mobile {
 namespace operators {
@@ -57,8 +60,8 @@ class PoolFunctor<CPU, PoolProcess, T> {
     T *output_data = output->mutable_data<T>();
 
     for (int i = 0; i < batch_size; i++) {
-      #pragma omp parallel for
       for (int c = 0; c < output_channels; ++c) {
+#pragma omp parallel for
         for (int ph = 0; ph < output_height; ++ph) {
           int hstart = ph * stride_height - padding_height;
           int hend = std::min(hstart + ksize_height, input_height);
diff --git a/src/operators/math/pooling.h b/src/operators/math/pooling.h
index bc2ecf41d224c2b0fd518d44fecc3f688d98ee19..3ca868fa4de4b9fefdcd8c18c0d7107cc9f60b4f 100644
--- a/src/operators/math/pooling.h
+++ b/src/operators/math/pooling.h
@@ -65,7 +65,7 @@ class PoolFunctor {
                   const std::vector<int> &paddings, PoolProcess pool_compute,
                   framework::Tensor *output);
 };
-}
+}  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/math/softmax.cpp b/src/operators/math/softmax.cpp
index a1eb4f13d82376d86da258101b15e6ae5e8bdc97..dba88c93969014f2ad0d2636b4141c734dbc2ed5 100644
--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "operators/math/softmax.h"
 #include "common/types.h"
-#if __ARM_NEON
+#ifdef __ARM_NEON
 #include <math.h>
 #include <algorithm>
 #include "operators/math/math_func_neon.h"
@@ -29,7 +29,7 @@ using framework::DDim;
 using framework::Tensor;
 template <typename T>
 class SoftmaxFuntor<CPU, T> {
-#if __ARM_NEON
+#ifdef __ARM_NEON
   void sum(float *input, float *sumptr, int inner_size, int outter_size) {
     float32x4_t acc = vdupq_n_f32(0);
     float sum_ = 0;
@@ -135,6 +135,7 @@ class SoftmaxFuntor<CPU, T> {
       }
     }
   }
+#else
 #endif  // ARM_NEON
 
  public:
@@ -144,7 +145,7 @@ class SoftmaxFuntor<CPU, T> {
       framework::Tensor sub_X = X->Slice(i, i + 1);
       framework::Tensor sub_Y = Y->Slice(i, i + 1);
 
-#if __ARM_NEON
+#ifdef __ARM_NEON
       SoftmaxCacl(&sub_X, &sub_Y);
 #endif
     }
diff --git a/src/operators/mul_op.cpp b/src/operators/mul_op.cpp
index d97c6ec3e470bb2b083ef7e5234168c6fdfc34c1..044da7012eccde57a87d417f4f3c00b82e01da42 100644
--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -50,16 +50,16 @@ void MulOp<Dtype, T>::InferShape() const {
   framework::DDim ddim = framework::make_ddim(output_dims);
   this->param_.Out()->Resize(ddim);
 }
-template class MulOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(mul);
 REGISTER_OPERATOR_CPU(mul, ops::MulOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
diff --git a/src/operators/mul_op.h b/src/operators/mul_op.h
index ad5c9a3702348455cb559c28453df82d81e1c4c8..64b811b01091418c9febdfb8d03bacd77421dcf5 100644
--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
@@ -46,4 +46,13 @@ class MulOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(mul);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(mul);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/multiclass_nms_op.cpp b/src/operators/multiclass_nms_op.cpp
index 52adf6cc627d76b18b3b48928c344545327ca99e..4324cab35298a45ece7e375299909994648a27a4 100644
--- a/src/operators/multiclass_nms_op.cpp
+++ b/src/operators/multiclass_nms_op.cpp
@@ -34,13 +34,12 @@ void MultiClassNMSOp<Dtype, T>::InferShape() const {
   // pre size, will change in Compute.
   this->param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6}));
 }
-template class MultiClassNMSOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(multiclass_nms);
 REGISTER_OPERATOR_CPU(multiclass_nms, ops::MultiClassNMSOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/multiclass_nms_op.h b/src/operators/multiclass_nms_op.h
index 30cf8f67942f7888599e8f0057baff1ddd5d6cea..425f7d33e35e0864b5f5a7739dbfa18bc8eb0c30 100644
--- a/src/operators/multiclass_nms_op.h
+++ b/src/operators/multiclass_nms_op.h
@@ -52,4 +52,12 @@ class MultiClassNMSOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(multiclass_nms);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index ad7de0ee44db3a727ec06d5fabfca203226215f4..4d1d5af29b81b044ca6d89b4a48a078f73dcabc9 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -22,6 +22,9 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
+#ifdef PADDLE_MOBILE_FPGA
+#include "fpga/api/fpga_api.h"
+#endif
 
 namespace paddle_mobile {
 namespace operators {
@@ -195,8 +198,7 @@ class OpParam {
 class ConvParam : OpParam {
  public:
   ConvParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs,
-            const framework::Scope &scope) {
+            const AttributeMap &attrs, const Scope &scope) {
     filter_ = FilterFrom<LoDTensor>(inputs, scope);
     input_ = InputFrom<LoDTensor>(inputs, scope);
     output_ = OutputFrom<LoDTensor>(outputs, scope);
@@ -233,16 +235,14 @@ class ConvParam : OpParam {
 Print &operator<<(Print &printer, const ConvParam &conv_param);
 #endif
 
-#ifdef ELEMENTWISEADD_OP
 class ElementwiseAddParam : OpParam {
  public:
   ElementwiseAddParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs,
-                      const framework::AttributeMap &attrs,
-                      const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    input_y_ = InputYFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+                      const VariableNameMap &outputs, const AttributeMap &attrs,
+                      const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    input_y_ = InputYFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
     axis_ = GetAttr<int>("axis", attrs);
   }
 
@@ -259,19 +259,29 @@ class ElementwiseAddParam : OpParam {
   Tensor *input_y_;
   Tensor *out_;
   int axis_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::EWAddArgs fpga_EW_add_args;
+
+ public:
+  const fpga::EWAddArgs &FpgaArgs() const { return fpga_EW_add_args; }
+  void SetFpgaArgs(const fpga::EWAddArgs &args) { fpga_EW_add_args = args; }
+#endif
 };
 
+#ifdef FUSION_ELEMENTWISEADDRELU_OP
+using ElementwiseAddReluParam = ElementwiseAddParam;
 #endif
 
 #ifdef MUL_OP
 class MulParam : OpParam {
  public:
   MulParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const framework::AttributeMap &attrs,
-           const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    input_y_ = InputYFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+           const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    input_y_ = InputYFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
     x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
     y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
   }
@@ -299,10 +309,9 @@ class MulParam : OpParam {
 class ConcatParam : public OpParam {
  public:
   ConcatParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-              const framework::AttributeMap &attrs,
-              const framework::Scope &scope) {
+              const AttributeMap &attrs, const Scope &scope) {
     inputs_ = InputMultiFrom<LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
     axis_ = GetAttr<int>("axis", attrs);
   }
 
@@ -323,11 +332,10 @@ class ConcatParam : public OpParam {
 class LrnParam : public OpParam {
  public:
   LrnParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const framework::AttributeMap &attrs,
-           const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
-    mid_out_ = MidOutFrom<framework::LoDTensor>(outputs, scope);
+           const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
+    mid_out_ = MidOutFrom<LoDTensor>(outputs, scope);
     n_ = GetAttr<int>("n", attrs);
     alpha_ = GetAttr<float>("alpha", attrs);
     beta_ = GetAttr<float>("beta", attrs);
@@ -367,17 +375,16 @@ class LrnParam : public OpParam {
 class BatchNormParam : OpParam {
  public:
   BatchNormParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs,
-                 const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    output_y_ = OutputYFrom<framework::LoDTensor>(outputs, scope);
-    input_bias_ = InputBiasFrom<framework::LoDTensor>(inputs, scope);
-    input_mean_ = InputMeanFrom<framework::LoDTensor>(inputs, scope);
-    input_scale_ = InputScaleFrom<framework::LoDTensor>(inputs, scope);
-    input_variance_ = InputVarianceFrom<framework::LoDTensor>(inputs, scope);
+                 const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    output_y_ = OutputYFrom<LoDTensor>(outputs, scope);
+    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
+    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
+    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
+    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
     epsilon_ = GetAttr<float>("epsilon", attrs);
     momentum_ = GetAttr<float>("momentum", attrs);
-    is_test_ = GetAttr<bool>("is_test", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
   }
 
   const Tensor *InputX() const { return input_x_; }
@@ -418,17 +425,16 @@ class BatchNormParam : OpParam {
 class PoolParam : public OpParam {
  public:
   PoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs,
-            const framework::Scope &scope) {
-    input_ = InputXFrom<framework::LoDTensor>(inputs, scope);
+            const AttributeMap &attrs, const Scope &scope) {
+    input_ = InputXFrom<LoDTensor>(inputs, scope);
 
-    output_ = OutFrom<framework::LoDTensor>(outputs, scope);
+    output_ = OutFrom<LoDTensor>(outputs, scope);
     pooling_type_ = GetAttr<string>("pooling_type", attrs);
     ksize_ = GetAttr<vector<int>>("ksize", attrs);
     strides_ = GetAttr<vector<int>>("strides", attrs);
     paddings_ = GetAttr<vector<int>>("paddings", attrs);
     ceil_mode_ = GetAttr<bool>("ceil_mode", attrs);
-    gloabal_pooling_ = GetAttr<bool>("global_pooling", attrs);
+    global_pooling_ = GetAttr<bool>("global_pooling", attrs);
   }
 
   const Tensor *Input() const { return input_; }
@@ -445,7 +451,7 @@ class PoolParam : public OpParam {
 
   bool isCeilMode() const { return ceil_mode_; }
 
-  bool isGlobalPooling() const { return gloabal_pooling_; }
+  bool isGlobalPooling() const { return global_pooling_; }
 
  private:
   Tensor *input_;
@@ -455,22 +461,28 @@ class PoolParam : public OpParam {
   vector<int> strides_;
   vector<int> paddings_;
   bool ceil_mode_;
-  bool gloabal_pooling_ = false;
-};
+  bool global_pooling_ = false;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::PoolingArgs fpga_pool_args;
 
+ public:
+  const fpga::PoolingArgs &FpgaArgs() const { return fpga_pool_args; }
+  void SetFpgaArgs(const fpga::PoolingArgs &args) { fpga_pool_args = args; }
+#endif
+};
 #endif
 
 #ifdef PRIORBOX_OP
 class PriorBoxParam : public OpParam {
  public:
   PriorBoxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const framework::AttributeMap &attrs,
-                const framework::Scope &scope) {
-    input_ = InputFrom<framework::LoDTensor>(inputs, scope);
-    input_image_ = InputImageFrom<framework::LoDTensor>(inputs, scope);
-    output_boxes_ = OutputBoxesFrom<framework::LoDTensor>(outputs, scope);
-    output_variances_ =
-        OutputVariancesFrom<framework::LoDTensor>(outputs, scope);
+                const AttributeMap &attrs, const Scope &scope) {
+    input_ = InputFrom<LoDTensor>(inputs, scope);
+    input_image_ = InputImageFrom<LoDTensor>(inputs, scope);
+    output_boxes_ = OutputBoxesFrom<LoDTensor>(outputs, scope);
+    output_variances_ = OutputVariancesFrom<LoDTensor>(outputs, scope);
     min_sizes_ = GetAttr<vector<float>>("min_sizes", attrs);
     max_sizes_ = GetAttr<vector<float>>("max_sizes", attrs);
     aspect_ratios_ = GetAttr<vector<float>>("aspect_ratios", attrs);
@@ -528,13 +540,11 @@ class PriorBoxParam : public OpParam {
 class BoxCoderParam : public OpParam {
  public:
   BoxCoderParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const framework::AttributeMap &attrs,
-                const framework::Scope &scope) {
-    input_priorbox_ = InputPriorBoxFrom<framework::LoDTensor>(inputs, scope);
-    input_priorboxvar_ =
-        InputPriorBoxVarFrom<framework::LoDTensor>(inputs, scope);
-    input_targetbox_ = InputTargetBoxFrom<framework::LoDTensor>(inputs, scope);
-    output_box_ = OutputBoxFrom<framework::LoDTensor>(outputs, scope);
+                const AttributeMap &attrs, const Scope &scope) {
+    input_priorbox_ = InputPriorBoxFrom<LoDTensor>(inputs, scope);
+    input_priorboxvar_ = InputPriorBoxVarFrom<LoDTensor>(inputs, scope);
+    input_targetbox_ = InputTargetBoxFrom<LoDTensor>(inputs, scope);
+    output_box_ = OutputBoxFrom<LoDTensor>(outputs, scope);
     code_type_ = GetAttr<std::string>("code_type", attrs);
   }
   const Tensor *InputPriorBox() const { return input_priorbox_; }
@@ -560,10 +570,9 @@ class BoxCoderParam : public OpParam {
 class SoftmaxParam : public OpParam {
  public:
   SoftmaxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const framework::AttributeMap &attrs,
-               const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+               const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
   }
   const Tensor *InputX() const { return input_x_; }
   Tensor *Out() const { return out_; }
@@ -578,10 +587,9 @@ class SoftmaxParam : public OpParam {
 class SigmoidParam : public OpParam {
  public:
   SigmoidParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const framework::AttributeMap &attrs,
-               const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+               const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
   }
   const Tensor *InputX() const { return input_x_; }
   Tensor *Out() const { return out_; }
@@ -643,10 +651,10 @@ class MultiClassNMSParam : public OpParam {
 class FeedParam : public OpParam {
  public:
   FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs, framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
-    auto var = scope.Var("batch_size");
+            const AttributeMap &attrs, Scope *scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, *scope);
+    out_ = OutFrom<LoDTensor>(outputs, *scope);
+    auto var = scope->Var("batch_size");
     batch_size = var->GetValue<int>();
   }
   const Tensor *InputX() const { return input_x_; }
@@ -657,15 +665,24 @@ class FeedParam : public OpParam {
   Tensor *input_x_;
   Tensor *out_;
   int batch_size;
+
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::BypassArgs fpga_bypass_args;
+
+ public:
+  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
+  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
+#endif
 };
 
 class FetchParam : public OpParam {
  public:
   FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs,
-             const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+             const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
   }
   const Tensor *InputX() const { return input_x_; }
   Tensor *Out() const { return out_; }
@@ -729,6 +746,123 @@ class ReshapeParam : public OpParam {
 };
 #endif
 
+#ifdef SCALE_OP
+class ScaleParam : public OpParam {
+ public:
+  ScaleParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+             const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    input_bias_ = InputBiasFrom<framework::LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
+    inplace_ = GetAttr<bool>("inplace", attrs);
+    has_bias_ = GetAttr<bool>("has_bias", attrs);
+    scales_ = GetAttr<vector<float>>("scales", attrs);
+    biases_ = GetAttr<vector<float>>("biases", attrs);
+  }
+
+  const Tensor *InputX() const { return input_x_; }
+
+  const Tensor *InputBias() const { return input_bias_; }
+
+  Tensor *Out() const { return out_; }
+
+  const bool &Inplace() const { return inplace_; }
+
+  const bool &HasBias() const { return has_bias_; }
+
+  const vector<float> &Scales() const { return scales_; }
+
+  const vector<float> &Biases() const { return biases_; }
+
+ private:
+  Tensor *input_x_;
+  Tensor *input_bias_;
+  Tensor *out_;
+  bool inplace_;
+  bool has_bias_;
+  vector<float> scales_;
+  vector<float> biases_;
+};
+#endif
+
+#ifdef SLICE_OP
+class SliceParam : public OpParam {
+ public:
+  SliceParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+             const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    input_shape_ = InputShapeFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
+    axis_ = GetAttr<int>("axis", attrs);
+    slice_points_ = GetAttr<vector<int>>("slice_points", attrs);
+    inplace_ = GetAttr<bool>("inplace", attrs);
+  }
+
+  const Tensor *InputX() const { return input_x_; }
+
+  const Tensor *InputShape() const { return input_shape_; }
+
+  Tensor *Out() const { return out_; }
+
+  const int &Axis() const { return axis_; }
+
+  const vector<int> &SlicePoints() const { return slice_points_; }
+
+  const bool &Inplace() const { return inplace_; }
+
+ private:
+  Tensor *input_x_;
+  Tensor *input_shape_;
+  Tensor *out_;
+  int axis_;
+  vector<int> slice_points_;
+  bool inplace_;
+};
+#endif
+
+#ifdef RESIZE_OP
+class ResizeParam : public OpParam {
+ public:
+  ResizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+              const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    input_shape_ = InputShapeFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
+    is_pyramid_test_ = GetAttr<bool>("is_pyramid_test", attrs);
+    height_ = GetAttr<int>("height", attrs);
+    width_ = GetAttr<int>("width", attrs);
+    out_height_scale_ = GetAttr<float>("out_height_scale", attrs);
+    out_width_scale_ = GetAttr<float>("out_width_scale", attrs);
+  }
+
+  const Tensor *InputX() const { return input_x_; }
+
+  const Tensor *InputShape() const { return input_shape_; }
+
+  Tensor *Out() const { return out_; }
+
+  const bool &IsPyramidTest() const { return is_pyramid_test_; }
+
+  const int &Height() const { return height_; }
+
+  const int &Width() const { return width_; }
+
+  const float &OutHeightScale() const { return out_height_scale_; }
+
+  const float &OutWidthScale() const { return out_width_scale_; }
+
+ private:
+  Tensor *input_x_;
+  Tensor *input_shape_;
+  Tensor *out_;
+  bool is_pyramid_test_;
+  int height_;
+  int width_;
+  float out_height_scale_;
+  float out_width_scale_;
+};
+#endif
+
 #ifdef RELU_OP
 /*
  * @b op 层实例化好这个 param 传递给 kernel 层使用
@@ -751,7 +885,27 @@ class ReluParam : public OpParam {
 };
 #endif
 
-#ifdef FUSION_FC_OP
+#ifdef PRELU_OP
+class PReluParam : public OpParam {
+ public:
+  PReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+             const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
+    slopes_ = GetAttr<vector<float>>("slopes", attrs);
+  }
+
+  const Tensor *InputX() const { return input_x_; }
+  Tensor *Out() const { return out_; }
+  const vector<float> &Slopes() const { return slopes_; }
+
+ private:
+  Tensor *input_x_;
+  Tensor *out_;
+  vector<float> slopes_;
+};
+#endif
+
 class FusionFcParam : public OpParam {
  public:
   FusionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -766,7 +920,11 @@ class FusionFcParam : public OpParam {
   }
   const Tensor *InputX() const { return input_x_; }
 
+#ifdef PADDLE_MOBILE_FPGA
+  Tensor *InputY() const { return input_y_; }
+#else
   const Tensor *InputY() const { return input_y_; }
+#endif
 
   const Tensor *InputZ() const { return input_z_; }
 
@@ -786,10 +944,21 @@ class FusionFcParam : public OpParam {
   int x_num_col_dims_;
   int y_num_col_dims_;
   int axis_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::ConvArgs fpga_conv_args;
+
+ public:
+  const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
+#endif
 };
+
+#ifdef FUSION_FCRELU_OP
+using FusionFcReluParam = FusionFcParam;
 #endif
 
-#ifdef FUSION_CONVADD_OP
 class FusionConvAddParam : public OpParam {
  public:
   FusionConvAddParam(const VariableNameMap &inputs,
@@ -811,7 +980,11 @@ class FusionConvAddParam : public OpParam {
 
   const Tensor *Input() const { return input_; }
 
+#ifdef PADDLE_MOBILE_FPGA
+  Tensor *Filter() const { return filter_; }
+#else
   const Tensor *Filter() const { return filter_; }
+#endif
 
   Tensor *Output() const { return output_; }
 
@@ -833,12 +1006,20 @@ class FusionConvAddParam : public OpParam {
   vector<int> paddings_;
   vector<int> dilations_;
   int groups;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::ConvArgs fpga_conv_args;
+
+ public:
+  const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
+#endif
 };
 
 Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
-#endif
 
-#ifdef FUSION_CONVADD_RELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 class FusionConvAddReluParam : public FusionConvAddParam {
  public:
   FusionConvAddReluParam(const VariableNameMap &inputs,
@@ -848,5 +1029,450 @@ class FusionConvAddReluParam : public FusionConvAddParam {
 };
 #endif
 
+#ifdef FUSION_CONVADDBNRELU_OP
+class FusionConvAddBNReluParam : public OpParam {
+ public:
+  FusionConvAddBNReluParam(const VariableNameMap &inputs,
+                           const VariableNameMap &outputs,
+                           const AttributeMap &attrs, const Scope &scope) {
+    bias_ = InputYFrom<LoDTensor>(inputs, scope);
+    axis_ = GetAttr<int>("axis", attrs);
+    filter_ = FilterFrom<LoDTensor>(inputs, scope);
+    input_ = InputFrom<LoDTensor>(inputs, scope);
+    output_ = OutFrom<LoDTensor>(outputs, scope);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
+    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
+    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
+    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
+    epsilon_ = GetAttr<float>("epsilon", attrs);
+    momentum_ = GetAttr<float>("momentum", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
+  }
+  Tensor *Bias() const { return bias_; }
+
+  const int &Axis() const { return axis_; }
+
+  const Tensor *Input() const { return input_; }
+
+#ifdef PADDLE_MOBILE_FPGA
+  Tensor *Filter() const { return filter_; }
+#else
+  const Tensor *Filter() const { return filter_; }
+#endif
+
+  Tensor *Output() const { return output_; }
+
+  const vector<int> &Strides() const { return strides_; }
+
+  const vector<int> &Paddings() const { return paddings_; }
+
+  const vector<int> &Dilations() const { return dilations_; }
+
+  const int &Groups() const { return groups; }
+
+  const Tensor *InputBias() const { return input_bias_; }
+
+  const Tensor *InputMean() const { return input_mean_; }
+
+  const Tensor *InputScale() const { return input_scale_; }
+
+  const Tensor *InputVariance() const { return input_variance_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+  const float &Momentum() const { return momentum_; }
+
+  const bool &IsTest() const { return is_test_; }
+
+  void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; }
+
+  void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; }
+
+  const Tensor *NewScale() const { return new_scale_; }
+
+  const Tensor *NewBias() const { return new_bias_; }
+
+ protected:
+  Tensor *bias_;
+  int axis_;
+  Tensor *input_;
+  Tensor *output_;
+  Tensor *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+  Tensor *input_bias_;
+  Tensor *input_mean_;
+  Tensor *input_scale_;
+  Tensor *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  Tensor *new_bias_;
+  Tensor *new_scale_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::ConvArgs fpga_conv_args;
+
+ public:
+  const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
+#endif
+};
+#endif
+
+#ifdef FUSION_CONVADDBN_OP
+class FusionConvAddBNParam : public OpParam {
+ public:
+  FusionConvAddBNParam(const VariableNameMap &inputs,
+                       const VariableNameMap &outputs,
+                       const AttributeMap &attrs, const Scope &scope) {
+    bias_ = InputYFrom<LoDTensor>(inputs, scope);
+    axis_ = GetAttr<int>("axis", attrs);
+    filter_ = FilterFrom<LoDTensor>(inputs, scope);
+    input_ = InputFrom<LoDTensor>(inputs, scope);
+    output_y_ = OutputYFrom<LoDTensor>(outputs, scope);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
+    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
+    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
+    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
+    epsilon_ = GetAttr<float>("epsilon", attrs);
+    momentum_ = GetAttr<float>("momentum", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
+  }
+  Tensor *Bias() const { return bias_; }
+
+  const int &Axis() const { return axis_; }
+
+  const Tensor *Input() const { return input_; }
+
+#ifdef PADDLE_MOBILE_FPGA
+  Tensor *Filter() const { return filter_; }
+#else
+  const Tensor *Filter() const { return filter_; }
+#endif
+  Tensor *Output() const { return output_y_; }
+
+  const vector<int> &Strides() const { return strides_; }
+
+  const vector<int> &Paddings() const { return paddings_; }
+
+  const vector<int> &Dilations() const { return dilations_; }
+
+  const int &Groups() const { return groups; }
+
+  const Tensor *InputBias() const { return input_bias_; }
+
+  const Tensor *InputMean() const { return input_mean_; }
+
+  const Tensor *InputScale() const { return input_scale_; }
+
+  const Tensor *InputVariance() const { return input_variance_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+  const float &Momentum() const { return momentum_; }
+
+  const bool &IsTest() const { return is_test_; }
+
+  void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; }
+
+  void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; }
+
+  const Tensor *NewScale() const { return new_scale_; }
+
+  const Tensor *NewBias() const { return new_bias_; }
+
+ protected:
+  Tensor *bias_;
+  int axis_;
+  Tensor *input_;
+  Tensor *output_y_;
+  Tensor *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+  Tensor *input_bias_;
+  Tensor *input_mean_;
+  Tensor *input_scale_;
+  Tensor *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  Tensor *new_bias_;
+  Tensor *new_scale_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::ConvArgs fpga_conv_args;
+
+ public:
+  const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
+#endif
+};
+#endif
+
+#ifdef FUSION_DWCONVBNRELU_OP
+class FusionDWConvBNReluParam : public OpParam {
+ public:
+  FusionDWConvBNReluParam(const VariableNameMap &inputs,
+                          const VariableNameMap &outputs,
+                          const AttributeMap &attrs, const Scope &scope) {
+    filter_ = FilterFrom<LoDTensor>(inputs, scope);
+    input_ = InputFrom<LoDTensor>(inputs, scope);
+    output_ = OutFrom<LoDTensor>(outputs, scope);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
+    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
+    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
+    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
+    epsilon_ = GetAttr<float>("epsilon", attrs);
+    momentum_ = GetAttr<float>("momentum", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
+  }
+
+  const Tensor *Input() const { return input_; }
+
+  const Tensor *Filter() const { return filter_; }
+
+  Tensor *Output() const { return output_; }
+
+  const vector<int> &Strides() const { return strides_; }
+
+  const vector<int> &Paddings() const { return paddings_; }
+
+  const vector<int> &Dilations() const { return dilations_; }
+
+  const int &Groups() const { return groups; }
+
+  const Tensor *InputBias() const { return input_bias_; }
+
+  const Tensor *InputMean() const { return input_mean_; }
+
+  const Tensor *InputScale() const { return input_scale_; }
+
+  const Tensor *InputVariance() const { return input_variance_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+  const float &Momentum() const { return momentum_; }
+
+  const bool &IsTest() const { return is_test_; }
+
+  void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; }
+
+  void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; }
+
+  const Tensor *NewScale() const { return new_scale_; }
+
+  const Tensor *NewBias() const { return new_bias_; }
+
+ protected:
+  Tensor *input_;
+  Tensor *output_;
+  Tensor *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+  Tensor *input_bias_;
+  Tensor *input_mean_;
+  Tensor *input_scale_;
+  Tensor *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  Tensor *new_bias_;
+  Tensor *new_scale_;
+};
+
+Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
+#endif
+
+#ifdef FUSION_CONVBNRELU_OP
+class FusionConvBNReluParam : public OpParam {
+ public:
+  FusionConvBNReluParam(const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const AttributeMap &attrs, const Scope &scope) {
+    filter_ = FilterFrom<LoDTensor>(inputs, scope);
+    input_ = InputFrom<LoDTensor>(inputs, scope);
+    output_ = OutFrom<LoDTensor>(outputs, scope);
+
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
+    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
+    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
+    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
+    epsilon_ = GetAttr<float>("epsilon", attrs);
+    momentum_ = GetAttr<float>("momentum", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
+  }
+
+  const Tensor *Input() const { return input_; }
+
+  const Tensor *Filter() const { return filter_; }
+
+  Tensor *Output() const { return output_; }
+
+  const vector<int> &Strides() const { return strides_; }
+
+  const vector<int> &Paddings() const { return paddings_; }
+
+  const vector<int> &Dilations() const { return dilations_; }
+
+  const int &Groups() const { return groups; }
+
+  const Tensor *InputBias() const { return input_bias_; }
+
+  const Tensor *InputMean() const { return input_mean_; }
+
+  const Tensor *InputScale() const { return input_scale_; }
+
+  const Tensor *InputVariance() const { return input_variance_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+  const float &Momentum() const { return momentum_; }
+
+  const bool &IsTest() const { return is_test_; }
+
+  void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; }
+
+  void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; }
+
+  const Tensor *NewScale() const { return new_scale_; }
+
+  const Tensor *NewBias() const { return new_bias_; }
+
+ protected:
+  Tensor *input_;
+  Tensor *output_;
+  Tensor *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+  Tensor *input_bias_;
+  Tensor *input_mean_;
+  Tensor *input_scale_;
+  Tensor *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  Tensor *new_bias_;
+  Tensor *new_scale_;
+};
+#endif
+
+#ifdef IM2SEQUENCE_OP
+class Im2SequenceParam : public OpParam {
+ public:
+  Im2SequenceParam(const VariableNameMap &inputs,
+                   const VariableNameMap &outputs, const AttributeMap &attrs,
+                   const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
+    kernels_ = GetAttr<vector<int>>("kernels", attrs);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+  }
+
+  const Tensor *Input() const { return input_x_; }
+
+  Tensor *Output() const { return out_; }
+
+  const vector<int> &Kernels() const { return kernels_; }
+
+  const vector<int> &Strides() const { return strides_; }
+
+  const vector<int> &Paddings() const { return paddings_; }
+
+ private:
+  Tensor *input_x_;
+  Tensor *out_;
+  vector<int> kernels_;
+  vector<int> strides_;
+  vector<int> paddings_;
+};
+#endif
+
+#ifdef DROPOUT_OP
+class DropoutParam : public OpParam {
+ public:
+  DropoutParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+               const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
+  }
+
+  const Tensor *InputX() const { return input_x_; }
+
+  Tensor *Out() const { return out_; }
+
+ private:
+  Tensor *input_x_;
+  Tensor *out_;
+};
+#endif
+
+#ifdef CONV_TRANSPOSE
+class ConvTransposeParam : public OpParam {
+ public:
+  ConvTransposeParam(const VariableNameMap &inputs,
+                     const VariableNameMap &outputs, const AttributeMap &attrs,
+                     const Scope &scope) {
+    filter_ = FilterFrom<LoDTensor>(inputs, scope);
+    input_ = InputFrom<LoDTensor>(inputs, scope);
+    output_ = OutputFrom<LoDTensor>(outputs, scope);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+  }
+
+  const Tensor *Input() const { return input_; }
+
+  const Tensor *Filter() const { return filter_; }
+
+  Tensor *Output() const { return output_; }
+
+  const vector<int> &Strides() const { return strides_; }
+
+  const vector<int> &Paddings() const { return paddings_; }
+
+  const vector<int> &Dilations() const { return dilations_; }
+
+  const int &Groups() const { return groups; }
+
+ private:
+  Tensor *input_;
+  Tensor *output_;
+  Tensor *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+};
+#endif
+
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/pool_op.cpp b/src/operators/pool_op.cpp
index e8a469d43141f0b880605b52216094c292ca50fb..dd23059ea01a332aff45137b7f7ed4c9f6c2e1bb 100644
--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
@@ -54,18 +54,19 @@ void PoolOp<DeviceType, T>::InferShape() const {
   }
   this->param_.Output()->Resize(framework::make_ddim(output_shape));
 }
-template class PoolOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(pool2d);
 REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(pool2d, ops::PoolOp);
 #endif
 
 #endif
diff --git a/src/operators/pool_op.h b/src/operators/pool_op.h
index 5b436fb18bdc055add21acd37e5a1a9c7b6e5b02..bc823e70c5a8e7b229a0101726316d1e825b7b54 100644
--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
@@ -48,4 +48,14 @@ class PoolOp : public OperatorWithKernel<DeviceType, PoolParam,
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(pool2d);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(pool2d);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(pool2d);
+#endif
+
 #endif
diff --git a/src/operators/prelu_op.cpp b/src/operators/prelu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..245154ca5ea6971dee33e14550bf1e090fa0ec71
--- /dev/null
+++ b/src/operators/prelu_op.cpp
@@ -0,0 +1,47 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PRELU_OP
+
+#include "operators/prelu_op.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void PReluOp<Dtype, T>::InferShape() const {
+  auto input_dims = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(input_dims);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+/*
+ * @b 每一个 op 都需要注册一下的,
+ *    USE_OP的参数 和 REGISTER_OPERATOR的第一个参数
+ * 都是需要和model中类型对应起来的
+ * */
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(prelu);
+REGISTER_OPERATOR_CPU(prelu, ops::PReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(prelu);
+REGISTER_OPERATOR_MALI_GPU(prelu, ops::PReluOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/prelu_op.h b/src/operators/prelu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..330d1d4a567b10bddf5879b4897a945350a34ac5
--- /dev/null
+++ b/src/operators/prelu_op.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PRELU_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/prelu_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class PReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, PReluParam, operators::PReluKernel<DeviceType, T>> {
+ public:
+  PReluOp(const std::string &type, const VariableNameMap &inputs,
+          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+          std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, PReluParam,
+                                      operators::PReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, PReluParam,
+      operators::PReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/prior_box_op.cpp b/src/operators/prior_box_op.cpp
index 44e1741b66f301aee55f1f4d33b9bb1173e6004d..a05a0ddcec5ba9d442b58846468a121e9b655a6a 100644
--- a/src/operators/prior_box_op.cpp
+++ b/src/operators/prior_box_op.cpp
@@ -44,13 +44,12 @@ void PriorBoxOp<Dtype, T>::InferShape() const {
   this->param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec));
   this->param_.OutputVariances()->Resize(framework::make_ddim(dim_vec));
 }
-template class PriorBoxOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(prior_box);
 REGISTER_OPERATOR_CPU(prior_box, ops::PriorBoxOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/prior_box_op.h b/src/operators/prior_box_op.h
index 5b3e3fffd6787360b69ff3af2d19bc8e05549c04..6fcaa07c74f0e005fd5b91ae04ec7219e0394064 100644
--- a/src/operators/prior_box_op.h
+++ b/src/operators/prior_box_op.h
@@ -51,4 +51,12 @@ class PriorBoxOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(prior_box);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/relu_op.cpp b/src/operators/relu_op.cpp
index cf495d8bdace83f5dd7f86d372d07b3241867af9..2a771e81e7a5a0e869984990b52b98d15036543a 100644
--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -23,7 +23,7 @@ void ReluOp<Dtype, T>::InferShape() const {
   auto input_dims = this->param_.InputX()->dims();
   this->param_.Out()->Resize(input_dims);
 }
-template class ReluOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
@@ -34,10 +34,10 @@ template class ReluOp<CPU, float>;
  * */
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(relu);
 REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
diff --git a/src/operators/relu_op.h b/src/operators/relu_op.h
index 8f9e55cf8a2d5bb58e85c21cd2cee3647b00fa22..204ec3d29c147d0d52b9b05d16de6807211a5e57 100644
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
@@ -53,4 +53,13 @@ class ReluOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(relu);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/reshape_op.cpp b/src/operators/reshape_op.cpp
index 0fdcaf4d1a95ccd2a0ceccdc6d890b30a1d66368..dcc15009af2b23129552d58b3fa22c3c67684dce 100644
--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -27,16 +27,16 @@ void ReshapeOp<Dtype, T>::InferShape() const {
   auto out_dims = ValidateShape(shape, input_x_dims);
   this->param_.Out()->Resize(out_dims);
 }
-template class ReshapeOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(reshape);
 REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
diff --git a/src/operators/reshape_op.h b/src/operators/reshape_op.h
index 90d31153135f629585d56eb89ae12830215900d8..da2328ec3570359ccdb45ce1511c02f322498aa1 100644
--- a/src/operators/reshape_op.h
+++ b/src/operators/reshape_op.h
@@ -51,4 +51,14 @@ class ReshapeOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(reshape);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(reshape);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/resize_op.cpp b/src/operators/resize_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..02c50b662665fc9bd2f662922cb88dbce9fc5d53
--- /dev/null
+++ b/src/operators/resize_op.cpp
@@ -0,0 +1,43 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESIZE_OP
+
+#include "operators/resize_op.h"
+#include <vector>
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void ResizeOp<Dtype, T>::InferShape() const {
+  auto out_dims = CalOutputShape(this->param_);
+  this->param_.Out()->Resize(out_dims);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(resize);
+REGISTER_OPERATOR_CPU(resize, ops::ResizeOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(resize);
+REGISTER_OPERATOR_MALI_GPU(resize, ops::ResizeOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/resize_op.h b/src/operators/resize_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cab048dea350d668c92fda56f6b6b197c38093d
--- /dev/null
+++ b/src/operators/resize_op.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESIZE_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/resize_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class ResizeOp
+    : public framework::OperatorWithKernel<
+          DeviceType, ResizeParam, operators::ResizeKernel<DeviceType, T>> {
+ public:
+  ResizeOp(const std::string &type, const VariableNameMap &inputs,
+           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+           std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, ResizeParam,
+                                      operators::ResizeKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, ResizeParam,
+      operators::ResizeKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/scale_op.cpp b/src/operators/scale_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..968fcd4098e92a47899c9a733c0261d91c314c29
--- /dev/null
+++ b/src/operators/scale_op.cpp
@@ -0,0 +1,43 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SCALE_OP
+
+#include "operators/scale_op.h"
+#include <vector>
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void ScaleOp<Dtype, T>::InferShape() const {
+  auto input_dims = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(input_dims);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(scale);
+REGISTER_OPERATOR_CPU(scale, ops::ScaleOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(scale);
+REGISTER_OPERATOR_MALI_GPU(scale, ops::ScaleOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/scale_op.h b/src/operators/scale_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6a900d646a53cb8aa2a0ab624a31781f420db06
--- /dev/null
+++ b/src/operators/scale_op.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SCALE_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/scale_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class ScaleOp
+    : public framework::OperatorWithKernel<
+          DeviceType, ScaleParam, operators::ScaleKernel<DeviceType, T>> {
+ public:
+  ScaleOp(const std::string &type, const VariableNameMap &inputs,
+          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+          std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, ScaleParam,
+                                      operators::ScaleKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, ScaleParam,
+      operators::ScaleKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/sigmoid_op.cpp b/src/operators/sigmoid_op.cpp
index 79190e6c3368b9d375770062d948580779393f04..8ea4c98942e0630f5b69133991583ee1192c8153 100644
--- a/src/operators/sigmoid_op.cpp
+++ b/src/operators/sigmoid_op.cpp
@@ -22,13 +22,12 @@ template <typename DeviceType, typename T>
 void SigmoidOp<DeviceType, T>::InferShape() const {
   this->param_.Out()->Resize(this->param_.InputX()->dims());
 }
-template class SigmoidOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(sigmoid);
 REGISTER_OPERATOR_CPU(sigmoid, ops::SigmoidOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/sigmoid_op.h b/src/operators/sigmoid_op.h
index bd914a63783f65c7b55d783f2bbcdf19c303c00f..bffef7880b2bb6057f5d489eaac6dea7a3fb3ab5 100644
--- a/src/operators/sigmoid_op.h
+++ b/src/operators/sigmoid_op.h
@@ -46,4 +46,12 @@ class SigmoidOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(sigmoid);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/slice_op.cpp b/src/operators/slice_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b77a675e10ed030443e1d4074239a715ddedf772
--- /dev/null
+++ b/src/operators/slice_op.cpp
@@ -0,0 +1,42 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SLICE_OP
+
+#include "operators/slice_op.h"
+#include <vector>
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void SliceOp<Dtype, T>::InferShape() const {
+  /// todo: add InputShape() detection.
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(slice);
+REGISTER_OPERATOR_CPU(slice, ops::SliceOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(slice);
+REGISTER_OPERATOR_MALI_GPU(slice, ops::SliceOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/slice_op.h b/src/operators/slice_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c9d074b24c1aaf1bf28c862f3731ca130f3c462
--- /dev/null
+++ b/src/operators/slice_op.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SLICE_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/slice_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class SliceOp
+    : public framework::OperatorWithKernel<
+          DeviceType, SliceParam, operators::SliceKernel<DeviceType, T>> {
+ public:
+  SliceOp(const std::string &type, const VariableNameMap &inputs,
+          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+          std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, SliceParam,
+                                      operators::SliceKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, SliceParam,
+      operators::SliceKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/softmax_op.cpp b/src/operators/softmax_op.cpp
index e25b59198f3206357a770a104080f99bafa84dc5..c9edfccf4ff08e5a12d735526c3d63c689711357 100644
--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -22,16 +22,16 @@ template <typename DeviceType, typename T>
 void SoftmaxOp<DeviceType, T>::InferShape() const {
   this->param_.Out()->Resize(this->param_.InputX()->dims());
 }
-template class SoftmaxOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(softmax);
 REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
diff --git a/src/operators/softmax_op.h b/src/operators/softmax_op.h
index 1445ca055ea0472cdaa02d7496ff895feb9174bc..f645d7edf7a3b9f7a92cf286feec58e960a5e3b7 100644
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
@@ -48,4 +48,13 @@ class SoftmaxOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(softmax);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(softmax);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/transpose_op.cpp b/src/operators/transpose_op.cpp
index 989b277b9d58a8c029e041a89a1982f8994bae44..5f193f96396c8d4d7cb58143573015384e7a7c28 100644
--- a/src/operators/transpose_op.cpp
+++ b/src/operators/transpose_op.cpp
@@ -47,13 +47,12 @@ void TransposeOp<Dtype, T>::InferShape() const {
   }
   this->param_.Out()->Resize(out_dims);
 }
-template class TransposeOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(transpose);
 REGISTER_OPERATOR_CPU(transpose, ops::TransposeOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/transpose_op.h b/src/operators/transpose_op.h
index 349220b58ff3e0daec8c7dc2e2dec969ced8b289..25cf07c4c9253736d513505e5f8eba6147f3740c 100644
--- a/src/operators/transpose_op.h
+++ b/src/operators/transpose_op.h
@@ -50,4 +50,12 @@ class TransposeOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(transpose);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 9bfc55c93daa2f69200941bfb49a8a6312fa9eb1..468cbd4ed6d579f7b39f8628a3e052e90ae26644 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,26 +1,32 @@
 set(dir ${CMAKE_CURRENT_SOURCE_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build")
 
-if (googlenet)
+if ("googlenet" IN_LIST NET)
     # gen test
     ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-googlenet paddle-mobile)
-elseif (mobilenet)
+elseif ("mobilenet" IN_LIST NET)
     # gen test
     ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-mobilenet paddle-mobile)
-elseif (yolo)
+
+    # gen test
+    ADD_EXECUTABLE(test-mobilenet-combine net/test_mobilenet_combine.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-mobilenet-combine paddle-mobile)
+
+elseif ("yolo" IN_LIST NET)
     # gen test
     ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-yolo paddle-mobile)
-elseif (squeezenet)
+elseif ("squeezenet" IN_LIST NET)
     # gen test
     ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-squeezenet paddle-mobile)
-elseif(resnet)
+elseif("resnet" IN_LIST NET)
     # gen test
     ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-resnet paddle-mobile)
+elseif("FPGAnets" IN_LIST NET)
 else ()
 
     # gen test
@@ -99,6 +105,10 @@ else ()
     ADD_EXECUTABLE(test-load framework/test_load.cpp)
     target_link_libraries(test-load paddle-mobile)
 
+    ADD_EXECUTABLE(test-inference-api framework/test_inference_api.cpp)
+    target_link_libraries(test-inference-api paddle-mobile)
+
+
     # gen test log
     # gen test
     ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp)
@@ -114,8 +124,12 @@ else ()
     target_link_libraries(test-softmax paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-gemm common/test_gemm.cpp)
-    target_link_libraries(test-gemm paddle-mobile)
+    ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
+    target_link_libraries(test-gemm-accuracy paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp)
+    target_link_libraries(test-gemm-perf paddle-mobile)
 
     # gen test
     ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
@@ -129,6 +143,10 @@ else ()
     ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-mobilenetssd paddle-mobile)
 
+     # gen test
+    ADD_EXECUTABLE(test-mobilenet-combine net/test_mobilenet_combine.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-mobilenet-combine paddle-mobile)
+
     # gen test
     ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp  test_include.h)
     target_link_libraries(test-sigmoid paddle-mobile)
@@ -145,6 +163,18 @@ else ()
     ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-conv-add-relu-op paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
+
     #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 
+
+
+endif()
+
+if(FPGA)
+    ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-tensor-quant paddle-mobile)
+
 endif()
diff --git a/test/common/test_gemm.cpp b/test/common/test_gemm.cpp
deleted file mode 100644
index f385bf960e266df1ddfd317c3281904fea1a21ee..0000000000000000000000000000000000000000
--- a/test/common/test_gemm.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "common/log.h"
-#include "operators/math/gemm.h"
-
-#define a(i, j) a[(i)*lda + (j)]
-#define b(i, j) b[(i)*ldb + (j)]
-#define c1(i, j) c1[(i)*ldc + (j)]
-
-#define m 62
-#define n 63
-#define k 74
-
-int main() {
-  int lda = k;
-  int ldb = n;
-  int ldc = n;
-
-  float a[62 * 74];
-  float b[74 * 63];
-  float c[62 * 63] = {0};
-  float c1[62 * 63] = {0};
-  for (int i = 0; i < m * k; ++i) {
-    a[i] = 2;
-  }
-  for (int i = 0; i < k * n; ++i) {
-    b[i] = 2;
-  }
-  for (int i = 0; i < m * n; ++i) {
-    c[i] = 2;
-    c1[i] = 2;
-  }
-
-  paddle_mobile::operators::math::sgemm(m, n, k, 0.9, a, lda, b, ldb, 0.3, c,
-                                        ldc);
-  for (int i = 0; i < m * n; ++i) {
-    std::cout << c[i] << " | ";
-    if (i % n == (n - 1)) {
-      std::cout << std::endl;
-    }
-  }
-  for (int j = 0; j < n; ++j) {
-    for (int i = 0; i < m; ++i) {
-      c1(i, j) *= 0.3;
-      for (int p = 0; p < k; ++p) {
-        c1(i, j) += 0.9 * a(i, p) * b(p, j);
-      }
-    }
-  }
-  std::cout << "正确结果对比:" << std::endl;
-  for (int i = 0; i < m * n; ++i) {
-    std::cout << c1[i] << " | ";
-    if (i % n == (n - 1)) {
-      std::cout << std::endl;
-    }
-  }
-  return 0;
-}
diff --git a/test/common/test_gemm_accuracy.cpp b/test/common/test_gemm_accuracy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..35241fbd535e062be1c7f1f28eb3860d118a3455
--- /dev/null
+++ b/test/common/test_gemm_accuracy.cpp
@@ -0,0 +1,136 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstdlib>
+#include <ctime>
+#include <iostream>
+#include "../test_helper.h"
+#include "common/log.h"
+#include "memory/t_malloc.h"
+#include "operators/math/gemm.h"
+
+#define a(i, j) a[(i)*lda + (j)]
+#define b(i, j) b[(i)*ldb + (j)]
+#define c(i, j) c[(i)*ldc + (j)]
+#define c1(i, j) c1[(i)*ldc + (j)]
+
+void print_matirx(int m, int n, int ldc, float *c) {
+  for (int i = 0; i < m; ++i) {
+    std::cout << c(i, 0);
+    for (int j = 1; j < n; ++j) {
+      std::cout << " | " << c(i, j);
+    }
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+}
+
+int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
+  int lda = k;
+  int ldb = n;
+  int ldc = n;
+
+  float *a =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * k));
+  float *b =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * k * n));
+  float *c =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
+  float *c1 =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
+  float *scale =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m));
+  float *bias =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m));
+
+  srand(unsigned(time(0)));
+  for (int i = 0; i < m * k; ++i) {
+    a[i] = t1 + rand() % t2;
+  }
+  for (int i = 0; i < k * n; ++i) {
+    b[i] = t1 + rand() % t2;
+  }
+  for (int i = 0; i < m; ++i) {
+    scale[i] = t1 + rand() % t2;
+  }
+  for (int i = 0; i < m; ++i) {
+    bias[i] = t1 + rand() % t2;
+  }
+
+  for (int i = 0; i < m; ++i) {
+    for (int j = 0; j < n; ++j) {
+      float r = 0;
+      for (int p = 0; p < k; p++) {
+        r += a(i, p) * b(p, j);
+      }
+      r *= scale[i];
+      r += bias[i];
+      if (relu && (r < 0)) {
+        r = 0;
+      }
+      c1(i, j) = r;
+    }
+  }
+
+  paddle_mobile::operators::math::SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3,
+                                              c, ldc, relu, scale, bias);
+  int eq = 0;
+  int neq = 0;
+  for (int i = 0; i < m * n; ++i) {
+    if (static_cast<int>(c[i]) == static_cast<int>(c1[i])) {
+      ++eq;
+    } else {
+      ++neq;
+    }
+  }
+
+  if (pr > 0) {
+    std::cout << "A:" << std::endl;
+    print_matirx(m, k, lda, a);
+    std::cout << "B:" << std::endl;
+    print_matirx(k, n, ldb, b);
+    std::cout << "C:" << std::endl;
+    print_matirx(m, n, ldc, c);
+    std::cout << "C1:" << std::endl;
+    print_matirx(m, n, ldc, c1);
+  }
+
+  std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
+            << "   eq=" << eq << " neq=" << neq << std::endl;
+
+  paddle_mobile::memory::Free(a);
+  paddle_mobile::memory::Free(b);
+  paddle_mobile::memory::Free(c);
+  paddle_mobile::memory::Free(c1);
+  paddle_mobile::memory::Free(scale);
+  paddle_mobile::memory::Free(bias);
+
+  return 0;
+}
+
+int main() {
+  do_sgemm(9, 9, 9, true, 10, 10, 10);
+  do_sgemm(10, 6, 12, false, 10, 10, 0);
+  do_sgemm(512, 256, 384, false, 10, 10, 0);
+  do_sgemm(1366, 768, 256, false, 10, 10, 0);
+  do_sgemm(1255, 755, 333, false, 10, 10, 0);
+  do_sgemm(555, 777, 999, false, 10, 10, 0);
+
+  do_sgemm(10, 6, 12, true, -4, 10, 0);
+  do_sgemm(512, 256, 384, true, -4, 10, 0);
+  do_sgemm(1366, 768, 256, true, -4, 10, 0);
+  do_sgemm(1255, 755, 333, true, -4, 10, 0);
+  do_sgemm(555, 777, 999, true, -4, 10, 0);
+  return 0;
+}
diff --git a/test/common/test_gemm_perf.cpp b/test/common/test_gemm_perf.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..386c09d71a3d5709842991bffd2e8ea039edc940
--- /dev/null
+++ b/test/common/test_gemm_perf.cpp
@@ -0,0 +1,67 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "operators/math/gemm.h"
+#include "operators/math/math_function.h"
+
+#define a(i, j) a[(i)*lda + (j)]
+#define b(i, j) b[(i)*ldb + (j)]
+#define c1(i, j) c1[(i)*ldc + (j)]
+
+#define m 1024
+#define n 1024
+#define k 1024
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  Tensor aa, bb, cc, scale, bias;
+  auto aaptr = aa.mutable_data<float>({m, k});
+  auto bbptr = bb.mutable_data<float>({k, n});
+  auto ccptr = cc.mutable_data<float>({m, n});
+  auto scaleptr = scale.mutable_data<float>({m});
+  auto biasptr = bias.mutable_data<float>({m});
+
+  for (int i = 0; i < m * k; ++i) {
+    aaptr[i] = 2;
+  }
+  for (int i = 0; i < k * n; ++i) {
+    bbptr[i] = 2;
+  }
+  for (int i = 0; i < m * n; ++i) {
+    ccptr[i] = 2;
+  }
+  for (int i = 0; i < m; ++i) {
+    scaleptr[i] = 1;
+    biasptr[i] = 0;
+  }
+
+  auto time1 = time();
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul<float>(
+        aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
+        false, biasptr);
+
+    //    paddle_mobile::operators::math::matmulWithBn<float>(
+    //        aa, false, bb, false, static_cast<float>(1), &cc,
+    //        static_cast<float>(0), true, &scale, &bias, 0);
+  }
+  auto time2 = time();
+  std::cout << "gemm  cost :" << time_diff(time1, time2) / 10 << "ms\n";
+
+  return 0;
+}
diff --git a/test/executor_for_test.h b/test/executor_for_test.h
index 0d3051327a57202e2b8d1dcbdda571fd244de108..93847af20a6d48a6df33dc50f6c6a1db76facf51 100644
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #include "common/log.h"
 #include "framework/op_registry.h"
-#include "io/io.h"
+#include "io/executor.h"
 #include "operators/conv_op.h"
 #include "operators/elementwise_add_op.h"
 #include "operators/pool_op.h"
@@ -43,7 +43,7 @@ template <typename DeviceType, typename OpType>
 class Executor4Test : public Executor<DeviceType> {
  public:
   Executor4Test(Program<DeviceType> p, string op_type,
-                bool use_optimize = false)
+                bool use_optimize = false, int predict_op_count = 1)
       : Executor<DeviceType>() {
     this->use_optimize_ = use_optimize;
     this->program_ = p;
@@ -57,12 +57,14 @@ class Executor4Test : public Executor<DeviceType> {
       LOG(paddle_mobile::LogLevel::kLOG_ERROR)
           << "to_predict_program_ == nullptr";
     }
+
     const std::vector<std::shared_ptr<BlockDesc>> blocks =
         this->to_predict_program_->Blocks();
     for (std::shared_ptr<BlockDesc> block_desc : blocks) {
       std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      for (std::shared_ptr<OpDesc> op : ops) {
-        if (op->Type() == op_type) {
+      for (int i = 0; i < ops.size(); ++i) {
+        auto op = ops[i];
+        if (op->Type() == op_type && i < predict_op_count) {
           DLOG << "匹配到: " << op->Type();
 
           /// test first meeting op in program
@@ -72,11 +74,17 @@ class Executor4Test : public Executor<DeviceType> {
                       op->Type(), op->GetInputs(), op->GetOutputs(),
                       op->GetAttrMap(), this->program_.scope);
           this->ops_of_block_[*block_desc.get()].push_back(op_ptr);
-          break;
         }
       }
     }
     this->InitMemory();
+
+    std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
+        this->to_predict_program_->Block(0);
+    auto &ops = this->ops_of_block_[*to_predict_block.get()];
+    for (const auto &op : ops) {
+      op->Init();
+    }
   }
 
   template <typename T = LoDTensor>
@@ -130,9 +138,6 @@ class Executor4Test : public Executor<DeviceType> {
     auto *output_tensor = con_output->GetMutable<LoDTensor>();
     output_tensor->mutable_data<float>(dDim);
 
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
     std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
         this->to_predict_program_->Block(0);
     for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size();
@@ -141,6 +146,7 @@ class Executor4Test : public Executor<DeviceType> {
       op->Run();
     }
 
-    return out_tensor;
+    return std::make_shared<paddle_mobile::framework::Tensor>(
+        paddle_mobile::framework::Tensor(*output_tensor));
   }
 };
diff --git a/test/fpga/test_tensor_quant.cpp b/test/fpga/test_tensor_quant.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3835c395a4764c3c978b6bba9c1af48305be1d58
--- /dev/null
+++ b/test/fpga/test_tensor_quant.cpp
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  bool optimize = false;
+  if (paddle_mobile.Load(g_googlenet, optimize)) {
+    auto time1 = time();
+    DLOG << "load cost: " << time_diff(time1, time1) << "ms";
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
+    auto time3 = time();
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    auto time4 = time();
+    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  }
+  return 0;
+}
diff --git a/test/framework/test_inference_api.cpp b/test/framework/test_inference_api.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7dec2fe29753c75ee70f31428d104450acce9404
--- /dev/null
+++ b/test/framework/test_inference_api.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "io/paddle_inference_api.h"
+
+using namespace paddle_mobile;
+
+PaddleMobileConfig GetConfig() {
+  PaddleMobileConfig config;
+  config.precision = PaddleMobileConfig::FP32;
+  config.device = PaddleMobileConfig::kCPU;
+  config.model_dir = "../models/mobilenet/";
+  config.thread_num = 4;
+  return config;
+}
+
+int main() {
+  PaddleMobileConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<PaddleMobileConfig,
+                            PaddleEngineKind::kPaddleMobile>(config);
+
+  float data[1 * 3 * 224 * 224] = {1.0f};
+
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({1, 3, 224, 224});
+  tensor.data = PaddleBuf(data, sizeof(data));
+  tensor.dtype = PaddleDType::FLOAT32;
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+
+  PaddleTensor tensor_out;
+  tensor_out.shape = std::vector<int>({});
+  tensor_out.data = PaddleBuf();
+  tensor_out.dtype = PaddleDType::FLOAT32;
+  std::vector<PaddleTensor> outputs(1, tensor_out);
+
+  assert(predictor->Run(paddle_tensor_feeds, &outputs));
+
+  float* data_o = static_cast<float*>(outputs[0].data.data());
+  for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) {
+    std::cout << "output[" << j << "]: " << data_o[j] << std::endl;
+  }
+
+  return 0;
+}
diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp
index 8c76eb1dde3ef39a342d19e7f3d4e26fc1be2b2f..25cad4feaa706899122902dee2a8f0c915e78975 100644
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <string>
+
 #include "../test_helper.h"
-#include "io/io.h"
+#include "io/loader.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
-  auto program = loader.Load(g_mobilenet_ssd, false, false);
-  //  auto program = loader.Load(g_googlenet_combine + "/model",
-  //  g_googlenet_combine +
-  //    "/params", true);
+  //  auto program = loader.Load(g_googlenet, true);
+  //  auto program = loader.Load(g_mobilenet_ssd, true);
 
+  auto program = loader.Load(std::string(g_ocr) + "/model",
+                             std::string(g_ocr) + "/params", false);
   //  program.originProgram->Description("program desc: ");
   return 0;
 }
diff --git a/test/framework/test_optimize.cpp b/test/framework/test_optimize.cpp
index 32574764e1ba538ab0bea31d1e238096e7098dfc..3cae963eca048da221d69c4c336dd4fdfecbb584 100644
--- a/test/framework/test_optimize.cpp
+++ b/test/framework/test_optimize.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "../test_helper.h"
 #include "framework/program/program-optimize/node.h"
 #include "framework/program/program-optimize/program_optimize.h"
-#include "io/io.h"
+#include "io/loader.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index 1695995a8d60d20e0d6c5f8911c39a948426a82a..02882bedb01df49b8032325e506c9118f3434a2f 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -12,30 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
   bool optimize = true;
   auto time1 = time();
-  //  auto program = loader.Load(g_googlenet, optimize);
-  auto program = loader.Load(g_googlenet_combine + "/model",
-                             g_googlenet_combine + "/params", optimize);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
-  std::vector<float> input;
-  std::vector<int64_t> dims{1, 3, 224, 224};
-  GetInput<float>(g_test_image_1x3x224x224, &input, dims);
-  auto time3 = time();
+  if (paddle_mobile.Load(g_googlenet, optimize)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
+    // 预热一次
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
 
-  for (int i = 0; i < 10; ++i) {
-    executor.Predict(input, dims);
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
   }
-
-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms\n";
   return 0;
 }
diff --git a/test/net/test_mobilenet+ssd.cpp b/test/net/test_mobilenet+ssd.cpp
index 097d03ad710468a881050ff729e8352f029d664f..ae6c40961ca96ea032b1822f17a663baedc8f661 100644
--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
@@ -12,28 +12,35 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
   auto time1 = time();
-  auto program = loader.Load(g_mobilenet_ssd, true);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, true);
+  auto isok = paddle_mobile.Load(
+      std::string(g_mobilenet_ssd_gesture) + "/model",
+      std::string(g_mobilenet_ssd_gesture) + "/params", true);
+  //  auto isok = paddle_mobile.Load(g_mobilenet_ssd, false);
+  if (isok) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
 
-  std::vector<int64_t> dims{1, 3, 300, 300};
-  Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {1, 3, 300, 300}, static_cast<float>(0),
-                     static_cast<float>(1));
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 300, 300};
+    GetInput<float>(g_hand, &input, dims);
 
-  std::vector<float> input(input_tensor.data<float>(),
-                           input_tensor.data<float>() + input_tensor.numel());
-  auto time3 = time();
-  executor.Predict(input, dims);
-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    // 预热一次
+    auto output = paddle_mobile.Predict(input, dims);
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto output = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
   return 0;
 }
diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp
index b58d9c1d94a57ede04840210540c0b58e405bfe6..56234c3c72b58869775238d78875c8bd3b94cf7c 100644
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -12,36 +12,45 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
   auto time1 = time();
-  //  auto program = loader.Load(g_mobilenet_combine, true);
-  auto program = loader.Load(g_mobilenet_combine + "/model",
-                             g_mobilenet_combine + "/params", true);
-
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, true);
-
-  std::vector<int64_t> dims{1, 3, 224, 224};
-  Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
-                     static_cast<float>(1));
-
-  std::vector<float> input(input_tensor.data<float>(),
-                           input_tensor.data<float>() + input_tensor.numel());
-  auto time3 = time();
-  auto vec_result = executor.Predict(input, dims);
-  float sum = 0;
-  for (const auto item : vec_result) {
-    sum += item;
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
+  //                     std::string(g_mobilenet_detect) + "/params", true);
+
+  auto isok = paddle_mobile.Load(g_mobilenet, true);
+  if (isok) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+
+    // 预热一次
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    DLOG << vec_result;
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
   }
-  DLOG << "mobilenet output sum =" << sum;
-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+
+  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
+               "是否存在?"
+            << std::endl;
   return 0;
 }
diff --git a/test/net/test_mobilenet_combine.cpp b/test/net/test_mobilenet_combine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..af93d105ea0c290b1dd3a80310a39e0f52c8abaa
--- /dev/null
+++ b/test/net/test_mobilenet_combine.cpp
@@ -0,0 +1,51 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  if (paddle_mobile.Load(std::string(g_mobilenet_combined) + "/model",
+                         std::string(g_mobilenet_combined) + "/params", true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+
+    // 预热一次
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+  std::cout
+      << "如果结果Nan请查看: test/images/test_image_1x3x224x224_float 是否存在?"
+      << std::endl;
+  return 0;
+}
diff --git a/test/net/test_resnet.cpp b/test/net/test_resnet.cpp
index 55f4c5efef209c421fc550c1f17422acd64b11b9..73ac88ef77b0c02545ef55b6493d4681c61c192d 100644
--- a/test/net/test_resnet.cpp
+++ b/test/net/test_resnet.cpp
@@ -12,28 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
   auto time1 = time();
-  auto program = loader.Load(g_resnet, false);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+  if (paddle_mobile.Load(g_resnet, true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    std::vector<int64_t> dims{1, 3, 32, 32};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
+                       static_cast<float>(1));
 
-  std::vector<int64_t> dims{1, 3, 32, 32};
-  Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
-                     static_cast<float>(1));
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+    // 预热一次
+    paddle_mobile.Predict(input, dims);
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+              << std::endl;
+  }
 
-  std::vector<float> input(input_tensor.data<float>(),
-                           input_tensor.data<float>() + input_tensor.numel());
-  auto time3 = time();
-  executor.Predict(input, dims);
-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
   return 0;
 }
diff --git a/test/net/test_squeezenet.cpp b/test/net/test_squeezenet.cpp
index 30460018fe8cc008e0031c1c713150745767fa28..4c14f63bde40675a7e0016e28d900788431ff2ae 100644
--- a/test/net/test_squeezenet.cpp
+++ b/test/net/test_squeezenet.cpp
@@ -12,30 +12,36 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(2);
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
   auto time1 = time();
-  auto program = loader.Load(g_squeezenet, false);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+  if (paddle_mobile.Load(g_squeezenet, true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    std::vector<int64_t> dims{1, 3, 227, 227};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
+                       static_cast<float>(1));
 
-  std::vector<int64_t> dims{1, 3, 227, 227};
-  Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
-                     static_cast<float>(1));
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+    // 预热一次
+    paddle_mobile.Predict(input, dims);
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
 
-  std::vector<float> input(input_tensor.data<float>(),
-                           input_tensor.data<float>() + input_tensor.numel());
-  auto time3 = time();
-  executor.Predict(input, dims);
-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
   return 0;
 }
diff --git a/test/net/test_yolo.cpp b/test/net/test_yolo.cpp
index c82443e23953def917826fe4ec3b2c484b588f59..83508cff335c55f5cc416c6652d83706a4626c1a 100644
--- a/test/net/test_yolo.cpp
+++ b/test/net/test_yolo.cpp
@@ -12,30 +12,36 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(2);
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
   auto time1 = time();
-  auto program = loader.Load(g_yolo, false);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+  if (paddle_mobile.Load(g_yolo, true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
 
-  std::vector<int64_t> dims{1, 3, 227, 227};
-  Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
-                     static_cast<float>(1));
+    std::vector<int64_t> dims{1, 3, 227, 227};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
+                       static_cast<float>(1));
 
-  std::vector<float> input(input_tensor.data<float>(),
-                           input_tensor.data<float>() + input_tensor.numel());
-  auto time3 = time();
-  executor.Predict(input, dims);
-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+    // 预热一次
+    paddle_mobile.Predict(input, dims);
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
   return 0;
 }
diff --git a/test/operators/test_fusion_conv_add_bn_relu_op.cpp b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81400d987195364c06b4b93d0859469b43f90e7b
--- /dev/null
+++ b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/fusion_conv_add_bn_relu_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../models/image_classification_resnet.inference.model
+  auto program = loader.Load(g_mobilenet, true);
+
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::FusionConvAddBNReluOp<
+                    paddle_mobile::CPU, float>>
+      executor(program, "fusion_conv_add_bn_relu", true);
+
+  std::cout << "executor 4 test: " << std::endl;
+
+  paddle_mobile::framework::Tensor input;
+  GetInput<float>(g_test_image_1x3x224x224_banana, &input, {1, 3, 224, 224});
+  //  // use SetupTensor if not has local input image .
+  //  SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
+  //                     static_cast<float>(1));
+
+  DLOG << " fuck: " << input;
+
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 112, 112});
+  std::cout << "before predict: " << std::endl;
+  auto output =
+      executor.Predict(input, "data", "conv2_1_dw_bn.tmp_2", out_ddim);
+  std::cout << "after predict " << std::endl;
+  auto output_ptr = output->data<float>();
+
+  int stride = output->numel() / 100;
+  for (int i = 0; i < 100; i++) {
+    DLOG << " index:" << i * stride << " value: " << output_ptr[i * stride];
+  }
+
+  //  for (int i = 0; i < 100; i++) {
+  //    DLOG << " index:" << i << " value: "<< output_ptr[i];
+  //  }
+
+  //  for (int j = 0; j < output->numel(); ++j) {
+  //    std::cout << " (index: " << j << " value: " << output_ptr[j] << ") ";
+  //  }
+  std::cout << std::endl;
+  return 0;
+}
diff --git a/test/operators/test_im2sequence_op.cpp b/test/operators/test_im2sequence_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a7512d3bf3cffcb100fe292e50fc7b7b23fa0aa0
--- /dev/null
+++ b/test/operators/test_im2sequence_op.cpp
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../executor_for_test.h"
+#include "../test_include.h"
+#include "operators/im2sequence_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_ocr_recg);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::ReluOp<paddle_mobile::CPU, float>>
+      executor(program, "im2sequence");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {2, 2, 3, 3}, -1, 1);
+  input_tensors.push_back(input1);
+
+  // 2. input_names
+  vector<string> input_names({
+      "conv2d_19.tmp_1",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"im2sequence_0.tmp_0"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({8, 9});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  for (int j = 0; j < input_tensors[0].numel(); ++j) {
+    DLOG << " value of input: " << input1_data[j];
+  }
+
+  for (int j = 0; j < output[0]->numel(); ++j) {
+    DLOG << " value of output: " << output0_data[j];
+  }
+  return 0;
+}
diff --git a/test/operators/test_prelu_op.cpp b/test/operators/test_prelu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e93d8732d18496721b24cfba1df296250169f8b2
--- /dev/null
+++ b/test/operators/test_prelu_op.cpp
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../executor_for_test.h"
+#include "../test_include.h"
+#include "operators/prelu_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_resnet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::PReluOp<paddle_mobile::CPU, float>>
+      executor(program, "prelu");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {1, 2, 3, 4}, -1, 1);
+  input_tensors.push_back(input1);
+
+  // 2. input_names
+  vector<string> input_names({
+      "batch_norm_0.tmp_2",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"batch_norm_0.tmp_3"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 2, 3, 4});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  for (int j = 0; j < output[0]->numel(); ++j) {
+    DLOG << " value of output: " << output0_data[j];
+  }
+  return 0;
+}
diff --git a/test/operators/test_resize_op.cpp b/test/operators/test_resize_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f4dcaa6885d92a727e8c97d5106c3b6913a4ab33
--- /dev/null
+++ b/test/operators/test_resize_op.cpp
@@ -0,0 +1,47 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/resize_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
+  if (program.originProgram == nullptr) {
+    DLOG << "program read file";
+  }
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::ResizeOp<paddle_mobile::CPU, float>>
+      executor(program, "resize");
+  paddle_mobile::framework::Tensor input;
+  SetupTensor<float>(&input, {2, 3, 3, 2}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto input_ptr = input.data<float>();
+  auto out_ddim = paddle_mobile::framework::make_ddim({2, 9, 2});
+  auto output =
+      executor.Predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim);
+  auto *output_ptr = output->data<float>();
+
+  DLOG << "input : ";
+  for (int j = 0; j < input.numel(); ++j) {
+    DLOG << " index " << j << " : " << input_ptr[j];
+  }
+
+  DLOG << "output : ";
+  for (int j = 0; j < output->numel(); ++j) {
+    DLOG << " index " << j << " : " << output_ptr[j];
+  }
+
+  return 0;
+}
diff --git a/test/operators/test_scale_op.cpp b/test/operators/test_scale_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..574779d71e5ebc5f06fe5cd8fb33422726f39464
--- /dev/null
+++ b/test/operators/test_scale_op.cpp
@@ -0,0 +1,18 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/scale_op.h"
+
+int main() {}
diff --git a/test/operators/test_sigmoid_op.cpp b/test/operators/test_sigmoid_op.cpp
index 4ed3efaf28aa986f0b679729c46cb386150583e3..c8fac6b9eee5c5777ddb0147bc81d361d4dd09f5 100644
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "../../src/operators/kernel/sigmoid_kernel.h"
 #include "../test_helper.h"
-#include "io/io.h"
+#include "io/executor.h"
 
 int main() {
   paddle_mobile::framework::Tensor input;
diff --git a/test/operators/test_slice_op.cpp b/test/operators/test_slice_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9306bc53c6ae23b10c27a71071c11c9ddf1c0d25
--- /dev/null
+++ b/test/operators/test_slice_op.cpp
@@ -0,0 +1,18 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/slice_op.h"
+
+int main() {}
diff --git a/test/test_helper.h b/test/test_helper.h
index fe720ded8270f2bc02a4f1e72625954962184069..fef175951e834a176c7987a77d53f2b5b4eecc5b 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -14,38 +14,37 @@ limitations under the License. */
 
 #pragma once
 
-#include <chrono>
 #include <fstream>
 #include <random>
+#include <string>
+#include <vector>
 
+#include "common/common.h"
 #include "common/log.h"
 #include "framework/ddim.h"
 #include "framework/tensor.h"
 
-static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd";
-static const std::string g_squeezenet = "../models/squeezenet";
-static const std::string g_googlenet = "../models/googlenet";
-static const std::string g_mobilenet = "../models/mobilenet";
-static const std::string g_resnet_50 = "../models/resnet_50";
-static const std::string g_resnet = "../models/resnet";
-static const std::string g_googlenet_combine = "../models/googlenet_combine";
-static const std::string g_yolo = "../models/yolo";
-static const std::string g_test_image_1x3x224x224 =
+static const char *g_ocr = "../models/ocr";
+static const char *g_mobilenet_ssd = "../models/mobilenet+ssd";
+static const char *g_mobilenet_ssd_gesture = "../models/mobilenet+ssd_gesture";
+static const char *g_mobilenet_combined = "../models/mobilenet_combine";
+static const char *g_mobilenet_detect = "../models/mobilenet-detect";
+static const char *g_squeezenet = "../models/squeezenet";
+static const char *g_googlenet = "../models/googlenet";
+static const char *g_mobilenet = "../models/mobilenet";
+static const char *g_resnet_50 = "../models/resnet_50";
+static const char *g_resnet = "../models/resnet";
+static const char *g_googlenet_combine = "../models/googlenet_combine";
+static const char *g_yolo = "../models/yolo";
+static const char *g_test_image_1x3x224x224 =
     "../images/test_image_1x3x224x224_float";
+static const char *g_test_image_1x3x224x224_banana =
+    "../images/input_3x224x224_banana";
+static const char *g_hand = "../images/hand_image";
+
 using paddle_mobile::framework::DDim;
 using paddle_mobile::framework::Tensor;
 
-using Time = decltype(std::chrono::high_resolution_clock::now());
-
-Time time() { return std::chrono::high_resolution_clock::now(); }
-
-double time_diff(Time t1, Time t2) {
-  typedef std::chrono::microseconds ms;
-  auto diff = t2 - t1;
-  ms counter = std::chrono::duration_cast<ms>(diff);
-  return counter.count() / 1000.0;
-}
-
 template <typename T>
 void SetupTensor(paddle_mobile::framework::Tensor *input,
                  paddle_mobile::framework::DDim dims, T lower, T upper) {
@@ -73,9 +72,9 @@ void GetInput(const std::string &input_name, std::vector<T> *input,
     size *= dim;
   }
 
-  T *input_ptr = (T *)malloc(sizeof(T) * size);
+  T *input_ptr = reinterpret_cast<T *>(malloc(sizeof(T) * size));
   std::ifstream in(input_name, std::ios::in | std::ios::binary);
-  in.read((char *)(input_ptr), size * sizeof(T));
+  in.read(reinterpret_cast<char *>(input_ptr), size * sizeof(T));
   in.close();
   for (int i = 0; i < size; ++i) {
     input->push_back(input_ptr[i]);
@@ -90,6 +89,6 @@ void GetInput(const std::string &input_name,
   T *input_ptr = input->mutable_data<T>(dims);
 
   std::ifstream in(input_name, std::ios::in | std::ios::binary);
-  in.read((char *)(input_ptr), input->numel() * sizeof(T));
+  in.read(reinterpret_cast<char *>(input_ptr), input->numel() * sizeof(T));
   in.close();
 }
diff --git a/test/test_include.h b/test/test_include.h
index 2d89dc8c9ed1de1ad49ebca07724b6649e2a12a7..4728a469334010e7353e6ab1f3695ec23f3e7456 100644
--- a/test/test_include.h
+++ b/test/test_include.h
@@ -30,4 +30,4 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
-#include "io/io.h"
+#include "io/paddle_mobile.h"
diff --git a/tools/android-cmake/android.toolchain.cmake b/tools/android-cmake/android.toolchain.cmake
index 4db5cd41b46246f92882f1548290fb87fc915aae..55b90ba65260b99d9af4a29832ed6f8ff5b235c8 100644
--- a/tools/android-cmake/android.toolchain.cmake
+++ b/tools/android-cmake/android.toolchain.cmake
@@ -37,7 +37,7 @@
 # ANDROID_DISABLE_FORMAT_STRING_CHECKS
 # ANDROID_CCACHE
 
-cmake_minimum_required(VERSION 3.6.0)
+# cmake_minimum_required(VERSION 3.6.0)
 
 # Inhibit all of CMake's own NDK handling code.
 set(CMAKE_SYSTEM_VERSION 1)
@@ -65,6 +65,8 @@ endif()
 file(TO_CMAKE_PATH "${ANDROID_NDK}" ANDROID_NDK)
 
 # Android NDK revision
+message("${ANDROID_NDK}")
+
 file(READ "${ANDROID_NDK}/source.properties" ANDROID_NDK_SOURCE_PROPERTIES)
 set(ANDROID_NDK_SOURCE_PROPERTIES_REGEX
   "^Pkg\\.Desc = Android NDK\nPkg\\.Revision = ([0-9]+)\\.")
@@ -159,7 +161,7 @@ endif()
 
 # Default values for configurable variables.
 if(NOT ANDROID_TOOLCHAIN)
-  set(ANDROID_TOOLCHAIN clang)
+  set(ANDROID_TOOLCHAIN gcc)
 endif()
 if(NOT ANDROID_ABI)
   set(ANDROID_ABI armeabi-v7a)
diff --git a/tools/push2android.sh b/tools/android-debug-script/push2android.sh
similarity index 59%
rename from tools/push2android.sh
rename to tools/android-debug-script/push2android.sh
index d7d1ad9950d58f415804834b8ebc0740a3e796cb..fae1a856123bd16cf3f7a115f61b3e4473ff58a3 100644
--- a/tools/push2android.sh
+++ b/tools/android-debug-script/push2android.sh
@@ -1,10 +1,10 @@
 #!/usr/bin/env sh
 
 push_fn () {
-MODELS_PATH="../test/models/*"
-MODELS_SRC="../test/models"
-IMAGE_PATH="../test/images/*"
-EXE_FILE="../test/build/*"
+MODELS_PATH="../../test/models/*"
+MODELS_SRC="../../test/models"
+IMAGE_PATH="../../test/images/*"
+EXE_FILE="../../test/build/*"
 EXE_DIR="data/local/tmp/bin"
 adb shell mkdir ${EXE_DIR}
 MODELS_DIR="data/local/tmp/models"
@@ -14,9 +14,14 @@ do
     adb shell mkdir ${MODELS_DIR}"/"${file}
 done
 
+if [[ -d "../../src/operators/kernel/mali/ACL_Android/build" ]]; then
+ACL_BUILD_PATH="../../src/operators/kernel/mali/ACL_Android/build/*"
+adb push ${ACL_BUILD_PATH} ${EXE_DIR}
+fi
+
 IMAGES_DIR="data/local/tmp/images"
 adb shell mkdir ${IMAGES_DIR}
-LIB_PATH="../build/release/arm-v7a/build/*"
+LIB_PATH="../../build/release/arm-v7a/build/*"
 adb push ${EXE_FILE} ${EXE_DIR}
 adb push ${LIB_PATH} ${EXE_DIR}
 if [[ $1 != "npm" ]]; then
diff --git a/tools/scripts/run_on_android.sh b/tools/android-debug-script/run_on_android.sh
similarity index 100%
rename from tools/scripts/run_on_android.sh
rename to tools/android-debug-script/run_on_android.sh
diff --git a/tools/arm-platform.cmake b/tools/arm-platform.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..9f2b6d5e89d92255848af54321ea09ebdb058691
--- /dev/null
+++ b/tools/arm-platform.cmake
@@ -0,0 +1,9 @@
+
+set(ARCH "armv7-a")
+
+set(FLOAT_ABI "softfp" CACHE STRING "-mfloat-api chosen")
+set_property(CACHE FLOAT_ABI PROPERTY STRINGS "softfp" "soft" "hard")
+
+set(FPU "neon")
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${ARCH} -mfloat-abi=${FLOAT_ABI} -mfpu=${FPU}")
diff --git a/tools/build.sh b/tools/build.sh
index 4ac63315a94798d3aca63fb62aef511c4146cd3c..bf3545ef162c86c16c0877f5f25f3a1e09de1fd4 100755
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+NETS=""
+declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet")
 
 build_for_mac() {
     if [ ! `which brew` ]; then
@@ -31,14 +33,15 @@ build_for_mac() {
 }
 
 build_for_android() {
-    rm -rf "../build"
-    if [ -z "${ANDROID_NDK}" ]; then
-        echo "ANDROID_NDK not found!"
+    #rm -rf "../build"
+    if [ -z "${NDK_ROOT}" ]; then
+        echo "NDK_ROOT not found!"
         exit -1
     fi
 
     if [ -z "$PLATFORM" ]; then
-        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" or other platforms from the command line.
+        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" platform.
+#        PLATFORM="arm-v8a"
     fi
 
     if [ "${PLATFORM}" = "arm-v7a" ]; then
@@ -56,11 +59,11 @@ build_for_android() {
 
 
     MODE="Release"
-    ANDROID_PLATFORM_VERSION="android-15"
+    ANDROID_PLATFORM_VERSION="android-22"
     TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
     ANDROID_ARM_MODE="arm"
-    if [ $# -eq 1 ]; then
-    NET=$1
+
+    if [ "${#NETS}" -gt 1 ]; then
     cmake .. \
         -B"../build/release/${PLATFORM}" \
         -DANDROID_ABI="${ABI}" \
@@ -70,7 +73,7 @@ build_for_android() {
         -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
         -DANDROID_STL=c++_static \
         -DANDROID=true \
-        -D"${NET}=true" \
+        -DNET="${NETS}" \
         -D"${ARM_PLATFORM}"=true
     else
 
@@ -90,85 +93,78 @@ build_for_android() {
 }
 
 build_for_ios() {
-    rm -rf "../build"
+#    rm -rf "../build"
     PLATFORM="ios"
     MODE="Release"
-    BUILD_DIR=../build/release/"${PLATFORM}"
+    BUILD_DIR=../build/release/"${PLATFORM}"/
     TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
-    C_FLAGS="-fobjc-abi-version=2 -fobjc-arc -isysroot ${CMAKE_OSX_SYSROOT}"
-    CXX_FLAGS="-fobjc-abi-version=2 -fobjc-arc -std=gnu++14 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT}"
     mkdir -p "${BUILD_DIR}"
-    if [ $# -eq 1 ]; then
-        NET=$1
+    if [ "${#NETS}" -gt 1 ]; then
         cmake .. \
             -B"${BUILD_DIR}" \
             -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
             -DIOS_PLATFORM=OS \
-            -DCMAKE_C_FLAGS="${C_FLAGS}" \
-            -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-            -D"${NET}"=true \
+            -DIOS_ARCH="${IOS_ARCH}" \
+            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
+            -DNET="${NETS}" \
             -DIS_IOS="true"
     else
         cmake .. \
             -B"${BUILD_DIR}" \
             -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
             -DIOS_PLATFORM=OS \
-            -DCMAKE_C_FLAGS="${C_FLAGS}" \
-            -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
+            -DIOS_ARCH="${IOS_ARCH}" \
+            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
             -DIS_IOS="true"
     fi
     cd "${BUILD_DIR}"
     make -j 8
+    cp ../../../src/ios_io/PaddleMobile.h ./build/PaddleMobile.h
+    cd ./build
+    # 生成符号表
+    ranlib *.a
 }
 
 build_error() {
-    echo "unknown argument"
+    echo "unknown target : $1"
 }
 
 if [ $# -lt 1 ]; then
 	echo "error: target missing!"
-    echo "available targets: mac|linux|ios|android"
-    echo "sample usage: ./build.sh mac"
+    echo "available targets: ios|android"
+    echo "sample usage: ./build.sh android"
 else
-    if [ $# -eq 2 ]; then
-        if [ $2 != "googlenet" -a $2 != "mobilenet" -a $2 != "yolo" -a $2 != "squeezenet" -a $2 != "resnet" ]; then
-            if [ $1 = "mac" ]; then
-		        build_for_mac
-	        elif [ $1 = "linux" ]; then
-		        build_for_linux
-	        elif [ $1 = "android" ]; then
-		        build_for_android
-	        elif [ $1 = "ios" ]; then
-		        build_for_ios
-	        else
-		        build_error
-	        fi
-        else
-            if [ $1 = "mac" ]; then
-		        build_for_mac $2
-	        elif [ $1 = "linux" ]; then
-		        build_for_linux $2
-	        elif [ $1 = "android" ]; then
-		        build_for_android $2
-	        elif [ $1 = "ios" ]; then
-		        build_for_ios $2
-	        else
-		        build_error
-	        fi
+    params=($@)
+    for(( i=1; i<$#; i++ )); do  
+        if [ ${i} != 1 ]; then
+            NETS=$NETS$";"
+        fi
+        NETS=$NETS$"${params[i]}"
+    done
+    params=${@:2}
+
+    supported=false
+    for name in ${params[@]}; do
+        for net in ${supportedNets[@]}; do
+            match=false
+            if [ "$name"x = "$net"x ];then
+                supported=true
+                match=true
+                break 1
+            fi
+        done
+        if [ "$match" = false ];then
+            echo "${name} not supported!"
+            echo "supported nets are: ${supportedNets[@]}"
+            exit -1
         fi
+    done
+
+    if [ $1 = "android" ]; then
+        build_for_android
+    elif [ $1 = "ios" ]; then
+        build_for_ios
     else
-        if [ $1 = "mac" ]; then
-		    build_for_mac
-	    elif [ $1 = "linux" ]; then
-		    build_for_linux
-	    elif [ $1 = "android" ]; then
-		    build_for_android
-	    elif [ $1 = "ios" ]; then
-		    build_for_ios
-	    else
-		    build_error
-	    fi
-	fi
-fi
+        build_error "$1"
+    fi
+fi
\ No newline at end of file
diff --git a/tools/ios-cmake/ios.toolchain.cmake b/tools/ios-cmake/ios.toolchain.cmake
index a8735adc8d853a5825a23f1ddf129d0a95199275..4db079d01de8db35fca8fbe63b59e58fd5a3463e 100644
--- a/tools/ios-cmake/ios.toolchain.cmake
+++ b/tools/ios-cmake/ios.toolchain.cmake
@@ -34,6 +34,7 @@ set (CMAKE_SYSTEM_VERSION 1)
 set (UNIX True)
 set (APPLE True)
 set (IOS True)
+set (IOS_ARCH armv7 armv7s arm64)
 
 # Required as of cmake 2.8.10
 set (CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
@@ -159,7 +160,6 @@ set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS su
 
 # set the architecture for iOS
 if (${IOS_PLATFORM} STREQUAL "OS")
-  set (IOS_ARCH armv7 armv7s arm64)
 elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
   set (IOS_ARCH i386)
 elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")
diff --git a/tools/net-detail.awk b/tools/net-detail.awk
new file mode 100644
index 0000000000000000000000000000000000000000..84d0166ac777b5b7fbd9801665031bb2d51fedbb
--- /dev/null
+++ b/tools/net-detail.awk
@@ -0,0 +1,91 @@
+BEGIN {
+print "digraph G {"
+}
+/op:/ {
+    id++
+    opname[id] = $NF
+}
+/input/ {
+    type = "input"
+    para = $NF
+    if (input[id]) {
+        input[id] = input[id] "|"
+    }
+    input[id] = input[id] "<" para ">" para
+}
+/output/ {
+    type = "output"
+    para = $NF
+    if (output[id]) {
+        output[id] = output[id] "|"
+    }
+    output[id] = output[id] "<" para ">" para
+}
+/attr/ {
+    type = "attr"
+    aname = $NF
+    if (attr_key[id]) {
+        attr_key[id] = attr_key[id] "|"
+        attr_value[id] = attr_value[id] "|"
+    }
+    attr_key[id] = attr_key[id] $NF
+}
+/argument/ {
+    if (type == "attr") {
+        split($0, arr, " - ")
+        attr_value[id] = attr_value[id] arr[2]
+    } else if ((type == "input") || (type == "output")) {
+        if (!var2id[$NF]) {
+            var_id++
+            var[var_id] = $NF
+            var2id[$NF] = var_id
+        }
+        varid = var2id[$NF]
+        lid++
+        if (type == "input") {
+            line[lid] = "var_" varid " -> " "op_" id ":<" para ">"
+            if (xout[$NF]) {
+                xi++
+                xline[xi] = "xop_" xout[$NF] " -> " "xop_" id
+            }
+        } else if (type == "output") {
+            line[lid] = "op_" id ":<" para ">" " -> " "var_" varid
+            xout[$NF] = id
+        }
+    }
+}
+/var name/ {
+    varname = $NF
+    vid = var2id[varname]
+}
+/var tensor desc dim / {
+    if (tensor[vid]) tensor[vid] = tensor[vid] " x "
+    tensor[vid] = tensor[vid] $NF
+}
+END {
+
+print "subgraph cluster_G0 {"
+for (i = 1; i <= id; i++) {
+    print "xop_" i "[label=\"" i ". " opname[i] "\"]"
+}
+for (i = 1; i <= xi; i++) {
+    print xline[i]
+}
+print "}"
+
+for (i = 1; i <= id; i++) {
+print "op_" i "[group=op;shape=record;label=\"{{" input[i] "}|<op>" i ". " opname[i] "|{" output[i] "}}\"]"
+}
+for (i = 1; i <= var_id; i++) {
+print "var_" i "[label=\"" var[i] " [" tensor[i] "]\"]"
+}
+for (i = 1; i <= lid; i++) {
+print line[i]
+}
+for (i = 1; i <= id; i++) {
+print "attr_" i "[shape=record;label=\"{" attr_key[i] "}|{" attr_value[i] "}\"]"
+print "attr_" i " -> " "op_" i ":<op>"
+}
+print "}"
+}
+
diff --git a/tools/net.awk b/tools/net.awk
new file mode 100644
index 0000000000000000000000000000000000000000..25689c90d871618fc445bba5044446fa7198b2c5
--- /dev/null
+++ b/tools/net.awk
@@ -0,0 +1,27 @@
+BEGIN {
+    print "digraph {"
+}
+/op:/ {
+    id++
+    op = $NF
+    opname = op "_" id
+    print opname "[\"label\"=\"" op " [" id "]" "\"]"
+}
+/input/ {
+    type = "input"
+}
+/output/ {
+    type = "output"
+}
+/argument/ {
+    if (type == "output") {
+        output[$NF] = opname
+    } else if (type == "input") {
+        if (output[$NF]) {
+            print output[$NF] " -> " opname
+        }
+    }
+}
+END {
+    print "}"
+}
diff --git a/tools/op.cmake b/tools/op.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0eab67267032d3956a52b80ab7494c6572df7074
--- /dev/null
+++ b/tools/op.cmake
@@ -0,0 +1,249 @@
+set(FOUND_MATCH OFF)
+if ("googlenet" IN_LIST NET)
+  message("googlenet enabled")
+  set(CONCAT_OP ON)
+  set(CONV_OP ON)
+  set(LRN_OP ON)
+  set(MUL_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(FUSION_FC_OP ON)
+  set(POOL_OP ON)
+  set(RELU_OP ON)
+  set(FUSION_CONVADD_OP ON)
+  set(FUSION_CONVADDRELU_OP ON)
+
+  set(FOUND_MATCH ON)
+endif()
+
+if ("mobilenet" IN_LIST NET)
+  message("mobilenet enabled")
+  set(CONV_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(RELU_OP ON)
+  set(SOFTMAX_OP ON)
+  set(MUL_OP ON)
+  set(DEPTHWISECONV_OP ON)
+  set(BATCHNORM_OP ON)
+  set(POOL_OP ON)
+  set(RESHAPE_OP ON)
+  set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_CONVADD_OP ON)
+
+  set(FOUND_MATCH ON)
+endif()
+
+
+if ("yolo" IN_LIST NET)
+  message("yolo enabled")
+  set(BATCHNORM_OP ON)
+  set(CONV_OP ON)
+  set(RELU_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+
+  set(FOUND_MATCH ON)
+endif()
+
+if ("squeezenet" IN_LIST NET)
+  message("squeezenet enabled")
+  set(CONCAT_OP ON)
+  set(CONV_OP ON)
+  set(RELU_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(POOL_OP ON)
+  set(RESHAPE_OP ON)
+  set(SOFTMAX_OP ON)
+
+  set(FOUND_MATCH ON)
+endif()
+
+
+if ("resnet" IN_LIST NET)
+  message("resnet enabled")
+  set(CONCAT_OP ON)
+  set(CONV_OP ON)
+  set(RELU_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(POOL_OP ON)
+  set(RESHAPE_OP ON)
+  set(SOFTMAX_OP ON)
+
+  set(FOUND_MATCH ON)
+endif()
+
+if ("FPGAnets" IN_LIST NET)
+  message("FPGAnets enabled")
+  set(FUSION_CONVADDRELU_OP ON)
+  set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_CONVADDBN_OP ON)
+  set(FUSION_ELEMENTWISEADDRELU_OP ON)
+  set(FUSION_FC_OP ON)
+  set(FUSION_FCRELU_OP ON)
+  set(POOL_OP ON)
+  set(CONCAT_OP ON)
+  set(SOFTMAX_OP ON)
+  set(DROPOUT_OP ON)
+
+  set(FOUND_MATCH ON)   
+endif()
+
+
+if(NOT FOUND_MATCH)
+  message("--default--")
+  set(BATCHNORM_OP ON)
+  set(CONV_TRANSPOSE_OP ON)
+  set(BOXCODER_OP ON)
+  set(CONCAT_OP ON)
+  set(CONV_OP ON)
+  set(DEPTHWISECONV_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(FUSION_CONVADD_OP ON)
+  set(FUSION_CONVADDRELU_OP ON)
+  set(FUSION_FC_OP ON)
+  set(LRN_OP ON)
+  set(MUL_OP ON)
+  set(MULTICLASSNMS_OP ON)
+  set(POOL_OP ON)
+  set(PRIORBOX_OP ON)
+  set(RELU_OP ON)
+  set(RESHAPE_OP ON)
+  set(SIGMOID_OP ON)
+  set(SOFTMAX_OP ON)
+  set(TRANSPOSE_OP ON)
+  set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_DWCONVBNRELU_OP ON)
+  set(FUSION_CONVBNRELU_OP ON)
+  set(PRELU_OP ON)
+  set(RESIZE_OP ON)
+  set(SCALE_OP ON)
+  set(SLICE_OP ON)
+  set(DROPOUT_OP ON)
+  set(IM2SEQUENCE_OP ON)
+endif()
+
+  # option(BATCHNORM_OP "" ON)
+  # option(BOXCODER_OP "" ON)
+  # option(CONCAT_OP "" ON)
+  # option(CONV_OP "" ON)
+  # option(DEPTHWISECONV_OP "" ON)
+  # option(ELEMENTWISEADD_OP "" ON)
+  # option(FUSION_CONVADD_OP "" ON)
+  # option(FUSION_CONVADDRELU_OP "" ON)
+  # option(FUSION_FC_OP "" ON)
+  # option(LRN_OP "" ON)
+  # option(MUL_OP "" ON)
+  # option(MULTICLASSNMS_OP "" ON)
+  # option(POOL_OP "" ON)
+  # option(PRIORBOX_OP "" ON)
+  # option(RELU_OP "" ON)
+  # option(RESHAPE_OP "" ON)
+  # option(SIGMOID_OP "" ON)
+  # option(SOFTMAX_OP "" ON)
+  # option(TRANSPOSE_OP "" ON)
+# endif ()
+
+if (BATCHNORM_OP)
+  add_definitions(-DBATCHNORM_OP)
+endif()
+if (BOXCODER_OP)
+  add_definitions(-DBOXCODER_OP)
+endif()
+if (CONCAT_OP)
+  add_definitions(-DCONCAT_OP)
+endif()
+if (CONV_OP)
+  add_definitions(-DCONV_OP)
+endif()
+if (DEPTHWISECONV_OP)
+  add_definitions(-DDEPTHWISECONV_OP)
+endif()
+if (ELEMENTWISEADD_OP)
+  add_definitions(-DELEMENTWISEADD_OP)
+endif()
+if (FUSION_CONVADD_OP)
+  add_definitions(-DFUSION_CONVADD_OP)
+endif()
+if (FUSION_CONVADDRELU_OP)
+  add_definitions(-DFUSION_CONVADDRELU_OP)
+endif()
+if (FUSION_FC_OP)
+  add_definitions(-DFUSION_FC_OP)
+endif()
+if (LRN_OP)
+  add_definitions(-DLRN_OP)
+endif()
+if (MUL_OP)
+  add_definitions(-DMUL_OP)
+endif()
+if (MULTICLASSNMS_OP)
+  add_definitions(-DMULTICLASSNMS_OP)
+endif()
+if (POOL_OP)
+  add_definitions(-DPOOL_OP)
+endif()
+if (PRIORBOX_OP)
+  add_definitions(-DPRIORBOX_OP)
+endif()
+if (RELU_OP)
+  add_definitions(-DRELU_OP)
+endif()
+if (RESHAPE_OP)
+  add_definitions(-DRESHAPE_OP)
+endif()
+if (SIGMOID_OP)
+  add_definitions(-DSIGMOID_OP)
+endif()
+if (SOFTMAX_OP)
+  add_definitions(-DSOFTMAX_OP)
+endif()
+if (TRANSPOSE_OP)
+  add_definitions(-DTRANSPOSE_OP)
+endif()
+if (FUSION_CONVADDBNRELU_OP)
+  add_definitions(-DFUSION_CONVADDBNRELU_OP)
+endif()
+if (FUSION_DWCONVBNRELU_OP)
+  add_definitions(-DFUSION_DWCONVBNRELU_OP)
+endif()
+
+if (FUSION_CONVBNRELU_OP)
+  add_definitions(-DFUSION_CONVBNRELU_OP)
+endif()
+
+if (PRELU_OP)
+  add_definitions(-DPRELU_OP)
+endif()
+if (RESIZE_OP)
+  add_definitions(-DRESIZE_OP)
+endif()
+if (SCALE_OP)
+  add_definitions(-DSCALE_OP)
+endif()
+if (SLICE_OP)
+  add_definitions(-DSLICE_OP)
+endif()
+if (DROPOUT_OP)
+  add_definitions(-DDROPOUT_OP)
+endif()
+if (IM2SEQUENCE_OP)
+  add_definitions(-DIM2SEQUENCE_OP)
+endif()
+
+if (FUSION_CONVADDBN_OP)
+  add_definitions(-DFUSION_CONVADDBN_OP)
+endif()
+if (FUSION_FCRELU_OP)
+  add_definitions(-DFUSION_FCRELU_OP)
+endif()
+if (FUSION_POOLBN_OP)
+  add_definitions(-DFUSION_POOLBN_OP)
+endif()
+if (FUSION_ELEMENTWISEADDRELU_OP)
+  add_definitions(-DFUSION_ELEMENTWISEADDRELU_OP)
+endif()
+if (REGION_OP)
+  add_definitions(-DREGION_OP)
+endif()
+
+if (CONV_TRANSPOSE_OP)
+  add_definitions(-DCONV_TRANSPOSE)
+endif()
diff --git a/tools/pre-commit.hooks/clang-format.hook b/tools/pre-commit.hooks/clang-format.hook
index 4fa4253bad78fe287fb92863a684a5d7def71061..ece9ebc598e3fa63d1d76409dc0068854aaec851 100644
--- a/tools/pre-commit.hooks/clang-format.hook
+++ b/tools/pre-commit.hooks/clang-format.hook
@@ -14,6 +14,10 @@ fi
 
 # https://medicineyeh.wordpress.com/2017/07/13/clang-format-with-pragma/
 shift
-perl -i -pe 's|#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@"
-clang-format -i $@
+perl -i -pe 's|^\s+#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@"
+(
+# remove clang format ios_io folder
+flist=$(echo "$@" | perl -pe 's|src/ios_io/[^ ]*||')
+clang-format -i $flist
+)
 perl -i -pe 's|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> ||' "$@"
diff --git a/tools/quantification/CMakeLists.txt b/tools/quantification/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5f1ca7fdc2b65638c7158b0933b924c71eadc4a0
--- /dev/null
+++ b/tools/quantification/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 3.6)
+project(quali)
+add_definitions(-DENABLE_EXCEPTION)
+
+set(CMAKE_CXX_STANDARD 11)
+file(GLOB_RECURSE QULIFICATON_CC src/*.cc src/*.cpp src/*.c src/*.mm)
+file(GLOB_RECURSE QULIFICATON_H src/*.h)
+include_directories(. src/)
+
+#add_library(paddle-mobile SHARED ${QULIFICATON_CC} ${QULIFICATON_H} convert.cpp)
+
+add_executable(quantify convert.cpp ${QULIFICATON_CC} ${QULIFICATON_H})
\ No newline at end of file
diff --git a/tools/quantification/README.md b/tools/quantification/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ac729af01e7e73328b884097009dad1d468e7997
--- /dev/null
+++ b/tools/quantification/README.md
@@ -0,0 +1,39 @@
+# 模型量化脚本
+
+#### 量化脚本使用指南
+1. 在PaddleMobile项目目录下（如 ~/PaddleProject/paddle-mobile）
+
+2. cd到  tools/quantification/ 目录
+
+3. cmake编译
+
+    ``` sh
+    cmake .
+    make
+    ```
+
+4. 运行量化脚本
+    ```sh
+    ./quantify (0:seperated. 1:combined ) (输入路径) (输出路径)
+    # quantify googlenet seperated   from  /Users/xiebaiyuan/PaddleProject/quali/models/googlenet to ./googlenet_min
+    ./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min 
+
+    ```
+
+*注:*
+*量化工具中*
+*1.seperated模型model文件默认命名为 "__model__";*
+*2.combined模型的model文件默认命名为 "model",参数文件默认命名为"params";*
+
+    
+##### 整体如下:
+以googlenet非combined为例：
+
+```sh
+cd tools/quantification/
+cmake .
+make
+./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min
+```
+
+
diff --git a/tools/quantification/convert.cpp b/tools/quantification/convert.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..282b22073fc96ddb2ed0d421f113604aadcc4afc
--- /dev/null
+++ b/tools/quantification/convert.cpp
@@ -0,0 +1,275 @@
+
+
+#include "src/enforce.h"
+#include "src/var_desc.h"
+#include "src/program_desc.h"
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+#include <iostream>
+#include <utility>
+#include <vector>
+#include "src/framework.pb-c.h"
+#include "src/protobuf-c.h"
+#include <fstream>
+#include <iostream>
+#include <limits>
+
+const size_t kSize64 = sizeof(uint64_t);
+const size_t kSize32 = sizeof(uint32_t);
+
+char *Get_binary_data(const std::string &filename) {
+
+    FILE *file = fopen(filename.c_str(), "rb");
+
+    PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
+                          filename.c_str());
+    fseek(file, 0, SEEK_END);
+    int64_t size = ftell(file);
+
+    PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
+    rewind(file);
+    auto *data = new char[size];
+    size_t bytes_read = fread(data, 1, static_cast<size_t>(size), file);
+    PADDLE_MOBILE_ENFORCE(bytes_read == size,
+                          "read binary file bytes do not match with fseek");
+    fclose(file);
+    return data;
+}
+
+
+static size_t ReadBuffer(const char *file_name, uint8_t **out) {
+    FILE *fp;
+    fp = fopen(file_name, "rb");
+    PADDLE_MOBILE_ENFORCE(fp != nullptr, " %s open failed !", file_name);
+    fseek(fp, 0, SEEK_END);
+    auto size = static_cast<size_t>(ftell(fp));
+    rewind(fp);
+    *out = reinterpret_cast<uint8_t *>(malloc(size));
+    size_t cur_len = 0;
+    size_t nread;
+    while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
+        cur_len += nread;
+    }
+    fclose(fp);
+    return cur_len;
+}
+
+std::shared_ptr<ProgramDesc> loadParams(const std::string &model_path) {
+    PaddleMobile__Framework__Proto__ProgramDesc *c_program;
+    uint8_t *buf = nullptr;
+    size_t read_size = ReadBuffer(model_path.c_str(), &buf);
+    PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
+    c_program = paddle_mobile__framework__proto__program_desc__unpack(
+            nullptr, read_size, buf);
+    PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
+    auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
+    return originProgramDesc;
+
+}
+
+void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file) {
+    // 1. version
+    uint32_t version = *reinterpret_cast<uint32_t *>(*dataP);
+
+    // write version
+    fwrite(&version, kSize32, 1, out_file);
+
+    *dataP += kSize32;
+
+    // 2 Lod information
+    auto *lod_level_ptr = new uint64_t();
+    memcpy(lod_level_ptr, *dataP, kSize64);
+
+    uint64_t lod_level = 0;
+    // write lod Information
+    fwrite(&lod_level, kSize64, 1, out_file);
+    delete lod_level_ptr;
+
+    *dataP += kSize64;
+
+    for (uint64_t i = 0; i < lod_level; ++i) {
+        uint64_t size = *reinterpret_cast<uint64_t *>(*dataP);
+        // write lod size
+        fwrite(&size, kSize64, 1, out_file);
+        (*dataP) += kSize64;
+
+        std::vector<size_t> tmp(size / sizeof(size_t));
+        for (unsigned long &k : tmp) {
+            k = *reinterpret_cast<size_t *>(*dataP);
+            (*dataP) += sizeof(size_t);
+        }
+        // write lod size vector
+        fwrite(&tmp, sizeof(size_t), tmp.size(), out_file);
+    }
+
+    // 3. tensor version
+    uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*dataP);
+    // write tensor version
+    fwrite(&tensor_version, kSize32, 1, out_file);
+    (*dataP) += kSize32;
+
+    // 4. tensor desc
+    int32_t size = *reinterpret_cast<int32_t *>(*dataP);
+    // write tensor desc
+    fwrite(&size, sizeof(int32_t), 1, out_file);
+    (*dataP) += sizeof(int32_t);
+
+    std::unique_ptr<char[]> buf(new char[size]);
+    for (int m = 0; m < size; ++m) {
+        buf.get()[m] = (*dataP)[m];
+    }
+
+    fwrite(buf.get(), sizeof(char), static_cast<size_t>(size), out_file);
+    (*dataP) += (sizeof(char) * size);
+
+    const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
+    int memory_size = 1;
+    for (auto l : desc.Dims()) {
+        memory_size *= l;
+    }
+
+    void *memory = nullptr;
+    int type_size = 0;
+    switch (desc.DataType()) {
+        case paddle_mobile::framework::VARTYPE_TYPE_FP16:
+            type_size = 2;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_FP32:
+            type_size = 4;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_FP64:
+            type_size = 8;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_INT32:
+            type_size = 4;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_INT64:
+            type_size = 8;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_BOOL:
+            type_size = 1;
+            break;
+        default:
+            break;
+    }
+    size_t tensorSize = sizeof(char) * memory_size * type_size;
+
+    memory = new char[tensorSize];
+
+    for (int n = 0; n < tensorSize; ++n) {
+        static_cast<char *>(memory)[n] = (*dataP)[n];
+    }
+    *dataP += tensorSize;
+
+    // for float 32
+    float min_value = std::numeric_limits<float>::max();
+    float max_value = std::numeric_limits<float>::min();
+
+    for (int k = 0; k < memory_size; ++k) {
+        min_value = std::min(min_value, static_cast<float *> (memory)[k]);
+        max_value = std::max(max_value, static_cast<float *> (memory)[k]);
+    }
+
+    fwrite(&min_value, sizeof(float), 1, out_file);
+    fwrite(&max_value, sizeof(float), 1, out_file);
+
+    for (int g = 0; g < memory_size; ++g) {
+        float value = static_cast<float *> (memory)[g];
+        auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
+        fwrite(&factor, sizeof(uint8_t), 1, out_file);
+    }
+}
+
+void
+quantificate_combined(const std::string &model_path, const std::string &param_path, const std::string &param_min_path) {
+
+    auto program = loadParams(model_path);
+    char *origin_data = Get_binary_data(param_path);
+    char *data = origin_data;
+    FILE *out_file = fopen(param_min_path.c_str(), "wb");
+    for (const auto &block : program->Blocks()) {
+        for (const auto &var_desc : block->Vars()) {
+            if (var_desc->Persistable()) {
+                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+                    continue;
+                }
+                LoadWithDump(*var_desc, &data, out_file);
+            }
+        }
+    }
+    fclose(out_file);
+    delete origin_data;
+
+}
+
+void quantificate_seperated(const std::string model_dir, const std::string param_min_path) {
+
+    auto program = loadParams(model_dir + "/__model__");
+
+    std::string shell_command = "mkdir " + param_min_path;
+    system(shell_command.c_str());
+
+    for (const auto &block : program->Blocks()) {
+        for (const auto &var_desc : block->Vars()) {
+            if (var_desc->Persistable()) {
+                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+                    continue;
+                }
+                std::string file_name = param_min_path + "/" + var_desc->Name();
+                FILE *out_file = fopen(file_name.c_str(), "wb");
+                char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
+                char *data = origin_data;
+                LoadWithDump(*var_desc, &data, out_file);
+                delete origin_data;
+                fclose(out_file);
+            }
+        }
+    }
+
+}
+
+
+int main(int argc, char **argv) {
+
+    const std::string kNoteEg = "( eg:  ./quantify 1 your_combined_model_path output_path  or  ./quantify 0 your_seperated_model_path output_path)";
+
+    PADDLE_MOBILE_ENFORCE(argc > 1, "wee need params.%s ", kNoteEg.c_str());
+
+    std::string action_type = argv[1];
+    PADDLE_MOBILE_ENFORCE(argc > 1 && (action_type) == "1" || action_type == "0",
+                          "only 1 or 2 supported, current is %s %s ",
+                          action_type.c_str(),
+                          kNoteEg.c_str());
+
+    PADDLE_MOBILE_ENFORCE(argc > 2, "we need your model path. %s ", kNoteEg.c_str());
+    std::string base_path = argv[2];
+
+    PADDLE_MOBILE_ENFORCE(argc > 3, "we need your output path. %s ", kNoteEg.c_str());
+    std::string output_path = argv[3];
+
+    if (action_type == "0") {
+        // for seperated
+        const std::string &seperated_min_dir = output_path;
+        quantificate_seperated(base_path, seperated_min_dir);
+        return 0;
+    }
+
+    if (action_type == "1") {
+        // for combined
+        const std::string &combined_min_dir = output_path;
+        std::string model_path = base_path + "/model";
+        std::string param_path = base_path + "/params";
+        quantificate_combined(model_path, param_path, combined_min_dir);
+
+        return 0;
+    }
+
+    return -1;
+}
+
+
+
+
+
+
diff --git a/tools/quantification/src/block_desc_local.cpp b/tools/quantification/src/block_desc_local.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ad1982c05ed0b1b7c7bec5ef26aa8151f941cf3
--- /dev/null
+++ b/tools/quantification/src/block_desc_local.cpp
@@ -0,0 +1,48 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+//
+// Created by 谢柏渊 on 2018/7/25.
+//
+#include "src/block_desc_local.h"
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "src/framework.pb-c.h"
+
+std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>>
+BlockDesc::Vars() const {
+  return vars_;
+}
+
+BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc)
+    : index_(desc->idx), parent_index_(desc->idx) {
+  for (int i = 0; i < desc->n_vars; ++i) {
+    PaddleMobile__Framework__Proto__VarDesc *var_desc = desc->vars[i];
+    vars_.emplace_back(std::shared_ptr<paddle_mobile::framework::VarDesc>(
+        new paddle_mobile::framework::VarDesc(var_desc)));
+  }
+
+  std::sort(vars_.begin(), vars_.end(),
+            [](std::shared_ptr<paddle_mobile::framework::VarDesc> left,
+               std::shared_ptr<paddle_mobile::framework::VarDesc> right) {
+              return left->Name() < right->Name();
+            });
+
+  //        for (int j = 0; j < desc->n_ops; ++j) {
+  //            PaddleMobile__Framework__Proto__OpDesc *op_desc = desc->ops[j];
+  //            ops_.emplace_back(new OpDesc(op_desc));
+  //        }
+}
diff --git a/tools/quantification/src/block_desc_local.h b/tools/quantification/src/block_desc_local.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ee8132af7f21ed0e62678c8da510bfd7fba9dbd
--- /dev/null
+++ b/tools/quantification/src/block_desc_local.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+//
+// Created by 谢柏渊 on 2018/7/25.
+//
+
+#ifndef TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
+#define TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
+
+#include <memory>
+#include <vector>
+#include "src/var_desc.h"
+
+class BlockDesc {
+ public:
+  friend class Node;
+  friend class ProgramOptimize;
+  BlockDesc() {}
+  explicit BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc);
+
+  const int &ID() const { return index_; }
+
+  const bool &MultiThread() const { return multi_thread_; }
+
+  const int &Parent() const { return parent_index_; }
+
+  bool operator==(const BlockDesc &in_block) const {
+    return this->ID() == in_block.ID() && this->Parent() == in_block.Parent();
+  }
+
+  bool operator<(const BlockDesc &in_block) const {
+    return this->ID() < in_block.ID() && this->Parent() < in_block.Parent();
+  }
+
+  std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>> Vars() const;
+
+ private:
+  int index_;
+  bool multi_thread_;
+  int parent_index_;
+  std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>> vars_;
+};
+
+#endif  // TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
diff --git a/tools/quantification/src/enforce.h b/tools/quantification/src/enforce.h
new file mode 100644
index 0000000000000000000000000000000000000000..51d2110e32433686d1b3353bc63b92a564a13e9d
--- /dev/null
+++ b/tools/quantification/src/enforce.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef ENABLE_EXCEPTION
+#include <stdio.h>
+#include <exception>
+#include <string>
+
+#endif
+
+namespace paddle_mobile {
+
+#ifdef ENABLE_EXCEPTION
+struct PaddleMobileException : public std::exception {
+  const std::string exception_prefix = "paddle mobile C++ Exception: \n";
+  std::string message;
+
+  PaddleMobileException(const char *header, const char *detail,
+                        const char *file, const int line) {
+    char buffer[1500];
+    snprintf(buffer, sizeof(buffer),
+             "%s| %s \n| [in file] : %s\n| [on line] : %d\n| [detail]  : %s\n",
+             exception_prefix.c_str(), header, file, line, detail);
+    message = std::string(buffer);
+  }
+  const char *what() const noexcept { return message.c_str(); }
+};
+
+#define PADDLE_MOBILE_THROW_EXCEPTION(...)                                 \
+  {                                                                        \
+    char buffer[1000];                                                     \
+    snprintf(buffer, sizeof(buffer), __VA_ARGS__);                         \
+    std::string detail(buffer);                                            \
+    throw paddle_mobile::PaddleMobileException("Custom Exception", buffer, \
+                                               __FILE__, __LINE__);        \
+  }
+
+#define PADDLE_MOBILE_ENFORCE(stat, ...)                                      \
+  {                                                                           \
+    if (stat) {                                                               \
+    } else {                                                                  \
+      char buffer[1000];                                                      \
+      snprintf(buffer, sizeof(buffer), __VA_ARGS__);                          \
+      std::string detail(buffer);                                             \
+      throw paddle_mobile::PaddleMobileException("paddle-mobile enforce",     \
+                                                 buffer, __FILE__, __LINE__); \
+    }                                                                         \
+  }
+#else
+#define PADDLE_MOBILE_THROW_EXCEPTION(...)
+#define PADDLE_MOBILE_ENFORCE(stat, ...)
+#endif
+
+}  // namespace paddle_mobile
diff --git a/tools/quantification/src/framework.pb-c.c b/tools/quantification/src/framework.pb-c.c
new file mode 100644
index 0000000000000000000000000000000000000000..aed0a6c9c0614da74a82cea8c7aa705978dddafc
--- /dev/null
+++ b/tools/quantification/src/framework.pb-c.c
@@ -0,0 +1,1403 @@
+/* Generated by the protocol buffer compiler.  DO NOT EDIT! */
+/* Generated from: framework.proto */
+
+/* Do not generate deprecated warnings for self */
+#ifndef PROTOBUF_C__NO_DEPRECATED
+#define PROTOBUF_C__NO_DEPRECATED
+#endif
+
+#include "framework.pb-c.h"
+void paddle_mobile__framework__proto__op_desc__attr__init(
+    PaddleMobile__Framework__Proto__OpDesc__Attr *message) {
+  static const PaddleMobile__Framework__Proto__OpDesc__Attr init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__op_desc__var__init(
+    PaddleMobile__Framework__Proto__OpDesc__Var *message) {
+  static const PaddleMobile__Framework__Proto__OpDesc__Var init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__op_desc__init(
+    PaddleMobile__Framework__Proto__OpDesc *message) {
+  static const PaddleMobile__Framework__Proto__OpDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__OpDesc *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__op_desc__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+
+PaddleMobile__Framework__Proto__OpDesc *
+paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
+                                                 size_t len,
+                                                 const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__OpDesc *)protobuf_c_message_unpack(
+      &paddle_mobile__framework__proto__op_desc__descriptor, allocator, len,
+      data);
+}
+void paddle_mobile__framework__proto__op_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__OpDesc *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__op_desc__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+void paddle_mobile__framework__proto__op_proto__var__init(
+    PaddleMobile__Framework__Proto__OpProto__Var *message) {
+  static const PaddleMobile__Framework__Proto__OpProto__Var init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__op_proto__attr__init(
+    PaddleMobile__Framework__Proto__OpProto__Attr *message) {
+  static const PaddleMobile__Framework__Proto__OpProto__Attr init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__op_proto__init(
+    PaddleMobile__Framework__Proto__OpProto *message) {
+  static const PaddleMobile__Framework__Proto__OpProto init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
+    const PaddleMobile__Framework__Proto__OpProto *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__op_proto__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+
+PaddleMobile__Framework__Proto__OpProto *
+paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__OpProto *)protobuf_c_message_unpack(
+      &paddle_mobile__framework__proto__op_proto__descriptor, allocator, len,
+      data);
+}
+void paddle_mobile__framework__proto__op_proto__free_unpacked(
+    PaddleMobile__Framework__Proto__OpProto *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__op_proto__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+void paddle_mobile__framework__proto__var_type__tensor_desc__init(
+    PaddleMobile__Framework__Proto__VarType__TensorDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarType__TensorDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init(
+    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc
+      init_value =
+          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init(
+    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc
+      init_value =
+          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__reader_desc__init(
+    PaddleMobile__Framework__Proto__VarType__ReaderDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarType__ReaderDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__channel_desc__init(
+    PaddleMobile__Framework__Proto__VarType__ChannelDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarType__ChannelDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__tuple__init(
+    PaddleMobile__Framework__Proto__VarType__Tuple *message) {
+  static const PaddleMobile__Framework__Proto__VarType__Tuple init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__init(
+    PaddleMobile__Framework__Proto__VarType *message) {
+  static const PaddleMobile__Framework__Proto__VarType init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__var_type__get_packed_size(
+    const PaddleMobile__Framework__Proto__VarType *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__var_type__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+PaddleMobile__Framework__Proto__VarType *
+paddle_mobile__framework__proto__var_type__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__VarType *)protobuf_c_message_unpack(
+      &paddle_mobile__framework__proto__var_type__descriptor, allocator, len,
+      data);
+}
+void paddle_mobile__framework__proto__var_type__free_unpacked(
+    PaddleMobile__Framework__Proto__VarType *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__var_type__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+void paddle_mobile__framework__proto__var_desc__init(
+    PaddleMobile__Framework__Proto__VarDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__VarDesc *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__var_desc__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+
+PaddleMobile__Framework__Proto__VarDesc *
+paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__VarDesc *)protobuf_c_message_unpack(
+      &paddle_mobile__framework__proto__var_desc__descriptor, allocator, len,
+      data);
+}
+void paddle_mobile__framework__proto__var_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__VarDesc *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__var_desc__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+void paddle_mobile__framework__proto__block_desc__init(
+    PaddleMobile__Framework__Proto__BlockDesc *message) {
+  static const PaddleMobile__Framework__Proto__BlockDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__BlockDesc *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__block_desc__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+
+PaddleMobile__Framework__Proto__BlockDesc *
+paddle_mobile__framework__proto__block_desc__unpack(
+    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__BlockDesc *)protobuf_c_message_unpack(
+      &paddle_mobile__framework__proto__block_desc__descriptor, allocator, len,
+      data);
+}
+void paddle_mobile__framework__proto__block_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__BlockDesc *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__block_desc__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+void paddle_mobile__framework__proto__program_desc__init(
+    PaddleMobile__Framework__Proto__ProgramDesc *message) {
+  static const PaddleMobile__Framework__Proto__ProgramDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__ProgramDesc *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__program_desc__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+
+PaddleMobile__Framework__Proto__ProgramDesc *
+paddle_mobile__framework__proto__program_desc__unpack(
+    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__ProgramDesc *)
+      protobuf_c_message_unpack(
+          &paddle_mobile__framework__proto__program_desc__descriptor, allocator,
+          len, data);
+}
+void paddle_mobile__framework__proto__program_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__ProgramDesc *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__program_desc__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_desc__attr__field_descriptors[12] = {
+        {
+            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, name), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, type),
+            &paddle_mobile__framework__proto__attr_type__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "i", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_i),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, i), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "f", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_FLOAT,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_f),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, f), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "s", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, s), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "ints", 6, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT32,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_ints),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, ints), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "floats", 7, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_FLOAT,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_floats),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, floats),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "strings", 8, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_strings),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, strings),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "b", 10, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_b),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, b), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "bools", 11, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_bools),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, bools), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "block_idx", 12, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr,
+                     has_block_idx),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, block_idx),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "l", 13, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT64,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_l),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, l), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name[] = {
+        8,  /* field[8] = b */
+        10, /* field[10] = block_idx */
+        9,  /* field[9] = bools */
+        3,  /* field[3] = f */
+        6,  /* field[6] = floats */
+        2,  /* field[2] = i */
+        5,  /* field[5] = ints */
+        11, /* field[11] = l */
+        0,  /* field[0] = name */
+        4,  /* field[4] = s */
+        7,  /* field[7] = strings */
+        1,  /* field[1] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_desc__attr__number_ranges[2 + 1] = {
+        {1, 0}, {10, 8}, {0, 12}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__attr__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpDesc.Attr",
+        "Attr",
+        "PaddleMobile__Framework__Proto__OpDesc__Attr",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpDesc__Attr),
+        12,
+        paddle_mobile__framework__proto__op_desc__attr__field_descriptors,
+        paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name,
+        2,
+        paddle_mobile__framework__proto__op_desc__attr__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__op_desc__attr__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_desc__var__field_descriptors[2] = {
+        {
+            "parameter", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, parameter),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "arguments", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, n_arguments),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, arguments),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_desc__var__field_indices_by_name[] = {
+        1, /* field[1] = arguments */
+        0, /* field[0] = parameter */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_desc__var__number_ranges[1 + 1] = {
+        {1, 0}, {0, 2}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__var__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpDesc.Var",
+        "Var",
+        "PaddleMobile__Framework__Proto__OpDesc__Var",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpDesc__Var),
+        2,
+        paddle_mobile__framework__proto__op_desc__var__field_descriptors,
+        paddle_mobile__framework__proto__op_desc__var__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__op_desc__var__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__op_desc__var__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__op_desc__is_target__default_value = 0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_desc__field_descriptors[5] = {
+        {
+            "inputs", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_inputs),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, inputs),
+            &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "outputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_outputs),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, outputs),
+            &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "type", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, type), NULL, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_attrs),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, attrs),
+            &paddle_mobile__framework__proto__op_desc__attr__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "is_target", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, has_is_target),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, is_target), NULL,
+            &paddle_mobile__framework__proto__op_desc__is_target__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_desc__field_indices_by_name[] = {
+        3, /* field[3] = attrs */
+        0, /* field[0] = inputs */
+        4, /* field[4] = is_target */
+        1, /* field[1] = outputs */
+        2, /* field[2] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_desc__number_ranges[1 + 1] = {{1, 0},
+                                                                      {0, 5}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpDesc",
+        "OpDesc",
+        "PaddleMobile__Framework__Proto__OpDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpDesc),
+        5,
+        paddle_mobile__framework__proto__op_desc__field_descriptors,
+        paddle_mobile__framework__proto__op_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__op_desc__number_ranges,
+        (ProtobufCMessageInit)paddle_mobile__framework__proto__op_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__op_proto__var__duplicable__default_value =
+        0;
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__op_proto__var__intermediate__default_value =
+        0;
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__op_proto__var__dispensable__default_value =
+        0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_proto__var__field_descriptors[5] = {
+        {
+            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, name), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "comment", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, comment),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "duplicable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
+                     has_duplicable),
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, duplicable),
+            NULL,
+            &paddle_mobile__framework__proto__op_proto__var__duplicable__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "intermediate", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
+                     has_intermediate),
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
+                     intermediate),
+            NULL,
+            &paddle_mobile__framework__proto__op_proto__var__intermediate__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "dispensable", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
+                     has_dispensable),
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, dispensable),
+            NULL,
+            &paddle_mobile__framework__proto__op_proto__var__dispensable__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_proto__var__field_indices_by_name[] = {
+        1, /* field[1] = comment */
+        4, /* field[4] = dispensable */
+        2, /* field[2] = duplicable */
+        3, /* field[3] = intermediate */
+        0, /* field[0] = name */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_proto__var__number_ranges[1 + 1] = {
+        {1, 0}, {0, 5}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__var__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpProto.Var",
+        "Var",
+        "PaddleMobile__Framework__Proto__OpProto__Var",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpProto__Var),
+        5,
+        paddle_mobile__framework__proto__op_proto__var__field_descriptors,
+        paddle_mobile__framework__proto__op_proto__var__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__op_proto__var__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__op_proto__var__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__op_proto__attr__generated__default_value =
+        0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_proto__attr__field_descriptors[4] = {
+        {
+            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, name), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, type),
+            &paddle_mobile__framework__proto__attr_type__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "comment", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, comment),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "generated", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr,
+                     has_generated),
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, generated),
+            NULL,
+            &paddle_mobile__framework__proto__op_proto__attr__generated__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name[] = {
+        2, /* field[2] = comment */
+        3, /* field[3] = generated */
+        0, /* field[0] = name */
+        1, /* field[1] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_proto__attr__number_ranges[1 + 1] = {
+        {1, 0}, {0, 4}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__attr__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpProto.Attr",
+        "Attr",
+        "PaddleMobile__Framework__Proto__OpProto__Attr",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpProto__Attr),
+        4,
+        paddle_mobile__framework__proto__op_proto__attr__field_descriptors,
+        paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__op_proto__attr__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__op_proto__attr__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_proto__field_descriptors[5] = {
+        {
+            "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto, type), NULL, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "inputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpProto, n_inputs),
+            offsetof(PaddleMobile__Framework__Proto__OpProto, inputs),
+            &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "outputs", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpProto, n_outputs),
+            offsetof(PaddleMobile__Framework__Proto__OpProto, outputs),
+            &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpProto, n_attrs),
+            offsetof(PaddleMobile__Framework__Proto__OpProto, attrs),
+            &paddle_mobile__framework__proto__op_proto__attr__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "comment", 5, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto, comment), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_proto__field_indices_by_name[] = {
+        3, /* field[3] = attrs */
+        4, /* field[4] = comment */
+        1, /* field[1] = inputs */
+        2, /* field[2] = outputs */
+        0, /* field[0] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_proto__number_ranges[1 + 1] = {{1, 0},
+                                                                       {0, 5}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpProto",
+        "OpProto",
+        "PaddleMobile__Framework__Proto__OpProto",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpProto),
+        5,
+        paddle_mobile__framework__proto__op_proto__field_descriptors,
+        paddle_mobile__framework__proto__op_proto__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__op_proto__number_ranges,
+        (ProtobufCMessageInit)paddle_mobile__framework__proto__op_proto__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors
+        [2] = {
+            {
+                "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
+                0, /* quantifier_offset */
+                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
+                         data_type),
+                &paddle_mobile__framework__proto__var_type__type__descriptor,
+                NULL, 0,      /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+            {
+                "dims", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT64,
+                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
+                         n_dims),
+                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
+                         dims),
+                NULL, NULL, 0, /* flags */
+                0, NULL, NULL  /* reserved1,reserved2, etc */
+            },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name
+        [] = {
+            0, /* field[0] = data_type */
+            1, /* field[1] = dims */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges[1 +
+                                                                          1] = {
+        {1, 0}, {0, 2}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__tensor_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.TensorDesc",
+        "TensorDesc",
+        "PaddleMobile__Framework__Proto__VarType__TensorDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__TensorDesc),
+        2,
+        paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__tensor_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const int32_t
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value =
+        0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors
+        [2] = {
+            {
+                "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
+                0, /* quantifier_offset */
+                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
+                         tensor),
+                &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
+                NULL, 0,      /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+            {
+                "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL,
+                PROTOBUF_C_TYPE_INT32,
+                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
+                         has_lod_level),
+                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
+                         lod_level),
+                NULL,
+                &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value,
+                0,            /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name
+        [] = {
+            1, /* field[1] = lod_level */
+            0, /* field[0] = tensor */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges
+        [1 + 1] = {{1, 0}, {0, 2}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.LoDTensorDesc",
+        "LoDTensorDesc",
+        "PaddleMobile__Framework__Proto__VarType__LoDTensorDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc),
+        2,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const int32_t
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value =
+        0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors
+        [2] = {
+            {
+                "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
+                0, /* quantifier_offset */
+                offsetof(
+                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
+                    tensor),
+                &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
+                NULL, 0,      /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+            {
+                "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL,
+                PROTOBUF_C_TYPE_INT32,
+                offsetof(
+                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
+                    has_lod_level),
+                offsetof(
+                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
+                    lod_level),
+                NULL,
+                &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value,
+                0,            /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name
+        [] = {
+            1, /* field[1] = lod_level */
+            0, /* field[0] = tensor */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges
+        [1 + 1] = {{1, 0}, {0, 2}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc",
+        "LoDTensorArrayDesc",
+        "PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc),
+        2,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors[1] = {
+        {
+            "lod_tensor", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc,
+                     n_lod_tensor),
+            offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc,
+                     lod_tensor),
+            &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name
+        [] = {
+            0, /* field[0] = lod_tensor */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__reader_desc__number_ranges[1 +
+                                                                          1] = {
+        {1, 0}, {0, 1}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__reader_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.ReaderDesc",
+        "ReaderDesc",
+        "PaddleMobile__Framework__Proto__VarType__ReaderDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__ReaderDesc),
+        1,
+        paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__reader_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__reader_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors
+        [2] = {
+            {
+                "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
+                0, /* quantifier_offset */
+                offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc,
+                         data_type),
+                &paddle_mobile__framework__proto__var_type__type__descriptor,
+                NULL, 0,      /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+            {
+                "capacity", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT64,
+                0, /* quantifier_offset */
+                offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc,
+                         capacity),
+                NULL, NULL, 0, /* flags */
+                0, NULL, NULL  /* reserved1,reserved2, etc */
+            },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name
+        [] = {
+            1, /* field[1] = capacity */
+            0, /* field[0] = data_type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__channel_desc__number_ranges[1 +
+                                                                           1] =
+        {{1, 0}, {0, 2}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__channel_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.ChannelDesc",
+        "ChannelDesc",
+        "PaddleMobile__Framework__Proto__VarType__ChannelDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__ChannelDesc),
+        2,
+        paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__channel_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__channel_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__tuple__field_descriptors[1] = {
+        {
+            "element_type", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_ENUM,
+            offsetof(PaddleMobile__Framework__Proto__VarType__Tuple,
+                     n_element_type),
+            offsetof(PaddleMobile__Framework__Proto__VarType__Tuple,
+                     element_type),
+            &paddle_mobile__framework__proto__var_type__type__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name[] =
+        {
+            0, /* field[0] = element_type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__tuple__number_ranges[1 + 1] = {
+        {1, 0}, {0, 1}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__tuple__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.Tuple",
+        "Tuple",
+        "PaddleMobile__Framework__Proto__VarType__Tuple",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__Tuple),
+        1,
+        paddle_mobile__framework__proto__var_type__tuple__field_descriptors,
+        paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__tuple__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__tuple__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCEnumValue
+    paddle_mobile__framework__proto__var_type__type__enum_values_by_number[19] =
+        {
+            {"BOOL", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL",
+             0},
+            {"INT16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16",
+             1},
+            {"INT32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32",
+             2},
+            {"INT64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64",
+             3},
+            {"FP16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16",
+             4},
+            {"FP32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32",
+             5},
+            {"FP64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64",
+             6},
+            {"LOD_TENSOR",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR", 7},
+            {"SELECTED_ROWS",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS",
+             8},
+            {"FEED_MINIBATCH",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH",
+             9},
+            {"FETCH_LIST",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST", 10},
+            {"STEP_SCOPES",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES",
+             11},
+            {"LOD_RANK_TABLE",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE",
+             12},
+            {"LOD_TENSOR_ARRAY",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_"
+             "ARRAY",
+             13},
+            {"PLACE_LIST",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST", 14},
+            {"READER",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER", 15},
+            {"CHANNEL",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL", 16},
+            {"RAW", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW", 17},
+            {"TUPLE", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE",
+             18},
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__type__value_ranges[] = {{0, 0},
+                                                                       {0, 19}};
+static const ProtobufCEnumValueIndex
+    paddle_mobile__framework__proto__var_type__type__enum_values_by_name[19] = {
+        {"BOOL", 0},
+        {"CHANNEL", 16},
+        {"FEED_MINIBATCH", 9},
+        {"FETCH_LIST", 10},
+        {"FP16", 4},
+        {"FP32", 5},
+        {"FP64", 6},
+        {"INT16", 1},
+        {"INT32", 2},
+        {"INT64", 3},
+        {"LOD_RANK_TABLE", 12},
+        {"LOD_TENSOR", 7},
+        {"LOD_TENSOR_ARRAY", 13},
+        {"PLACE_LIST", 14},
+        {"RAW", 17},
+        {"READER", 15},
+        {"SELECTED_ROWS", 8},
+        {"STEP_SCOPES", 11},
+        {"TUPLE", 18},
+};
+const ProtobufCEnumDescriptor
+    paddle_mobile__framework__proto__var_type__type__descriptor = {
+        PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.Type",
+        "Type",
+        "PaddleMobile__Framework__Proto__VarType__Type",
+        "paddle_mobile.framework.proto",
+        19,
+        paddle_mobile__framework__proto__var_type__type__enum_values_by_number,
+        19,
+        paddle_mobile__framework__proto__var_type__type__enum_values_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__type__value_ranges,
+        NULL,
+        NULL,
+        NULL,
+        NULL /* reserved[1234] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__field_descriptors[7] = {
+        {
+            "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, type),
+            &paddle_mobile__framework__proto__var_type__type__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "selected_rows", 2, PROTOBUF_C_LABEL_OPTIONAL,
+            PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, selected_rows),
+            &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "lod_tensor", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, lod_tensor),
+            &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "tensor_array", 4, PROTOBUF_C_LABEL_OPTIONAL,
+            PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, tensor_array),
+            &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "reader", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, reader),
+            &paddle_mobile__framework__proto__var_type__reader_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "channel", 6, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, channel),
+            &paddle_mobile__framework__proto__var_type__channel_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "tuple", 7, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, tuple),
+            &paddle_mobile__framework__proto__var_type__tuple__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__field_indices_by_name[] = {
+        5, /* field[5] = channel */
+        2, /* field[2] = lod_tensor */
+        4, /* field[4] = reader */
+        1, /* field[1] = selected_rows */
+        3, /* field[3] = tensor_array */
+        6, /* field[6] = tuple */
+        0, /* field[0] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__number_ranges[1 + 1] = {{1, 0},
+                                                                       {0, 7}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType",
+        "VarType",
+        "PaddleMobile__Framework__Proto__VarType",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType),
+        7,
+        paddle_mobile__framework__proto__var_type__field_descriptors,
+        paddle_mobile__framework__proto__var_type__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__number_ranges,
+        (ProtobufCMessageInit)paddle_mobile__framework__proto__var_type__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__var_desc__persistable__default_value = 0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_desc__field_descriptors[3] = {
+        {
+            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarDesc, name), NULL, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarDesc, type),
+            &paddle_mobile__framework__proto__var_type__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "persistable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__VarDesc, has_persistable),
+            offsetof(PaddleMobile__Framework__Proto__VarDesc, persistable),
+            NULL,
+            &paddle_mobile__framework__proto__var_desc__persistable__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_desc__field_indices_by_name[] = {
+        0, /* field[0] = name */
+        2, /* field[2] = persistable */
+        1, /* field[1] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_desc__number_ranges[1 + 1] = {{1, 0},
+                                                                       {0, 3}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarDesc",
+        "VarDesc",
+        "PaddleMobile__Framework__Proto__VarDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarDesc),
+        3,
+        paddle_mobile__framework__proto__var_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_desc__number_ranges,
+        (ProtobufCMessageInit)paddle_mobile__framework__proto__var_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const int32_t
+    paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value =
+        -1;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__block_desc__field_descriptors[5] = {
+        {
+            "idx", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, idx), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "parent_idx", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, parent_idx),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "vars", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_vars),
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, vars),
+            &paddle_mobile__framework__proto__var_desc__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "ops", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_ops),
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, ops),
+            &paddle_mobile__framework__proto__op_desc__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "forward_block_idx", 5, PROTOBUF_C_LABEL_OPTIONAL,
+            PROTOBUF_C_TYPE_INT32,
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc,
+                     has_forward_block_idx),
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc,
+                     forward_block_idx),
+            NULL,
+            &paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__block_desc__field_indices_by_name[] = {
+        4, /* field[4] = forward_block_idx */
+        0, /* field[0] = idx */
+        3, /* field[3] = ops */
+        1, /* field[1] = parent_idx */
+        2, /* field[2] = vars */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__block_desc__number_ranges[1 + 1] = {
+        {1, 0}, {0, 5}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__block_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.BlockDesc",
+        "BlockDesc",
+        "PaddleMobile__Framework__Proto__BlockDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__BlockDesc),
+        5,
+        paddle_mobile__framework__proto__block_desc__field_descriptors,
+        paddle_mobile__framework__proto__block_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__block_desc__number_ranges,
+        (ProtobufCMessageInit)paddle_mobile__framework__proto__block_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__program_desc__field_descriptors[1] = {
+        {
+            "blocks", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, n_blocks),
+            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, blocks),
+            &paddle_mobile__framework__proto__block_desc__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__program_desc__field_indices_by_name[] = {
+        0, /* field[0] = blocks */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__program_desc__number_ranges[1 + 1] = {
+        {1, 0}, {0, 1}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__program_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.ProgramDesc",
+        "ProgramDesc",
+        "PaddleMobile__Framework__Proto__ProgramDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__ProgramDesc),
+        1,
+        paddle_mobile__framework__proto__program_desc__field_descriptors,
+        paddle_mobile__framework__proto__program_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__program_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__program_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCEnumValue
+    paddle_mobile__framework__proto__attr_type__enum_values_by_number[10] = {
+        {"INT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT", 0},
+        {"FLOAT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT", 1},
+        {"STRING", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING", 2},
+        {"INTS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS", 3},
+        {"FLOATS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS", 4},
+        {"STRINGS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS", 5},
+        {"BOOLEAN", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN", 6},
+        {"BOOLEANS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS", 7},
+        {"BLOCK", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK", 8},
+        {"LONG", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG", 9},
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__attr_type__value_ranges[] = {{0, 0},
+                                                                  {0, 10}};
+static const ProtobufCEnumValueIndex
+    paddle_mobile__framework__proto__attr_type__enum_values_by_name[10] = {
+        {"BLOCK", 8},  {"BOOLEAN", 6}, {"BOOLEANS", 7}, {"FLOAT", 1},
+        {"FLOATS", 4}, {"INT", 0},     {"INTS", 3},     {"LONG", 9},
+        {"STRING", 2}, {"STRINGS", 5},
+};
+const ProtobufCEnumDescriptor
+    paddle_mobile__framework__proto__attr_type__descriptor = {
+        PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.AttrType",
+        "AttrType",
+        "PaddleMobile__Framework__Proto__AttrType",
+        "paddle_mobile.framework.proto",
+        10,
+        paddle_mobile__framework__proto__attr_type__enum_values_by_number,
+        10,
+        paddle_mobile__framework__proto__attr_type__enum_values_by_name,
+        1,
+        paddle_mobile__framework__proto__attr_type__value_ranges,
+        NULL,
+        NULL,
+        NULL,
+        NULL /* reserved[1234] */
+};
diff --git a/tools/quantification/src/framework.pb-c.h b/tools/quantification/src/framework.pb-c.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d63bad76ad188d02986971bd911d8f30cf0af81
--- /dev/null
+++ b/tools/quantification/src/framework.pb-c.h
@@ -0,0 +1,579 @@
+/* Generated by the protocol buffer compiler.  DO NOT EDIT! */
+/* Generated from: framework.proto */
+
+#ifndef PROTOBUF_C_framework_2eproto__INCLUDED
+#define PROTOBUF_C_framework_2eproto__INCLUDED
+
+#include "protobuf-c.h"
+
+PROTOBUF_C__BEGIN_DECLS
+
+#if PROTOBUF_C_VERSION_NUMBER < 1000000
+# error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers.
+#elif 1003000 < PROTOBUF_C_MIN_COMPILER_VERSION
+# error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c.
+#endif
+
+typedef struct _PaddleMobile__Framework__Proto__OpDesc
+    PaddleMobile__Framework__Proto__OpDesc;
+typedef struct _PaddleMobile__Framework__Proto__OpDesc__Attr
+    PaddleMobile__Framework__Proto__OpDesc__Attr;
+typedef struct _PaddleMobile__Framework__Proto__OpDesc__Var
+    PaddleMobile__Framework__Proto__OpDesc__Var;
+typedef struct _PaddleMobile__Framework__Proto__OpProto
+    PaddleMobile__Framework__Proto__OpProto;
+typedef struct _PaddleMobile__Framework__Proto__OpProto__Var
+    PaddleMobile__Framework__Proto__OpProto__Var;
+typedef struct _PaddleMobile__Framework__Proto__OpProto__Attr
+    PaddleMobile__Framework__Proto__OpProto__Attr;
+typedef struct _PaddleMobile__Framework__Proto__VarType
+    PaddleMobile__Framework__Proto__VarType;
+typedef struct _PaddleMobile__Framework__Proto__VarType__TensorDesc
+    PaddleMobile__Framework__Proto__VarType__TensorDesc;
+typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc
+    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc;
+typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc
+    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc;
+typedef struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc
+    PaddleMobile__Framework__Proto__VarType__ReaderDesc;
+typedef struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc
+    PaddleMobile__Framework__Proto__VarType__ChannelDesc;
+typedef struct _PaddleMobile__Framework__Proto__VarType__Tuple
+    PaddleMobile__Framework__Proto__VarType__Tuple;
+typedef struct _PaddleMobile__Framework__Proto__VarDesc
+    PaddleMobile__Framework__Proto__VarDesc;
+typedef struct _PaddleMobile__Framework__Proto__BlockDesc
+    PaddleMobile__Framework__Proto__BlockDesc;
+typedef struct _PaddleMobile__Framework__Proto__ProgramDesc
+    PaddleMobile__Framework__Proto__ProgramDesc;
+
+/* --- enums --- */
+
+typedef enum _PaddleMobile__Framework__Proto__VarType__Type {
+  /*
+   * Pod Types
+   */
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL = 0,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16 = 1,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32 = 2,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64 = 3,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16 = 4,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32 = 5,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64 = 6,
+  /*
+   * Other types that may need additional descriptions
+   */
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR = 7,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS = 8,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH = 9,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST = 10,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES = 11,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE = 12,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_ARRAY = 13,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST = 14,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER = 15,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL = 16,
+  /*
+   * Any runtime decided variable type is raw
+   * raw variables should manage their own allocations
+   * in operators like nccl_op
+   */
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW = 17,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE =
+      18 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
+          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE)
+} PaddleMobile__Framework__Proto__VarType__Type;
+typedef enum _PaddleMobile__Framework__Proto__AttrType {
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT = 0,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT = 1,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING = 2,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS = 3,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS = 4,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS = 5,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN = 6,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG =
+      9 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
+          PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE)
+} PaddleMobile__Framework__Proto__AttrType;
+
+/* --- messages --- */
+
+struct _PaddleMobile__Framework__Proto__OpDesc__Attr {
+  ProtobufCMessage base;
+  char *name;
+  PaddleMobile__Framework__Proto__AttrType type;
+  protobuf_c_boolean has_i;
+  int32_t i;
+  protobuf_c_boolean has_f;
+  float f;
+  char *s;
+  size_t n_ints;
+  int32_t *ints;
+  size_t n_floats;
+  float *floats;
+  size_t n_strings;
+  char **strings;
+  protobuf_c_boolean has_b;
+  protobuf_c_boolean b;
+  size_t n_bools;
+  protobuf_c_boolean *bools;
+  protobuf_c_boolean has_block_idx;
+  int32_t block_idx;
+  protobuf_c_boolean has_l;
+  int64_t l;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT                   \
+  {                                                                            \
+    PROTOBUF_C_MESSAGE_INIT(                                                   \
+        &paddle_mobile__framework__proto__op_desc__attr__descriptor)           \
+    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, 0, 0, 0, 0, NULL, \
+        0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0                   \
+  }
+
+struct _PaddleMobile__Framework__Proto__OpDesc__Var {
+  ProtobufCMessage base;
+  char *parameter;
+  size_t n_arguments;
+  char **arguments;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT         \
+  {                                                                 \
+    PROTOBUF_C_MESSAGE_INIT(                                        \
+        &paddle_mobile__framework__proto__op_desc__var__descriptor) \
+    , NULL, 0, NULL                                                 \
+  }
+
+/*
+ * OpDesc describes an instance of a C++ framework::OperatorBase
+ * derived class type.
+ */
+struct _PaddleMobile__Framework__Proto__OpDesc {
+  ProtobufCMessage base;
+  char *type;
+  size_t n_inputs;
+  PaddleMobile__Framework__Proto__OpDesc__Var **inputs;
+  size_t n_outputs;
+  PaddleMobile__Framework__Proto__OpDesc__Var **outputs;
+  size_t n_attrs;
+  PaddleMobile__Framework__Proto__OpDesc__Attr **attrs;
+  protobuf_c_boolean has_is_target;
+  protobuf_c_boolean is_target;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT         \
+  {                                                            \
+    PROTOBUF_C_MESSAGE_INIT(                                   \
+        &paddle_mobile__framework__proto__op_desc__descriptor) \
+    , NULL, 0, NULL, 0, NULL, 0, NULL, 0, 0                    \
+  }
+
+/*
+ * VarProto describes the C++ type framework::Variable.
+ */
+struct _PaddleMobile__Framework__Proto__OpProto__Var {
+  ProtobufCMessage base;
+  char *name;
+  char *comment;
+  protobuf_c_boolean has_duplicable;
+  protobuf_c_boolean duplicable;
+  protobuf_c_boolean has_intermediate;
+  protobuf_c_boolean intermediate;
+  protobuf_c_boolean has_dispensable;
+  protobuf_c_boolean dispensable;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT         \
+  {                                                                  \
+    PROTOBUF_C_MESSAGE_INIT(                                         \
+        &paddle_mobile__framework__proto__op_proto__var__descriptor) \
+    , NULL, NULL, 0, 0, 0, 0, 0, 0                                   \
+  }
+
+/*
+ * AttrProto describes the C++ type Attribute.
+ */
+struct _PaddleMobile__Framework__Proto__OpProto__Attr {
+  ProtobufCMessage base;
+  char *name;
+  PaddleMobile__Framework__Proto__AttrType type;
+  char *comment;
+  /*
+   * If that attribute is generated, it means the Paddle third
+   * language binding has responsibility to fill that
+   * attribute. End-User should not set that attribute.
+   */
+  protobuf_c_boolean has_generated;
+  protobuf_c_boolean generated;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT           \
+  {                                                                     \
+    PROTOBUF_C_MESSAGE_INIT(                                            \
+        &paddle_mobile__framework__proto__op_proto__attr__descriptor)   \
+    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, NULL, 0, 0 \
+  }
+
+/*
+ * OpProto describes a C++ framework::OperatorBase derived class.
+ */
+struct _PaddleMobile__Framework__Proto__OpProto {
+  ProtobufCMessage base;
+  char *type;
+  size_t n_inputs;
+  PaddleMobile__Framework__Proto__OpProto__Var **inputs;
+  size_t n_outputs;
+  PaddleMobile__Framework__Proto__OpProto__Var **outputs;
+  size_t n_attrs;
+  PaddleMobile__Framework__Proto__OpProto__Attr **attrs;
+  char *comment;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT         \
+  {                                                             \
+    PROTOBUF_C_MESSAGE_INIT(                                    \
+        &paddle_mobile__framework__proto__op_proto__descriptor) \
+    , NULL, 0, NULL, 0, NULL, 0, NULL, NULL                     \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__TensorDesc {
+  ProtobufCMessage base;
+  /*
+   * Should only be PODType. Is enforced in C++
+   */
+  PaddleMobile__Framework__Proto__VarType__Type data_type;
+  /*
+   * [UNK, 640, 480] is saved as [-1, 640, 480]
+   */
+  size_t n_dims;
+  int64_t *dims;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT         \
+  {                                                                          \
+    PROTOBUF_C_MESSAGE_INIT(                                                 \
+        &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor) \
+    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0, NULL         \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc {
+  ProtobufCMessage base;
+  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor;
+  protobuf_c_boolean has_lod_level;
+  int32_t lod_level;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT         \
+  {                                                                              \
+    PROTOBUF_C_MESSAGE_INIT(                                                     \
+        &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor) \
+    , NULL, 0, 0                                                                 \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc {
+  ProtobufCMessage base;
+  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor;
+  protobuf_c_boolean has_lod_level;
+  int32_t lod_level;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT         \
+  {                                                                                    \
+    PROTOBUF_C_MESSAGE_INIT(                                                           \
+        &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor) \
+    , NULL, 0, 0                                                                       \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc {
+  ProtobufCMessage base;
+  size_t n_lod_tensor;
+  PaddleMobile__Framework__Proto__VarType__LoDTensorDesc **lod_tensor;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT         \
+  {                                                                          \
+    PROTOBUF_C_MESSAGE_INIT(                                                 \
+        &paddle_mobile__framework__proto__var_type__reader_desc__descriptor) \
+    , 0, NULL                                                                \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc {
+  ProtobufCMessage base;
+  PaddleMobile__Framework__Proto__VarType__Type data_type;
+  int64_t capacity;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT         \
+  {                                                                           \
+    PROTOBUF_C_MESSAGE_INIT(                                                  \
+        &paddle_mobile__framework__proto__var_type__channel_desc__descriptor) \
+    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0                \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__Tuple {
+  ProtobufCMessage base;
+  size_t n_element_type;
+  PaddleMobile__Framework__Proto__VarType__Type *element_type;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT         \
+  {                                                                    \
+    PROTOBUF_C_MESSAGE_INIT(                                           \
+        &paddle_mobile__framework__proto__var_type__tuple__descriptor) \
+    , 0, NULL                                                          \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType {
+  ProtobufCMessage base;
+  PaddleMobile__Framework__Proto__VarType__Type type;
+  PaddleMobile__Framework__Proto__VarType__TensorDesc *selected_rows;
+  PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *lod_tensor;
+  PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *tensor_array;
+  PaddleMobile__Framework__Proto__VarType__ReaderDesc *reader;
+  PaddleMobile__Framework__Proto__VarType__ChannelDesc *channel;
+  PaddleMobile__Framework__Proto__VarType__Tuple *tuple;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT                        \
+  {                                                                            \
+    PROTOBUF_C_MESSAGE_INIT(                                                   \
+        &paddle_mobile__framework__proto__var_type__descriptor)                \
+    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, NULL, NULL, NULL, \
+        NULL, NULL, NULL                                                       \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarDesc {
+  ProtobufCMessage base;
+  char *name;
+  PaddleMobile__Framework__Proto__VarType *type;
+  protobuf_c_boolean has_persistable;
+  protobuf_c_boolean persistable;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT         \
+  {                                                             \
+    PROTOBUF_C_MESSAGE_INIT(                                    \
+        &paddle_mobile__framework__proto__var_desc__descriptor) \
+    , NULL, NULL, 0, 0                                          \
+  }
+
+struct _PaddleMobile__Framework__Proto__BlockDesc {
+  ProtobufCMessage base;
+  int32_t idx;
+  int32_t parent_idx;
+  size_t n_vars;
+  PaddleMobile__Framework__Proto__VarDesc **vars;
+  size_t n_ops;
+  PaddleMobile__Framework__Proto__OpDesc **ops;
+  protobuf_c_boolean has_forward_block_idx;
+  int32_t forward_block_idx;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT         \
+  {                                                               \
+    PROTOBUF_C_MESSAGE_INIT(                                      \
+        &paddle_mobile__framework__proto__block_desc__descriptor) \
+    , 0, 0, 0, NULL, 0, NULL, 0, -1                               \
+  }
+
+/*
+ * Please refer to
+ * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
+ * for more details.
+ * TODO(panyx0718): A model can have multiple programs. Need a
+ * way to distinguish them. Maybe ID or name?
+ */
+struct _PaddleMobile__Framework__Proto__ProgramDesc {
+  ProtobufCMessage base;
+  size_t n_blocks;
+  PaddleMobile__Framework__Proto__BlockDesc **blocks;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT         \
+  {                                                                 \
+    PROTOBUF_C_MESSAGE_INIT(                                        \
+        &paddle_mobile__framework__proto__program_desc__descriptor) \
+    , 0, NULL                                                       \
+  }
+
+/* PaddleMobile__Framework__Proto__OpDesc__Attr methods */
+void paddle_mobile__framework__proto__op_desc__attr__init(
+    PaddleMobile__Framework__Proto__OpDesc__Attr *message);
+/* PaddleMobile__Framework__Proto__OpDesc__Var methods */
+void paddle_mobile__framework__proto__op_desc__var__init(
+    PaddleMobile__Framework__Proto__OpDesc__Var *message);
+/* PaddleMobile__Framework__Proto__OpDesc methods */
+void paddle_mobile__framework__proto__op_desc__init(
+    PaddleMobile__Framework__Proto__OpDesc *message);
+
+size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__OpDesc *message);
+
+PaddleMobile__Framework__Proto__OpDesc *
+paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
+                                                 size_t len,
+                                                 const uint8_t *data);
+void paddle_mobile__framework__proto__op_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__OpDesc *message,
+    ProtobufCAllocator *allocator);
+/* PaddleMobile__Framework__Proto__OpProto__Var methods */
+void paddle_mobile__framework__proto__op_proto__var__init(
+    PaddleMobile__Framework__Proto__OpProto__Var *message);
+/* PaddleMobile__Framework__Proto__OpProto__Attr methods */
+void paddle_mobile__framework__proto__op_proto__attr__init(
+    PaddleMobile__Framework__Proto__OpProto__Attr *message);
+/* PaddleMobile__Framework__Proto__OpProto methods */
+void paddle_mobile__framework__proto__op_proto__init(
+    PaddleMobile__Framework__Proto__OpProto *message);
+size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
+    const PaddleMobile__Framework__Proto__OpProto *message);
+PaddleMobile__Framework__Proto__OpProto *
+paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data);
+void paddle_mobile__framework__proto__op_proto__free_unpacked(
+    PaddleMobile__Framework__Proto__OpProto *message,
+    ProtobufCAllocator *allocator);
+/* PaddleMobile__Framework__Proto__VarType__TensorDesc methods */
+void paddle_mobile__framework__proto__var_type__tensor_desc__init(
+    PaddleMobile__Framework__Proto__VarType__TensorDesc *message);
+/* PaddleMobile__Framework__Proto__VarType__LoDTensorDesc methods */
+void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init(
+    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message);
+/* PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc methods */
+void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init(
+    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message);
+/* PaddleMobile__Framework__Proto__VarType__ReaderDesc methods */
+void paddle_mobile__framework__proto__var_type__reader_desc__init(
+    PaddleMobile__Framework__Proto__VarType__ReaderDesc *message);
+/* PaddleMobile__Framework__Proto__VarType__ChannelDesc methods */
+void paddle_mobile__framework__proto__var_type__channel_desc__init(
+    PaddleMobile__Framework__Proto__VarType__ChannelDesc *message);
+/* PaddleMobile__Framework__Proto__VarType__Tuple methods */
+void paddle_mobile__framework__proto__var_type__tuple__init(
+    PaddleMobile__Framework__Proto__VarType__Tuple *message);
+/* PaddleMobile__Framework__Proto__VarType methods */
+void paddle_mobile__framework__proto__var_type__init(
+    PaddleMobile__Framework__Proto__VarType *message);
+size_t paddle_mobile__framework__proto__var_type__get_packed_size(
+    const PaddleMobile__Framework__Proto__VarType *message);
+PaddleMobile__Framework__Proto__VarType *
+paddle_mobile__framework__proto__var_type__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data);
+void paddle_mobile__framework__proto__var_type__free_unpacked(
+    PaddleMobile__Framework__Proto__VarType *message,
+    ProtobufCAllocator *allocator);
+/* PaddleMobile__Framework__Proto__VarDesc methods */
+void paddle_mobile__framework__proto__var_desc__init(
+    PaddleMobile__Framework__Proto__VarDesc *message);
+size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__VarDesc *message);
+PaddleMobile__Framework__Proto__VarDesc *
+paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data);
+void paddle_mobile__framework__proto__var_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__VarDesc *message,
+    ProtobufCAllocator *allocator);
+/* PaddleMobile__Framework__Proto__BlockDesc methods */
+void paddle_mobile__framework__proto__block_desc__init(
+    PaddleMobile__Framework__Proto__BlockDesc *message);
+size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__BlockDesc *message);
+PaddleMobile__Framework__Proto__BlockDesc *
+paddle_mobile__framework__proto__block_desc__unpack(
+    ProtobufCAllocator *allocator, size_t len, const uint8_t *data);
+void paddle_mobile__framework__proto__block_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__BlockDesc *message,
+    ProtobufCAllocator *allocator);
+/* PaddleMobile__Framework__Proto__ProgramDesc methods */
+void paddle_mobile__framework__proto__program_desc__init(
+    PaddleMobile__Framework__Proto__ProgramDesc *message);
+size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__ProgramDesc *message);
+PaddleMobile__Framework__Proto__ProgramDesc *
+paddle_mobile__framework__proto__program_desc__unpack(
+    ProtobufCAllocator *allocator, size_t len, const uint8_t *data);
+void paddle_mobile__framework__proto__program_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__ProgramDesc *message,
+    ProtobufCAllocator *allocator);
+/* --- per-message closures --- */
+
+typedef void (*PaddleMobile__Framework__Proto__OpDesc__Attr_Closure)(
+    const PaddleMobile__Framework__Proto__OpDesc__Attr *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__OpDesc__Var_Closure)(
+    const PaddleMobile__Framework__Proto__OpDesc__Var *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__OpDesc_Closure)(
+    const PaddleMobile__Framework__Proto__OpDesc *message, void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__OpProto__Var_Closure)(
+    const PaddleMobile__Framework__Proto__OpProto__Var *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__OpProto__Attr_Closure)(
+    const PaddleMobile__Framework__Proto__OpProto__Attr *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__OpProto_Closure)(
+    const PaddleMobile__Framework__Proto__OpProto *message, void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__TensorDesc *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType__LoDTensorDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message,
+    void *closure_data);
+typedef void (
+    *PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType__ReaderDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__ReaderDesc *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType__ChannelDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__ChannelDesc *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType__Tuple_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__Tuple *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType_Closure)(
+    const PaddleMobile__Framework__Proto__VarType *message, void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarDesc *message, void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__BlockDesc_Closure)(
+    const PaddleMobile__Framework__Proto__BlockDesc *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__ProgramDesc_Closure)(
+    const PaddleMobile__Framework__Proto__ProgramDesc *message,
+    void *closure_data);
+
+/* --- services --- */
+
+/* --- descriptors --- */
+
+extern const ProtobufCEnumDescriptor
+    paddle_mobile__framework__proto__attr_type__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__attr__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__var__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__var__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__attr__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__tensor_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__reader_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__channel_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__tuple__descriptor;
+extern const ProtobufCEnumDescriptor
+    paddle_mobile__framework__proto__var_type__type__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__block_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__program_desc__descriptor;
+
+PROTOBUF_C__END_DECLS
+
+#endif /* PROTOBUF_C_framework_2eproto__INCLUDED */
diff --git a/tools/quantification/src/program_desc.cpp b/tools/quantification/src/program_desc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f9984832ada5061c7691aeb7fadba86cb5b8c0c
--- /dev/null
+++ b/tools/quantification/src/program_desc.cpp
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+//
+// Created by 谢柏渊 on 2018/7/25.
+//
+
+#include "src/program_desc.h"
+#include <vector>
+
+ProgramDesc::ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc) {
+  for (int i = 0; i < desc->n_blocks; ++i) {
+    blocks_.emplace_back(std::make_shared<BlockDesc>(desc->blocks[i]));
+  }
+}
+
+const std::vector<std::shared_ptr<BlockDesc>> ProgramDesc::Blocks() {
+  return blocks_;
+}
diff --git a/tools/quantification/src/program_desc.h b/tools/quantification/src/program_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..60a0f757b0c907165d7639a41e35a407ef083b59
--- /dev/null
+++ b/tools/quantification/src/program_desc.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+//
+// Created by 谢柏渊 on 2018/7/25.
+//
+
+#ifndef TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
+#define TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
+
+#include <memory>
+#include <vector>
+#include "src/block_desc_local.h"
+#include "src/framework.pb-c.h"
+
+class ProgramDesc {
+ public:
+  //    friend class Node;
+  //
+  //    friend class ProgramOptimize;
+
+  explicit ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc);
+
+  const std::vector<std::shared_ptr<BlockDesc>> Blocks();
+
+ private:
+  std::vector<std::shared_ptr<BlockDesc>> blocks_;
+};
+
+#endif  // TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
diff --git a/tools/quantification/src/protobuf-c.c b/tools/quantification/src/protobuf-c.c
new file mode 100644
index 0000000000000000000000000000000000000000..1092e3f78b02a343d8c8965ea7b2d777a6fac9ae
--- /dev/null
+++ b/tools/quantification/src/protobuf-c.c
@@ -0,0 +1,2098 @@
+/*
+ * Copyright (c) 2008-2015, Dave Benson and the protobuf-c authors.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*! \file
+ * Support library for `protoc-c` generated code.
+ *
+ * This file implements the public API used by the code generated
+ * by `protoc-c`.
+ *
+ * \authors Dave Benson and the protobuf-c authors
+ *
+ * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license.
+ */
+
+/**
+ * \todo 64-BIT OPTIMIZATION: certain implementations use 32-bit math
+ * even on 64-bit platforms (uint64_size, uint64_pack, parse_uint64).
+ *
+ * \todo Use size_t consistently.
+ */
+
+#include <stdlib.h> /* for malloc, free */
+#include <string.h> /* for strcmp, strlen, memcpy, memmove, memset */
+
+#include "protobuf-c.h"
+
+#define TRUE 1
+#define FALSE 0
+
+#define PROTOBUF_C__ASSERT_NOT_REACHED() assert(0)
+
+/* Workaround for Microsoft compilers. */
+#ifdef _MSC_VER
+#define inline __inline
+#endif
+
+/**
+ * \defgroup internal Internal functions and macros
+ *
+ * These are not exported by the library but are useful to developers working
+ * on `libprotobuf-c` itself.
+ */
+
+/**
+ * \defgroup macros Utility macros for manipulating structures
+ *
+ * Macros and constants used to manipulate the base "classes" generated by
+ * `protobuf-c`. They also define limits and check correctness.
+ *
+ * \ingroup internal
+ * @{
+ */
+
+/** The maximum length of a 64-bit integer in varint encoding. */
+#define MAX_UINT64_ENCODED_SIZE 10
+
+#ifndef PROTOBUF_C_UNPACK_ERROR
+#define PROTOBUF_C_UNPACK_ERROR(...)
+#endif
+
+const char protobuf_c_empty_string[] = "";
+
+/**
+ * Internal `ProtobufCMessage` manipulation macro.
+ *
+ * Base macro for manipulating a `ProtobufCMessage`. Used by STRUCT_MEMBER() and
+ * STRUCT_MEMBER_PTR().
+ */
+#define STRUCT_MEMBER_P(struct_p, struct_offset) \
+  ((void *)((uint8_t *)(struct_p) + (struct_offset)))
+
+/**
+ * Return field in a `ProtobufCMessage` based on offset.
+ *
+ * Take a pointer to a `ProtobufCMessage` and find the field at the offset.
+ * Cast it to the passed type.
+ */
+#define STRUCT_MEMBER(member_type, struct_p, struct_offset) \
+  (*(member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset)))
+
+/**
+ * Return field in a `ProtobufCMessage` based on offset.
+ *
+ * Take a pointer to a `ProtobufCMessage` and find the field at the offset. Cast
+ * it to a pointer to the passed type.
+ */
+#define STRUCT_MEMBER_PTR(member_type, struct_p, struct_offset) \
+  ((member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset)))
+
+/* Assertions for magic numbers. */
+
+#define ASSERT_IS_ENUM_DESCRIPTOR(desc) \
+  assert((desc)->magic == PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC)
+
+#define ASSERT_IS_MESSAGE_DESCRIPTOR(desc) \
+  assert((desc)->magic == PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC)
+
+#define ASSERT_IS_MESSAGE(message) \
+  ASSERT_IS_MESSAGE_DESCRIPTOR((message)->descriptor)
+
+#define ASSERT_IS_SERVICE_DESCRIPTOR(desc) \
+  assert((desc)->magic == PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC)
+
+/**@}*/
+
+/* --- version --- */
+
+const char *protobuf_c_version(void) { return PROTOBUF_C_VERSION; }
+
+uint32_t protobuf_c_version_number(void) { return PROTOBUF_C_VERSION_NUMBER; }
+
+/* --- allocator --- */
+
+static void *system_alloc(void *allocator_data, size_t size) {
+  return malloc(size);
+}
+
+static void system_free(void *allocator_data, void *data) { free(data); }
+
+static inline void *do_alloc(ProtobufCAllocator *allocator, size_t size) {
+  return allocator->alloc(allocator->allocator_data, size);
+}
+
+static inline void do_free(ProtobufCAllocator *allocator, void *data) {
+  if (data != NULL) allocator->free(allocator->allocator_data, data);
+}
+
+/*
+ * This allocator uses the system's malloc() and free(). It is the default
+ * allocator used if NULL is passed as the ProtobufCAllocator to an exported
+ * function.
+ */
+static ProtobufCAllocator protobuf_c__allocator = {
+    .alloc = &system_alloc,
+    .free = &system_free,
+    .allocator_data = NULL,
+};
+
+/* === buffer-simple === */
+
+void protobuf_c_buffer_simple_append(ProtobufCBuffer *buffer, size_t len,
+                                     const uint8_t *data) {
+  ProtobufCBufferSimple *simp = (ProtobufCBufferSimple *)buffer;
+  size_t new_len = simp->len + len;
+
+  if (new_len > simp->alloced) {
+    ProtobufCAllocator *allocator = simp->allocator;
+    size_t new_alloced = simp->alloced * 2;
+    uint8_t *new_data;
+
+    if (allocator == NULL) allocator = &protobuf_c__allocator;
+    while (new_alloced < new_len) new_alloced += new_alloced;
+    new_data = do_alloc(allocator, new_alloced);
+    if (!new_data) return;
+    memcpy(new_data, simp->data, simp->len);
+    if (simp->must_free_data)
+      do_free(allocator, simp->data);
+    else
+      simp->must_free_data = TRUE;
+    simp->data = new_data;
+    simp->alloced = new_alloced;
+  }
+  memcpy(simp->data + simp->len, data, len);
+  simp->len = new_len;
+}
+
+/**
+ * \defgroup packedsz protobuf_c_message_get_packed_size() implementation
+ *
+ * Routines mainly used by protobuf_c_message_get_packed_size().
+ *
+ * \ingroup internal
+ * @{
+ */
+
+/**
+ * Return the number of bytes required to store the tag for the field. Includes
+ * 3 bits for the wire-type, and a single bit that denotes the end-of-tag.
+ *
+ * \param number
+ *      Field tag to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t get_tag_size(uint32_t number) {
+  if (number < (1UL << 4)) {
+    return 1;
+  } else if (number < (1UL << 11)) {
+    return 2;
+  } else if (number < (1UL << 18)) {
+    return 3;
+  } else if (number < (1UL << 25)) {
+    return 4;
+  } else {
+    return 5;
+  }
+}
+
+/**
+ * Return the number of bytes required to store a variable-length unsigned
+ * 32-bit integer in base-128 varint encoding.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t uint32_size(uint32_t v) {
+  if (v < (1UL << 7)) {
+    return 1;
+  } else if (v < (1UL << 14)) {
+    return 2;
+  } else if (v < (1UL << 21)) {
+    return 3;
+  } else if (v < (1UL << 28)) {
+    return 4;
+  } else {
+    return 5;
+  }
+}
+
+/**
+ * Return the number of bytes required to store a variable-length signed 32-bit
+ * integer in base-128 varint encoding.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t int32_size(int32_t v) {
+  if (v < 0) {
+    return 10;
+  } else if (v < (1L << 7)) {
+    return 1;
+  } else if (v < (1L << 14)) {
+    return 2;
+  } else if (v < (1L << 21)) {
+    return 3;
+  } else if (v < (1L << 28)) {
+    return 4;
+  } else {
+    return 5;
+  }
+}
+
+/**
+ * Return the ZigZag-encoded 32-bit unsigned integer form of a 32-bit signed
+ * integer.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      ZigZag encoded integer.
+ */
+static inline uint32_t zigzag32(int32_t v) {
+  if (v < 0)
+    return (-(uint32_t)v) * 2 - 1;
+  else
+    return (uint32_t)(v)*2;
+}
+
+/**
+ * Return the number of bytes required to store a signed 32-bit integer,
+ * converted to an unsigned 32-bit integer with ZigZag encoding, using base-128
+ * varint encoding.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t sint32_size(int32_t v) { return uint32_size(zigzag32(v)); }
+
+/**
+ * Return the number of bytes required to store a 64-bit unsigned integer in
+ * base-128 varint encoding.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t uint64_size(uint64_t v) {
+  uint32_t upper_v = (uint32_t)(v >> 32);
+
+  if (upper_v == 0) {
+    return uint32_size((uint32_t)v);
+  } else if (upper_v < (1UL << 3)) {
+    return 5;
+  } else if (upper_v < (1UL << 10)) {
+    return 6;
+  } else if (upper_v < (1UL << 17)) {
+    return 7;
+  } else if (upper_v < (1UL << 24)) {
+    return 8;
+  } else if (upper_v < (1UL << 31)) {
+    return 9;
+  } else {
+    return 10;
+  }
+}
+
+/**
+ * Return the ZigZag-encoded 64-bit unsigned integer form of a 64-bit signed
+ * integer.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      ZigZag encoded integer.
+ */
+static inline uint64_t zigzag64(int64_t v) {
+  if (v < 0)
+    return (-(uint64_t)v) * 2 - 1;
+  else
+    return (uint64_t)(v)*2;
+}
+
+/**
+ * Return the number of bytes required to store a signed 64-bit integer,
+ * converted to an unsigned 64-bit integer with ZigZag encoding, using base-128
+ * varint encoding.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t sint64_size(int64_t v) { return uint64_size(zigzag64(v)); }
+
+/**
+ * Calculate the serialized size of a single required message field, including
+ * the space needed by the preceding tag.
+ *
+ * \param field
+ *      Field descriptor for member.
+ * \param member
+ *      Field to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static size_t required_field_get_packed_size(
+    const ProtobufCFieldDescriptor *field, const void *member) {
+  size_t rv = get_tag_size(field->id);
+
+  switch (field->type) {
+    case PROTOBUF_C_TYPE_SINT32:
+      return rv + sint32_size(*(const int32_t *)member);
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_INT32:
+      return rv + int32_size(*(const int32_t *)member);
+    case PROTOBUF_C_TYPE_UINT32:
+      return rv + uint32_size(*(const uint32_t *)member);
+    case PROTOBUF_C_TYPE_SINT64:
+      return rv + sint64_size(*(const int64_t *)member);
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+      return rv + uint64_size(*(const uint64_t *)member);
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+      return rv + 4;
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+      return rv + 8;
+    case PROTOBUF_C_TYPE_BOOL:
+      return rv + 1;
+    case PROTOBUF_C_TYPE_FLOAT:
+      return rv + 4;
+    case PROTOBUF_C_TYPE_DOUBLE:
+      return rv + 8;
+    case PROTOBUF_C_TYPE_STRING: {
+      const char *str = *(char *const *)member;
+      size_t len = str ? strlen(str) : 0;
+      return rv + uint32_size(len) + len;
+    }
+    case PROTOBUF_C_TYPE_BYTES: {
+      size_t len = ((const ProtobufCBinaryData *)member)->len;
+      return rv + uint32_size(len) + len;
+    }
+    case PROTOBUF_C_TYPE_MESSAGE: {
+      const ProtobufCMessage *msg = *(ProtobufCMessage *const *)member;
+      size_t subrv = msg ? protobuf_c_message_get_packed_size(msg) : 0;
+      return rv + uint32_size(subrv) + subrv;
+    }
+  }
+  PROTOBUF_C__ASSERT_NOT_REACHED();
+  return 0;
+}
+
+/**
+ * Calculate the serialized size of a single oneof message field, including
+ * the space needed by the preceding tag. Returns 0 if the oneof field isn't
+ * selected or is not set.
+ *
+ * \param field
+ *      Field descriptor for member.
+ * \param oneof_case
+ *      Enum value that selects the field in the oneof.
+ * \param member
+ *      Field to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static size_t oneof_field_get_packed_size(const ProtobufCFieldDescriptor *field,
+                                          uint32_t oneof_case,
+                                          const void *member) {
+  if (oneof_case != field->id) {
+    return 0;
+  }
+  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
+      field->type == PROTOBUF_C_TYPE_STRING) {
+    const void *ptr = *(const void *const *)member;
+    if (ptr == NULL || ptr == field->default_value) return 0;
+  }
+  return required_field_get_packed_size(field, member);
+}
+
+/**
+ * Calculate the serialized size of a single optional message field, including
+ * the space needed by the preceding tag. Returns 0 if the optional field isn't
+ * set.
+ *
+ * \param field
+ *      Field descriptor for member.
+ * \param has
+ *      True if the field exists, false if not.
+ * \param member
+ *      Field to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static size_t optional_field_get_packed_size(
+    const ProtobufCFieldDescriptor *field, const protobuf_c_boolean has,
+    const void *member) {
+  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
+      field->type == PROTOBUF_C_TYPE_STRING) {
+    const void *ptr = *(const void *const *)member;
+    if (ptr == NULL || ptr == field->default_value) return 0;
+  } else {
+    if (!has) return 0;
+  }
+  return required_field_get_packed_size(field, member);
+}
+
+static protobuf_c_boolean field_is_zeroish(
+    const ProtobufCFieldDescriptor *field, const void *member) {
+  protobuf_c_boolean ret = FALSE;
+
+  switch (field->type) {
+    case PROTOBUF_C_TYPE_BOOL:
+      ret = (0 == *(const protobuf_c_boolean *)member);
+      break;
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_SINT32:
+    case PROTOBUF_C_TYPE_INT32:
+    case PROTOBUF_C_TYPE_UINT32:
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+      ret = (0 == *(const uint32_t *)member);
+      break;
+    case PROTOBUF_C_TYPE_SINT64:
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+      ret = (0 == *(const uint64_t *)member);
+      break;
+    case PROTOBUF_C_TYPE_FLOAT:
+      ret = (0 == *(const float *)member);
+      break;
+    case PROTOBUF_C_TYPE_DOUBLE:
+      ret = (0 == *(const double *)member);
+      break;
+    case PROTOBUF_C_TYPE_STRING:
+      ret = (NULL == *(const char *const *)member) ||
+            ('\0' == **(const char *const *)member);
+      break;
+    case PROTOBUF_C_TYPE_BYTES:
+    case PROTOBUF_C_TYPE_MESSAGE:
+      ret = (NULL == *(const void *const *)member);
+      break;
+    default:
+      ret = TRUE;
+      break;
+  }
+
+  return ret;
+}
+
+/**
+ * Calculate the serialized size of a single unlabeled message field, including
+ * the space needed by the preceding tag. Returns 0 if the field isn't set or
+ * if it is set to a "zeroish" value (null pointer or 0 for numerical values).
+ * Unlabeled fields are supported only in proto3.
+ *
+ * \param field
+ *      Field descriptor for member.
+ * \param member
+ *      Field to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static size_t unlabeled_field_get_packed_size(
+    const ProtobufCFieldDescriptor *field, const void *member) {
+  if (field_is_zeroish(field, member)) return 0;
+  return required_field_get_packed_size(field, member);
+}
+
+/**
+ * Calculate the serialized size of repeated message fields, which may consist
+ * of any number of values (including 0). Includes the space needed by the
+ * preceding tags (as needed).
+ *
+ * \param field
+ *      Field descriptor for member.
+ * \param count
+ *      Number of repeated field members.
+ * \param member
+ *      Field to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static size_t repeated_field_get_packed_size(
+    const ProtobufCFieldDescriptor *field, size_t count, const void *member) {
+  size_t header_size;
+  size_t rv = 0;
+  unsigned i;
+  void *array = *(void *const *)member;
+
+  if (count == 0) return 0;
+  header_size = get_tag_size(field->id);
+  if (0 == (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED)) header_size *= count;
+
+  switch (field->type) {
+    case PROTOBUF_C_TYPE_SINT32:
+      for (i = 0; i < count; i++) rv += sint32_size(((int32_t *)array)[i]);
+      break;
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_INT32:
+      for (i = 0; i < count; i++) rv += int32_size(((int32_t *)array)[i]);
+      break;
+    case PROTOBUF_C_TYPE_UINT32:
+      for (i = 0; i < count; i++) rv += uint32_size(((uint32_t *)array)[i]);
+      break;
+    case PROTOBUF_C_TYPE_SINT64:
+      for (i = 0; i < count; i++) rv += sint64_size(((int64_t *)array)[i]);
+      break;
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+      for (i = 0; i < count; i++) rv += uint64_size(((uint64_t *)array)[i]);
+      break;
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+    case PROTOBUF_C_TYPE_FLOAT:
+      rv += 4 * count;
+      break;
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+    case PROTOBUF_C_TYPE_DOUBLE:
+      rv += 8 * count;
+      break;
+    case PROTOBUF_C_TYPE_BOOL:
+      rv += count;
+      break;
+    case PROTOBUF_C_TYPE_STRING:
+      for (i = 0; i < count; i++) {
+        size_t len = strlen(((char **)array)[i]);
+        rv += uint32_size(len) + len;
+      }
+      break;
+    case PROTOBUF_C_TYPE_BYTES:
+      for (i = 0; i < count; i++) {
+        size_t len = ((ProtobufCBinaryData *)array)[i].len;
+        rv += uint32_size(len) + len;
+      }
+      break;
+    case PROTOBUF_C_TYPE_MESSAGE:
+      for (i = 0; i < count; i++) {
+        size_t len =
+            protobuf_c_message_get_packed_size(((ProtobufCMessage **)array)[i]);
+        rv += uint32_size(len) + len;
+      }
+      break;
+  }
+
+  if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED))
+    header_size += uint32_size(rv);
+  return header_size + rv;
+}
+
+/**
+ * Calculate the serialized size of an unknown field, i.e. one that is passed
+ * through mostly uninterpreted. This is required for forward compatibility if
+ * new fields are added to the message descriptor.
+ *
+ * \param field
+ *      Unknown field type.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t unknown_field_get_packed_size(
+    const ProtobufCMessageUnknownField *field) {
+  return get_tag_size(field->tag) + field->len;
+}
+
+/**@}*/
+
+/*
+ * Calculate the serialized size of the message.
+ */
+size_t protobuf_c_message_get_packed_size(const ProtobufCMessage *message) {
+  unsigned i;
+  size_t rv = 0;
+
+  ASSERT_IS_MESSAGE(message);
+  for (i = 0; i < message->descriptor->n_fields; i++) {
+    const ProtobufCFieldDescriptor *field = message->descriptor->fields + i;
+    const void *member = ((const char *)message) + field->offset;
+    const void *qmember = ((const char *)message) + field->quantifier_offset;
+
+    if (field->label == PROTOBUF_C_LABEL_REQUIRED) {
+      rv += required_field_get_packed_size(field, member);
+    } else if ((field->label == PROTOBUF_C_LABEL_OPTIONAL ||
+                field->label == PROTOBUF_C_LABEL_NONE) &&
+               (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF))) {
+      rv += oneof_field_get_packed_size(field, *(const uint32_t *)qmember,
+                                        member);
+    } else if (field->label == PROTOBUF_C_LABEL_OPTIONAL) {
+      rv += optional_field_get_packed_size(
+          field, *(protobuf_c_boolean *)qmember, member);
+    } else if (field->label == PROTOBUF_C_LABEL_NONE) {
+      rv += unlabeled_field_get_packed_size(field, member);
+    } else {
+      rv += repeated_field_get_packed_size(field, *(const size_t *)qmember,
+                                           member);
+    }
+  }
+  for (i = 0; i < message->n_unknown_fields; i++)
+    rv += unknown_field_get_packed_size(&message->unknown_fields[i]);
+  return rv;
+}
+
+/**
+ * \defgroup pack protobuf_c_message_pack() implementation
+ *
+ * Routines mainly used by protobuf_c_message_pack().
+ *
+ * \ingroup internal
+ * @{
+ */
+
+/**
+ * Pack an unsigned 32-bit integer in base-128 varint encoding and return the
+ * number of bytes written, which must be 5 or less.
+ *
+ * \param value
+ *      Value to encode.
+ * \param[out] out
+ *      Packed value.
+ * \return
+ *      Number of bytes written to `out`.
+ */
+static inline size_t uint32_pack(uint32_t value, uint8_t *out) {
+  unsigned rv = 0;
+
+  if (value >= 0x80) {
+    out[rv++] = value | 0x80;
+    value >>= 7;
+    if (value >= 0x80) {
+      out[rv++] = value | 0x80;
+      value >>= 7;
+      if (value >= 0x80) {
+        out[rv++] = value | 0x80;
+        value >>= 7;
+        if (value >= 0x80) {
+          out[rv++] = value | 0x80;
+          value >>= 7;
+        }
+      }
+    }
+  }
+  /* assert: value<128 */
+  out[rv++] = value;
+  return rv;
+}
+
+/**
+ * Pack a 64-bit unsigned integer using base-128 varint encoding and return the
+ * number of bytes written.
+ *
+ * \param value
+ *      Value to encode.
+ * \param[out] out
+ *      Packed value.
+ * \return
+ *      Number of bytes written to `out`.
+ */
+static size_t uint64_pack(uint64_t value, uint8_t *out) {
+  uint32_t hi = (uint32_t)(value >> 32);
+  uint32_t lo = (uint32_t)value;
+  unsigned rv;
+
+  if (hi == 0) return uint32_pack((uint32_t)lo, out);
+  out[0] = (lo) | 0x80;
+  out[1] = (lo >> 7) | 0x80;
+  out[2] = (lo >> 14) | 0x80;
+  out[3] = (lo >> 21) | 0x80;
+  if (hi < 8) {
+    out[4] = (hi << 4) | (lo >> 28);
+    return 5;
+  } else {
+    out[4] = ((hi & 7) << 4) | (lo >> 28) | 0x80;
+    hi >>= 3;
+  }
+  rv = 5;
+  while (hi >= 128) {
+    out[rv++] = hi | 0x80;
+    hi >>= 7;
+  }
+  out[rv++] = hi;
+  return rv;
+}
+
+/**
+ * Pack a ProtobufCBinaryData and return the number of bytes written. The output
+ * includes a length delimiter.
+ *
+ * \param bd
+ *      ProtobufCBinaryData to encode.
+ * \param[out] out
+ *      Packed value.
+ * \return
+ *      Number of bytes written to `out`.
+ */
+static inline size_t binary_data_pack(const ProtobufCBinaryData *bd,
+                                      uint8_t *out) {
+  size_t len = bd->len;
+  size_t rv = uint32_pack(len, out);
+  memcpy(out + rv, bd->data, len);
+  return rv + len;
+}
+
+/**
+ * Pack a field tag.
+ *
+ * Wire-type will be added in required_field_pack().
+ *
+ * \todo Just call uint64_pack on 64-bit platforms.
+ *
+ * \param id
+ *      Tag value to encode.
+ * \param[out] out
+ *      Packed value.
+ * \return
+ *      Number of bytes written to `out`.
+ */
+static size_t tag_pack(uint32_t id, uint8_t *out) {
+  if (id < (1UL << (32 - 3)))
+    return uint32_pack(id << 3, out);
+  else
+    return uint64_pack(((uint64_t)id) << 3, out);
+}
+
+/**
+ * Given a field type, return the in-memory size.
+ *
+ * \todo Implement as a table lookup.
+ *
+ * \param type
+ *      Field type.
+ * \return
+ *      Size of the field.
+ */
+static inline size_t sizeof_elt_in_repeated_array(ProtobufCType type) {
+  switch (type) {
+    case PROTOBUF_C_TYPE_SINT32:
+    case PROTOBUF_C_TYPE_INT32:
+    case PROTOBUF_C_TYPE_UINT32:
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+    case PROTOBUF_C_TYPE_FLOAT:
+    case PROTOBUF_C_TYPE_ENUM:
+      return 4;
+    case PROTOBUF_C_TYPE_SINT64:
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+    case PROTOBUF_C_TYPE_DOUBLE:
+      return 8;
+    case PROTOBUF_C_TYPE_BOOL:
+      return sizeof(protobuf_c_boolean);
+    case PROTOBUF_C_TYPE_STRING:
+    case PROTOBUF_C_TYPE_MESSAGE:
+      return sizeof(void *);
+    case PROTOBUF_C_TYPE_BYTES:
+      return sizeof(ProtobufCBinaryData);
+  }
+  PROTOBUF_C__ASSERT_NOT_REACHED();
+  return 0;
+}
+
+static inline int int_range_lookup(unsigned n_ranges,
+                                   const ProtobufCIntRange *ranges, int value) {
+  unsigned n;
+  unsigned start;
+
+  if (n_ranges == 0) return -1;
+  start = 0;
+  n = n_ranges;
+  while (n > 1) {
+    unsigned mid = start + n / 2;
+
+    if (value < ranges[mid].start_value) {
+      n = mid - start;
+    } else if (value >=
+               ranges[mid].start_value +
+                   (int)(ranges[mid + 1].orig_index - ranges[mid].orig_index)) {
+      unsigned new_start = mid + 1;
+      n = start + n - new_start;
+      start = new_start;
+    } else
+      return (value - ranges[mid].start_value) + ranges[mid].orig_index;
+  }
+  if (n > 0) {
+    unsigned start_orig_index = ranges[start].orig_index;
+    unsigned range_size = ranges[start + 1].orig_index - start_orig_index;
+
+    if (ranges[start].start_value <= value &&
+        value < (int)(ranges[start].start_value + range_size)) {
+      return (value - ranges[start].start_value) + start_orig_index;
+    }
+  }
+  return -1;
+}
+
+static size_t parse_tag_and_wiretype(size_t len, const uint8_t *data,
+                                     uint32_t *tag_out,
+                                     ProtobufCWireType *wiretype_out) {
+  unsigned max_rv = len > 5 ? 5 : len;
+  uint32_t tag = (data[0] & 0x7f) >> 3;
+  unsigned shift = 4;
+  unsigned rv;
+
+  *wiretype_out = data[0] & 7;
+  if ((data[0] & 0x80) == 0) {
+    *tag_out = tag;
+    return 1;
+  }
+  for (rv = 1; rv < max_rv; rv++) {
+    if (data[rv] & 0x80) {
+      tag |= (data[rv] & 0x7f) << shift;
+      shift += 7;
+    } else {
+      tag |= data[rv] << shift;
+      *tag_out = tag;
+      return rv + 1;
+    }
+  }
+  return 0; /* error: bad header */
+}
+
+/* sizeof(ScannedMember) must be <= (1UL<<BOUND_SIZEOF_SCANNED_MEMBER_LOG2) */
+#define BOUND_SIZEOF_SCANNED_MEMBER_LOG2 5
+typedef struct _ScannedMember ScannedMember;
+/** Field as it's being read. */
+struct _ScannedMember {
+  uint32_t tag;                          /**< Field tag. */
+  uint8_t wire_type;                     /**< Field type. */
+  uint8_t length_prefix_len;             /**< Prefix length. */
+  const ProtobufCFieldDescriptor *field; /**< Field descriptor. */
+  size_t len;                            /**< Field length. */
+  const uint8_t *data;                   /**< Pointer to field data. */
+};
+
+static inline uint32_t scan_length_prefixed_data(size_t len,
+                                                 const uint8_t *data,
+                                                 size_t *prefix_len_out) {
+  unsigned hdr_max = len < 5 ? len : 5;
+  unsigned hdr_len;
+  uint32_t val = 0;
+  unsigned i;
+  unsigned shift = 0;
+
+  for (i = 0; i < hdr_max; i++) {
+    val |= (data[i] & 0x7f) << shift;
+    shift += 7;
+    if ((data[i] & 0x80) == 0) break;
+  }
+  if (i == hdr_max) {
+    PROTOBUF_C_UNPACK_ERROR("error parsing length for length-prefixed data");
+    return 0;
+  }
+  hdr_len = i + 1;
+  *prefix_len_out = hdr_len;
+  if (hdr_len + val > len) {
+    PROTOBUF_C_UNPACK_ERROR("data too short after length-prefix of %u", val);
+    return 0;
+  }
+  return hdr_len + val;
+}
+
+static size_t max_b128_numbers(size_t len, const uint8_t *data) {
+  size_t rv = 0;
+  while (len--)
+    if ((*data++ & 0x80) == 0) ++rv;
+  return rv;
+}
+
+/**@}*/
+
+/**
+ * Merge earlier message into a latter message.
+ *
+ * For numeric types and strings, if the same value appears multiple
+ * times, the parser accepts the last value it sees. For embedded
+ * message fields, the parser merges multiple instances of the same
+ * field. That is, all singular scalar fields in the latter instance
+ * replace those in the former, singular embedded messages are merged,
+ * and repeated fields are concatenated.
+ *
+ * The earlier message should be freed after calling this function, as
+ * some of its fields may have been reused and changed to their default
+ * values during the merge.
+ */
+static protobuf_c_boolean merge_messages(ProtobufCMessage *earlier_msg,
+                                         ProtobufCMessage *latter_msg,
+                                         ProtobufCAllocator *allocator) {
+  unsigned i;
+  const ProtobufCFieldDescriptor *fields = latter_msg->descriptor->fields;
+  for (i = 0; i < latter_msg->descriptor->n_fields; i++) {
+    if (fields[i].label == PROTOBUF_C_LABEL_REPEATED) {
+      size_t *n_earlier =
+          STRUCT_MEMBER_PTR(size_t, earlier_msg, fields[i].quantifier_offset);
+      uint8_t **p_earlier =
+          STRUCT_MEMBER_PTR(uint8_t *, earlier_msg, fields[i].offset);
+      size_t *n_latter =
+          STRUCT_MEMBER_PTR(size_t, latter_msg, fields[i].quantifier_offset);
+      uint8_t **p_latter =
+          STRUCT_MEMBER_PTR(uint8_t *, latter_msg, fields[i].offset);
+
+      if (*n_earlier > 0) {
+        if (*n_latter > 0) {
+          /* Concatenate the repeated field */
+          size_t el_size = sizeof_elt_in_repeated_array(fields[i].type);
+          uint8_t *new_field;
+
+          new_field = do_alloc(allocator, (*n_earlier + *n_latter) * el_size);
+          if (!new_field) return FALSE;
+
+          memcpy(new_field, *p_earlier, *n_earlier * el_size);
+          memcpy(new_field + *n_earlier * el_size, *p_latter,
+                 *n_latter * el_size);
+
+          do_free(allocator, *p_latter);
+          do_free(allocator, *p_earlier);
+          *p_latter = new_field;
+          *n_latter = *n_earlier + *n_latter;
+        } else {
+          /* Zero copy the repeated field from the earlier message */
+          *n_latter = *n_earlier;
+          *p_latter = *p_earlier;
+        }
+        /* Make sure the field does not get double freed */
+        *n_earlier = 0;
+        *p_earlier = 0;
+      }
+    } else if (fields[i].label == PROTOBUF_C_LABEL_OPTIONAL ||
+               fields[i].label == PROTOBUF_C_LABEL_NONE) {
+      const ProtobufCFieldDescriptor *field;
+      uint32_t *earlier_case_p =
+          STRUCT_MEMBER_PTR(uint32_t, earlier_msg, fields[i].quantifier_offset);
+      uint32_t *latter_case_p =
+          STRUCT_MEMBER_PTR(uint32_t, latter_msg, fields[i].quantifier_offset);
+      protobuf_c_boolean need_to_merge = FALSE;
+      void *earlier_elem;
+      void *latter_elem;
+      const void *def_val;
+
+      if (fields[i].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) {
+        if (*latter_case_p == 0) {
+          /* lookup correct oneof field */
+          int field_index = int_range_lookup(
+              latter_msg->descriptor->n_field_ranges,
+              latter_msg->descriptor->field_ranges, *earlier_case_p);
+          field = latter_msg->descriptor->fields + field_index;
+        } else {
+          /* Oneof is present in the latter message, move on */
+          continue;
+        }
+      } else {
+        field = &fields[i];
+      }
+
+      earlier_elem = STRUCT_MEMBER_P(earlier_msg, field->offset);
+      latter_elem = STRUCT_MEMBER_P(latter_msg, field->offset);
+      def_val = field->default_value;
+
+      switch (field->type) {
+        case PROTOBUF_C_TYPE_MESSAGE: {
+          ProtobufCMessage *em = *(ProtobufCMessage **)earlier_elem;
+          ProtobufCMessage *lm = *(ProtobufCMessage **)latter_elem;
+          if (em != NULL) {
+            if (lm != NULL) {
+              if (!merge_messages(em, lm, allocator)) return FALSE;
+              /* Already merged */
+              need_to_merge = FALSE;
+            } else {
+              /* Zero copy the message */
+              need_to_merge = TRUE;
+            }
+          }
+          break;
+        }
+        case PROTOBUF_C_TYPE_BYTES: {
+          uint8_t *e_data = ((ProtobufCBinaryData *)earlier_elem)->data;
+          uint8_t *l_data = ((ProtobufCBinaryData *)latter_elem)->data;
+          const ProtobufCBinaryData *d_bd = (ProtobufCBinaryData *)def_val;
+
+          need_to_merge =
+              (e_data != NULL && (d_bd == NULL || e_data != d_bd->data)) &&
+              (l_data == NULL || (d_bd != NULL && l_data == d_bd->data));
+          break;
+        }
+        case PROTOBUF_C_TYPE_STRING: {
+          char *e_str = *(char **)earlier_elem;
+          char *l_str = *(char **)latter_elem;
+          const char *d_str = def_val;
+
+          need_to_merge = e_str != d_str && l_str == d_str;
+          break;
+        }
+        default: {
+          /* Could be has field or case enum, the logic is
+           * equivalent, since 0 (FALSE) means not set for
+           * oneof */
+          need_to_merge = (*earlier_case_p != 0) && (*latter_case_p == 0);
+          break;
+        }
+      }
+
+      if (need_to_merge) {
+        size_t el_size = sizeof_elt_in_repeated_array(field->type);
+        memcpy(latter_elem, earlier_elem, el_size);
+        /*
+         * Reset the element from the old message to 0
+         * to make sure earlier message deallocation
+         * doesn't corrupt zero-copied data in the new
+         * message, earlier message will be freed after
+         * this function is called anyway
+         */
+        memset(earlier_elem, 0, el_size);
+
+        if (field->quantifier_offset != 0) {
+          /* Set the has field or the case enum,
+           * if applicable */
+          *latter_case_p = *earlier_case_p;
+          *earlier_case_p = 0;
+        }
+      }
+    }
+  }
+  return TRUE;
+}
+
+/**
+ * Count packed elements.
+ *
+ * Given a raw slab of packed-repeated values, determine the number of
+ * elements. This function detects certain kinds of errors but not
+ * others; the remaining error checking is done by
+ * parse_packed_repeated_member().
+ */
+static protobuf_c_boolean count_packed_elements(ProtobufCType type, size_t len,
+                                                const uint8_t *data,
+                                                size_t *count_out) {
+  switch (type) {
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+    case PROTOBUF_C_TYPE_FLOAT:
+      if (len % 4 != 0) {
+        PROTOBUF_C_UNPACK_ERROR(
+            "length must be a multiple of 4 for fixed-length 32-bit types");
+        return FALSE;
+      }
+      *count_out = len / 4;
+      return TRUE;
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+    case PROTOBUF_C_TYPE_DOUBLE:
+      if (len % 8 != 0) {
+        PROTOBUF_C_UNPACK_ERROR(
+            "length must be a multiple of 8 for fixed-length 64-bit types");
+        return FALSE;
+      }
+      *count_out = len / 8;
+      return TRUE;
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_INT32:
+    case PROTOBUF_C_TYPE_SINT32:
+    case PROTOBUF_C_TYPE_UINT32:
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_SINT64:
+    case PROTOBUF_C_TYPE_UINT64:
+      *count_out = max_b128_numbers(len, data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_BOOL:
+      *count_out = len;
+      return TRUE;
+    case PROTOBUF_C_TYPE_STRING:
+    case PROTOBUF_C_TYPE_BYTES:
+    case PROTOBUF_C_TYPE_MESSAGE:
+    default:
+      PROTOBUF_C_UNPACK_ERROR("bad protobuf-c type %u for packed-repeated",
+                              type);
+      return FALSE;
+  }
+}
+
+static inline uint32_t parse_uint32(unsigned len, const uint8_t *data) {
+  uint32_t rv = data[0] & 0x7f;
+  if (len > 1) {
+    rv |= ((uint32_t)(data[1] & 0x7f) << 7);
+    if (len > 2) {
+      rv |= ((uint32_t)(data[2] & 0x7f) << 14);
+      if (len > 3) {
+        rv |= ((uint32_t)(data[3] & 0x7f) << 21);
+        if (len > 4) rv |= ((uint32_t)(data[4]) << 28);
+      }
+    }
+  }
+  return rv;
+}
+
+static inline uint32_t parse_int32(unsigned len, const uint8_t *data) {
+  return parse_uint32(len, data);
+}
+
+static inline int32_t unzigzag32(uint32_t v) {
+  if (v & 1)
+    return -(v >> 1) - 1;
+  else
+    return v >> 1;
+}
+
+static inline uint32_t parse_fixed_uint32(const uint8_t *data) {
+#if !defined(WORDS_BIGENDIAN)
+  uint32_t t;
+  memcpy(&t, data, 4);
+  return t;
+#else
+  return data[0] | ((uint32_t)(data[1]) << 8) | ((uint32_t)(data[2]) << 16) |
+         ((uint32_t)(data[3]) << 24);
+#endif
+}
+
+static uint64_t parse_uint64(unsigned len, const uint8_t *data) {
+  unsigned shift, i;
+  uint64_t rv;
+
+  if (len < 5) return parse_uint32(len, data);
+  rv = ((uint64_t)(data[0] & 0x7f)) | ((uint64_t)(data[1] & 0x7f) << 7) |
+       ((uint64_t)(data[2] & 0x7f) << 14) | ((uint64_t)(data[3] & 0x7f) << 21);
+  shift = 28;
+  for (i = 4; i < len; i++) {
+    rv |= (((uint64_t)(data[i] & 0x7f)) << shift);
+    shift += 7;
+  }
+  return rv;
+}
+
+static inline int64_t unzigzag64(uint64_t v) {
+  if (v & 1)
+    return -(v >> 1) - 1;
+  else
+    return v >> 1;
+}
+
+static inline uint64_t parse_fixed_uint64(const uint8_t *data) {
+#if !defined(WORDS_BIGENDIAN)
+  uint64_t t;
+  memcpy(&t, data, 8);
+  return t;
+#else
+  return (uint64_t)parse_fixed_uint32(data) |
+         (((uint64_t)parse_fixed_uint32(data + 4)) << 32);
+#endif
+}
+
+static protobuf_c_boolean parse_boolean(unsigned len, const uint8_t *data) {
+  unsigned i;
+  for (i = 0; i < len; i++)
+    if (data[i] & 0x7f) return TRUE;
+  return FALSE;
+}
+
+static protobuf_c_boolean parse_required_member(
+    ScannedMember *scanned_member, void *member, ProtobufCAllocator *allocator,
+    protobuf_c_boolean maybe_clear) {
+  unsigned len = scanned_member->len;
+  const uint8_t *data = scanned_member->data;
+  ProtobufCWireType wire_type = scanned_member->wire_type;
+
+  switch (scanned_member->field->type) {
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_INT32:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
+      *(int32_t *)member = parse_int32(len, data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_UINT32:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
+      *(uint32_t *)member = parse_uint32(len, data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_SINT32:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
+      *(int32_t *)member = unzigzag32(parse_uint32(len, data));
+      return TRUE;
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+    case PROTOBUF_C_TYPE_FLOAT:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_32BIT) return FALSE;
+      *(uint32_t *)member = parse_fixed_uint32(data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
+      *(uint64_t *)member = parse_uint64(len, data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_SINT64:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
+      *(int64_t *)member = unzigzag64(parse_uint64(len, data));
+      return TRUE;
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+    case PROTOBUF_C_TYPE_DOUBLE:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_64BIT) return FALSE;
+      *(uint64_t *)member = parse_fixed_uint64(data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_BOOL:
+      *(protobuf_c_boolean *)member = parse_boolean(len, data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_STRING: {
+      char **pstr = member;
+      unsigned pref_len = scanned_member->length_prefix_len;
+
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
+
+      if (maybe_clear && *pstr != NULL) {
+        const char *def = scanned_member->field->default_value;
+        if (*pstr != NULL && *pstr != def) do_free(allocator, *pstr);
+      }
+      *pstr = do_alloc(allocator, len - pref_len + 1);
+      if (*pstr == NULL) return FALSE;
+      memcpy(*pstr, data + pref_len, len - pref_len);
+      (*pstr)[len - pref_len] = 0;
+      return TRUE;
+    }
+    case PROTOBUF_C_TYPE_BYTES: {
+      ProtobufCBinaryData *bd = member;
+      const ProtobufCBinaryData *def_bd;
+      unsigned pref_len = scanned_member->length_prefix_len;
+
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
+
+      def_bd = scanned_member->field->default_value;
+      if (maybe_clear && bd->data != NULL &&
+          (def_bd == NULL || bd->data != def_bd->data)) {
+        do_free(allocator, bd->data);
+      }
+      if (len - pref_len > 0) {
+        bd->data = do_alloc(allocator, len - pref_len);
+        if (bd->data == NULL) return FALSE;
+        memcpy(bd->data, data + pref_len, len - pref_len);
+      } else {
+        bd->data = NULL;
+      }
+      bd->len = len - pref_len;
+      return TRUE;
+    }
+    case PROTOBUF_C_TYPE_MESSAGE: {
+      ProtobufCMessage **pmessage = member;
+      ProtobufCMessage *subm;
+      const ProtobufCMessage *def_mess;
+      protobuf_c_boolean merge_successful = TRUE;
+      unsigned pref_len = scanned_member->length_prefix_len;
+
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
+
+      def_mess = scanned_member->field->default_value;
+      subm =
+          protobuf_c_message_unpack(scanned_member->field->descriptor,
+                                    allocator, len - pref_len, data + pref_len);
+
+      if (maybe_clear && *pmessage != NULL && *pmessage != def_mess) {
+        if (subm != NULL)
+          merge_successful = merge_messages(*pmessage, subm, allocator);
+        /* Delete the previous message */
+        protobuf_c_message_free_unpacked(*pmessage, allocator);
+      }
+      *pmessage = subm;
+      if (subm == NULL || !merge_successful) return FALSE;
+      return TRUE;
+    }
+  }
+  return FALSE;
+}
+
+static protobuf_c_boolean parse_oneof_member(ScannedMember *scanned_member,
+                                             void *member,
+                                             ProtobufCMessage *message,
+                                             ProtobufCAllocator *allocator) {
+  uint32_t *oneof_case = STRUCT_MEMBER_PTR(
+      uint32_t, message, scanned_member->field->quantifier_offset);
+
+  /* If we have already parsed a member of this oneof, free it. */
+  if (*oneof_case != 0) {
+    /* lookup field */
+    int field_index =
+        int_range_lookup(message->descriptor->n_field_ranges,
+                         message->descriptor->field_ranges, *oneof_case);
+    const ProtobufCFieldDescriptor *old_field =
+        message->descriptor->fields + field_index;
+    size_t el_size = sizeof_elt_in_repeated_array(old_field->type);
+
+    switch (old_field->type) {
+      case PROTOBUF_C_TYPE_STRING: {
+        char **pstr = member;
+        const char *def = old_field->default_value;
+        if (*pstr != NULL && *pstr != def) do_free(allocator, *pstr);
+        break;
+      }
+      case PROTOBUF_C_TYPE_BYTES: {
+        ProtobufCBinaryData *bd = member;
+        const ProtobufCBinaryData *def_bd = old_field->default_value;
+        if (bd->data != NULL && (def_bd == NULL || bd->data != def_bd->data)) {
+          do_free(allocator, bd->data);
+        }
+        break;
+      }
+      case PROTOBUF_C_TYPE_MESSAGE: {
+        ProtobufCMessage **pmessage = member;
+        const ProtobufCMessage *def_mess = old_field->default_value;
+        if (*pmessage != NULL && *pmessage != def_mess)
+          protobuf_c_message_free_unpacked(*pmessage, allocator);
+        break;
+      }
+      default:
+        break;
+    }
+
+    memset(member, 0, el_size);
+  }
+  if (!parse_required_member(scanned_member, member, allocator, TRUE))
+    return FALSE;
+
+  *oneof_case = scanned_member->tag;
+  return TRUE;
+}
+
+static protobuf_c_boolean parse_optional_member(ScannedMember *scanned_member,
+                                                void *member,
+                                                ProtobufCMessage *message,
+                                                ProtobufCAllocator *allocator) {
+  if (!parse_required_member(scanned_member, member, allocator, TRUE))
+    return FALSE;
+  if (scanned_member->field->quantifier_offset != 0)
+    STRUCT_MEMBER(protobuf_c_boolean, message,
+                  scanned_member->field->quantifier_offset) = TRUE;
+  return TRUE;
+}
+
+static protobuf_c_boolean parse_repeated_member(ScannedMember *scanned_member,
+                                                void *member,
+                                                ProtobufCMessage *message,
+                                                ProtobufCAllocator *allocator) {
+  const ProtobufCFieldDescriptor *field = scanned_member->field;
+  size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset);
+  size_t siz = sizeof_elt_in_repeated_array(field->type);
+  char *array = *(char **)member;
+
+  if (!parse_required_member(scanned_member, array + siz * (*p_n), allocator,
+                             FALSE)) {
+    return FALSE;
+  }
+  *p_n += 1;
+  return TRUE;
+}
+
+static unsigned scan_varint(unsigned len, const uint8_t *data) {
+  unsigned i;
+  if (len > 10) len = 10;
+  for (i = 0; i < len; i++)
+    if ((data[i] & 0x80) == 0) break;
+  if (i == len) return 0;
+  return i + 1;
+}
+
+static protobuf_c_boolean parse_packed_repeated_member(
+    ScannedMember *scanned_member, void *member, ProtobufCMessage *message) {
+  const ProtobufCFieldDescriptor *field = scanned_member->field;
+  size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset);
+  size_t siz = sizeof_elt_in_repeated_array(field->type);
+  void *array = *(char **)member + siz * (*p_n);
+  const uint8_t *at = scanned_member->data + scanned_member->length_prefix_len;
+  size_t rem = scanned_member->len - scanned_member->length_prefix_len;
+  size_t count = 0;
+  unsigned i;
+
+  switch (field->type) {
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+    case PROTOBUF_C_TYPE_FLOAT:
+      count = (scanned_member->len - scanned_member->length_prefix_len) / 4;
+#if !defined(WORDS_BIGENDIAN)
+      goto no_unpacking_needed;
+#else
+      for (i = 0; i < count; i++) {
+        ((uint32_t *)array)[i] = parse_fixed_uint32(at);
+        at += 4;
+      }
+      break;
+#endif
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+    case PROTOBUF_C_TYPE_DOUBLE:
+      count = (scanned_member->len - scanned_member->length_prefix_len) / 8;
+#if !defined(WORDS_BIGENDIAN)
+      goto no_unpacking_needed;
+#else
+      for (i = 0; i < count; i++) {
+        ((uint64_t *)array)[i] = parse_fixed_uint64(at);
+        at += 8;
+      }
+      break;
+#endif
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_INT32:
+      while (rem > 0) {
+        unsigned s = scan_varint(rem, at);
+        if (s == 0) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int32 value");
+          return FALSE;
+        }
+        ((int32_t *)array)[count++] = parse_int32(s, at);
+        at += s;
+        rem -= s;
+      }
+      break;
+    case PROTOBUF_C_TYPE_SINT32:
+      while (rem > 0) {
+        unsigned s = scan_varint(rem, at);
+        if (s == 0) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint32 value");
+          return FALSE;
+        }
+        ((int32_t *)array)[count++] = unzigzag32(parse_uint32(s, at));
+        at += s;
+        rem -= s;
+      }
+      break;
+    case PROTOBUF_C_TYPE_UINT32:
+      while (rem > 0) {
+        unsigned s = scan_varint(rem, at);
+        if (s == 0) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated enum or uint32 value");
+          return FALSE;
+        }
+        ((uint32_t *)array)[count++] = parse_uint32(s, at);
+        at += s;
+        rem -= s;
+      }
+      break;
+
+    case PROTOBUF_C_TYPE_SINT64:
+      while (rem > 0) {
+        unsigned s = scan_varint(rem, at);
+        if (s == 0) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint64 value");
+          return FALSE;
+        }
+        ((int64_t *)array)[count++] = unzigzag64(parse_uint64(s, at));
+        at += s;
+        rem -= s;
+      }
+      break;
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+      while (rem > 0) {
+        unsigned s = scan_varint(rem, at);
+        if (s == 0) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int64/uint64 value");
+          return FALSE;
+        }
+        ((int64_t *)array)[count++] = parse_uint64(s, at);
+        at += s;
+        rem -= s;
+      }
+      break;
+    case PROTOBUF_C_TYPE_BOOL:
+      count = rem;
+      for (i = 0; i < count; i++) {
+        if (at[i] > 1) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated boolean value");
+          return FALSE;
+        }
+        ((protobuf_c_boolean *)array)[i] = at[i];
+      }
+      break;
+    default:
+      PROTOBUF_C__ASSERT_NOT_REACHED();
+  }
+  *p_n += count;
+  return TRUE;
+
+#if !defined(WORDS_BIGENDIAN)
+no_unpacking_needed:
+  memcpy(array, at, count * siz);
+  *p_n += count;
+  return TRUE;
+#endif
+}
+
+static protobuf_c_boolean is_packable_type(ProtobufCType type) {
+  return type != PROTOBUF_C_TYPE_STRING && type != PROTOBUF_C_TYPE_BYTES &&
+         type != PROTOBUF_C_TYPE_MESSAGE;
+}
+
+static protobuf_c_boolean parse_member(ScannedMember *scanned_member,
+                                       ProtobufCMessage *message,
+                                       ProtobufCAllocator *allocator) {
+  const ProtobufCFieldDescriptor *field = scanned_member->field;
+  void *member;
+
+  if (field == NULL) {
+    ProtobufCMessageUnknownField *ufield =
+        message->unknown_fields + (message->n_unknown_fields++);
+    ufield->tag = scanned_member->tag;
+    ufield->wire_type = scanned_member->wire_type;
+    ufield->len = scanned_member->len;
+    ufield->data = do_alloc(allocator, scanned_member->len);
+    if (ufield->data == NULL) return FALSE;
+    memcpy(ufield->data, scanned_member->data, ufield->len);
+    return TRUE;
+  }
+  member = (char *)message + field->offset;
+  switch (field->label) {
+    case PROTOBUF_C_LABEL_REQUIRED:
+      return parse_required_member(scanned_member, member, allocator, TRUE);
+    case PROTOBUF_C_LABEL_OPTIONAL:
+    case PROTOBUF_C_LABEL_NONE:
+      if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF)) {
+        return parse_oneof_member(scanned_member, member, message, allocator);
+      } else {
+        return parse_optional_member(scanned_member, member, message,
+                                     allocator);
+      }
+    case PROTOBUF_C_LABEL_REPEATED:
+      if (scanned_member->wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED &&
+          (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) ||
+           is_packable_type(field->type))) {
+        return parse_packed_repeated_member(scanned_member, member, message);
+      } else {
+        return parse_repeated_member(scanned_member, member, message,
+                                     allocator);
+      }
+  }
+  PROTOBUF_C__ASSERT_NOT_REACHED();
+  return 0;
+}
+
+/**
+ * Initialise messages generated by old code.
+ *
+ * This function is used if desc->message_init == NULL (which occurs
+ * for old code, and which would be useful to support allocating
+ * descriptors dynamically).
+ */
+static void message_init_generic(const ProtobufCMessageDescriptor *desc,
+                                 ProtobufCMessage *message) {
+  unsigned i;
+
+  memset(message, 0, desc->sizeof_message);
+  message->descriptor = desc;
+  for (i = 0; i < desc->n_fields; i++) {
+    if (desc->fields[i].default_value != NULL &&
+        desc->fields[i].label != PROTOBUF_C_LABEL_REPEATED) {
+      void *field = STRUCT_MEMBER_P(message, desc->fields[i].offset);
+      const void *dv = desc->fields[i].default_value;
+
+      switch (desc->fields[i].type) {
+        case PROTOBUF_C_TYPE_INT32:
+        case PROTOBUF_C_TYPE_SINT32:
+        case PROTOBUF_C_TYPE_SFIXED32:
+        case PROTOBUF_C_TYPE_UINT32:
+        case PROTOBUF_C_TYPE_FIXED32:
+        case PROTOBUF_C_TYPE_FLOAT:
+        case PROTOBUF_C_TYPE_ENUM:
+          memcpy(field, dv, 4);
+          break;
+        case PROTOBUF_C_TYPE_INT64:
+        case PROTOBUF_C_TYPE_SINT64:
+        case PROTOBUF_C_TYPE_SFIXED64:
+        case PROTOBUF_C_TYPE_UINT64:
+        case PROTOBUF_C_TYPE_FIXED64:
+        case PROTOBUF_C_TYPE_DOUBLE:
+          memcpy(field, dv, 8);
+          break;
+        case PROTOBUF_C_TYPE_BOOL:
+          memcpy(field, dv, sizeof(protobuf_c_boolean));
+          break;
+        case PROTOBUF_C_TYPE_BYTES:
+          memcpy(field, dv, sizeof(ProtobufCBinaryData));
+          break;
+
+        case PROTOBUF_C_TYPE_STRING:
+        case PROTOBUF_C_TYPE_MESSAGE:
+          /*
+           * The next line essentially implements a cast
+           * from const, which is totally unavoidable.
+           */
+          *(const void **)field = dv;
+          break;
+      }
+    }
+  }
+}
+
+/**@}*/
+
+/*
+ * ScannedMember slabs (an unpacking implementation detail). Before doing real
+ * unpacking, we first scan through the elements to see how many there are (for
+ * repeated fields), and which field to use (for non-repeated fields given
+ * twice).
+ *
+ * In order to avoid allocations for small messages, we keep a stack-allocated
+ * slab of ScannedMembers of size FIRST_SCANNED_MEMBER_SLAB_SIZE (16). After we
+ * fill that up, we allocate each slab twice as large as the previous one.
+ */
+#define FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2 4
+
+/*
+ * The number of slabs, including the stack-allocated ones; choose the number so
+ * that we would overflow if we needed a slab larger than provided.
+ */
+#define MAX_SCANNED_MEMBER_SLAB                                      \
+  (sizeof(unsigned int) * 8 - 1 - BOUND_SIZEOF_SCANNED_MEMBER_LOG2 - \
+   FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2)
+
+#define REQUIRED_FIELD_BITMAP_SET(index) \
+  (required_fields_bitmap[(index) / 8] |= (1UL << ((index) % 8)))
+
+#define REQUIRED_FIELD_BITMAP_IS_SET(index) \
+  (required_fields_bitmap[(index) / 8] & (1UL << ((index) % 8)))
+
+ProtobufCMessage *protobuf_c_message_unpack(
+    const ProtobufCMessageDescriptor *desc, ProtobufCAllocator *allocator,
+    size_t len, const uint8_t *data) {
+  ProtobufCMessage *rv;
+  size_t rem = len;
+  const uint8_t *at = data;
+  const ProtobufCFieldDescriptor *last_field = desc->fields + 0;
+  ScannedMember first_member_slab[1UL << FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2];
+
+  /*
+   * scanned_member_slabs[i] is an array of arrays of ScannedMember.
+   * The first slab (scanned_member_slabs[0] is just a pointer to
+   * first_member_slab), above. All subsequent slabs will be allocated
+   * using the allocator.
+   */
+  ScannedMember *scanned_member_slabs[MAX_SCANNED_MEMBER_SLAB + 1];
+  unsigned which_slab = 0;    /* the slab we are currently populating */
+  unsigned in_slab_index = 0; /* number of members in the slab */
+  size_t n_unknown = 0;
+  unsigned f;
+  unsigned j;
+  unsigned i_slab;
+  unsigned last_field_index = 0;
+  unsigned required_fields_bitmap_len;
+  unsigned char required_fields_bitmap_stack[16];
+  unsigned char *required_fields_bitmap = required_fields_bitmap_stack;
+  protobuf_c_boolean required_fields_bitmap_alloced = FALSE;
+
+  ASSERT_IS_MESSAGE_DESCRIPTOR(desc);
+
+  if (allocator == NULL) allocator = &protobuf_c__allocator;
+
+  rv = do_alloc(allocator, desc->sizeof_message);
+  if (!rv) return (NULL);
+  scanned_member_slabs[0] = first_member_slab;
+
+  required_fields_bitmap_len = (desc->n_fields + 7) / 8;
+  if (required_fields_bitmap_len > sizeof(required_fields_bitmap_stack)) {
+    required_fields_bitmap = do_alloc(allocator, required_fields_bitmap_len);
+    if (!required_fields_bitmap) {
+      do_free(allocator, rv);
+      return (NULL);
+    }
+    required_fields_bitmap_alloced = TRUE;
+  }
+  memset(required_fields_bitmap, 0, required_fields_bitmap_len);
+
+  /*
+   * Generated code always defines "message_init". However, we provide a
+   * fallback for (1) users of old protobuf-c generated-code that do not
+   * provide the function, and (2) descriptors constructed from some other
+   * source (most likely, direct construction from the .proto file).
+   */
+  if (desc->message_init != NULL)
+    protobuf_c_message_init(desc, rv);
+  else
+    message_init_generic(desc, rv);
+
+  while (rem > 0) {
+    uint32_t tag;
+    ProtobufCWireType wire_type;
+    size_t used = parse_tag_and_wiretype(rem, at, &tag, &wire_type);
+    const ProtobufCFieldDescriptor *field;
+    ScannedMember tmp;
+
+    if (used == 0) {
+      PROTOBUF_C_UNPACK_ERROR("error parsing tag/wiretype at offset %u",
+                              (unsigned)(at - data));
+      goto error_cleanup_during_scan;
+    }
+    /*
+     * \todo Consider optimizing for field[1].id == tag, if field[1]
+     * exists!
+     */
+    if (last_field == NULL || last_field->id != tag) {
+      /* lookup field */
+      int field_index =
+          int_range_lookup(desc->n_field_ranges, desc->field_ranges, tag);
+      if (field_index < 0) {
+        field = NULL;
+        n_unknown++;
+      } else {
+        field = desc->fields + field_index;
+        last_field = field;
+        last_field_index = field_index;
+      }
+    } else {
+      field = last_field;
+    }
+
+    if (field != NULL && field->label == PROTOBUF_C_LABEL_REQUIRED)
+      REQUIRED_FIELD_BITMAP_SET(last_field_index);
+
+    at += used;
+    rem -= used;
+    tmp.tag = tag;
+    tmp.wire_type = wire_type;
+    tmp.field = field;
+    tmp.data = at;
+    tmp.length_prefix_len = 0;
+
+    switch (wire_type) {
+      case PROTOBUF_C_WIRE_TYPE_VARINT: {
+        unsigned max_len = rem < 10 ? rem : 10;
+        unsigned i;
+
+        for (i = 0; i < max_len; i++)
+          if ((at[i] & 0x80) == 0) break;
+        if (i == max_len) {
+          PROTOBUF_C_UNPACK_ERROR("unterminated varint at offset %u",
+                                  (unsigned)(at - data));
+          goto error_cleanup_during_scan;
+        }
+        tmp.len = i + 1;
+        break;
+      }
+      case PROTOBUF_C_WIRE_TYPE_64BIT:
+        if (rem < 8) {
+          PROTOBUF_C_UNPACK_ERROR("too short after 64bit wiretype at offset %u",
+                                  (unsigned)(at - data));
+          goto error_cleanup_during_scan;
+        }
+        tmp.len = 8;
+        break;
+      case PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED: {
+        size_t pref_len;
+
+        tmp.len = scan_length_prefixed_data(rem, at, &pref_len);
+        if (tmp.len == 0) {
+          /* NOTE: scan_length_prefixed_data calls UNPACK_ERROR */
+          goto error_cleanup_during_scan;
+        }
+        tmp.length_prefix_len = pref_len;
+        break;
+      }
+      case PROTOBUF_C_WIRE_TYPE_32BIT:
+        if (rem < 4) {
+          PROTOBUF_C_UNPACK_ERROR("too short after 32bit wiretype at offset %u",
+                                  (unsigned)(at - data));
+          goto error_cleanup_during_scan;
+        }
+        tmp.len = 4;
+        break;
+      default:
+        PROTOBUF_C_UNPACK_ERROR("unsupported tag %u at offset %u", wire_type,
+                                (unsigned)(at - data));
+        goto error_cleanup_during_scan;
+    }
+
+    if (in_slab_index ==
+        (1UL << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2))) {
+      size_t size;
+
+      in_slab_index = 0;
+      if (which_slab == MAX_SCANNED_MEMBER_SLAB) {
+        PROTOBUF_C_UNPACK_ERROR("too many fields");
+        goto error_cleanup_during_scan;
+      }
+      which_slab++;
+      size = sizeof(ScannedMember)
+             << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2);
+      scanned_member_slabs[which_slab] = do_alloc(allocator, size);
+      if (scanned_member_slabs[which_slab] == NULL)
+        goto error_cleanup_during_scan;
+    }
+    scanned_member_slabs[which_slab][in_slab_index++] = tmp;
+
+    if (field != NULL && field->label == PROTOBUF_C_LABEL_REPEATED) {
+      size_t *n = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset);
+      if (wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED &&
+          (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) ||
+           is_packable_type(field->type))) {
+        size_t count;
+        if (!count_packed_elements(field->type, tmp.len - tmp.length_prefix_len,
+                                   tmp.data + tmp.length_prefix_len, &count)) {
+          PROTOBUF_C_UNPACK_ERROR("counting packed elements");
+          goto error_cleanup_during_scan;
+        }
+        *n += count;
+      } else {
+        *n += 1;
+      }
+    }
+
+    at += tmp.len;
+    rem -= tmp.len;
+  }
+
+  /* allocate space for repeated fields, also check that all required fields
+   * have been set */
+  for (f = 0; f < desc->n_fields; f++) {
+    const ProtobufCFieldDescriptor *field = desc->fields + f;
+    if (field->label == PROTOBUF_C_LABEL_REPEATED) {
+      size_t siz = sizeof_elt_in_repeated_array(field->type);
+      size_t *n_ptr = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset);
+      if (*n_ptr != 0) {
+        unsigned n = *n_ptr;
+        void *a;
+        *n_ptr = 0;
+        assert(rv->descriptor != NULL);
+#define CLEAR_REMAINING_N_PTRS()                               \
+  for (f++; f < desc->n_fields; f++) {                         \
+    field = desc->fields + f;                                  \
+    if (field->label == PROTOBUF_C_LABEL_REPEATED)             \
+      STRUCT_MEMBER(size_t, rv, field->quantifier_offset) = 0; \
+  }
+        a = do_alloc(allocator, siz * n);
+        if (!a) {
+          CLEAR_REMAINING_N_PTRS();
+          goto error_cleanup;
+        }
+        STRUCT_MEMBER(void *, rv, field->offset) = a;
+      }
+    } else if (field->label == PROTOBUF_C_LABEL_REQUIRED) {
+      if (field->default_value == NULL && !REQUIRED_FIELD_BITMAP_IS_SET(f)) {
+        CLEAR_REMAINING_N_PTRS();
+        PROTOBUF_C_UNPACK_ERROR("message '%s': missing required field '%s'",
+                                desc->name, field->name);
+        goto error_cleanup;
+      }
+    }
+  }
+#undef CLEAR_REMAINING_N_PTRS
+
+  /* allocate space for unknown fields */
+  if (n_unknown) {
+    rv->unknown_fields =
+        do_alloc(allocator, n_unknown * sizeof(ProtobufCMessageUnknownField));
+    if (rv->unknown_fields == NULL) goto error_cleanup;
+  }
+
+  /* do real parsing */
+  for (i_slab = 0; i_slab <= which_slab; i_slab++) {
+    unsigned max =
+        (i_slab == which_slab) ? in_slab_index : (1UL << (i_slab + 4));
+    ScannedMember *slab = scanned_member_slabs[i_slab];
+
+    for (j = 0; j < max; j++) {
+      if (!parse_member(slab + j, rv, allocator)) {
+        PROTOBUF_C_UNPACK_ERROR(
+            "error parsing member %s of %s",
+            slab->field ? slab->field->name : "*unknown-field*", desc->name);
+        goto error_cleanup;
+      }
+    }
+  }
+
+  /* cleanup */
+  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
+  if (required_fields_bitmap_alloced)
+    do_free(allocator, required_fields_bitmap);
+  return rv;
+
+error_cleanup:
+  protobuf_c_message_free_unpacked(rv, allocator);
+  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
+  if (required_fields_bitmap_alloced)
+    do_free(allocator, required_fields_bitmap);
+  return NULL;
+
+error_cleanup_during_scan:
+  do_free(allocator, rv);
+  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
+  if (required_fields_bitmap_alloced)
+    do_free(allocator, required_fields_bitmap);
+  return NULL;
+}
+
+void protobuf_c_message_free_unpacked(ProtobufCMessage *message,
+                                      ProtobufCAllocator *allocator) {
+  const ProtobufCMessageDescriptor *desc;
+  unsigned f;
+
+  if (message == NULL) return;
+
+  desc = message->descriptor;
+
+  ASSERT_IS_MESSAGE(message);
+
+  if (allocator == NULL) allocator = &protobuf_c__allocator;
+  message->descriptor = NULL;
+  for (f = 0; f < desc->n_fields; f++) {
+    if (0 != (desc->fields[f].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) &&
+        desc->fields[f].id !=
+            STRUCT_MEMBER(uint32_t, message,
+                          desc->fields[f].quantifier_offset)) {
+      /* This is not the selected oneof, skip it */
+      continue;
+    }
+
+    if (desc->fields[f].label == PROTOBUF_C_LABEL_REPEATED) {
+      size_t n =
+          STRUCT_MEMBER(size_t, message, desc->fields[f].quantifier_offset);
+      void *arr = STRUCT_MEMBER(void *, message, desc->fields[f].offset);
+
+      if (arr != NULL) {
+        if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) {
+          unsigned i;
+          for (i = 0; i < n; i++) do_free(allocator, ((char **)arr)[i]);
+        } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) {
+          unsigned i;
+          for (i = 0; i < n; i++)
+            do_free(allocator, ((ProtobufCBinaryData *)arr)[i].data);
+        } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) {
+          unsigned i;
+          for (i = 0; i < n; i++)
+            protobuf_c_message_free_unpacked(((ProtobufCMessage **)arr)[i],
+                                             allocator);
+        }
+        do_free(allocator, arr);
+      }
+    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) {
+      char *str = STRUCT_MEMBER(char *, message, desc->fields[f].offset);
+
+      if (str && str != desc->fields[f].default_value) do_free(allocator, str);
+    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) {
+      void *data =
+          STRUCT_MEMBER(ProtobufCBinaryData, message, desc->fields[f].offset)
+              .data;
+      const ProtobufCBinaryData *default_bd;
+
+      default_bd = desc->fields[f].default_value;
+      if (data != NULL && (default_bd == NULL || default_bd->data != data)) {
+        do_free(allocator, data);
+      }
+    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) {
+      ProtobufCMessage *sm;
+
+      sm = STRUCT_MEMBER(ProtobufCMessage *, message, desc->fields[f].offset);
+      if (sm && sm != desc->fields[f].default_value)
+        protobuf_c_message_free_unpacked(sm, allocator);
+    }
+  }
+
+  for (f = 0; f < message->n_unknown_fields; f++)
+    do_free(allocator, message->unknown_fields[f].data);
+  if (message->unknown_fields != NULL)
+    do_free(allocator, message->unknown_fields);
+
+  do_free(allocator, message);
+}
+
+void protobuf_c_message_init(const ProtobufCMessageDescriptor *descriptor,
+                             void *message) {
+  descriptor->message_init((ProtobufCMessage *)(message));
+}
+
+protobuf_c_boolean protobuf_c_message_check(const ProtobufCMessage *message) {
+  unsigned i;
+
+  if (!message || !message->descriptor ||
+      message->descriptor->magic != PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC) {
+    return FALSE;
+  }
+
+  for (i = 0; i < message->descriptor->n_fields; i++) {
+    const ProtobufCFieldDescriptor *f = message->descriptor->fields + i;
+    ProtobufCType type = f->type;
+    ProtobufCLabel label = f->label;
+    void *field = STRUCT_MEMBER_P(message, f->offset);
+
+    if (label == PROTOBUF_C_LABEL_REPEATED) {
+      size_t *quantity = STRUCT_MEMBER_P(message, f->quantifier_offset);
+
+      if (*quantity > 0 && *(void **)field == NULL) {
+        return FALSE;
+      }
+
+      if (type == PROTOBUF_C_TYPE_MESSAGE) {
+        ProtobufCMessage **submessage = *(ProtobufCMessage ***)field;
+        unsigned j;
+        for (j = 0; j < *quantity; j++) {
+          if (!protobuf_c_message_check(submessage[j])) return FALSE;
+        }
+      } else if (type == PROTOBUF_C_TYPE_STRING) {
+        char **string = *(char ***)field;
+        unsigned j;
+        for (j = 0; j < *quantity; j++) {
+          if (!string[j]) return FALSE;
+        }
+      } else if (type == PROTOBUF_C_TYPE_BYTES) {
+        ProtobufCBinaryData *bd = *(ProtobufCBinaryData **)field;
+        unsigned j;
+        for (j = 0; j < *quantity; j++) {
+          if (bd[j].len > 0 && bd[j].data == NULL) return FALSE;
+        }
+      }
+
+    } else { /* PROTOBUF_C_LABEL_REQUIRED or PROTOBUF_C_LABEL_OPTIONAL */
+
+      if (type == PROTOBUF_C_TYPE_MESSAGE) {
+        ProtobufCMessage *submessage = *(ProtobufCMessage **)field;
+        if (label == PROTOBUF_C_LABEL_REQUIRED || submessage != NULL) {
+          if (!protobuf_c_message_check(submessage)) return FALSE;
+        }
+      } else if (type == PROTOBUF_C_TYPE_STRING) {
+        char *string = *(char **)field;
+        if (label == PROTOBUF_C_LABEL_REQUIRED && string == NULL) return FALSE;
+      } else if (type == PROTOBUF_C_TYPE_BYTES) {
+        protobuf_c_boolean *has =
+            STRUCT_MEMBER_P(message, f->quantifier_offset);
+        ProtobufCBinaryData *bd = field;
+        if (label == PROTOBUF_C_LABEL_REQUIRED || *has == TRUE) {
+          if (bd->len > 0 && bd->data == NULL) return FALSE;
+        }
+      }
+    }
+  }
+
+  return TRUE;
+}
+
+/* === services === */
+
+typedef void (*GenericHandler)(void *service, const ProtobufCMessage *input,
+                               ProtobufCClosure closure, void *closure_data);
diff --git a/tools/quantification/src/protobuf-c.h b/tools/quantification/src/protobuf-c.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd85695b868af6c7b91590196339bc4f7826a256
--- /dev/null
+++ b/tools/quantification/src/protobuf-c.h
@@ -0,0 +1,921 @@
+/*
+ * Copyright (c) 2008-2017, Dave Benson and the protobuf-c authors.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*! \file
+ * \mainpage Introduction
+ *
+ * This is [protobuf-c], a C implementation of [Protocol Buffers].
+ *
+ * This file defines the public API for the `libprotobuf-c` support library.
+ * This API includes interfaces that can be used directly by client code as well
+ * as the interfaces used by the code generated by the `protoc-c` compiler.
+ *
+ * The `libprotobuf-c` support library performs the actual serialization and
+ * deserialization of Protocol Buffers messages. It interacts with structures,
+ * definitions, and metadata generated by the `protoc-c` compiler from .proto
+ * files.
+ *
+ * \authors Dave Benson and the `protobuf-c` authors.
+ *
+ * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license.
+ *
+ * [protobuf-c]:       https://github.com/protobuf-c/protobuf-c
+ * [Protocol Buffers]: https://developers.google.com/protocol-buffers/
+ * [BSD-2-Clause]:     http://opensource.org/licenses/BSD-2-Clause
+ *
+ * \page gencode Generated Code
+ *
+ * For each enum, we generate a C enum. For each message, we generate a C
+ * structure which can be cast to a `ProtobufCMessage`.
+ *
+ * For each enum and message, we generate a descriptor object that allows us to
+ * implement a kind of reflection on the structures.
+ *
+ * First, some naming conventions:
+ *
+ * - The name of the type for enums and messages and services is camel case
+ *   (meaning WordsAreCrammedTogether) except that double underscores are used
+ *   to delimit scopes. For example, the following `.proto` file:
+ *
+~~~{.proto}
+        package foo.bar;
+        message BazBah {
+            optional int32 val = 1;
+        }
+~~~
+ *
+ * would generate a C type `Foo__Bar__BazBah`.
+ *
+ * - Identifiers for functions and globals are all lowercase, with camel case
+ *   words separated by single underscores. For example, one of the function
+ *   prototypes generated by `protoc-c` for the above example:
+ *
+~~~{.c}
+Foo__Bar__BazBah *
+       foo__bar__baz_bah__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data);
+~~~
+ *
+ * - Identifiers for enum values contain an uppercase prefix which embeds the
+ *   package name and the enum type name.
+ *
+ * - A double underscore is used to separate further components of identifier
+ *   names.
+ *
+ * For example, in the name of the unpack function above, the package name
+ * `foo.bar` has become `foo__bar`, the message name BazBah has become
+ * `baz_bah`, and the method name is `unpack`. These are all joined with double
+ * underscores to form the C identifier `foo__bar__baz_bah__unpack`.
+ *
+ * We also generate descriptor objects for messages and enums. These are
+ * declared in the `.pb-c.h` files:
+ *
+~~~{.c}
+extern const ProtobufCMessageDescriptor foo__bar__baz_bah__descriptor;
+~~~
+ *
+ * The message structures all begin with `ProtobufCMessageDescriptor *` which is
+ * sufficient to allow them to be cast to `ProtobufCMessage`.
+ *
+ * For each message defined in a `.proto` file, we generate a number of
+ * functions and macros. Each function name contains a prefix based on the
+ * package name and message name in order to make it a unique C identifier.
+ *
+ * - `INIT`. Statically initializes a message object, initializing its
+ *   descriptor and setting its fields to default values. Uninitialized
+ *   messages cannot be processed by the protobuf-c library.
+ *
+~~~{.c}
+#define FOO__BAR__BAZ_BAH__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&foo__bar__baz_bah__descriptor), 0 }
+~~~
+ * - `init()`. Initializes a message object, initializing its descriptor and
+ *   setting its fields to default values. Uninitialized messages cannot be
+ *   processed by the protobuf-c library.
+ *
+~~~{.c}
+void foo__bar__baz_bah__init
+                     (Foo__Bar__BazBah *message);
+~~~
+ * - `unpack()`. Unpacks data for a particular message format. Note that the
+ *   `allocator` parameter is usually `NULL` to indicate that the system's
+ *   `malloc()` and `free()` functions should be used for dynamically allocating
+ *   memory.
+ *
+~~~{.c}
+Foo__Bar__BazBah *
+       foo__bar__baz_bah__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data);
+~~~
+ *
+ * - `free_unpacked()`. Frees a message object obtained with the `unpack()`
+ *   method. Freeing `NULL` is allowed (the same as with `free()`).
+ *
+~~~{.c}
+void   foo__bar__baz_bah__free_unpacked
+                     (Foo__Bar__BazBah *message,
+                      ProtobufCAllocator *allocator);
+~~~
+ *
+ * - `get_packed_size()`. Calculates the length in bytes of the serialized
+ *   representation of the message object.
+ *
+~~~{.c}
+size_t foo__bar__baz_bah__get_packed_size
+                     (const Foo__Bar__BazBah   *message);
+~~~
+ *
+ * - `pack()`. Pack a message object into a preallocated buffer. Assumes that
+ *   the buffer is large enough. (Use `get_packed_size()` first.)
+ *
+~~~{.c}
+size_t foo__bar__baz_bah__pack
+                     (const Foo__Bar__BazBah   *message,
+                      uint8_t             *out);
+~~~
+ *
+ * - `pack_to_buffer()`. Packs a message into a "virtual buffer". This is an
+ *   object which defines an "append bytes" callback to consume data as it is
+ *   serialized.
+ *
+~~~{.c}
+size_t foo__bar__baz_bah__pack_to_buffer
+                     (const Foo__Bar__BazBah   *message,
+                      ProtobufCBuffer     *buffer);
+~~~
+ *
+ * \page pack Packing and unpacking messages
+ *
+ * To pack a message, first compute the packed size of the message with
+ * protobuf_c_message_get_packed_size(), then allocate a buffer of at least
+ * that size, then call protobuf_c_message_pack().
+ *
+ * Alternatively, a message can be serialized without calculating the final size
+ * first. Use the protobuf_c_message_pack_to_buffer() function and provide a
+ * ProtobufCBuffer object which implements an "append" method that consumes
+ * data.
+ *
+ * To unpack a message, call the protobuf_c_message_unpack() function. The
+ * result can be cast to an object of the type that matches the descriptor for
+ * the message.
+ *
+ * The result of unpacking a message should be freed with
+ * protobuf_c_message_free_unpacked().
+ */
+
+#ifndef PROTOBUF_C_H
+#define PROTOBUF_C_H
+
+#include <assert.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+#define PROTOBUF_C__BEGIN_DECLS extern "C" {
+#define PROTOBUF_C__END_DECLS }
+#else
+#define PROTOBUF_C__BEGIN_DECLS
+#define PROTOBUF_C__END_DECLS
+#endif
+
+PROTOBUF_C__BEGIN_DECLS
+
+#if defined(_WIN32) && defined(PROTOBUF_C_USE_SHARED_LIB)
+#ifdef PROTOBUF_C_EXPORT
+#define PROTOBUF_C__API __declspec(dllexport)
+#else
+#define PROTOBUF_C__API __declspec(dllimport)
+#endif
+#else
+#define PROTOBUF_C__API
+#endif
+
+#if !defined(PROTOBUF_C__NO_DEPRECATED) && \
+    ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+#define PROTOBUF_C__DEPRECATED __attribute__((__deprecated__))
+#else
+#define PROTOBUF_C__DEPRECATED
+#endif
+
+#ifndef PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE
+#define PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(enum_name) \
+  , _##enum_name##_IS_INT_SIZE = INT_MAX
+#endif
+
+#define PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC 0x14159bc3
+#define PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC 0x28aaeef9
+#define PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC 0x114315af
+
+/* Empty string used for initializers */
+extern const char protobuf_c_empty_string[];
+
+/**
+ * \defgroup api Public API
+ *
+ * This is the public API for `libprotobuf-c`. These interfaces are stable and
+ * subject to Semantic Versioning guarantees.
+ *
+ * @{
+ */
+
+/**
+ * Values for the `flags` word in `ProtobufCFieldDescriptor`.
+ */
+typedef enum {
+  /** Set if the field is repeated and marked with the `packed` option. */
+  PROTOBUF_C_FIELD_FLAG_PACKED = (1 << 0),
+
+  /** Set if the field is marked with the `deprecated` option. */
+  PROTOBUF_C_FIELD_FLAG_DEPRECATED = (1 << 1),
+
+  /** Set if the field is a member of a oneof (union). */
+  PROTOBUF_C_FIELD_FLAG_ONEOF = (1 << 2),
+} ProtobufCFieldFlag;
+
+/**
+ * Message field rules.
+ *
+ * \see [Defining A Message Type] in the Protocol Buffers documentation.
+ *
+ * [Defining A Message Type]:
+ *      https://developers.google.com/protocol-buffers/docs/proto#simple
+ */
+typedef enum {
+  /** A well-formed message must have exactly one of this field. */
+  PROTOBUF_C_LABEL_REQUIRED,
+
+  /**
+   * A well-formed message can have zero or one of this field (but not
+   * more than one).
+   */
+  PROTOBUF_C_LABEL_OPTIONAL,
+
+  /**
+   * This field can be repeated any number of times (including zero) in a
+   * well-formed message. The order of the repeated values will be
+   * preserved.
+   */
+  PROTOBUF_C_LABEL_REPEATED,
+
+  /**
+   * This field has no label. This is valid only in proto3 and is
+   * equivalent to OPTIONAL but no "has" quantifier will be consulted.
+   */
+  PROTOBUF_C_LABEL_NONE,
+} ProtobufCLabel;
+
+/**
+ * Field value types.
+ *
+ * \see [Scalar Value Types] in the Protocol Buffers documentation.
+ *
+ * [Scalar Value Types]:
+ *      https://developers.google.com/protocol-buffers/docs/proto#scalar
+ */
+typedef enum {
+  PROTOBUF_C_TYPE_INT32,    /**< int32 */
+  PROTOBUF_C_TYPE_SINT32,   /**< signed int32 */
+  PROTOBUF_C_TYPE_SFIXED32, /**< signed int32 (4 bytes) */
+  PROTOBUF_C_TYPE_INT64,    /**< int64 */
+  PROTOBUF_C_TYPE_SINT64,   /**< signed int64 */
+  PROTOBUF_C_TYPE_SFIXED64, /**< signed int64 (8 bytes) */
+  PROTOBUF_C_TYPE_UINT32,   /**< unsigned int32 */
+  PROTOBUF_C_TYPE_FIXED32,  /**< unsigned int32 (4 bytes) */
+  PROTOBUF_C_TYPE_UINT64,   /**< unsigned int64 */
+  PROTOBUF_C_TYPE_FIXED64,  /**< unsigned int64 (8 bytes) */
+  PROTOBUF_C_TYPE_FLOAT,    /**< float */
+  PROTOBUF_C_TYPE_DOUBLE,   /**< double */
+  PROTOBUF_C_TYPE_BOOL,     /**< boolean */
+  PROTOBUF_C_TYPE_ENUM,     /**< enumerated type */
+  PROTOBUF_C_TYPE_STRING,   /**< UTF-8 or ASCII string */
+  PROTOBUF_C_TYPE_BYTES,    /**< arbitrary byte sequence */
+  PROTOBUF_C_TYPE_MESSAGE,  /**< nested message */
+} ProtobufCType;
+
+/**
+ * Field wire types.
+ *
+ * \see [Message Structure] in the Protocol Buffers documentation.
+ *
+ * [Message Structure]:
+ *      https://developers.google.com/protocol-buffers/docs/encoding#structure
+ */
+typedef enum {
+  PROTOBUF_C_WIRE_TYPE_VARINT = 0,
+  PROTOBUF_C_WIRE_TYPE_64BIT = 1,
+  PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED = 2,
+  /* "Start group" and "end group" wire types are unsupported. */
+  PROTOBUF_C_WIRE_TYPE_32BIT = 5,
+} ProtobufCWireType;
+
+struct ProtobufCAllocator;
+struct ProtobufCBinaryData;
+struct ProtobufCBuffer;
+struct ProtobufCBufferSimple;
+struct ProtobufCEnumDescriptor;
+struct ProtobufCEnumValue;
+struct ProtobufCEnumValueIndex;
+struct ProtobufCFieldDescriptor;
+struct ProtobufCIntRange;
+struct ProtobufCMessage;
+struct ProtobufCMessageDescriptor;
+struct ProtobufCMessageUnknownField;
+struct ProtobufCMethodDescriptor;
+struct ProtobufCService;
+struct ProtobufCServiceDescriptor;
+
+typedef struct ProtobufCAllocator ProtobufCAllocator;
+typedef struct ProtobufCBinaryData ProtobufCBinaryData;
+typedef struct ProtobufCBuffer ProtobufCBuffer;
+typedef struct ProtobufCBufferSimple ProtobufCBufferSimple;
+typedef struct ProtobufCEnumDescriptor ProtobufCEnumDescriptor;
+typedef struct ProtobufCEnumValue ProtobufCEnumValue;
+typedef struct ProtobufCEnumValueIndex ProtobufCEnumValueIndex;
+typedef struct ProtobufCFieldDescriptor ProtobufCFieldDescriptor;
+typedef struct ProtobufCIntRange ProtobufCIntRange;
+typedef struct ProtobufCMessage ProtobufCMessage;
+typedef struct ProtobufCMessageDescriptor ProtobufCMessageDescriptor;
+typedef struct ProtobufCMessageUnknownField ProtobufCMessageUnknownField;
+typedef struct ProtobufCMethodDescriptor ProtobufCMethodDescriptor;
+typedef struct ProtobufCService ProtobufCService;
+typedef struct ProtobufCServiceDescriptor ProtobufCServiceDescriptor;
+
+/** Boolean type. */
+typedef int protobuf_c_boolean;
+
+typedef void (*ProtobufCClosure)(const ProtobufCMessage *, void *closure_data);
+typedef void (*ProtobufCMessageInit)(ProtobufCMessage *);
+typedef void (*ProtobufCServiceDestroy)(ProtobufCService *);
+
+/**
+ * Structure for defining a custom memory allocator.
+ */
+struct ProtobufCAllocator {
+  /** Function to allocate memory. */
+  void *(*alloc)(void *allocator_data, size_t size);
+
+  /** Function to free memory. */
+  void (*free)(void *allocator_data, void *pointer);
+
+  /** Opaque pointer passed to `alloc` and `free` functions. */
+  void *allocator_data;
+};
+
+/**
+ * Structure for the protobuf `bytes` scalar type.
+ *
+ * The data contained in a `ProtobufCBinaryData` is an arbitrary sequence of
+ * bytes. It may contain embedded `NUL` characters and is not required to be
+ * `NUL`-terminated.
+ */
+struct ProtobufCBinaryData {
+  size_t len;    /**< Number of bytes in the `data` field. */
+  uint8_t *data; /**< Data bytes. */
+};
+
+/**
+ * Structure for defining a virtual append-only buffer. Used by
+ * protobuf_c_message_pack_to_buffer() to abstract the consumption of serialized
+ * bytes.
+ *
+ * `ProtobufCBuffer` "subclasses" may be defined on the stack. For example, to
+ * write to a `FILE` object:
+ *
+~~~{.c}
+typedef struct {
+        ProtobufCBuffer base;
+        FILE *fp;
+} BufferAppendToFile;
+
+static void
+my_buffer_file_append(ProtobufCBuffer *buffer,
+                      size_t len,
+                      const uint8_t *data)
+{
+        BufferAppendToFile *file_buf = (BufferAppendToFile *) buffer;
+        fwrite(data, len, 1, file_buf->fp); // XXX: No error handling!
+}
+~~~
+ *
+ * To use this new type of ProtobufCBuffer, it could be called as follows:
+ *
+~~~{.c}
+...
+BufferAppendToFile tmp = {0};
+tmp.base.append = my_buffer_file_append;
+tmp.fp = fp;
+protobuf_c_message_pack_to_buffer(&message, &tmp);
+...
+~~~
+ */
+struct ProtobufCBuffer {
+  /** Append function. Consumes the `len` bytes stored at `data`. */
+  void (*append)(ProtobufCBuffer *buffer, size_t len, const uint8_t *data);
+};
+
+/**
+ * Simple buffer "subclass" of `ProtobufCBuffer`.
+ *
+ * A `ProtobufCBufferSimple` object is declared on the stack and uses a
+ * scratch buffer provided by the user for the initial allocation. It performs
+ * exponential resizing, using dynamically allocated memory. A
+ * `ProtobufCBufferSimple` object can be created and used as follows:
+ *
+~~~{.c}
+uint8_t pad[128];
+ProtobufCBufferSimple simple = PROTOBUF_C_BUFFER_SIMPLE_INIT(pad);
+ProtobufCBuffer *buffer = (ProtobufCBuffer *) &simple;
+~~~
+ *
+ * `buffer` can now be used with `protobuf_c_message_pack_to_buffer()`. Once a
+ * message has been serialized to a `ProtobufCBufferSimple` object, the
+ * serialized data bytes can be accessed from the `.data` field.
+ *
+ * To free the memory allocated by a `ProtobufCBufferSimple` object, if any,
+ * call PROTOBUF_C_BUFFER_SIMPLE_CLEAR() on the object, for example:
+ *
+~~~{.c}
+PROTOBUF_C_BUFFER_SIMPLE_CLEAR(&simple);
+~~~
+ *
+ * \see PROTOBUF_C_BUFFER_SIMPLE_INIT
+ * \see PROTOBUF_C_BUFFER_SIMPLE_CLEAR
+ */
+struct ProtobufCBufferSimple {
+  /** "Base class". */
+  ProtobufCBuffer base;
+  /** Number of bytes allocated in `data`. */
+  size_t alloced;
+  /** Number of bytes currently stored in `data`. */
+  size_t len;
+  /** Data bytes. */
+  uint8_t *data;
+  /** Whether `data` must be freed. */
+  protobuf_c_boolean must_free_data;
+  /** Allocator to use. May be NULL to indicate the system allocator. */
+  ProtobufCAllocator *allocator;
+};
+
+/**
+ * Describes an enumeration as a whole, with all of its values.
+ */
+struct ProtobufCEnumDescriptor {
+  /** Magic value checked to ensure that the API is used correctly. */
+  uint32_t magic;
+
+  /** The qualified name (e.g., "namespace.Type"). */
+  const char *name;
+  /** The unqualified name as given in the .proto file (e.g., "Type"). */
+  const char *short_name;
+  /** Identifier used in generated C code. */
+  const char *c_name;
+  /** The dot-separated namespace. */
+  const char *package_name;
+
+  /** Number elements in `values`. */
+  unsigned n_values;
+  /** Array of distinct values, sorted by numeric value. */
+  const ProtobufCEnumValue *values;
+
+  /** Number of elements in `values_by_name`. */
+  unsigned n_value_names;
+  /** Array of named values, including aliases, sorted by name. */
+  const ProtobufCEnumValueIndex *values_by_name;
+
+  /** Number of elements in `value_ranges`. */
+  unsigned n_value_ranges;
+  /** Value ranges, for faster lookups by numeric value. */
+  const ProtobufCIntRange *value_ranges;
+
+  /** Reserved for future use. */
+  void *reserved1;
+  /** Reserved for future use. */
+  void *reserved2;
+  /** Reserved for future use. */
+  void *reserved3;
+  /** Reserved for future use. */
+  void *reserved4;
+};
+
+/**
+ * Represents a single value of an enumeration.
+ */
+struct ProtobufCEnumValue {
+  /** The string identifying this value in the .proto file. */
+  const char *name;
+
+  /** The string identifying this value in generated C code. */
+  const char *c_name;
+
+  /** The numeric value assigned in the .proto file. */
+  int value;
+};
+
+/**
+ * Used by `ProtobufCEnumDescriptor` to look up enum values.
+ */
+struct ProtobufCEnumValueIndex {
+  /** Name of the enum value. */
+  const char *name;
+  /** Index into values[] array. */
+  unsigned index;
+};
+
+/**
+ * Describes a single field in a message.
+ */
+struct ProtobufCFieldDescriptor {
+  /** Name of the field as given in the .proto file. */
+  const char *name;
+
+  /** Tag value of the field as given in the .proto file. */
+  uint32_t id;
+
+  /** Whether the field is `REQUIRED`, `OPTIONAL`, or `REPEATED`. */
+  ProtobufCLabel label;
+
+  /** The type of the field. */
+  ProtobufCType type;
+
+  /**
+   * The offset in bytes of the message's C structure's quantifier field
+   * (the `has_MEMBER` field for optional members or the `n_MEMBER` field
+   * for repeated members or the case enum for oneofs).
+   */
+  unsigned quantifier_offset;
+
+  /**
+   * The offset in bytes into the message's C structure for the member
+   * itself.
+   */
+  unsigned offset;
+
+  /**
+   * A type-specific descriptor.
+   *
+   * If `type` is `PROTOBUF_C_TYPE_ENUM`, then `descriptor` points to the
+   * corresponding `ProtobufCEnumDescriptor`.
+   *
+   * If `type` is `PROTOBUF_C_TYPE_MESSAGE`, then `descriptor` points to
+   * the corresponding `ProtobufCMessageDescriptor`.
+   *
+   * Otherwise this field is NULL.
+   */
+  const void *descriptor; /* for MESSAGE and ENUM types */
+
+  /** The default value for this field, if defined. May be NULL. */
+  const void *default_value;
+
+  /**
+   * A flag word. Zero or more of the bits defined in the
+   * `ProtobufCFieldFlag` enum may be set.
+   */
+  uint32_t flags;
+
+  /** Reserved for future use. */
+  unsigned reserved_flags;
+  /** Reserved for future use. */
+  void *reserved2;
+  /** Reserved for future use. */
+  void *reserved3;
+};
+
+/**
+ * Helper structure for optimizing int => index lookups in the case
+ * where the keys are mostly consecutive values, as they presumably are for
+ * enums and fields.
+ *
+ * The data structures requires that the values in the original array are
+ * sorted.
+ */
+struct ProtobufCIntRange {
+  int start_value;
+  unsigned orig_index;
+  /*
+   * NOTE: the number of values in the range can be inferred by looking
+   * at the next element's orig_index. A dummy element is added to make
+   * this simple.
+   */
+};
+
+/**
+ * An instance of a message.
+ *
+ * `ProtobufCMessage` is a light-weight "base class" for all messages.
+ *
+ * In particular, `ProtobufCMessage` doesn't have any allocation policy
+ * associated with it. That's because it's common to create `ProtobufCMessage`
+ * objects on the stack. In fact, that's what we recommend for sending messages.
+ * If the object is allocated from the stack, you can't really have a memory
+ * leak.
+ *
+ * This means that calls to functions like protobuf_c_message_unpack() which
+ * return a `ProtobufCMessage` must be paired with a call to a free function,
+ * like protobuf_c_message_free_unpacked().
+ */
+struct ProtobufCMessage {
+  /** The descriptor for this message type. */
+  const ProtobufCMessageDescriptor *descriptor;
+  /** The number of elements in `unknown_fields`. */
+  unsigned n_unknown_fields;
+  /** The fields that weren't recognized by the parser. */
+  ProtobufCMessageUnknownField *unknown_fields;
+};
+
+/**
+ * Describes a message.
+ */
+struct ProtobufCMessageDescriptor {
+  /** Magic value checked to ensure that the API is used correctly. */
+  uint32_t magic;
+
+  /** The qualified name (e.g., "namespace.Type"). */
+  const char *name;
+  /** The unqualified name as given in the .proto file (e.g., "Type"). */
+  const char *short_name;
+  /** Identifier used in generated C code. */
+  const char *c_name;
+  /** The dot-separated namespace. */
+  const char *package_name;
+
+  /**
+   * Size in bytes of the C structure representing an instance of this
+   * type of message.
+   */
+  size_t sizeof_message;
+
+  /** Number of elements in `fields`. */
+  unsigned n_fields;
+  /** Field descriptors, sorted by tag number. */
+  const ProtobufCFieldDescriptor *fields;
+  /** Used for looking up fields by name. */
+  const unsigned *fields_sorted_by_name;
+
+  /** Number of elements in `field_ranges`. */
+  unsigned n_field_ranges;
+  /** Used for looking up fields by id. */
+  const ProtobufCIntRange *field_ranges;
+
+  /** Message initialisation function. */
+  ProtobufCMessageInit message_init;
+
+  /** Reserved for future use. */
+  void *reserved1;
+  /** Reserved for future use. */
+  void *reserved2;
+  /** Reserved for future use. */
+  void *reserved3;
+};
+
+/**
+ * An unknown message field.
+ */
+struct ProtobufCMessageUnknownField {
+  /** The tag number. */
+  uint32_t tag;
+  /** The wire type of the field. */
+  ProtobufCWireType wire_type;
+  /** Number of bytes in `data`. */
+  size_t len;
+  /** Field data. */
+  uint8_t *data;
+};
+
+/**
+ * Method descriptor.
+ */
+struct ProtobufCMethodDescriptor {
+  /** Method name. */
+  const char *name;
+  /** Input message descriptor. */
+  const ProtobufCMessageDescriptor *input;
+  /** Output message descriptor. */
+  const ProtobufCMessageDescriptor *output;
+};
+
+/**
+ * Service.
+ */
+struct ProtobufCService {
+  /** Service descriptor. */
+  const ProtobufCServiceDescriptor *descriptor;
+  /** Function to invoke the service. */
+  void (*invoke)(ProtobufCService *service, unsigned method_index,
+                 const ProtobufCMessage *input, ProtobufCClosure closure,
+                 void *closure_data);
+  /** Function to destroy the service. */
+  void (*destroy)(ProtobufCService *service);
+};
+
+/**
+ * Service descriptor.
+ */
+struct ProtobufCServiceDescriptor {
+  /** Magic value checked to ensure that the API is used correctly. */
+  uint32_t magic;
+
+  /** Service name. */
+  const char *name;
+  /** Short version of service name. */
+  const char *short_name;
+  /** C identifier for the service name. */
+  const char *c_name;
+  /** Package name. */
+  const char *package;
+  /** Number of elements in `methods`. */
+  unsigned n_methods;
+  /** Method descriptors, in the order defined in the .proto file. */
+  const ProtobufCMethodDescriptor *methods;
+  /** Sort index of methods. */
+  const unsigned *method_indices_by_name;
+};
+
+/**
+ * Get the version of the protobuf-c library. Note that this is the version of
+ * the library linked against, not the version of the headers compiled against.
+ *
+ * \return A string containing the version number of protobuf-c.
+ */
+PROTOBUF_C__API
+const char *protobuf_c_version(void);
+
+/**
+ * Get the version of the protobuf-c library. Note that this is the version of
+ * the library linked against, not the version of the headers compiled against.
+ *
+ * \return A 32 bit unsigned integer containing the version number of
+ *      protobuf-c, represented in base-10 as (MAJOR*1E6) + (MINOR*1E3) + PATCH.
+ */
+PROTOBUF_C__API
+uint32_t protobuf_c_version_number(void);
+
+/**
+ * The version of the protobuf-c headers, represented as a string using the same
+ * format as protobuf_c_version().
+ */
+#define PROTOBUF_C_VERSION "1.3.0"
+
+/**
+ * The version of the protobuf-c headers, represented as an integer using the
+ * same format as protobuf_c_version_number().
+ */
+#define PROTOBUF_C_VERSION_NUMBER 1003000
+
+/**
+ * The minimum protoc-c version which works with the current version of the
+ * protobuf-c headers.
+ */
+#define PROTOBUF_C_MIN_COMPILER_VERSION 1000000
+
+/**
+ * Determine the number of bytes required to store the serialised message.
+ *
+ * \param message
+ *      The message object to serialise.
+ * \return
+ *      Number of bytes.
+ */
+PROTOBUF_C__API
+size_t protobuf_c_message_get_packed_size(const ProtobufCMessage *message);
+
+/**
+ * Unpack a serialised message into an in-memory representation.
+ *
+ * \param descriptor
+ *      The message descriptor.
+ * \param allocator
+ *      `ProtobufCAllocator` to use for memory allocation. May be NULL to
+ *      specify the default allocator.
+ * \param len
+ *      Length in bytes of the serialised message.
+ * \param data
+ *      Pointer to the serialised message.
+ * \return
+ *      An unpacked message object.
+ * \retval NULL
+ *      If an error occurred during unpacking.
+ */
+PROTOBUF_C__API
+ProtobufCMessage *protobuf_c_message_unpack(
+    const ProtobufCMessageDescriptor *descriptor, ProtobufCAllocator *allocator,
+    size_t len, const uint8_t *data);
+
+/**
+ * Free an unpacked message object.
+ *
+ * This function should be used to deallocate the memory used by a call to
+ * protobuf_c_message_unpack().
+ *
+ * \param message
+ *      The message object to free. May be NULL.
+ * \param allocator
+ *      `ProtobufCAllocator` to use for memory deallocation. May be NULL to
+ *      specify the default allocator.
+ */
+PROTOBUF_C__API
+void protobuf_c_message_free_unpacked(ProtobufCMessage *message,
+                                      ProtobufCAllocator *allocator);
+
+/**
+ * Check the validity of a message object.
+ *
+ * Makes sure all required fields (`PROTOBUF_C_LABEL_REQUIRED`) are present.
+ * Recursively checks nested messages.
+ *
+ * \retval TRUE
+ *      Message is valid.
+ * \retval FALSE
+ *      Message is invalid.
+ */
+PROTOBUF_C__API
+protobuf_c_boolean protobuf_c_message_check(const ProtobufCMessage *);
+
+/** Message initialiser. */
+#define PROTOBUF_C_MESSAGE_INIT(descriptor) \
+  { descriptor, 0, NULL }
+
+/**
+ * Initialise a message object from a message descriptor.
+ *
+ * \param descriptor
+ *      Message descriptor.
+ * \param message
+ *      Allocated block of memory of size `descriptor->sizeof_message`.
+ */
+PROTOBUF_C__API
+void protobuf_c_message_init(const ProtobufCMessageDescriptor *descriptor,
+                             void *message);
+
+/**
+ * Initialise a `ProtobufCBufferSimple` object.
+ */
+#define PROTOBUF_C_BUFFER_SIMPLE_INIT(array_of_bytes)             \
+  {                                                               \
+    {protobuf_c_buffer_simple_append}, sizeof(array_of_bytes), 0, \
+        (array_of_bytes), 0, NULL                                 \
+  }
+
+/**
+ * Clear a `ProtobufCBufferSimple` object, freeing any allocated memory.
+ */
+#define PROTOBUF_C_BUFFER_SIMPLE_CLEAR(simp_buf)                              \
+  do {                                                                        \
+    if ((simp_buf)->must_free_data) {                                         \
+      if ((simp_buf)->allocator != NULL)                                      \
+        (simp_buf)->allocator->free((simp_buf)->allocator, (simp_buf)->data); \
+      else                                                                    \
+        free((simp_buf)->data);                                               \
+    }                                                                         \
+  } while (0)
+
+/**
+ * The `append` method for `ProtobufCBufferSimple`.
+ *
+ * \param buffer
+ *      The buffer object to append to. Must actually be a
+ *      `ProtobufCBufferSimple` object.
+ * \param len
+ *      Number of bytes in `data`.
+ * \param data
+ *      Data to append.
+ */
+PROTOBUF_C__API
+void protobuf_c_buffer_simple_append(ProtobufCBuffer *buffer, size_t len,
+                                     const unsigned char *data);
+
+/**@}*/
+
+PROTOBUF_C__END_DECLS
+
+#endif /* PROTOBUF_C_H */
diff --git a/tools/quantification/src/tensor_desc.h b/tools/quantification/src/tensor_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..4eadf341db998ae12939d252d585051ba54c3bf0
--- /dev/null
+++ b/tools/quantification/src/tensor_desc.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "src/framework.pb-c.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+enum VarType_Type {
+  VARTYPE_TYPE_BOOL = 0,
+  VARTYPE_TYPE_INT16 = 1,
+  VARTYPE_TYPE_INT32 = 2,
+  VARTYPE_TYPE_INT64 = 3,
+  VARTYPE_TYPE_FP16 = 4,
+  VARTYPE_TYPE_FP32 = 5,
+  VARTYPE_TYPE_FP64 = 6,
+  VARTYPE_TYPE_LOD_TENSOR = 7,
+  VARTYPE_TYPE_SELECTED_ROWS = 8,
+  VARTYPE_TYPE_FEED_MINIBATCH = 9,
+  VARTYPE_TYPE_FETCH_LIST = 10,
+  VARTYPE_TYPE_STEP_SCOPES = 11,
+  VARTYPE_TYPE_STEP_LOD_RANK_TABLE = 12,
+  VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY = 13,
+  VARTYPE_TYPE_STEP_PLACE_LIST = 14,
+  VARTYPE_TYPE_READER = 15,
+  VARTYPE_TYPE_CHANNEL = 16,
+  VARTYPE_TYPE_RAW = 17,
+  VARTYPE_TYPE_TUPLE = 18
+};
+
+class TensorDesc {
+ public:
+  TensorDesc() = default;
+  TensorDesc(const TensorDesc &desc) {
+    this->dims_ = desc.dims_;
+    this->data_type_ = desc.data_type_;
+  }
+
+  explicit TensorDesc(
+      PaddleMobile__Framework__Proto__VarType__TensorDesc *desc) {
+    for (int i = 0; i < desc->n_dims; ++i) {
+      int64_t d = desc->dims[i];
+      dims_.emplace_back(d);
+    }
+    data_type_ = (VarType_Type)desc->data_type;
+  }
+
+  std::vector<int64_t> Dims() const { return dims_; }
+  VarType_Type DataType() const { return data_type_; }
+
+ private:
+  std::vector<int64_t> dims_;
+  VarType_Type data_type_;
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/tools/quantification/src/var_desc.h b/tools/quantification/src/var_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b9c5ac4d672be2dd8a8a2a2695c2816f9cae05a
--- /dev/null
+++ b/tools/quantification/src/var_desc.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "src/framework.pb-c.h"
+#include "src/tensor_desc.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+class VarDesc {
+ public:
+  VarDesc(const VarDesc &var_desc) {
+    this->data_type_ = var_desc.data_type_;
+    this->name_ = var_desc.name_;
+    this->persistable_ = var_desc.persistable_;
+    this->tensor_desc_ = var_desc.tensor_desc_;
+    this->type_ = var_desc.type_;
+  }
+  explicit VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) {
+    type_ = (VarType_Type)desc->type->type;
+    name_ = std::string(desc->name);
+    persistable_ = static_cast<bool>(desc->persistable);
+
+    switch (type_) {
+      case VARTYPE_TYPE_SELECTED_ROWS:
+        tensor_desc_ = TensorDesc(desc->type->selected_rows);
+        break;
+      case VARTYPE_TYPE_LOD_TENSOR:
+        tensor_desc_ = TensorDesc(desc->type->lod_tensor->tensor);
+        break;
+      case VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY:
+        // desc->type->tensor_array->tensor->data_type;
+        tensor_desc_ = TensorDesc(desc->type->tensor_array->tensor);
+
+        break;
+      default:
+        break;
+    }
+    switch (type_) {
+      case VARTYPE_TYPE_CHANNEL:
+        data_type_ = (VarType_Type)desc->type->channel->data_type;
+        break;
+      default:
+        data_type_ = tensor_desc_.DataType();
+        break;
+    }
+  }
+  std::string Name() const { return name_; }
+
+  VarType_Type Type() const { return type_; }
+
+  bool Persistable() const { return persistable_; }
+
+  const TensorDesc &Tensor_desc() const { return tensor_desc_; }
+
+ private:
+  std::string name_;
+  bool persistable_;
+  TensorDesc tensor_desc_;
+  VarType_Type type_;
+  VarType_Type data_type_;
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/tools/run.sh b/tools/run.sh
deleted file mode 100644
index aaf0f52f0335d6e73060ed9b8e86a78ba357c552..0000000000000000000000000000000000000000
--- a/tools/run.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env sh
-# auto build and run
-
-BUILDNET="mobilenetssd"
-TESTUNIT="test-mobilenetssd"
-
-push_fn () {
-sh build.sh android ${BUILDNET}
-MODELS_PATH="../test/models/*"
-MODELS_SRC="../test/models"
-IMAGE_PATH="../test/images/*"
-EXE_FILE="../test/build/*"
-EXE_DIR="data/local/tmp/bin"
-adb shell mkdir ${EXE_DIR}
-MODELS_DIR="data/local/tmp/models"
-adb shell mkdir ${MODELS_DIR}
-for file in `ls ${MODELS_SRC}`
-do
-    adb shell mkdir ${MODELS_DIR}"/"${file}
-done
-
-IMAGES_DIR="data/local/tmp/images"
-adb shell mkdir ${IMAGES_DIR}
-LIB_PATH="../build/release/arm-v7a/build/*"
-adb push ${EXE_FILE} ${EXE_DIR}
-adb push ${LIB_PATH} ${EXE_DIR}
-if [[ $1 != "npm" ]]; then
-adb push ${IMAGE_PATH} ${IMAGES_DIR}
-adb push ${MODELS_PATH} ${MODELS_DIR}
-fi
-adb shell "cd /data/local/tmp/bin; LD_LIBRARY_PATH=. ./${TESTUNIT}"
-}
-
-if [[ $1 == "npm" ]]; then
-push_fn $1
-else
-push_fn
-fi
diff --git a/tools/toolchains/arm-android-neon.cmake b/tools/toolchains/arm-android-neon.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..5e431059a974810b2fd0481e0942447f57bf1286
--- /dev/null
+++ b/tools/toolchains/arm-android-neon.cmake
@@ -0,0 +1,5 @@
+set(ANDROID_ARM_NEON ON)
+set(ANDROID_PIE TRUE)
+set(ANDROID_STL "c++_static")
+set(ANDROID_PLATFORM "android-22")
+include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")
diff --git a/tools/toolchains/arm-linux-gnueabi.cmake b/tools/toolchains/arm-linux-gnueabi.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c2b1b853def5f470565e670751708f76c59e16c4
--- /dev/null
+++ b/tools/toolchains/arm-linux-gnueabi.cmake
@@ -0,0 +1,16 @@
+# CMake toolchain file for building ARM software on Linux environment
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_VERSION 1)
+
+set(CMAKE_C_COMPILER   /usr/bin/arm-linux-gnueabi-gcc)
+set(CMAKE_CXX_COMPILER /usr/bin/arm-linux-gnueabi-g++)
+set(CMAKE_STRIP /usr/bin/arm-linux-gnueabi-strip)
+
+set(CMAKE_FIND_ROOT_PATH  /usr/arm-linux-gnueabi)
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+set(ARM_LINUX 1)