diff --git a/.gitignore b/.gitignore
index 532601bfe9222eae0d6be7378322ab1b9c2eb110..547e94ea11f048c9e007996d4ee716d22a13742e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -69,3 +69,10 @@ build
 # clion building directories
 cmake-build-debug
 cmake-build-release
+
+
+#ios demo
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/googlenet_combine/
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/*.jpg
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/*.a
+*.xcuserstate
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6f037c79962058dfc39b4cc598c0ee9106a10f88..a00d179a0d4972080c8fd392160f8ec451692e4d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,12 +6,25 @@ option(USE_OPENMP "openmp support" OFF)
 option(USE_EXCEPTION "use std exception" ON)
 option(LOG_PROFILE "log profile" ON)
 # select the platform to build
-option(CPU "cpu" OFF)
-option(MALI_GPU "mali gpu" ON)
+option(CPU "armv7 with neon" ON)
+option(MALI_GPU "mali gpu" OFF)
 option(FPGA "fpga" OFF)
+set(DEBUGING ON)
+
+if (ARM_LINUX)
+include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake")
+endif ()
+
+file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
+file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
 
 if (CPU)
-    add_definitions(-DPADDLE_MOBILE_CPU)
+  add_definitions(-DPADDLE_MOBILE_CPU)
+else()
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ./src/operators/kernel/arm/*.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ./src/operators/kernel/arm/*.cc)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ./src/operators/kernel/arm/*.cpp)
+
 endif()
 
 if (MALI_GPU)
@@ -27,32 +40,42 @@ if (MALI_GPU)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build/opencl-1.2-stubs")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ACL=1")
+else()
+    file(GLOB_RECURSE _tmp_list src/operators/kernel/mali/*.cpp src/operators/kernel/mali/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/mali/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
 endif()
 
 if(FPGA)
     add_definitions(-DPADDLE_MOBILE_FPGA)
+else()
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/fpga/*.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/fpga/*.cc)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/fpga/*.cpp)
 endif()
 
+
 set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
 if (DEBUGING)
-    set(CMAKE_BUILD_TYPE Debug)
-    set(CMAKE_CXX_FLAGS_DEBUG "-O3 -DNDEBUG")
-else()
-    set(CMAKE_BUILD_TYPE Release)
-    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
-endif ()
-
-if(DEBUGING)
-    message(STATUS "debuging")
-    add_definitions(-DPADDLE_MOBILE_DEBUG)
-    if(ANDROID)
+  message(STATUS "debug")
+  set(CMAKE_BUILD_TYPE Debug)
+  set(CMAKE_CXX_FLAGS_DEBUG "-g -DNDEBUG")
+  add_definitions(-DPADDLE_MOBILE_DEBUG)
+  if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
+    add_definitions(-DARMV7)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
-    endif()
-
-else()
-    message(STATUS "releasing")
-    add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
-endif()
+  endif ()
+else ()
+  set(CMAKE_BUILD_TYPE Release)
+  set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
+  add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
+endif ()
 
 if (USE_EXCEPTION)
     message(STATUS "use exception")
@@ -66,110 +89,47 @@ if (LOG_PROFILE)
     add_definitions(-DPADDLE_MOBILE_PROFILE)
 endif()
 
-if(IS_MAC)
-    add_definitions(-DX86)
-elseif(IS_IOS)
-    add_definitions(-DIOS)
-elseif(V7)
-    add_definitions(-DARMV7)
-elseif(V8)
-    add_definitions(-DARMV8)
-else ()
-    add_definitions(-DX86)
+if(USE_OPENMP)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
 endif()
 
-set(CMAKE_VERBOSE_MAKEFILE ON)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
 
-file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c)
-file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
 
-if (NOT ANDROID)
-list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.cpp)
-list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.h)
-list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
+
+if (NOT ANDROID_NDK_TOOLCHAIN_INCLUDED)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
 endif ()
 
 include_directories(src/)
 
-if(USE_OPENMP)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
-endif()
+set(CMAKE_VERBOSE_MAKEFILE ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
 
-if (googlenet)
-    add_definitions(-DCONCAT_OP)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DLRN_OP)
-    add_definitions(-DMUL_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DFUSION_FC_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DFUSION_CONVADD_OP)
-    add_definitions(-DFUSION_CONVADD_RELU_OP)
-elseif (mobilenet)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DSOFTMAX_OP)
-    add_definitions(-DSOFTMAX_OP)
-    add_definitions(-DDEPTHWISECONV_OP)
-    add_definitions(-DBATCHNORM_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DRESHAPE_OP)
-elseif (yolo)
-    add_definitions(-DBATCHNORM_OP)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-elseif (squeezenet)
-    add_definitions(-DCONCAT_OP)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DRESHAPE_OP)
-    add_definitions(-DSOFTMAX_OP)
-elseif(resnet)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DBATCHNORM_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DSOFTMAX_OP)
-    add_definitions(-DMUL_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DRELU_OP)
-else ()
-    add_definitions(-DBATCHNORM_OP)
-    add_definitions(-DBOXCODER_OP)
-    add_definitions(-DCONCAT_OP)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DDEPTHWISECONV_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DFUSION_CONVADD_OP)
-    add_definitions(-DCONVADDRELU_OP)
-    add_definitions(-DFUSION_FC_OP)
-    add_definitions(-DLRN_OP)
-    add_definitions(-DMUL_OP)
-    add_definitions(-DMULTICLASSNMS_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DPRIORBOX_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DRESHAPE_OP)
-    add_definitions(-DSIGMOID_OP)
-    add_definitions(-DSOFTMAX_OP)
-    add_definitions(-DTRANSPOSE_OP)
-    add_definitions(-DFUSION_CONVADD_RELU_OP)
-endif()
+# NET default
+set(NET "defult" CACHE STRING "select net type")
+set_property(CACHE NET PROPERTY STRINGS "defult" "googlenet" "mobilenet" "yolo" "squeezenet")
+
+include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
 
 if (IS_IOS)
     add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
-elseif(ANDROID)
-    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
 else()
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobile.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobile.mm)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h)
+endif ()
+
+if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
+    list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS)
+    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+elseif(IS_IOS)
+else ()
     add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
 endif ()
 
@@ -177,4 +137,3 @@ if(DEBUGING)
     add_subdirectory(test)
 endif()
 
-
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1a25d65e02afb09dabc96e1ec241346cff34f6f2..a33db73e109042276b686e8ab74261273df87390 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -183,6 +183,9 @@ upstream
 
 接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。
 
+![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294877166787.jpg)
+之后就可以提交代码了
+
 ## 删除远程分支
 
 在 PR 被 merge 进主仓库后，我们可以在 PR 的页面删除远程仓库的分支。
@@ -219,7 +222,7 @@ upstream
      - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
      - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
    - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
-3. 如果解决了某个Issue的问题，请在该Pull Request的**第一个**评论框中加上：`fix #issue_number`，这样当该PUll Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
+3. 如果解决了某个Issue的问题，请在该Pull Request的**第一个**评论框中加上：`fix #issue_number`，这样当该Pull Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
 
 此外，在回复评审人意见时，请您遵守以下约定：
 
diff --git a/Dockerfile b/Dockerfile
index 0a249af932f82e9f8d29dcaf739d8ea663965626..df7df032acefd39c20051e861d353644e3b91024 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,27 +1,37 @@
-FROM ubuntu:18.04
+FROM ubuntu:16.04
 
 RUN echo '\
-deb <mirror> bionic main restricted universe multiverse\n\
-deb <mirror> bionic-updates main restricted universe multiverse\n\
-deb <mirror> bionic-backports main restricted universe multiverse\n\
-deb <mirror> bionic-security main restricted universe multiverse\n'\
+deb <mirror> <version> main restricted universe multiverse\n\
+deb <mirror> <version>-updates main restricted universe multiverse\n\
+deb <mirror> <version>-backports main restricted universe multiverse\n\
+deb <mirror> <version>-security main restricted universe multiverse\n'\
 > /etc/apt/sources.list
 RUN sed -ie 's|<mirror>|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|' /etc/apt/sources.list
+RUN sed -ie 's|<version>|xenial|' /etc/apt/sources.list
 
 RUN apt-get update && apt-get upgrade -y
 RUN apt-get install -y --no-install-recommends \
         curl \
         unzip \
         git \
-        cmake \
+        make \
+        cmake-curses-gui \
         python \
         python-pip \
         python-setuptools \
         clang-format-5.0 \
-        graphviz
+        graphviz \
+        g++-arm-linux-gnueabi \
+        gcc-arm-linux-gnueabi
 RUN apt-get autoremove -y && apt-get clean
-RUN pip install wheel pre-commit
-RUN pre-commit autoupdate
 RUN ln -s clang-format-5.0 /usr/bin/clang-format
+RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip
+RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel
+RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit
 RUN cd /tmp && curl -O http://mirrors.neusoft.edu.cn/android/repository/android-ndk-r17b-linux-x86_64.zip
-RUN cd /opt && unzip /tmp/android-ndk-r17b-linux-x86_64.zip
\ No newline at end of file
+RUN curl -O https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
+        tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
+        mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \
+        mv /usr/bin/cmake /usr/bin/cmake.bak && ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake
+RUN cd /opt && unzip /tmp/android-ndk-r17b-linux-x86_64.zip
+ENV NDK_ROOT /opt/android-ndk-r17b
diff --git a/README.md b/README.md
index b6ae2beed999d146c64ffc9ee495373d9b77a175..90c68b87d625fd8e8311998a908835cd903dd372 100644
--- a/README.md
+++ b/README.md
@@ -1,36 +1,96 @@
-# Paddle-Mobile 
-
+# Paddle-Mobile
  
 [![Build Status](https://travis-ci.org/PaddlePaddle/paddle-mobile.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/paddle-mobile)
-[![License](https://img.shields.io/badge/license-Apache%202-brightgreen.svg)](LICENSE)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/doc)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
+
+<!--[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)-->
+
+
+
+欢迎来到 Paddle-Mobile GitHub 项目。
+
+Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平台的深度学习的框架。Paddle-Moible设计思想和PaddlePaddle的最新版fluid版本保持了高度一致，同时针对嵌入式做了大量优化。设计之初就对嵌入式的性能、体积、能耗、硬件平台覆盖等方面做了考虑。
+
+## Features
+
+- **ARM CPU**
+
+    arm cpu是paddle-mobile的主要支持方向，cpu的通用性一直是其优势。嵌入式深度学习，需要大量的cpu汇编实现。我们正在紧锣密鼓的编码，为的是能充分硬件的每一点加速能力。
+    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile现在单核运行一次mobilenet 1.0是160+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间。
+    
+- **Mali GPU**
+
+    Mali GPU是百度和ARM合作开发的，双方团队近期都在致力于将paddle的op能无缝运行在ACL(arm compute library)。目前已经支持squeezenet，googlenet，resnet等几个网络模型，后续会继续加大力度。使全部移动端paddle op能高效运行在mali gpu上。 
+
+- **苹果设备的GPU Metal实现**
+
+    基于Metal实现的苹果设备的GPU预测库，也已经在实现中，近期也会有相应可运行版本。
+     
+- **FPGA**
+
+    FPGA实现正在进行中，是基于Xilinx的ZU5目标开发板。
+
+- **灵活性**
+
+    * paddle-mobile cpu版不依赖任何第三库, 可进行快速集成。
+    * 使用泛型特化进行平台切换, 可灵活切换 cpu、gpu 和其他协处理器。
+    * 可根据特定的常见网络, 进行编译特定的 op, 降低编译时间, 减小包大小。
+    * 使用 docker 编译, 提供统一的编译环境。
+    * 高可拓展性, 方便拓展其他协处理器, 提供高性能 arm 算子实现, 方便其他协处理器开发者集成开发。
+    * 直接兼容 paddle-fluid 模型, 不需要额外的转换操作。
+
+- **体积**
+
+    paddle-mobile从设计之初就深入考虑到移动端的包体积的问题，cpu实现中没有外部依赖。在编译过程中，如果该网络不需要的op是完全不会被打入的。同时编译选项优化也为体积压缩提供了帮助。
+    除了二进制体积，我们对代码体积极力避免过大。整个仓库的代码体积也非常小。
+
+
+## 文档
+
+### 设计文档
+
+关于paddle-mobile设计文档在下面链接中，如果想了解更多内容。[issue](https://github.com/PaddlePaddle/paddle-mobile/issues)中会有很多早期的设计和讨论过程。
+[设计文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/design_doc.md)
+
+### 开发文档
+
+开发文档主要是关于编译、运行等问题。做为开发者，它可以和贡献文档共同结合使用。
+[开发文档]()https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_doc.md
 
+### 贡献文档
+- [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
+- 上面文档中涵盖了主要的贡献代码流程，如果在实践中您还遇到了其他问题，可以发[issue](https://github.com/PaddlePaddle/paddle-mobile/issues)。我们看到后会尽快处理。
 
-This project is used to develop the next version deep learning freamwork for mobile device.
 
-# Development
+## 模型获得
+目前Paddle-Mobile仅支持Paddle fluid训练的模型。如果你手中的模型是不同种类的模型，需要进行模型转换才可以运行。
+### 1. 直接使用Paddle Fluid训练
+该方式最为可靠，推荐方式
+### 2. caffe转为Paddle Fluid模型
+[链接](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
+### 3. ONNX
+ONNX全称为“Open Neural Network Exchange”，即“开放的神经网络切换”。该项目的目的是让不同的神经网络开发框架做到互通互用。
 
-[Used model in development](https://mms-mis.cdn.bcebos.com/paddle-mobile/models.zip)
+除直接使用PaddlePaddle训练fluid版本的模型外，还可以通过onnx转换得到个别Paddle fluid模型。
 
-## cross-compilation to android
+目前，百度也在做onnx支持工作。相关转换项目在这里：[paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)。
 
-* NDK is required
-* ANDROID_NDK environment variable is required
+![](http://otkwwi4x8.bkt.clouddn.com/2018-07-03-15305875853057.jpg)
 
-```bash 
-sh build.sh android
-```
+### 4. 部分测试模型下载
+[下载链接](https://mms-mis.cdn.bcebos.com/paddle-mobile/models.zip)
 
-## build for x86
-paddle-mobile is to run on arm platform. x86 only used to test not arm assembly code. So do not recommend compiling x86.
+## 问题解决
 
-Now only support osx.
+欢迎提出或解决我们的问题，有疑问可以发issue. [Github Issues](https://github.com/PaddlePaddle/paddle-mobile/issues).
 
-```
-sh build.sh mac
-```
+## Copyright and License
+Paddle-Mobile 提供相对宽松的Apache-2.0开源协议 [Apache-2.0 license](LICENSE).
 
-## Old Version of Mobile-Deep-Learning
-The old version of MDL was I moved to here [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) 
 
+## 旧版 Mobile-Deep-Learning
+原MDL(Mobile-Deep-Learning)工程被迁移到了这里 [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) 
 
 
diff --git a/demo/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.pbxproj b/demo/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.pbxproj
new file mode 100644
index 0000000000000000000000000000000000000000..8500c89c9af5ab2d56e08b576dc007a424262d15
--- /dev/null
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.pbxproj
@@ -0,0 +1,398 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 50;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		FC12E93320EB6B2800807EF4 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = FC12E93220EB6B2800807EF4 /* AppDelegate.m */; };
+		FC12E93620EB6B2800807EF4 /* ViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = FC12E93520EB6B2800807EF4 /* ViewController.m */; };
+		FC12E93920EB6B2800807EF4 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC12E93720EB6B2800807EF4 /* Main.storyboard */; };
+		FC12E93B20EB6B2900807EF4 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = FC12E93A20EB6B2900807EF4 /* Assets.xcassets */; };
+		FC12E93E20EB6B2900807EF4 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC12E93C20EB6B2900807EF4 /* LaunchScreen.storyboard */; };
+		FC12E94120EB6B2900807EF4 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = FC12E94020EB6B2900807EF4 /* main.m */; };
+		FC12E94A20EB6B6800807EF4 /* libpaddle-mobile.a in Frameworks */ = {isa = PBXBuildFile; fileRef = FC12E94820EB6B6800807EF4 /* libpaddle-mobile.a */; };
+		FC12E94D20EB6BBB00807EF4 /* libstdc++.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = FC12E94C20EB6BBB00807EF4 /* libstdc++.tbd */; };
+		FC12E95120EB6BED00807EF4 /* params in Resources */ = {isa = PBXBuildFile; fileRef = FC12E94F20EB6BED00807EF4 /* params */; };
+		FC12E95220EB6BED00807EF4 /* model in Resources */ = {isa = PBXBuildFile; fileRef = FC12E95020EB6BED00807EF4 /* model */; };
+		FC12E95420EB6C0D00807EF4 /* apple.jpg in Resources */ = {isa = PBXBuildFile; fileRef = FC12E95320EB6C0D00807EF4 /* apple.jpg */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		FC12E92E20EB6B2800807EF4 /* PaddleMobileDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = PaddleMobileDemo.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		FC12E93120EB6B2800807EF4 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
+		FC12E93220EB6B2800807EF4 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
+		FC12E93420EB6B2800807EF4 /* ViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ViewController.h; sourceTree = "<group>"; };
+		FC12E93520EB6B2800807EF4 /* ViewController.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ViewController.m; sourceTree = "<group>"; };
+		FC12E93820EB6B2800807EF4 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
+		FC12E93A20EB6B2900807EF4 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		FC12E93D20EB6B2900807EF4 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
+		FC12E93F20EB6B2900807EF4 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		FC12E94020EB6B2900807EF4 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
+		FC12E94820EB6B6800807EF4 /* libpaddle-mobile.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; path = "libpaddle-mobile.a"; sourceTree = "<group>"; };
+		FC12E94920EB6B6800807EF4 /* PaddleMobile.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PaddleMobile.h; sourceTree = "<group>"; };
+		FC12E94C20EB6BBB00807EF4 /* libstdc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libstdc++.tbd"; path = "usr/lib/libstdc++.tbd"; sourceTree = SDKROOT; };
+		FC12E94F20EB6BED00807EF4 /* params */ = {isa = PBXFileReference; lastKnownFileType = file; path = params; sourceTree = "<group>"; };
+		FC12E95020EB6BED00807EF4 /* model */ = {isa = PBXFileReference; lastKnownFileType = file; path = model; sourceTree = "<group>"; };
+		FC12E95320EB6C0D00807EF4 /* apple.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = apple.jpg; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		FC12E92B20EB6B2800807EF4 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FC12E94D20EB6BBB00807EF4 /* libstdc++.tbd in Frameworks */,
+				FC12E94A20EB6B6800807EF4 /* libpaddle-mobile.a in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		FC12E92520EB6B2800807EF4 = {
+			isa = PBXGroup;
+			children = (
+				FC12E93020EB6B2800807EF4 /* PaddleMobileDemo */,
+				FC12E92F20EB6B2800807EF4 /* Products */,
+				FC12E94B20EB6BBB00807EF4 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		FC12E92F20EB6B2800807EF4 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				FC12E92E20EB6B2800807EF4 /* PaddleMobileDemo.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		FC12E93020EB6B2800807EF4 /* PaddleMobileDemo */ = {
+			isa = PBXGroup;
+			children = (
+				FC12E95320EB6C0D00807EF4 /* apple.jpg */,
+				FC12E94E20EB6BED00807EF4 /* googlenet_combine */,
+				FC12E94720EB6B6800807EF4 /* PaddleMobile */,
+				FC12E93120EB6B2800807EF4 /* AppDelegate.h */,
+				FC12E93220EB6B2800807EF4 /* AppDelegate.m */,
+				FC12E93420EB6B2800807EF4 /* ViewController.h */,
+				FC12E93520EB6B2800807EF4 /* ViewController.m */,
+				FC12E93720EB6B2800807EF4 /* Main.storyboard */,
+				FC12E93A20EB6B2900807EF4 /* Assets.xcassets */,
+				FC12E93C20EB6B2900807EF4 /* LaunchScreen.storyboard */,
+				FC12E93F20EB6B2900807EF4 /* Info.plist */,
+				FC12E94020EB6B2900807EF4 /* main.m */,
+			);
+			path = PaddleMobileDemo;
+			sourceTree = "<group>";
+		};
+		FC12E94720EB6B6800807EF4 /* PaddleMobile */ = {
+			isa = PBXGroup;
+			children = (
+				FC12E94820EB6B6800807EF4 /* libpaddle-mobile.a */,
+				FC12E94920EB6B6800807EF4 /* PaddleMobile.h */,
+			);
+			path = PaddleMobile;
+			sourceTree = "<group>";
+		};
+		FC12E94B20EB6BBB00807EF4 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				FC12E94C20EB6BBB00807EF4 /* libstdc++.tbd */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		FC12E94E20EB6BED00807EF4 /* googlenet_combine */ = {
+			isa = PBXGroup;
+			children = (
+				FC12E94F20EB6BED00807EF4 /* params */,
+				FC12E95020EB6BED00807EF4 /* model */,
+			);
+			path = googlenet_combine;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		FC12E92D20EB6B2800807EF4 /* PaddleMobileDemo */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = FC12E94420EB6B2900807EF4 /* Build configuration list for PBXNativeTarget "PaddleMobileDemo" */;
+			buildPhases = (
+				FC12E92A20EB6B2800807EF4 /* Sources */,
+				FC12E92B20EB6B2800807EF4 /* Frameworks */,
+				FC12E92C20EB6B2800807EF4 /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = PaddleMobileDemo;
+			productName = PaddleMobileDemo;
+			productReference = FC12E92E20EB6B2800807EF4 /* PaddleMobileDemo.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		FC12E92620EB6B2800807EF4 /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastUpgradeCheck = 0940;
+				ORGANIZATIONNAME = orange;
+				TargetAttributes = {
+					FC12E92D20EB6B2800807EF4 = {
+						CreatedOnToolsVersion = 9.4.1;
+					};
+				};
+			};
+			buildConfigurationList = FC12E92920EB6B2800807EF4 /* Build configuration list for PBXProject "PaddleMobileDemo" */;
+			compatibilityVersion = "Xcode 9.3";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = FC12E92520EB6B2800807EF4;
+			productRefGroup = FC12E92F20EB6B2800807EF4 /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				FC12E92D20EB6B2800807EF4 /* PaddleMobileDemo */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		FC12E92C20EB6B2800807EF4 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FC12E93E20EB6B2900807EF4 /* LaunchScreen.storyboard in Resources */,
+				FC12E95220EB6BED00807EF4 /* model in Resources */,
+				FC12E93B20EB6B2900807EF4 /* Assets.xcassets in Resources */,
+				FC12E95120EB6BED00807EF4 /* params in Resources */,
+				FC12E95420EB6C0D00807EF4 /* apple.jpg in Resources */,
+				FC12E93920EB6B2800807EF4 /* Main.storyboard in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		FC12E92A20EB6B2800807EF4 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FC12E93620EB6B2800807EF4 /* ViewController.m in Sources */,
+				FC12E94120EB6B2900807EF4 /* main.m in Sources */,
+				FC12E93320EB6B2800807EF4 /* AppDelegate.m in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin PBXVariantGroup section */
+		FC12E93720EB6B2800807EF4 /* Main.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				FC12E93820EB6B2800807EF4 /* Base */,
+			);
+			name = Main.storyboard;
+			sourceTree = "<group>";
+		};
+		FC12E93C20EB6B2900807EF4 /* LaunchScreen.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				FC12E93D20EB6B2900807EF4 /* Base */,
+			);
+			name = LaunchScreen.storyboard;
+			sourceTree = "<group>";
+		};
+/* End PBXVariantGroup section */
+
+/* Begin XCBuildConfiguration section */
+		FC12E94220EB6B2900807EF4 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+			};
+			name = Debug;
+		};
+		FC12E94320EB6B2900807EF4 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = iphoneos;
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		FC12E94520EB6B2900807EF4 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				DEVELOPMENT_TEAM = Z5M2UUN5YV;
+				ENABLE_BITCODE = NO;
+				INFOPLIST_FILE = PaddleMobileDemo/Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 10.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				LIBRARY_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(PROJECT_DIR)/PaddleMobileDemo/PaddleMobile",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = orange.PaddleMobileDemo;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		FC12E94620EB6B2900807EF4 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				DEVELOPMENT_TEAM = Z5M2UUN5YV;
+				ENABLE_BITCODE = NO;
+				INFOPLIST_FILE = PaddleMobileDemo/Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 10.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				LIBRARY_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(PROJECT_DIR)/PaddleMobileDemo/PaddleMobile",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = orange.PaddleMobileDemo;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		FC12E92920EB6B2800807EF4 /* Build configuration list for PBXProject "PaddleMobileDemo" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				FC12E94220EB6B2900807EF4 /* Debug */,
+				FC12E94320EB6B2900807EF4 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		FC12E94420EB6B2900807EF4 /* Build configuration list for PBXNativeTarget "PaddleMobileDemo" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				FC12E94520EB6B2900807EF4 /* Debug */,
+				FC12E94620EB6B2900807EF4 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = FC12E92620EB6B2800807EF4 /* Project object */;
+}
diff --git a/demo/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/demo/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
new file mode 100644
index 0000000000000000000000000000000000000000..e4db9529ba656814e6a2bd889426662d914277eb
--- /dev/null
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Workspace
+   version = "1.0">
+   <FileRef
+      location = "self:PaddleMobileDemo.xcodeproj">
+   </FileRef>
+</Workspace>
diff --git a/demo/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/demo/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
new file mode 100644
index 0000000000000000000000000000000000000000..18d981003d68d0546c4804ac2ff47dd97c6e7921
--- /dev/null
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>IDEDidComputeMac32BitWarning</key>
+	<true/>
+</dict>
+</plist>
diff --git a/demo/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate b/demo/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
new file mode 100644
index 0000000000000000000000000000000000000000..395136a63bb50378df8c37256880d4bbf9fd2f83
Binary files /dev/null and b/demo/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate differ
diff --git a/demo/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist b/demo/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
new file mode 100644
index 0000000000000000000000000000000000000000..7caa9222e77f1e53c0ee45c298aacb330e870688
--- /dev/null
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>SchemeUserState</key>
+	<dict>
+		<key>PaddleMobileDemo.xcscheme</key>
+		<dict>
+			<key>orderHint</key>
+			<integer>0</integer>
+		</dict>
+	</dict>
+</dict>
+</plist>
diff --git a/demo/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.h b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d2e035ab3c44617694f9ebe437d1a7289be0390
--- /dev/null
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.h
@@ -0,0 +1,23 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#import <UIKit/UIKit.h>
+
+@interface AppDelegate : UIResponder <UIApplicationDelegate>
+
+@property (strong, nonatomic) UIWindow *window;
+
+
+@end
+
diff --git a/demo/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.m b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.m
new file mode 100644
index 0000000000000000000000000000000000000000..6644c3c079ae1748de28a634b78c344640cd335a
--- /dev/null
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.m
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#import "AppDelegate.h"
+
+@interface AppDelegate ()
+
+@end
+
+@implementation AppDelegate
+
+
+- (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
+    // Override point for customization after application launch.
+    return YES;
+}
+
+
+- (void)applicationWillResignActive:(UIApplication *)application {
+    // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
+    // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
+}
+
+
+- (void)applicationDidEnterBackground:(UIApplication *)application {
+    // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
+    // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
+}
+
+
+- (void)applicationWillEnterForeground:(UIApplication *)application {
+    // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
+}
+
+
+- (void)applicationDidBecomeActive:(UIApplication *)application {
+    // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
+}
+
+
+- (void)applicationWillTerminate:(UIApplication *)application {
+    // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
+}
+
+
+@end
diff --git a/demo/ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/AppIcon.appiconset/Contents.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8db8d65fd79fd541b2b7eba75c7378af3448f9c
--- /dev/null
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/AppIcon.appiconset/Contents.json
@@ -0,0 +1,98 @@
+{
+  "images" : [
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "83.5x83.5",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ios-marketing",
+      "size" : "1024x1024",
+      "scale" : "1x"
+    }
+  ],
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
diff --git a/demo/ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/Contents.json b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/Contents.json
new file mode 100644
index 0000000000000000000000000000000000000000..da4a164c918651cdd1e11dca5cc62c333f097601
--- /dev/null
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/Contents.json
@@ -0,0 +1,6 @@
+{
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
diff --git a/demo/ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/LaunchScreen.storyboard b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/LaunchScreen.storyboard
new file mode 100644
index 0000000000000000000000000000000000000000..f83f6fd5810b9c852cf98563d82d5ed1e84ff893
--- /dev/null
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/LaunchScreen.storyboard
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="13122.16" systemVersion="17A277" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
+    <dependencies>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="13104.12"/>
+        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--View Controller-->
+        <scene sceneID="EHf-IW-A2E">
+            <objects>
+                <viewController id="01J-lp-oVM" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="53" y="375"/>
+        </scene>
+    </scenes>
+</document>
diff --git a/demo/ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/Main.storyboard b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/Main.storyboard
new file mode 100644
index 0000000000000000000000000000000000000000..d7c78a1255c016bde922c849eef8555881c207b6
--- /dev/null
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/Main.storyboard
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="13122.16" systemVersion="17A277" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+    <dependencies>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="13104.12"/>
+        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--View Controller-->
+        <scene sceneID="tne-QT-ifu">
+            <objects>
+                <viewController id="BYZ-38-t0r" customClass="ViewController" customModuleProvider="" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
+            </objects>
+        </scene>
+    </scenes>
+</document>
diff --git a/demo/ios/PaddleMobileDemo/PaddleMobileDemo/Info.plist b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/Info.plist
new file mode 100644
index 0000000000000000000000000000000000000000..16be3b681122de83e380d47b840b7d0486f71f86
--- /dev/null
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/Info.plist
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>$(DEVELOPMENT_LANGUAGE)</string>
+	<key>CFBundleExecutable</key>
+	<string>$(EXECUTABLE_NAME)</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>$(PRODUCT_NAME)</string>
+	<key>CFBundlePackageType</key>
+	<string>APPL</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleVersion</key>
+	<string>1</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>UILaunchStoryboardName</key>
+	<string>LaunchScreen</string>
+	<key>UIMainStoryboardFile</key>
+	<string>Main</string>
+	<key>UIRequiredDeviceCapabilities</key>
+	<array>
+		<string>armv7</string>
+	</array>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+	<key>UISupportedInterfaceOrientations~ipad</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+</dict>
+</plist>
diff --git a/demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/PaddleMobile.h b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/PaddleMobile.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec58371de032c265b2c32a5bac61ca6cf682ff28
--- /dev/null
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/PaddleMobile.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#import <CoreImage/CoreImage.h>
+#import <Foundation/Foundation.h>
+
+@interface PaddleMobile : NSObject
+
+- (instancetype)init;
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
+- (void)clear;
+
+@end
diff --git a/demo/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.h b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.h
new file mode 100644
index 0000000000000000000000000000000000000000..41e22092711c4fd1def105470c5b6610cce2257a
--- /dev/null
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.h
@@ -0,0 +1,21 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#import <UIKit/UIKit.h>
+
+@interface ViewController : UIViewController
+
+
+@end
+
diff --git a/demo/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.m b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.m
new file mode 100644
index 0000000000000000000000000000000000000000..eb9f7d2bb5329ac4bbf8b790dc83c256f164ec64
--- /dev/null
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.m
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#import "PaddleMobile.h"
+#import "ViewController.h"
+
+@interface ViewController ()
+
+@end
+
+@implementation ViewController
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+    PaddleMobile *pam = [[PaddleMobile alloc] init];
+    NSString *modelPath = [[NSBundle mainBundle] pathForResource:@"model" ofType:nil];
+    NSString *paramPath = [[NSBundle mainBundle] pathForResource:@"params" ofType:nil];
+    if (modelPath.length == 0 || paramPath.length == 0) {
+        NSLog(@" need model and param");
+        return;
+    }
+    
+    if ([pam load:modelPath andWeightsPath:paramPath]) {
+        NSLog(@"load success");
+        UIImage *inputImage = [UIImage imageNamed:@"apple.jpg"];
+        if (!inputImage) {
+            NSLog(@" input image is nil");
+            return;
+        }
+        
+        NSDate *beforeDate = [NSDate date];
+        NSArray *res = [pam predict:inputImage.CGImage dim:@[@1, @3, @224, @224] means:@[@148, @148, @148] scale:1.0];
+        NSLog(@"res: %@", res);
+        NSLog(@"elapsed time: %f", [[NSDate date] timeIntervalSinceDate:beforeDate]);
+    }
+}
+
+@end
diff --git a/demo/ios/PaddleMobileDemo/PaddleMobileDemo/main.m b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/main.m
new file mode 100644
index 0000000000000000000000000000000000000000..cf2cf6aa80b1d3d3c0480b54a668780ea3324c8b
--- /dev/null
+++ b/demo/ios/PaddleMobileDemo/PaddleMobileDemo/main.m
@@ -0,0 +1,22 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#import <UIKit/UIKit.h>
+#import "AppDelegate.h"
+
+int main(int argc, char * argv[]) {
+    @autoreleasepool {
+        return UIApplicationMain(argc, argv, nil, NSStringFromClass([AppDelegate class]));
+    }
+}
diff --git a/doc/build.md b/doc/build.md
new file mode 100644
index 0000000000000000000000000000000000000000..6a8521b593ccdeab464687e7eae79192d131d51b
--- /dev/null
+++ b/doc/build.md
@@ -0,0 +1,59 @@
+# 环境搭建
+## 使用 docker
+### 1. 安装 docker
+安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
+### 2. 使用 docker 搭建构建环境
+首先进入 paddle-mobile 的目录下，执行 `docker build`
+以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
+```
+$ docker build -t paddle-mobile:dev - < Dockerfile
+```
+使用 `docker images` 可以看到我们新建的 image
+```
+$ docker images
+REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
+paddle-mobile   dev     33b146787711   45 hours ago    372MB
+```
+### 3. 使用 docker 构建
+进入 paddle-mobile 目录，执行 docker run
+```
+$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
+root@5affd29d4fc5:/ # cd /paddle-mobile
+# 生成构建 android 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
+# 生成构建 linux 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
+```
+### 4. 设置编译选项
+可以通过 ccmake 设置编译选项
+```
+root@5affd29d4fc5:/ # ccmake .
+                                                     Page 1 of 1
+ CMAKE_ASM_FLAGS
+ CMAKE_ASM_FLAGS_DEBUG
+ CMAKE_ASM_FLAGS_RELEASE
+ CMAKE_BUILD_TYPE
+ CMAKE_INSTALL_PREFIX             /usr/local
+ CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
+ CPU                              ON
+ DEBUGING                         ON
+ FPGA                             OFF
+ LOG_PROFILE                      ON
+ MALI_GPU                         OFF
+ NET                              googlenet
+ USE_EXCEPTION                    ON
+ USE_OPENMP                       OFF
+```
+修改选项后，按 `c`, `g` 更新 Makefile
+### 5. 构建
+使用 make 命令进行构建
+```
+root@5affd29d4fc5:/ # make
+```
+### 6. 查看构建产出
+构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
+
+## 不使用 docker
+不使用 docker 的方法，可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具，可能需要设置 CC，CXX 环境变量，或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake，或者增加自己需要的 toolchain file。
\ No newline at end of file
diff --git a/doc/design_doc.md b/doc/design_doc.md
new file mode 100644
index 0000000000000000000000000000000000000000..bf5f78e8d805465418cad8989945f2afa7ab5587
--- /dev/null
+++ b/doc/design_doc.md
@@ -0,0 +1,182 @@
+# paddle-mobile 设计文档
+
+
+#### 以下是 paddle-mobile 代码的执行流程图:
+
+![执行流程图](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305189473720.png)
+
+
+#### 主要分为: Loader 模块、 Program 模块、 Executor 模块、 op 模块、 kernel 模块、scope variable Tensor 模块
+
+#### 下面展开说一下各个模块的作用以及设计思路
+
+### 一. Loader
+先来看一下模型, 模型分为两种结构:
+ 一种为参数文件是散开的, 如下图, 红框为模型结构的 protobuf 文件, 其余为参数文件
+
+![模型描述](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305190629577.png)
+
+
+另一种为参数文件结合在一起的, 如下图, 红框内为模型结构描述的 protobuf 文件, 另一个文件为结合在一起的参数文件
+
+![模型描述combined](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191057130.png)
+
+
+loader 模块的作用是将模型结构信息 load 进内存, 将红框内的 protobuf 文件 load 进内存, 并对模型结构进行优化(如将几个细粒度的 op 融合成 粗粒度的 op, 如将 conv、 add、 batchnorm、 relu 融合为 conv\_add\_batchnorm\_relu).
+方便进行算法优化.
+
+__那么为什么融合在一起能够做算法优化 ?__
+
+如果未融合的 conv add batchnorm relu 运算是这样的
+
+```
+[n]
+[conv_res] = conv([n])
+
+for &res in conv_res {
+	res = add_biase(res)
+}
+
+for &res in conv_res {
+	res = batchnorm(res)
+}
+
+for &res in conv_res {
+	res = relu(res)
+}
+
+```
+融合后的 conv\_add\_batchnorm\_relu 运算是这样的:
+
+```
+[n]
+[conv_res] = conv([n])
+
+for &res in conv_res {
+	res = relu(batchnorm(add_biase(res)))
+}
+
+```
+由于 conv 可以转换为两个大矩阵相乘, 更进一步可以分为若干个一行一列的小矩阵相乘, 那最终的运算是这样的:
+
+```
+[n]
+for &res in [res] {
+	res = relu(batchnorm(add_biase(A * B)))
+}
+
+其中 A 和 B 为 1 * k 和 k * 1 矩阵
+
+```
+
+
+
+### 二. Program
+
+program 为 loader 模块的结果, 包含了优化前的模型结构对象, 以及优化后的模型结构对象, 此模块基本对应着 paddle 模型的结构, 关于paddle 模型的一些概念的定义, 详细设计可以参考 [program.md](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), 以下是一个简单的概况: 
+
+* programDesc 中包含着若干个(googlenet mobilenet yolo squeezenet resnet 常见的模型只有一个)可以嵌套的 block, blocks中的第一个block中的某个 op 可能会执行 blocks 中后边 block 中的一系列 op 运算(只有多个block才会有此概念)
+* block 包含着 ops 和 vars
+* ops 为一系列 op 的描述, 描述着每个 op 的类型, 输入输出, 所需参数
+* vars 里包含的为所有 op 运算所需的参数描述
+
+### 三. Executor
+
+executor 主要是用于 op 运算的上层调度操作, 主要有两个操作,  executor 实例化 和 暴露给上层的 predict 方法
+
+* executor 实例化过程中, 主要进行了这几个操作 
+	1. 根据 loader 产出的 program 初始化 operator 对象 
+	2. 分配所有需要用到的内存, 包括每个op 的输入输出, 权重参数, 目前模型的权重参数文件的内存格式为 NCHW, op 的输入输出中间矩阵参数也是 NCHW 格式
+	3. 调用每个 op 的 init 方法, init 方法是每个 op 实现者进行参数预处理的地方, 有助于减少 predict 的耗时
+
+* predict, 主要用于拿到外部的输入, 顺序调用 op 的 run 方法进行运算, 并返回最终的结果.
+
+
+### 四. op
+关于 op 模块代码的详细设计可以参考 [operator部分代码设计](https://github.com/PaddlePaddle/paddle-mobile/issues/300), operator主要包含一个kernel用于运算、一个 param 用于存储属性, operator 主要有三个操作, Init、RunImp、InferShape
+
+* Init: Init 函数主要用于参数预处理, 如对 batchNorm 参数进行预处理, 可以将 batchNorm 运算转化为 a * x + b 形式的运算, 这个函数也会调用, kernel 的 Init 函数对 kernel 进行初始化
+* RunImp: RunImp 函数会调用自己的kernel 的 compute 方法进行运算
+* InferShape: InferShape 函数会根据输入和参数得出输出的形状, 这个函数会在 executor 实例化时, 内存初始化前调用
+
+每个 operator 都需要进行注册才可以被使用, 以 conv 为例, 需在 conv_op.cpp 底部这样写: 
+
+```c++
+// 三个平台都注册了 conv op
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(conv2d);
+REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(conv2d);
+REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(conv2d);
+REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
+#endif
+
+```
+
+__一个关于包大小的优化__:
+
+每个 operator 都由一个宏控制编译, 如 conv_op.h(除了 conv_op.h ,  conv_op.cpp、conv_kernle.h、conv_kernle.cpp 也都需要加此宏控制)
+
+```c++
+
+#ifdef CONV_OP    //这个宏控制着 conv_op 是否被编译, 除了 conv_op.h ,  conv_op.cpp、conv_kernle.h conv_kernle.cpp 也都需要加此宏控制
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/conv_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+template <typename DeviceType, typename T>
+class ConvOp
+	//impl  
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+
+```
+这样做的目的是为了根据不同类型的网络编译特定的op, 在 cmake 中已经配置好不同网络编译的宏, 如果你要进行编译支持 yolo 的模型, 仅需执行:
+
+```sh
+cd toools
+sh build.sh android yolo
+
+```
+这样只会编译 yolo 所包含的四种 op, 极大的减小了包体积和编译时间
+
+### 五. kernel
+kernel 为 op 的底层运算实现, 主要有两个函数, Init 和 Compute, 分别用来初始化、预处理 和 运算操作, 值得提出的是, kernel 会根据泛型特化到不同的平台, 如图所示:
+
+![设备特化]![](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191401976.png)
+
+不同平台的 kernel 实现, 为同一个 kernel 类不同泛型的特化实现, 目前有三个平台, arm、mali、fpga, 图中的 central-arm-func\ 目录为 op kernel 的 arm 实现, 它承担了 arm\ 目录下 kernel 的底层实现, 同时 arm 处理器作为中央处理器, central-arm-func\ 也可以作为其他协处理器的底层实现, 如: fpga 的某一个 op kernel 还没有 fpga 协处理器的实现, 就可以直接调用使用这里的 arm 实现.
+
+__如果你有兴趣新增一个协处理器实现, 就可以在次添加一个 kernel 目录, 提供协处理器实现, 如果某个 kernel 你没有实现完, 你也可以直接使用 arm 实现__
+
+### 六. scope variable Tensor
+* scope 用来存储管理所需用到的所有 variable(用来存储不同类型的对象, 主要是矩阵Tensor, 也就是说 scpoe 管理着 op 运算过程中所有参数矩阵, 输入输出矩阵), 可以将 scope 理解为一个 map, 这里在 map 上封了一层 scope 的概念是为了方便内存管理
+* variable 可以用来存储不同类型的对象, paddle-mobile 里主要用它来存储矩阵 Tensor
+* tensor 代表着矩阵, 通过泛型可以用来存储不同类型的矩阵, 但需要注意的是, 存入和取出时的类型必须保持一致, 如果类型不一致,  使用 inline const T \*data() const 获取指针会不能通过类型检查, 通过  inline T \*mutable_data() 获取指针会重新分配内存, 以下是关于 Tensor 的一些小概念:
+	1. DDim: 用来存储矩阵的维度信息.
+	2. Slice(): 这个函数用来获取 N 维 (NCHW中的 N) 上切片
+	3. 当实例化未分配内存时, 调用 inline T *mutable_data() 会分配内存
+
+
+
+
+
+
diff --git a/doc/development_doc.md b/doc/development_doc.md
new file mode 100644
index 0000000000000000000000000000000000000000..acaa50215a7a51ad67eacf8ac5e3be41196a52e4
--- /dev/null
+++ b/doc/development_doc.md
@@ -0,0 +1,227 @@
+### iOS&Android开发文档
+
+# iOS开发文档
+
+## 编译
+
+### 一. 使用 build.sh 编译
+
+```sh
+sh build.sh ios
+
+# 如果只想编译某个特定模型的 op, 则需执行以下命令
+sh build.sh ios googlenet
+
+# 在这个文件夹下, 你可以拿到生成的 .a 库
+cd ../build/release/ios/build
+
+```
+
+### 二. 使用 xcode 编译
+
+我们提供了 ios 开发更为熟悉的 xcode 编译环境:
+在 ios/ 目录下打开 PaddleMobile.xcworkspace 即可编译 PaddleMobile 或者 运行 Demo
+
+### 三. 集成
+
+#### 如使用 c++ 接口
+将
+
+```
+libpaddle-mobile.a
+io.h
+program.h
+types.h
+lod_tensor.h
+tensor.h
+```
+拖入工程, io.h 为接口文件, 可在 [github](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/io.h)上查看接口注释
+
+#### 如使用 oc 接口
+将在xcode 编译生成的
+```
+libPaddleMobile.a
+PaddleMobile.h
+```
+拖入工程, 接口如下:
+
+```
+/*
+	创建单例对象
+*/
++ (instancetype)sharedInstance;
+
+/*
+	load 模型, 开辟内存
+*/
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+
+/*
+	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image means:(NSArray<NSNumber *> *)means scale:(float)scale;
+
+/*
+	进行预测
+*/
+- (NSArray *)predict:(CGImageRef)image;
+
+/*
+	清理内存
+*/
+- (void)clear;
+
+```
+
+
+#Android开发文档
+用户可通过如下两种方式，交叉编译Android平台上适用的paddle-mobile库：
+
+- 基于Docker容器编译
+- 基于Linux交叉编译
+
+
+## 基于Docker容器编译
+### 1. 安装 docker
+安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
+### 2. 使用 docker 搭建构建环境
+首先进入 paddle-mobile 的目录下，执行 `docker build`
+以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
+
+```
+$ docker build -t paddle-mobile:dev - < Dockerfile
+```
+使用 `docker images` 可以看到我们新建的 image
+
+```
+$ docker images
+REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
+paddle-mobile   dev     33b146787711   45 hours ago    372MB
+```
+### 3. 使用 docker 构建
+进入 paddle-mobile 目录，执行 docker run
+
+```
+$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
+root@5affd29d4fc5:/ # cd /paddle-mobile
+# 生成构建 android 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
+# 生成构建 linux 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
+```
+### 4. 设置编译选项
+可以通过 ccmake 设置编译选项
+
+```
+root@5affd29d4fc5:/ # ccmake .
+                                                     Page 1 of 1
+ CMAKE_ASM_FLAGS
+ CMAKE_ASM_FLAGS_DEBUG
+ CMAKE_ASM_FLAGS_RELEASE
+ CMAKE_BUILD_TYPE
+ CMAKE_INSTALL_PREFIX             /usr/local
+ CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
+ CPU                              ON
+ DEBUGING                         ON
+ FPGA                             OFF
+ LOG_PROFILE                      ON
+ MALI_GPU                         OFF
+ NET                              googlenet
+ USE_EXCEPTION                    ON
+ USE_OPENMP                       OFF
+```
+修改选项后，按 `c`, `g` 更新 Makefile
+### 5. 构建
+使用 make 命令进行构建
+
+```
+root@5affd29d4fc5:/ # make
+```
+### 6. 查看构建产出
+构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
+
+## 基于Linux交叉编译
+### 交叉编译环境准备
+##### 下载Android NDK
+
+从源码交叉编译paddle-mobile,用户需要提前准备好交叉编译环境。Android平台使用的C/C++交叉编译工具链是[Android NDK](https://developer.android.com/ndk/)，用户可以自行前往下载，也可以通过以下命令获取：
+
+```
+wget https://dl.google.com/android/repository/android-ndk-r17b-darwin-x86_64.zip
+unzip android-ndk-r17b-darwin-x86_64.zip
+
+```
+
+##### 设置环境变量
+工程中自带的独立工具链会根据环境变量NDK_ROOT查找NDK，因此需要配置环境变量：
+
+```
+export NDK_ROOT = "path to ndk"
+```
+### 执行编译
+在paddle-mobile根目录中，执行以下命令：
+
+```
+cd tools
+sh build.sh android
+
+```
+执行完毕后，生成的so位于build目录中，单测可执行文件位于test/build目录中。
+##### Tips:
+如果想要获得体积更小的库，可选择编译支持指定模型结构的库。
+如执行如下命令：
+
+```
+sh build.sh android googlenet
+```
+会得到一个支持googlnet的体积更小的库。
+
+##测试
+在编译完成后，我们提供了自动化的测试脚本，帮助用户将运行单测文件所需要的模型及库文件push到Android设备中，执行以下命令：
+
+```
+cd tools/android-debug-script
+sh run_on_android.sh (npm) 可选参数npm,用于选择是否传输模型文件到手机上
+```
+出现如下提示：
+
+```
+**** choose OP or NET to test ****
+which to test :
+```
+输入名称即可运行对应的测试文件。
+
+##部署
+Android应用可通过JNI接口调用底层C/C++，paddle-mobile对外提供的JNI接口如下：
+
+##### 1 load接口 加载模型参数
+
+```
+/*
+*@param modelPath 模型文件路径
+*@return jboolean
+*/
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
+                                                          jclass thiz,
+                                                          jstring modelPath);
+```
+
+##### 2 predict接口 执行预测
+
+```
+/**
+*@param buf 输入数据
+*@return 输出数据
+JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predict(
+    JNIEnv *env, jclass thiz, jfloatArray buf);
+```
+##### 3 clear接口 销毁实例、清理内存操作
+
+```
+JNIEXPORT void JNICALL Java_com_baidu_paddle_PMLL_clear(JNIEnv *env,
+                                                        jclass thiz);
+```
+
+
diff --git a/doc/images/devices.png b/doc/images/devices.png
new file mode 100644
index 0000000000000000000000000000000000000000..413d32c249972ee96f678d50a5cd0b36a2a03e29
Binary files /dev/null and b/doc/images/devices.png differ
diff --git a/doc/images/flow_chart.png b/doc/images/flow_chart.png
new file mode 100644
index 0000000000000000000000000000000000000000..c747230da43e2e688d7460704268631758d34596
Binary files /dev/null and b/doc/images/flow_chart.png differ
diff --git a/doc/images/model_desc.png b/doc/images/model_desc.png
new file mode 100644
index 0000000000000000000000000000000000000000..3c026b6192c8e1d84b3a82c3db91e022f35358c2
Binary files /dev/null and b/doc/images/model_desc.png differ
diff --git a/doc/images/model_desc_combined.png b/doc/images/model_desc_combined.png
new file mode 100644
index 0000000000000000000000000000000000000000..38e7388efcfdcad53f4e80ce0ac5d3b993eb986c
Binary files /dev/null and b/doc/images/model_desc_combined.png differ
diff --git a/src/common/common.h b/src/common/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..12157b5e946490d041f0cc0d235142a13a3a2527
--- /dev/null
+++ b/src/common/common.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <chrono>
+
+using Time = decltype(std::chrono::high_resolution_clock::now());
+
+inline Time time() { return std::chrono::high_resolution_clock::now(); }
+
+inline double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
diff --git a/src/common/depCore.h b/src/common/dep_core.h
similarity index 100%
rename from src/common/depCore.h
rename to src/common/dep_core.h
diff --git a/src/common/log.h b/src/common/log.h
index 07afdb39d04f2bf3ba083f79e812fb951a6194be..d964d9c1b39a7e72e3d757ef2be0737fd1d25f94 100644
--- a/src/common/log.h
+++ b/src/common/log.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <vector>
 #ifdef PADDLE_MOBILE_DEBUG
+#include <cstring>
 #include <iostream>
 #include <sstream>
 #include <string>
@@ -115,26 +116,29 @@ struct ToLog {
   Print printer_;
 };
 
-#define LOG(level)                                                             \
-  if (level > paddle_mobile::log_level) {                                      \
-  } else                                                                       \
-    paddle_mobile::ToLog(                                                      \
-        level,                                                                 \
-        (std::stringstream()                                                   \
-         << "[file: "                                                          \
-         << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \
-         << "] [line: " << __LINE__ << "] ")                                   \
-            .str())
-
-#define DLOG                                                                   \
-  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {                  \
-  } else                                                                       \
-    paddle_mobile::ToLog(                                                      \
-        paddle_mobile::kLOG_DEBUG,                                             \
-        (std::stringstream()                                                   \
-         << "[file: "                                                          \
-         << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \
-         << "] [line: " << __LINE__ << "] ")                                   \
+#define LOG(level)                                                           \
+  if (level > paddle_mobile::log_level) {                                    \
+  } else                                                                     \
+    paddle_mobile::ToLog(                                                    \
+        level, static_cast<const std::stringstream &>(                       \
+                   std::stringstream()                                       \
+                   << "[file: "                                              \
+                   << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
+                                              : __FILE__)                    \
+                   << "] [line: " << __LINE__ << "] ")                       \
+                   .str())
+
+#define DLOG                                                          \
+  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {         \
+  } else                                                              \
+    paddle_mobile::ToLog(                                             \
+        paddle_mobile::kLOG_DEBUG,                                    \
+        static_cast<const std::stringstream &>(                       \
+            std::stringstream()                                       \
+            << "[file: "                                              \
+            << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
+                                       : __FILE__)                    \
+            << "] [line: " << __LINE__ << "] ")                       \
             .str())
 
 #define LOGF(level, format, ...)          \
@@ -170,7 +174,10 @@ struct ToLog;
 struct Print {
   friend struct ToLog;
   template <typename T>
-  Print &operator<<(T const &value) {}
+  Print &operator<<(T const &value) {
+    Print p = Print();
+    return p;
+  }
 
  private:
 };
diff --git a/src/common/type_define.h b/src/common/type_define.h
index c26cdd91e0694d44cca9443503d3e263ee21f201..389f9a715f8cec3f0b494ae3b43b3952e49677f8 100644
--- a/src/common/type_define.h
+++ b/src/common/type_define.h
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once;
+#pragma once
 
+#include <functional>
 #include <map>
 #include <string>
 #include <vector>
diff --git a/src/common/types.cpp b/src/common/types.cpp
index a6f32762d3c8a492c3347ebfe65cb50f39425976..cea42171f0205e0d40b2703d5c90f0b9fc253e68 100644
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -23,8 +23,9 @@ const std::string G_OP_TYPE_BOX_CODER = "box_coder";
 const std::string G_OP_TYPE_CONCAT = "concat";
 const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
 const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
-const std::string G_OP_TYPE_FC = "fc";
-const std::string G_OP_TYPE_CONV_ADD = "conv_add";
+const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
+const std::string G_OP_TYPE_FC = "fusion_fc";
+const std::string G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
 const std::string G_OP_TYPE_LRN = "lrn";
 const std::string G_OP_TYPE_MUL = "mul";
 const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
@@ -39,12 +40,14 @@ const std::string G_OP_TYPE_SPLIT = "split";
 const std::string G_OP_TYPE_FEED = "feed";
 const std::string G_OP_TYPE_FETCH = "fetch";
 const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
+const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence";
+const std::string G_OP_TYPE_DROPOUT = "dropout";
 
 std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
     op_input_output_key = {
         {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
-        {G_OP_TYPE_CONV_ADD, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
         {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
         {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
@@ -59,11 +62,14 @@ std::unordered_map<
         {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
         {G_OP_TYPE_BOX_CODER,
          {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
         {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
         {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
         {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
         {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}}};
+        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}}};
 
 }  // namespace paddle_mobile
diff --git a/src/common/types.h b/src/common/types.h
index 30a0663eeef899e3b8ff35bcb062824417362efc..ec428b9911f64d7ccc8c6f5dc4be7f970e855d3c 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once;
+#pragma once
 
 #include <string>
 #include <unordered_map>
+#include <vector>
 
 namespace paddle_mobile {
 enum class Precision : int { FP32 = 0 };
@@ -78,7 +79,9 @@ extern const std::string G_OP_TYPE_CONCAT;
 extern const std::string G_OP_TYPE_ELEMENTWISE_ADD;
 extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU;
 extern const std::string G_OP_TYPE_FC;
-extern const std::string G_OP_TYPE_CONV_ADD;
+extern const std::string G_OP_TYPE_FUSION_CONV_ADD;
+extern const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
+
 extern const std::string G_OP_TYPE_LRN;
 extern const std::string G_OP_TYPE_MUL;
 extern const std::string G_OP_TYPE_MULTICLASS_NMS;
@@ -93,6 +96,8 @@ extern const std::string G_OP_TYPE_SPLIT;
 extern const std::string G_OP_TYPE_FEED;
 extern const std::string G_OP_TYPE_FETCH;
 extern const std::string G_OP_TYPE_DEPTHWISE_CONV;
+extern const std::string G_OP_TYPE_IM2SEQUENCE;
+extern const std::string G_OP_TYPE_DROPOUT;
 
 extern std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
diff --git a/src/common/variant.h b/src/common/variant.h
index 7fbf0ec0772f102165770dc9c8e053f469965f10..b87a5e67a76f4c616f2c450ef4527bcf6c16286b 100644
--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -83,6 +83,7 @@ struct Variant {
       return *const_cast<T *>(reinterpret_cast<const T *>(&data));
     } else {
       PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant ");
+      exit(0);
     }
   }
 
diff --git a/src/framework/attribute.h b/src/framework/attribute.h
index 3b6608cf03e7f786ad8c087dc869516cb6220edb..f0519a35b3ed2a02e35f1ef0d6a718efb7b76095 100644
--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include <typeinfo>
 #include <unordered_map>
 #include <vector>
 
@@ -128,6 +129,7 @@ class Attribute {
       return vistor(attr.variant_.Get<int64_t>());
     } else {
       PADDLE_MOBILE_THROW_EXCEPTION("type not support");
+      exit(0);
     }
   }
 
diff --git a/src/framework/data_layout.h b/src/framework/data_layout.h
index 3b31445707a887a2715afd0b9e7192ad76724351..f1249008f088dce48ed040e47900121c2eb41af1 100644
--- a/src/framework/data_layout.h
+++ b/src/framework/data_layout.h
@@ -40,6 +40,7 @@ inline DataLayout StringToDataLayout(const std::string &str) {
     return DataLayout::kAnyLayout;
   } else {
     PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
+    exit(0);
   }
 }
 
@@ -52,6 +53,8 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) {
     case DataLayout::kAnyLayout:
       return "ANY_LAYOUT";
     default:
+      PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string ")
+      exit(0);
       break;
   }
 }
diff --git a/src/framework/ddim.h b/src/framework/ddim.h
index c1d917dff612de3a42168c47d0bacd3ac7bdd3ad..833bc2783f855fd9d6df50d21345539fbe2ca6c4 100644
--- a/src/framework/ddim.h
+++ b/src/framework/ddim.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <initializer_list>
+#include <typeinfo>
 #include <vector>
 #include "common/enforce.h"
 #include "common/variant.h"
@@ -57,7 +58,8 @@ struct DDim {
     } else if (d.var.TypeId() == typeid(Dim<9>).hash_code()) {
       return vistor(d.var.Get<Dim<9>>());
     } else {
-      DLOG << " dim not support";
+      PADDLE_MOBILE_ENFORCE(false, " dim not support");
+      exit(0);
     }
   }
 
diff --git a/src/framework/dim.h b/src/framework/dim.h
index 38e62df99519c3e869dc0fd2ae71beed28370122..dd7610de65d4a4c93402cf49b0fdbdc7995610c0 100644
--- a/src/framework/dim.h
+++ b/src/framework/dim.h
@@ -129,6 +129,7 @@ int64_t &indexer(Dim<D> &dim, int idx) {
 template <>
 int64_t &indexer<0>(Dim<0> &dim, int idx) {
   PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
+  exit(0);
 }
 
 template <int D>
@@ -145,6 +146,7 @@ int64_t indexer(const Dim<D> &dim, int idx) {
 template <>
 int64_t indexer<0>(const Dim<0> &dim, int idx) {
   PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
+  exit(0);
 }
 
 }  // namespace
diff --git a/src/framework/lod_tensor.cpp b/src/framework/lod_tensor.cpp
index 0a57d29a0c05c009299d43b3b9f5a59b2c3dc341..e165e55507ed04a9b63e4ad5eb002f206c71d96c 100644
--- a/src/framework/lod_tensor.cpp
+++ b/src/framework/lod_tensor.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "lod_tensor.h"
+#include <algorithm>
 
 namespace paddle_mobile {
 namespace framework {
diff --git a/src/framework/op_registry.h b/src/framework/op_registry.h
index 8a7beae993be1a9f2a52fb48d4930754aba784e1..a76cc7e4910d1639a10fcc839e9073a837fd89f5 100644
--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -45,6 +45,7 @@ struct OperatorRegistrar : public Registrar {
           << "OperatorRegistrar should be invoked at least by OpClass";
       return;
     }
+    printf(" regis ting %s \n", op_type.c_str());
     OpInfo<Dtype> info;
     OperatorRegistrarRecursive<Dtype, 0, false, ARGS...>(op_type, &info);
     OpInfoMap<Dtype>::Instance()->Insert(op_type, info);
@@ -107,6 +108,7 @@ class OpRegistry {
       __op_registrar_##op_type##_##device_name(#op_type);                  \
   int TouchOpRegistrar_##op_type##_##device_name() {                       \
     __op_registrar_##op_type##_##device_name.Touch();                      \
+    printf(" registering !! \n");                                          \
     return 0;                                                              \
   }
 
diff --git a/src/framework/operator.h b/src/framework/operator.h
index c68744a676030413e81570ded0db5671cdf4ba7a..793551b0cd3eea290243c156c27616a34c37a3d2 100644
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -63,7 +63,7 @@ class OperatorBase {
   std::vector<string> GetOutKeys() const;
   virtual void RunImpl() const = 0;
 
-  virtual void Init() const = 0;
+  virtual void Init() = 0;
   /*
    * @b op 运算所需的输入, 如上一层的输出结果、卷积核
    * */
@@ -117,8 +117,8 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
 
   virtual void InferShape() const = 0;
 
-  void Init() const {
-    PADDLE_MOBILE_ENFORCE(kernel_.Init(param_), "  %s kernel init failed",
+  void Init() {
+    PADDLE_MOBILE_ENFORCE(kernel_.Init(&param_), "  %s kernel init failed",
                           this->type_.c_str());
   }
 
@@ -146,7 +146,7 @@ class OpKernelBase {
   }
 #endif
   virtual void Compute(const P &para) const = 0;
-  virtual bool Init(const P &para) const { return true; };
+  virtual bool Init(P *para) { return true; };
   virtual ~OpKernelBase() = default;
 
  private:
diff --git a/src/framework/program/block_desc.cpp b/src/framework/program/block_desc.cpp
index 4b45ab305bf0f353f017674773b5fc51203bfef8..4e3eb79d07d0c8c363a6c3a9556cf718ebdc08f2 100644
--- a/src/framework/program/block_desc.cpp
+++ b/src/framework/program/block_desc.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "block_desc.h"
+#include <algorithm>
 
 namespace paddle_mobile {
 namespace framework {
diff --git a/src/framework/program/program-optimize/fusion_op_register.h b/src/framework/program/program-optimize/fusion_op_register.h
index 4cc83f8c80ab86ee6dcc1e3c395f872419da2be7..1cd6b1dd779f9bc9ff0f5be5513c4fa716d80b10 100644
--- a/src/framework/program/program-optimize/fusion_op_register.h
+++ b/src/framework/program/program-optimize/fusion_op_register.h
@@ -34,6 +34,10 @@ class FusionOpRegister {
   }
 
   void regist(FusionOpMatcher* matcher) {
+    if (matchers_.find(matcher->Type()) != matchers_.end()) {
+      return;
+    }
+
     std::shared_ptr<FusionOpMatcher> shared_matcher(matcher);
     matchers_[matcher->Type()] = shared_matcher;
   }
diff --git a/src/framework/program/program-optimize/node.cpp b/src/framework/program/program-optimize/node.cpp
index 4ea45ec0a859ef8aa3ab4e34de8279e732706803..e635e07eaf4484c3e390101c3b43fdaf24bbd2c6 100644
--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "framework/program/program-optimize/node.h"
+#include <algorithm>
 #include "framework/operator.h"
 
 namespace paddle_mobile {
@@ -92,7 +93,8 @@ int Node::Depth(int begin) {
 
 Node &Node::Folder(
     int size, std::string type,
-    std::map<std::string, std::pair<std::string, std::string>> change,
+    std::map<std::string, std::vector<std::pair<std::string, std::string>>>
+        change,
     std::vector<std::shared_ptr<Node>> *removed_nodes) {
   std::shared_ptr<framework::OpDesc> op_desc =
       std::make_shared<framework::OpDesc>();
@@ -109,12 +111,15 @@ Node &Node::Folder(
 void Node::Folder(
     std::shared_ptr<framework::OpDesc> op_desc,
     std::vector<std::shared_ptr<Node>> *outputs, int index,
-    std::map<std::string, std::pair<std::string, std::string>> *change,
+    std::map<std::string, std::vector<std::pair<std::string, std::string>>>
+        *change,
     Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes) {
   if (change->find(this->type_) != change->end()) {
-    auto change_pair = (*change)[this->type_];
-    op_desc->GetInputs()[change_pair.second] =
-        this->op_desc_->GetInputs()[change_pair.first];
+    auto change_pairs = (*change)[this->type_];
+    for (const auto &change_pair : change_pairs) {
+      op_desc->GetInputs()[change_pair.second] =
+          this->op_desc_->GetInputs()[change_pair.first];
+    }
   }
 
   for (auto &attr_pair : this->op_desc_->attrs_) {
diff --git a/src/framework/program/program-optimize/node.h b/src/framework/program/program-optimize/node.h
index 7236ffdd1782dfb39af73195da9b3756030c9117..88bf1e16ed2a5fb3a038eadd546d63ffb3916f68 100644
--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cinttypes>
 #include <map>
 #include <string>
+#include <utility>
 #include <vector>
 #include "common/log.h"
 #include "framework/program/op_desc.h"
@@ -43,7 +44,8 @@ class Node {
   int Depth(int begin = 0);
   Node &Folder(
       int size, std::string type,
-      std::map<std::string, std::pair<std::string, std::string>> change_map,
+      std::map<std::string, std::vector<std::pair<std::string, std::string>>>
+          change,
       std::vector<std::shared_ptr<Node>> *removed_nodes);
   std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(int size);
   std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
@@ -56,7 +58,8 @@ class Node {
   void Folder(
       std::shared_ptr<framework::OpDesc> op_desc,
       std::vector<std::shared_ptr<Node>> *outputs, int index,
-      std::map<std::string, std::pair<std::string, std::string>> *change,
+      std::map<std::string, std::vector<std::pair<std::string, std::string>>>
+          *change,
       Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes);
   std::shared_ptr<framework::OpDesc> op_desc_;
 #ifdef PADDLE_MOBILE_DEBUG
diff --git a/src/framework/program/program-optimize/program_optimize.cpp b/src/framework/program/program-optimize/program_optimize.cpp
index e9b5cc5187bef7c9963e23b05187c09e2c789dc2..3619bc79f576651245aa322992df9d318c810cd4 100644
--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "framework/program/program-optimize/program_optimize.h"
+#include <algorithm>
 #include "framework/program/program-optimize/fusion_op_register.h"
 
 namespace paddle_mobile {
diff --git a/src/framework/program/program.h b/src/framework/program/program.h
index bb82fa7334a7d1941734dcd846c8e66befdbdd10..5760efc826667d805695118b12e41efa0305553b 100644
--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -29,7 +29,7 @@ class Program {
   std::shared_ptr<Scope> scope;
   std::string model_path;
   std::string para_path;
-  bool is_commbine = false;
+  bool combined = false;
 
  private:
 };
diff --git a/src/framework/scope.cpp b/src/framework/scope.cpp
index 2f7ff247b846f0a5f3e59c5c2f317a59598fc643..a1f5789aa52d2a70f54cef5c622c3a15907a4683 100644
--- a/src/framework/scope.cpp
+++ b/src/framework/scope.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "framework/scope.h"
 
+#include <algorithm>
 #include <set>
 #include <string>
 #include <vector>
diff --git a/src/framework/scope.h b/src/framework/scope.h
index d714f61af3bd443c09fcef7aacee2416b90b5e02..054f141ff68895e0879fd31e15d90c76ea038135 100644
--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -23,7 +23,17 @@ namespace framework {
 class Scope {
  public:
   Scope() = default;
-  ~Scope() = default;
+
+  ~Scope() {
+    for (auto &var : vars_) {
+      delete var.second;
+    }
+    vars_.clear();
+    for (auto kid : kids_) {
+      delete kid;
+    }
+    kids_.clear();
+  }
 
   Scope &NewScope() const;
 
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index a221a26aa1435000646cf7d58321df28f3322834..9bbd81aa30f6fa0188dacd0dce01813e17b9e339 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <vector>
 #include "common/enforce.h"
 
+#include <fstream>
 #include "common/enforce.h"
 #include "framework/data_layout.h"
 #include "framework/ddim.h"
@@ -131,6 +132,22 @@ class Tensor {
     return reinterpret_cast<T *>(mutable_data(typeid(T)));
   }
 
+#ifdef PADDLE_MOBILE_DEBUG
+  template <typename T>
+  inline void dump(std::string filename) const {
+    const T *dataptr = data<T>();
+    std::ofstream out(filename.c_str());
+    for (int i = 0; i < numel(); ++i) {
+      out << dataptr[i] << " ";
+    }
+    out << "形状：";
+    for (int j = 0; j < dims_.size(); ++j) {
+      out << dims_[j] << " ";
+    }
+    out.close();
+  }
+#endif
+
   inline void *mutable_data(std::type_index type) {
     if (holder_ != nullptr) {
       holder_->set_type(type);
diff --git a/src/io/io.cpp b/src/io/executor.cpp
similarity index 71%
rename from src/io/io.cpp
rename to src/io/executor.cpp
index c7f8f174ac825463f66fd50b72e0c58d50bd537e..82c3eae5d92fac19b2ed94fb587497236afd917d 100644
--- a/src/io/io.cpp
+++ b/src/io/executor.cpp
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "io/io.h"
+#include "io/executor.h"
+#include <algorithm>
 #include <vector>
 #include "common/enforce.h"
 #include "common/log.h"
@@ -25,7 +26,6 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
-#include <algorithm>
 #include <queue>
 #include <utility>
 #include "common/threadpool.h"
@@ -39,7 +39,7 @@ char *Get_binary_data(std::string filename) {
   PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
                         filename.c_str());
   fseek(file, 0, SEEK_END);
-  long size = ftell(file);
+  int64_t size = ftell(file);
   PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
   rewind(file);
   char *data = new char[size];
@@ -50,116 +50,6 @@ char *Get_binary_data(std::string filename) {
   return data;
 }
 
-static size_t ReadBuffer(const char *file_name, uint8_t **out) {
-  printf("%s \n", file_name);
-  FILE *fp;
-  fp = fopen(file_name, "rb");
-  PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name);
-
-  fseek(fp, 0, SEEK_END);
-  size_t size = ftell(fp);
-  rewind(fp);
-
-  DLOG << "model size: " << size;
-
-  *out = reinterpret_cast<uint8_t *>(malloc(size));
-
-  size_t cur_len = 0;
-  size_t nread;
-  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
-    cur_len += nread;
-  }
-  fclose(fp);
-  return cur_len;
-}
-
-template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &dirname, bool optimize, bool can_add_split) {
-  auto program =
-      this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
-  program.model_path = dirname;
-  return program;
-}
-
-template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &model_path, const std::string &para_path,
-    bool optimize) {
-  auto program = this->LoadProgram(model_path, optimize);
-  program.para_path = para_path;
-  program.is_commbine = true;
-  return program;
-}
-
-template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
-    const std::string &model_path, bool optimize, bool can_add_split) {
-  std::string model_filename = model_path;
-  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-  uint8_t *buf = NULL;
-  size_t read_size = ReadBuffer(model_filename.c_str(), &buf);
-
-  PADDLE_MOBILE_ENFORCE(buf != NULL, "read from __model__ is null");
-
-  c_program = paddle_mobile__framework__proto__program_desc__unpack(
-      NULL, read_size, buf);
-  //
-  PADDLE_MOBILE_ENFORCE(c_program != NULL, "program is null");
-  //
-  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
-  //
-  auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
-
-  framework::Program<Dtype, P> program;
-  program.originProgram = originProgramDesc;
-
-  auto scope = std::make_shared<framework::Scope>();
-  program.scope = scope;
-
-  for (const auto &block : originProgramDesc->Blocks()) {
-    for (auto var_desc : block->Vars()) {
-      auto var = scope->Var(var_desc->Name());
-
-      if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-        if (var_desc->Persistable() &&
-            var_desc->Type() != framework::VARTYPE_TYPE_FEED_MINIBATCH &&
-            var_desc->Type() != framework::VARTYPE_TYPE_FETCH_LIST) {
-          auto dim = var_desc->Tensor_desc().Dims();
-          auto tensor = var->GetMutable<framework::LoDTensor>();
-          tensor->Resize(framework::make_ddim(dim));
-        } else {
-          auto dim = var_desc->Tensor_desc().Dims();
-          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
-          dim[0] = 1;
-          auto tensor = var->GetMutable<framework::LoDTensor>();
-          tensor->Resize(framework::make_ddim(dim));
-        }
-      } else {
-        // TODO(codeWorm): some.
-      }
-    }
-  }
-
-  if (optimize) {
-    framework::ProgramOptimize program_optimize;
-    program.optimizeProgram =
-        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
-  }
-  if (optimize) {
-    program.optimizeProgram->Description("optimize: ");
-  } else {
-    originProgramDesc->Description("program: ");
-  }
-
-  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
-  return program;
-}
-
-template class Loader<CPU, Precision::FP32>;
-template class Loader<FPGA, Precision::FP32>;
-template class Loader<GPU_MALI, Precision::FP32>;
-
 #pragma mark - executor
 template <typename Dtype, Precision P>
 Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
@@ -193,7 +83,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
 #endif
     }
   }
-  if (program_.is_commbine) {
+  if (program_.combined) {
     InitCombineMemory();
   } else {
     InitMemory();
@@ -209,30 +99,30 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
 
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
-                                    framework::LoDTensor *tensor, char *&data) {
+                                    framework::LoDTensor *tensor, char **data) {
   // 1. version
-  uint32_t version = *(uint32_t *)data;
-  data += sizeof(uint32_t);
+  uint32_t version = *reinterpret_cast<uint32_t *>(*data);
+
+  (*data) += sizeof(uint32_t);
 
   // 2 Lod information
   uint64_t *lod_level_ptr = new uint64_t();
-  memcpy(lod_level_ptr, data, sizeof(uint64_t));
+  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
   uint64_t lod_level = *lod_level_ptr;
   delete lod_level_ptr;
-  data += sizeof(uint64_t);
+  (*data) += sizeof(uint64_t);
 
   auto &lod = *tensor->mutable_lod();
   lod.resize(lod_level);
   for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size = *(uint64_t *)data;
-    data += sizeof(uint64_t);
+    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
+    (*data) += sizeof(uint64_t);
     DLOG << "lod size: " << i << size;
     std::vector<size_t> tmp(size / sizeof(size_t));
 
     for (int k = 0; k < tmp.size(); ++k) {
-      tmp[k] = *(size_t *)data;
-      DLOG << "tmp[k]: " << k << *(size_t *)data;
-      data += sizeof(size_t);
+      tmp[k] = *reinterpret_cast<size_t *>(*data);
+      (*data) += sizeof(size_t);
     }
 
     for (auto j : tmp) {
@@ -242,18 +132,18 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
   }
 
   // 3. tensor version
-  uint32_t tensor_version = *(uint32_t *)data;
-  data += sizeof(uint32_t);
+  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
+  (*data) += sizeof(uint32_t);
 
   // 4. tensor desc
-  int32_t size = *(int32_t *)data;
-  data += sizeof(int32_t);
+  int32_t size = *reinterpret_cast<int32_t *>(*data);
+  (*data) += sizeof(int32_t);
 
   std::unique_ptr<char[]> buf(new char[size]);
   for (int m = 0; m < size; ++m) {
-    buf.get()[m] = data[m];
+    buf.get()[m] = (*data)[m];
   }
-  data += (sizeof(char) * size);
+  (*data) += (sizeof(char) * size);
 
   const framework::TensorDesc &desc = var_desc.Tensor_desc();
   int memory_size = 1;
@@ -290,9 +180,9 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
   }
 
   for (int n = 0; n < memory_size * type_size; ++n) {
-    static_cast<char *>(memory)[n] = data[n];
+    static_cast<char *>(memory)[n] = (*data)[n];
   }
-  data += (sizeof(char) * memory_size * type_size);
+  (*data) += (sizeof(char) * memory_size * type_size);
 }
 
 template <typename Dtype, Precision P>
@@ -309,7 +199,7 @@ void Executor<Dtype, P>::InitMemory() {
         char *origin_data =
             Get_binary_data(program_.model_path + "/" + var_desc->Name());
         char *data = origin_data;
-        LoadMemory(*var_desc, tensor, data);
+        LoadMemory(*var_desc, tensor, &data);
         delete origin_data;
       } else {
         if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
@@ -335,7 +225,7 @@ void Executor<Dtype, P>::InitCombineMemory() {
         if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
           continue;
         }
-        LoadMemory(*var_desc, tensor, data);
+        LoadMemory(*var_desc, tensor, &data);
       } else {
         if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
           auto tensor = var->template GetMutable<framework::LoDTensor>();
@@ -442,7 +332,8 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
                                                    *(program_.scope));
 #ifdef PADDLE_MOBILE_PROFILE
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
-  // TODO expose profile info as an interface, user can get them to analysis
+  // TODO(haipeng): expose profile info as an interface, user can get them to
+  // analysis
   //      the performance of their deepnet.
   FILE *df = fopen("net.dot", "w");
   fprintf(df, "digraph {\n");
@@ -457,16 +348,19 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
   fprintf(df, "}\n");
   fclose(df);
 #endif
-  FILE *pf = fopen("profile.out", "w");
+
+  //  FILE *pf = fopen("profile.out", "w");
   std::unordered_map<std::string, uint64_t> _tp;
   for (int i = 0; i < profile.size(); i++) {
     const auto &pInfo = profile[i];
     uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
     _tp[ops[i]->Type()] += timeCost;
-    fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i, ops[i]->Type().c_str(),
-            pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
+    //    fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i,
+    //    ops[i]->Type().c_str(),
+    //            pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
   }
-  fclose(pf);
+  //  fclose(pf);
+
   printf("====================[ profile ]======================\n");
   using prof_t = std::pair<std::string, uint64_t>;
   std::vector<prof_t> _tv(_tp.begin(), _tp.end());
@@ -480,8 +374,9 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
   std::sort(_tv.begin(), _tv.end(), compf);
   _tv.push_back(std::make_pair("total", _ptotal));
   for (auto const &p : _tv) {
-    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(), (float)p.second,
-           (float)p.second / _ptotal * 100.0);
+    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
+           static_cast<float>(p.second),
+           static_cast<float>(p.second) / _ptotal * 100.0);
   }
   printf("====================[---------]======================\n");
 #endif
diff --git a/src/io/io.h b/src/io/executor.h
similarity index 70%
rename from src/io/io.h
rename to src/io/executor.h
index ff520dd628406eae47f76196dbe66a0992dfe735..f8f2a8ad5657fdb3cf6cb249e32537bd5e866913 100644
--- a/src/io/io.h
+++ b/src/io/executor.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "common/types.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"
@@ -27,36 +28,11 @@ limitations under the License. */
 #include <condition_variable>
 #include <mutex>
 #include <thread>
-#include "common/depCore.h"
+#include "common/dep_core.h"
 #endif
 
 namespace paddle_mobile {
 
-template <typename Dtype = CPU, Precision P = Precision::FP32>
-class Loader {
- public:
-  /*
-   * @b load separate format fluid model
-   * @b 加载分开形式的 fluid 模型
-   * */
-  const framework::Program<Dtype, P> Load(const std::string &dirname,
-                                          bool optimize = false,
-                                          bool can_add_split = false);
-
-  /*
-   * @b load combine format fluid mode
-   * @b 加载结合在一起格式的模型
-   * */
-  const framework::Program<Dtype, P> Load(const std::string &model_path,
-                                          const std::string &para_path,
-                                          bool optimize = false);
-
- private:
-  const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
-                                                 bool optimize = false,
-                                                 bool can_add_split = false);
-};
-
 template <typename Dtype = CPU, Precision P = Precision::FP32>
 class Executor {
  public:
@@ -86,7 +62,7 @@ class Executor {
   Executor() = default;
   void InitMemory();
   void LoadMemory(const framework::VarDesc var_desc,
-                  framework::LoDTensor *tensor, char *&data);
+                  framework::LoDTensor *tensor, char **data);
   void InitCombineMemory();
   framework::Program<Dtype> program_;
   int batch_size_ = 1;
diff --git a/src/io/loader.cpp b/src/io/loader.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f3b19abbe17c3d5e8b5cc082a115c05058aa0219
--- /dev/null
+++ b/src/io/loader.cpp
@@ -0,0 +1,133 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "io/loader.h"
+
+#include "framework/lod_tensor.h"
+#include "framework/program/program-optimize/program_optimize.h"
+
+namespace paddle_mobile {
+using framework::Variable;
+
+static size_t ReadBuffer(const char *file_name, uint8_t **out) {
+  printf("%s \n", file_name);
+  FILE *fp;
+  fp = fopen(file_name, "rb");
+  PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name);
+
+  fseek(fp, 0, SEEK_END);
+  size_t size = ftell(fp);
+  rewind(fp);
+
+  DLOG << "model size: " << size;
+
+  *out = reinterpret_cast<uint8_t *>(malloc(size));
+
+  size_t cur_len = 0;
+  size_t nread;
+  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
+    cur_len += nread;
+  }
+  fclose(fp);
+  return cur_len;
+}
+
+template <typename Dtype, Precision P>
+const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
+    const std::string &dirname, bool optimize, bool can_add_split) {
+  auto program =
+      this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
+  program.model_path = dirname;
+  return program;
+}
+
+template <typename Dtype, Precision P>
+const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
+    const std::string &model_path, const std::string &para_path,
+    bool optimize) {
+  auto program = this->LoadProgram(model_path, optimize);
+  program.para_path = para_path;
+  program.combined = true;
+  return program;
+}
+
+template <typename Dtype, Precision P>
+const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
+    const std::string &model_path, bool optimize, bool can_add_split) {
+  std::string model_filename = model_path;
+  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
+  uint8_t *buf = NULL;
+  size_t read_size = ReadBuffer(model_filename.c_str(), &buf);
+
+  PADDLE_MOBILE_ENFORCE(buf != NULL, "read from __model__ is null");
+
+  c_program = paddle_mobile__framework__proto__program_desc__unpack(
+      NULL, read_size, buf);
+  //
+  PADDLE_MOBILE_ENFORCE(c_program != NULL, "program is null");
+  //
+  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
+  //
+  auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
+
+  framework::Program<Dtype, P> program;
+  program.originProgram = originProgramDesc;
+
+  auto scope = std::make_shared<framework::Scope>();
+  program.scope = scope;
+
+  for (const auto &block : originProgramDesc->Blocks()) {
+    for (auto var_desc : block->Vars()) {
+      auto var = scope->Var(var_desc->Name());
+
+      if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+        if (var_desc->Persistable() &&
+            var_desc->Type() != framework::VARTYPE_TYPE_FEED_MINIBATCH &&
+            var_desc->Type() != framework::VARTYPE_TYPE_FETCH_LIST) {
+          auto dim = var_desc->Tensor_desc().Dims();
+          auto tensor = var->GetMutable<framework::LoDTensor>();
+          tensor->Resize(framework::make_ddim(dim));
+        } else {
+          auto dim = var_desc->Tensor_desc().Dims();
+          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
+          dim[0] = 1;
+          auto tensor = var->GetMutable<framework::LoDTensor>();
+          tensor->Resize(framework::make_ddim(dim));
+        }
+      } else {
+        // TODO(codeWorm): some.
+      }
+    }
+  }
+
+  if (optimize) {
+    framework::ProgramOptimize program_optimize;
+    program.optimizeProgram =
+        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
+  }
+  if (optimize) {
+    program.optimizeProgram->Description("optimize: ");
+  } else {
+    originProgramDesc->Description("program: ");
+  }
+
+  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
+  return program;
+}
+
+template class Loader<CPU, Precision::FP32>;
+template class Loader<FPGA, Precision::FP32>;
+template class Loader<GPU_MALI, Precision::FP32>;
+
+}  // namespace paddle_mobile
diff --git a/src/io/loader.h b/src/io/loader.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e3c53dc9db858f506a13d2105339038340344a6
--- /dev/null
+++ b/src/io/loader.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "common/types.h"
+#include "framework/program/program.h"
+
+namespace paddle_mobile {
+
+template <typename Dtype = CPU, Precision P = Precision::FP32>
+class Loader {
+ public:
+  /*
+   * @b load separate format fluid model
+   * @b 加载分开形式的 fluid 模型
+   * */
+  const framework::Program<Dtype, P> Load(const std::string &dirname,
+                                          bool optimize = false,
+                                          bool can_add_split = false);
+
+  /*
+   * @b load combine format fluid mode
+   * @b 加载结合在一起格式的模型
+   * */
+  const framework::Program<Dtype, P> Load(const std::string &model_path,
+                                          const std::string &para_path,
+                                          bool optimize = false);
+
+ private:
+  const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
+                                                 bool optimize = false,
+                                                 bool can_add_split = false);
+};
+
+}  // namespace paddle_mobile
diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3d5735f8da66db6f4b5f139f8261a4cd9cf0f796
--- /dev/null
+++ b/src/io/paddle_mobile.cpp
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "io/paddle_mobile.h"
+
+namespace paddle_mobile {
+
+template <typename Dtype, Precision P>
+bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
+                                  int batch_size) {
+  if (loader_.get() == nullptr) {
+    loader_ = std::make_shared<Loader<Dtype, P>>();
+  } else {
+    LOG(kLOG_INFO) << "loader inited";
+  }
+
+  if (executor_.get() == nullptr) {
+    executor_ = std::make_shared<Executor<Dtype, P>>(
+        loader_->Load(dirname, optimize), batch_size, optimize);
+  } else {
+    LOG(kLOG_INFO) << "executor inited";
+  }
+
+  return true;
+}
+
+template <typename Dtype, Precision P>
+bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
+                                  const std::string &para_path, bool optimize,
+                                  int batch_size) {
+  if (loader_.get() == nullptr) {
+    loader_ = std::make_shared<Loader<Dtype, P>>();
+  } else {
+    LOG(kLOG_INFO) << "loader inited";
+  }
+
+  if (executor_.get() == nullptr) {
+    executor_ = std::make_shared<Executor<Dtype, P>>(
+        loader_->Load(model_path, para_path, optimize), batch_size, optimize);
+  } else {
+    LOG(kLOG_INFO) << "executor inited";
+  }
+
+  return true;
+}
+
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::Predict(
+    const framework::Tensor &t) {
+  return executor_->Predict(t);
+}
+
+template <typename Dtype, Precision P>
+std::vector<typename PaddleMobile<Dtype, P>::Ptype>
+PaddleMobile<Dtype, P>::Predict(const std::vector<Ptype> &input,
+                                const std::vector<int64_t> &dims) {
+  return executor_->Predict(input, dims);
+}
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::Clear() {
+  executor_ = nullptr;
+  loader_ = nullptr;
+}
+
+template <typename Dtype, Precision P>
+PaddleMobile<Dtype, P>::~PaddleMobile() {
+  executor_ = nullptr;
+  loader_ = nullptr;
+}
+
+template class PaddleMobile<CPU, Precision::FP32>;
+template class PaddleMobile<FPGA, Precision::FP32>;
+template class PaddleMobile<GPU_MALI, Precision::FP32>;
+
+}  // namespace paddle_mobile
diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ce39e0ae1ffc7e193f6f4308a911875fdf95076
--- /dev/null
+++ b/src/io/paddle_mobile.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "common/types.h"
+#include "framework/tensor.h"
+#include "io/executor.h"
+#include "io/loader.h"
+
+namespace paddle_mobile {
+
+template <typename Dtype = CPU, Precision P = Precision::FP32>
+class PaddleMobile {
+  typedef typename PrecisionTrait<P>::ptype Ptype;
+
+ public:
+  PaddleMobile() {}
+  /*
+   * @b load separate format fluid model
+   * @b 加载分开形式的 fluid 模型
+   * */
+  bool Load(const std::string &dirname, bool optimize = false,
+            int batch_size = 1);
+
+  /*
+   * @b load combine format fluid mode
+   * @b 加载结合在一起格式的模型
+   * */
+  bool Load(const std::string &model_path, const std::string &para_path,
+            bool optimize = false, int batch_size = 1);
+
+  /*
+   * @b to predict
+   * */
+  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
+
+  /*
+   * @b to predict with vector and dim
+   *
+   * @b 使用 输入 和 输入的维度信息 进行预测
+   * */
+  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
+                             const std::vector<int64_t> &dims);
+
+  void Clear();
+
+  ~PaddleMobile();
+
+ private:
+  std::shared_ptr<Loader<Dtype, P>> loader_;
+  std::shared_ptr<Executor<Dtype, P>> executor_;
+};
+
+}  // namespace paddle_mobile
diff --git a/src/ios_io/PaddleMobile.h b/src/ios_io/PaddleMobile.h
new file mode 100644
index 0000000000000000000000000000000000000000..863c0d004440dc6098eb7dc1ed490fde20f237c9
--- /dev/null
+++ b/src/ios_io/PaddleMobile.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#import <CoreImage/CoreImage.h>
+#import <Foundation/Foundation.h>
+
+@interface PaddleMobile : NSObject
+
+- (instancetype)init;
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
+- (void)clear;
+
+@end
diff --git a/src/ios_io/PaddleMobile.mm b/src/ios_io/PaddleMobile.mm
new file mode 100644
index 0000000000000000000000000000000000000000..f5ec2afb2a996ec4932d99ea93362e06ddf28a14
--- /dev/null
+++ b/src/ios_io/PaddleMobile.mm
@@ -0,0 +1,170 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#import "PaddleMobile.h"
+#import "op_symbols.h"
+#import "io/paddle_mobile.h"
+
+#import <memory>
+#import <vector>
+
+@interface  PaddleMobile()
+{
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32> *pam_;
+  BOOL loaded_;
+}
+@end
+
+@implementation PaddleMobile
+
+static std::mutex shared_mutex;
+
+- (instancetype)init {
+  if (self = [super init]) {
+    pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32>();
+  }
+  return self;
+}
+
+- (void)dealloc {
+  if (pam_) {
+    delete pam_;
+  }
+}
+
++ (instancetype)sharedInstance{
+  static dispatch_once_t onceToken;
+  static id sharedManager = nil;
+  dispatch_once(&onceToken, ^{
+    sharedManager = [[[self class] alloc] init];
+  });
+  return sharedManager;
+}
+
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath{
+  std::string model_path_str = std::string([modelPath UTF8String]);
+  std::string weights_path_str = std::string([weighsPath UTF8String]);
+  if (loaded_ = pam_->Load(model_path_str, weights_path_str, false)) {
+    return YES;
+  } else {
+    return NO;
+  }
+}
+
+-(void)preprocess:(const UInt8 *)input output:(float *)output imageWidth:(int)imageWidth imageHeight:(int)imageHeight imageChannels:(int)imageChannels means:(NSArray<NSNumber *> *)means scale:(float)scale dim:(std::vector<int64_t>)dim{
+  if (means == nil) {
+    means = @[@0, @0, @0];
+  }
+
+  int wanted_input_width = dim[3];
+  int wanted_input_height = dim[2];
+  int wanted_input_channels = dim[1];
+
+  for (int c = 0; c < wanted_input_channels; ++c) {
+    float *out_channel = output + c * wanted_input_height * wanted_input_width;
+    for (int y = 0; y < wanted_input_height; ++y) {
+      float *out_row = out_channel + y * wanted_input_width;
+      for (int x = 0; x < wanted_input_width; ++x) {
+        int in_row = (y * imageHeight) / wanted_input_height;
+        int in_col = (x * imageWidth) / wanted_input_width;
+        const UInt8 *in_pixel = input + (in_row * imageWidth * imageChannels) + (in_col * imageChannels);
+        float *out_pos = out_row + x;
+        if (c == 0) {
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }else if (c == 1){
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }else if (c == 2){
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }
+      }
+    }
+  }
+}
+
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale{
+  std::lock_guard<std::mutex> lock(shared_mutex);
+  if (!loaded_) {
+    printf("PaddleMobile doesn't be loaded yet");
+    return nil;
+  }
+
+  if (dim.count != 4) {
+    printf("dim must have 4 elements");
+    return nil;
+  }
+
+  // dim to c++ vector, get numel
+  std::vector<int64_t > dim_vec;
+  int numel = 1;
+  for (int k = 0; k < dim.count; ++k) {
+    int d = dim[k].intValue;
+    numel *= d;
+    dim_vec.push_back(d);
+  }
+
+  const int sourceRowBytes = CGImageGetBytesPerRow(image);
+  const int image_width = CGImageGetWidth(image);
+  const int image_height = CGImageGetHeight(image);
+  const int image_channels = 4;
+  CGDataProviderRef provider = CGImageGetDataProvider(image);
+  CFDataRef cfData = CGDataProviderCopyData(provider);
+  const UInt8 *input = CFDataGetBytePtr(cfData);
+
+  // sample image
+  float *output = (float *)malloc(numel*sizeof(float));
+  [self preprocess:input output:output imageWidth:image_width imageHeight:image_height imageChannels:image_channels means:means scale:scale dim:dim_vec];
+  float *dataPointer = nullptr;
+  if (nullptr != output) {
+    dataPointer = output;
+  } else {
+    return nil;
+  }
+
+  // input
+  std::vector<float> predict_input;
+  for (int j = 0; j < numel; ++j) {
+    predict_input.push_back(dataPointer[j]);
+  }
+
+  // predict
+  std::vector<float> cpp_result = pam_->Predict(predict_input, dim_vec);
+
+  // result
+  long count = 0;
+  count = cpp_result.size();
+  NSMutableArray *result = [[NSMutableArray alloc] init];
+  for (int i = 0; i < count; i++) {
+    [result addObject:[NSNumber numberWithFloat:cpp_result[i]]];
+  }
+
+  free(output);
+
+  // 待验证
+  //  if ([UIDevice currentDevice].systemVersion.doubleValue < 11.0) {
+  CFRelease(cfData);
+  cfData = NULL;
+  //  }
+
+  return result;
+}
+
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim {
+  [self predict:image dim:dim means:nil scale:1];
+}
+
+- (void)clear{
+  pam_->Clear();
+}
+
+@end
diff --git a/src/ios_io/op_symbols.h b/src/ios_io/op_symbols.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2825b90e67c4e20030509358f468c9c0190f727
--- /dev/null
+++ b/src/ios_io/op_symbols.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#include "operators/batchnorm_op.h"
+#include "operators/box_coder_op.h"
+#include "operators/concat_op.h"
+#include "operators/conv_op.h"
+#include "operators/depthwise_conv_op.h"
+#include "operators/dropout_op.h"
+#include "operators/elementwise_add_op.h"
+#include "operators/feed_op.h"
+#include "operators/fetch_op.h"
+#include "operators/fusion_conv_add.h"
+#include "operators/fusion_conv_add_bn_relu_op.h"
+#include "operators/fusion_fc_op.h"
+#include "operators/im2sequence_op.h"
+#include "operators/lrn_op.h"
+#include "operators/mul_op.h"
+#include "operators/multiclass_nms_op.h"
+#include "operators/pool_op.h"
+#include "operators/prior_box_op.h"
+#include "operators/relu_op.h"
+#include "operators/reshape_op.h"
+#include "operators/sigmoid_op.h"
+#include "operators/softmax_op.h"
+#include "operators/transpose_op.h"
diff --git a/src/jni/paddle_mobile_jni.cpp b/src/jni/paddle_mobile_jni.cpp
index f663b78fd490f2c9f0af525c7dabd2cc513c3a53..01d4e52a4b1308a7ff97bc672d1a15d329dbf318 100644
--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
@@ -15,6 +15,10 @@ limitations under the License. */
 #ifdef ANDROID
 
 #include "paddle_mobile_jni.h"
+#include "common/log.h"
+#include "framework/tensor.h"
+#include "io/paddle_mobile.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -28,17 +32,16 @@ using std::string;
 
 extern const char *ANDROID_LOG_TAG =
     "paddle_mobile LOG built on " __DATE__ " " __TIME__;
-static Executor<CPU> *shared_executor_instance = nullptr;
+static PaddleMobile<CPU> *shared_paddle_mobile_instance = nullptr;
 
 // toDo mutex lock
 // static std::mutex shared_mutex;
 
-Executor<CPU> *getExecutorInstance(const Program<CPU> p, int batch_size,
-                                   bool use_optimize) {
-  if (nullptr == shared_executor_instance) {
-    shared_executor_instance = new Executor<CPU>(p, batch_size, use_optimize);
+PaddleMobile<CPU> *getPaddleMobileInstance() {
+  if (nullptr == shared_paddle_mobile_instance) {
+    shared_paddle_mobile_instance = new PaddleMobile<CPU>();
   }
-  return shared_executor_instance;
+  return shared_paddle_mobile_instance;
 }
 
 string jstring2cppstring(JNIEnv *env, jstring jstr) {
@@ -51,15 +54,14 @@ string jstring2cppstring(JNIEnv *env, jstring jstr) {
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                                           jclass thiz,
                                                           jstring modelPath) {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  ANDROIDLOGI("load invoked");
   bool optimize = true;
-  auto program = loader.Load(jstring2cppstring(env, modelPath), optimize);
-  shared_executor_instance = getExecutorInstance(program, 1, optimize);
-  return shared_executor_instance != nullptr ? JNI_TRUE : JNI_FALSE;
+  return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                         optimize);
 }
 
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
-    JNIEnv *env, jclass thiz, jfloatArray buf) {
+JNIEXPORT jfloatArray JNICALL
+Java_com_baidu_paddle_PML_predict(JNIEnv *env, jclass thiz, jfloatArray buf) {
   jfloatArray result = NULL;
   int count = 0;
   float *dataPointer = nullptr;
@@ -73,15 +75,18 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
   for (int i = 0; i < framework::product(ddim); i++) {
     input_ptr[i] = dataPointer[i];
   }
-  auto output = shared_executor_instance->Predict(input);
+  auto output = shared_paddle_mobile_instance->Predict(input);
   count = output->numel();
   result = env->NewFloatArray(count);
   env->SetFloatArrayRegion(result, 0, count, output->data<float>());
+  ANDROIDLOGI("predict finished");
   return result;
 }
 
 JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_clear(JNIEnv *env,
-                                                       jclass thiz) {}
+                                                       jclass thiz) {
+  getPaddleMobileInstance()->Clear();
+}
 
 }  // namespace jni
 }  // namespace paddle_mobile
diff --git a/src/jni/paddle_mobile_jni.h b/src/jni/paddle_mobile_jni.h
index a262d4070c37013977e869fa816d52d78fbfa485..86caa9a273ab11124f6ea67efe27dc3529cea69f 100644
--- a/src/jni/paddle_mobile_jni.h
+++ b/src/jni/paddle_mobile_jni.h
@@ -15,9 +15,6 @@ limitations under the License. */
 #pragma once
 #ifdef ANDROID
 #include <jni.h>
-#include "common/log.h"
-#include "framework/tensor.h"
-#include "io/io.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -34,8 +31,8 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
 /**
  * object detection for anroid
  */
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
-    JNIEnv *env, jclass thiz, jfloatArray buf);
+JNIEXPORT jfloatArray JNICALL
+Java_com_baidu_paddle_PML_predict(JNIEnv *env, jclass thiz, jfloatArray buf);
 
 /**
  * clear data of the net when destroy for android
diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp
index 62e929024d7232ba4bee6b9e95ee895c2badb95e..0252f3c07c06487720586b0f650e2179d247234f 100644
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
 #include "memory/t_malloc.h"
 #include <cstdlib>
 #include <cstring>
diff --git a/src/operators/batchnorm_op.cpp b/src/operators/batchnorm_op.cpp
index 5d94d54f88e33b168739b1bbdf9af0bea9fe1b4f..d2fbd9fb6b0192b3728678ae92de7bf8e44e3620 100644
--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -32,7 +32,6 @@ template class BatchNormOp<CPU, float>;
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(batch_norm);
 REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/batchnorm_op.h b/src/operators/batchnorm_op.h
index 9ee0b2dcf6b6ec46fcb08cac88d3df275d33f7d6..2b2795b64fddfbcd1000088dbab18e54a017b459 100644
--- a/src/operators/batchnorm_op.h
+++ b/src/operators/batchnorm_op.h
@@ -45,4 +45,13 @@ class BatchNormOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(batch_norm);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(batch_norm);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/box_coder_op.cpp b/src/operators/box_coder_op.cpp
index 31891ed74266d599898dd7426eed5cd28f320ab6..dece07d5efcfae9629842aead04d0274b9d82c93 100644
--- a/src/operators/box_coder_op.cpp
+++ b/src/operators/box_coder_op.cpp
@@ -53,7 +53,6 @@ template class BoxCoderOp<CPU, float>;
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(box_coder);
 REGISTER_OPERATOR_CPU(box_coder, ops::BoxCoderOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/box_coder_op.h b/src/operators/box_coder_op.h
index 33ff2358bc8285a026c217ed11c2250769395567..5a75cacaf27f20e69b5e427421bd3dd8f43e8556 100644
--- a/src/operators/box_coder_op.h
+++ b/src/operators/box_coder_op.h
@@ -51,4 +51,12 @@ class BoxCoderOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(box_coder);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/concat_op.cpp b/src/operators/concat_op.cpp
index fe0507dc812a3ddafcc0433c2659c3b49ea87f6e..9c524df351549fd0141294be805d77b3f1057362 100644
--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -63,11 +63,9 @@ template class ConcatOp<CPU, float>;
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(concat);
 REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(concat);
 REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
diff --git a/src/operators/concat_op.h b/src/operators/concat_op.h
index 93612c6b1b6d1f6aa992773ef5cccc0c93f1b6e8..7aedaab4b1fa00707661ada428c7c1dc27f124cd 100644
--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
@@ -46,4 +46,13 @@ class ConcatOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(concat);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(concat);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/conv_op.cpp b/src/operators/conv_op.cpp
index 01d284a06ed33142a8d16cdc32f304c3d1a75e28..1b00ed06eee2b1676667b9c54b8601c8872b6699 100644
--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -55,15 +55,12 @@ template class ConvOp<CPU, float>;
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(conv2d);
 REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(conv2d);
 REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(conv2d);
 REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
 #endif
 
diff --git a/src/operators/conv_op.h b/src/operators/conv_op.h
index f8e8952d47fd726c712c0f7817606d959095b65b..d36fa47f4a0b37c467eb2101e2e930fe54a0e28b 100644
--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -46,4 +46,14 @@ class ConvOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(conv2d);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(conv2d);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(conv2d);
+#endif
+
 #endif
diff --git a/src/operators/depthwise_conv_op.cpp b/src/operators/depthwise_conv_op.cpp
index 46f2db30ba2fbff5839d6a737dda12fa6cd10b43..bee90781cd2de9d65bbbee3193cc922e743706de 100644
--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -56,7 +56,6 @@ template class DepthwiseConvOp<CPU, float>;
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(depthwise_conv2d);
 REGISTER_OPERATOR_CPU(depthwise_conv2d, ops::DepthwiseConvOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/depthwise_conv_op.h b/src/operators/depthwise_conv_op.h
index 75bcf44cb8790365e7f33719c481354c1a57c80a..9d7cbcfa2f2924db040cdc5f38ca6bb7ad8074b5 100644
--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
@@ -48,4 +48,12 @@ class DepthwiseConvOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(depthwise_conv2d);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/dropout_op.cpp b/src/operators/dropout_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f7f5ca2475171f5756ee8cf4f13754d07df8fe01
--- /dev/null
+++ b/src/operators/dropout_op.cpp
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef DROPOUT_OP
+#include "operators/dropout_op.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void DropoutOp<Dtype, T>::InferShape() const {
+  auto input_dims = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(input_dims);
+}
+template class DropoutOp<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(dropout, ops::DropoutOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/dropout_op.h b/src/operators/dropout_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9d5e173a8427d24097a627946dce4e14253fbe7
--- /dev/null
+++ b/src/operators/dropout_op.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef DROPOUT_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/dropout_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class DropoutOp
+    : public framework::OperatorWithKernel<
+          DeviceType, DropoutParam, operators::DropoutKernel<DeviceType, T>> {
+ public:
+  DropoutOp(const std::string &type, const VariableNameMap &inputs,
+            const VariableNameMap &outputs, const framework::AttributeMap attrs,
+            std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, DropoutParam,
+                                      operators::DropoutKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  // using framework::OperatorWithKernel<DeviceType, DropoutParam,
+  //                                    operators::DropoutKernel<DeviceType,
+  //                                    T>>;
+  void InferShape() const override;
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(dropout);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/elementwise_add_op.cpp b/src/operators/elementwise_add_op.cpp
index 12c59da6452992e3dd73b985db685a651df02250..369589574139c7bc68debb7c55836926a3d5f6b2 100644
--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -30,11 +30,9 @@ template class ElementwiseAddOp<CPU, float>;
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(elementwise_add);
 REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(elementwise_add);
 REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
diff --git a/src/operators/elementwise_add_op.h b/src/operators/elementwise_add_op.h
index 6cb80d06d0a4d66935c77a3c23a6264d0be53ecc..761a5d35459558d1ca5673757fae13147b7f6a6f 100644
--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
@@ -48,4 +48,13 @@ class ElementwiseAddOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(elementwise_add);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(elementwise_add);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/feed_op.cpp b/src/operators/feed_op.cpp
index a40eac098c7bef442befa1758b21904269cc22d5..c4357d7993cd91a306fec5856eaa6839e9ab6a6e 100644
--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -19,3 +19,14 @@ namespace operators {
 template class FeedOp<CPU, float>;
 }
 }  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+
+#endif
diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h
index bd5fd8cb32d484b7f76652139603f6b0f1b4b5d7..723747874da8fc8ee2c02eb1be4c89189c2af746 100644
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -32,7 +32,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
         param_(inputs, outputs, attrs, *scope) {}
   void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
 
-  void Init() const {}
+  void Init() {}
 
   void InferShape() const {
     auto out_dims = param_.Out()->dims();
@@ -44,17 +44,14 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
   FeedParam param_;
 };
 
-namespace ops = paddle_mobile::operators;
+}  // namespace operators
+}  // namespace paddle_mobile
+
 #ifdef PADDLE_MOBILE_CPU
 USE_OP_CPU(feed);
-REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 USE_OP_MALI_GPU(feed);
-REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/src/operators/fetch_op.cpp b/src/operators/fetch_op.cpp
index 45d6afc07b597156a746b7cd6657c3b58f1b9950..cdbe413c955b931a16e716aa2e18d2a018a53bab 100644
--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
@@ -19,3 +19,13 @@ namespace operators {
 template class FetchOp<CPU, float>;
 }
 }  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
diff --git a/src/operators/fetch_op.h b/src/operators/fetch_op.h
index 4b3680b58357d8295b1b6acf111d3573d4e4d1bd..5614fef8fe1a5b2e234b29e6d7b52cc4c2719008 100644
--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -33,7 +33,7 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
         param_(inputs, outputs, attrs, *scope) {}
   void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
 
-  void Init() const {}
+  void Init() {}
 
   void InferShape() const {
     auto x_dims = param_.InputX()->dims();
@@ -44,17 +44,14 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
   FetchParam param_;
 };
 
-namespace ops = paddle_mobile::operators;
+}  // namespace operators
+}  // namespace paddle_mobile
+
 #ifdef PADDLE_MOBILE_CPU
 USE_OP_CPU(fetch);
-REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 USE_OP_MALI_GPU(fetch);
-REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/src/operators/fusion_conv_add.cpp b/src/operators/fusion_conv_add.cpp
index 4c01603509b0a1d9da2c2dc31a38719d5117e05c..be70370f9de0963bbe6625513257be890e36dacb 100644
--- a/src/operators/fusion_conv_add.cpp
+++ b/src/operators/fusion_conv_add.cpp
@@ -44,17 +44,38 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
   framework::DDim ddim = framework::make_ddim(output_shape);
   this->param_.Output()->Resize(ddim);
 }
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef CONV_ADD_REGISTER
+framework::FusionOpRegistrar convadd_registrar(new FusionConvAddMatcher());
+#define CONV_ADD_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+
+#ifndef CONV_ADD_REGISTER
+static framework::FusionOpRegistrar convadd_registrar(
+    new FusionConvAddMatcher());
+#define CONV_ADD_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 template class FusionConvAddOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(conv_add);
-REGISTER_OPERATOR_CPU(conv_add, ops::FusionConvAddOp);
+REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(conv_add);
 REGISTER_OPERATOR_MALI_GPU(conv_add, ops::FusionConvAddOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
diff --git a/src/operators/fusion_conv_add.h b/src/operators/fusion_conv_add.h
index 24f1d3f63b3300db9b60a595466a0ced3b9e996b..02c9d910b955fd9398df9406a5f730c4a7abbfee 100644
--- a/src/operators/fusion_conv_add.h
+++ b/src/operators/fusion_conv_add.h
@@ -11,9 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define FUSION_CONVADD_OP
-#ifdef FUSION_CONVADD_OP
 
+#ifdef FUSION_CONVADD_OP
 #pragma once
 
 #include <string>
@@ -40,10 +39,10 @@ class FusionConvAddMatcher : public framework::FusionOpMatcher {
     vector<std::shared_ptr<framework::OpDesc>> origin_descs =
         node->OpDescs(node_.Depth());
     node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Y"}}}, removed_nodes);
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
   }
 
-  std::string Type() { return G_OP_TYPE_CONV_ADD; }
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD; }
 };
 
 template <typename DeviceType, typename T>
@@ -68,11 +67,13 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
 };
 
 #ifdef PADDLE_MOBILE_CPU
+
 #ifndef CONV_ADD_REGISTER
-static framework::FusionOpRegistrar convadd_registrar(
+extern framework::FusionOpRegistrar convadd_registrar(
     new FusionConvAddMatcher());
 #define CONV_ADD_REGISTER
 #endif
+
 #endif
 
 #ifdef PADDLE_MOBILE_MALI_GPU
@@ -91,4 +92,13 @@ static framework::FusionOpRegistrar convadd_registrar(
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(conv_add);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/fusion_conv_add_bn_relu_op.cpp b/src/operators/fusion_conv_add_bn_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..62839c1a5acaf89a3efef39bbe4a67c675da393b
--- /dev/null
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#include "operators/fusion_conv_add_bn_relu_op.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+template class FusionConvAddBNReluOp<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_add_bn_relu_op.h b/src/operators/fusion_conv_add_bn_relu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..753ce395980c9186c948bc1ae4c89c3d2c417fdc
--- /dev/null
+++ b/src/operators/fusion_conv_add_bn_relu_op.h
@@ -0,0 +1,114 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "op_param.h"
+#include "operators/kernel/conv_add_bn_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionConvAddBNReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionConvAddBNReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
+        node->OpDescs(node_.Depth());
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
+                  {G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_BN_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionConvAddBNReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionConvAddBNReluParam,
+          operators::ConvAddBNReluKernel<DeviceType, T>> {
+ public:
+  FusionConvAddBNReluOp(const string &type, const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs,
+                        std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionConvAddBNReluParam,
+            operators::ConvAddBNReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, FusionConvAddBNReluParam,
+      operators::ConvAddBNReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+
+//#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
+// static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
+//    new FusionConvAddBNReluMatcher());
+//#define FUSION_CONV_ADD_BN_RELU_REGISTER
+//#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+
+#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
+    new FusionConvAddBNReluMatcher());
+#define FUSION_CONV_ADD_BN_RELU_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_bn_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_add_relu_op.cpp b/src/operators/fusion_conv_add_relu_op.cpp
index 694e46af1f8dec3513c5a6d2ff26e3676e9204e4..5575b52ce9866901a13c630a7509c7e5ec5401cb 100644
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -49,7 +49,6 @@ void FusionConvAddReluOp<Dtype, T>::InferShape() const {
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add_relu);
 REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h
index fd27005c8bef8f8cb91fbf5b6e5a852306c28a9b..cf68fac8cf6dad4eb8469a543656311e5cedc9e7 100644
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -36,7 +36,7 @@ class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher {
       framework::Node *node,
       std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
     node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Y"}}}, removed_nodes);
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
   }
   std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; }
 };
@@ -65,11 +65,11 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
 
 #ifdef PADDLE_MOBILE_CPU
 
-#ifndef CONV_ADD_RELU_REGISTER
-#define CONV_ADD_RELU_REGISTER
+//#ifndef CONV_ADD_RELU_REGISTER
+//#define CONV_ADD_RELU_REGISTER
 // static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(new
 // FusionConvAddReluOpMatcher());
-#endif
+//#endif
 
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
@@ -80,4 +80,12 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/fusion_fc_op.cpp b/src/operators/fusion_fc_op.cpp
index fae561348899dadc4c25f84ec3a0993d9ae693f9..1b2a46defc520519e0fb61779cf45059f0a54913 100644
--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -49,17 +49,37 @@ void FusionFcOp<Dtype, T>::InferShape() const {
   framework::DDim ddim = framework::make_ddim(output_dims);
   this->param_.Out()->Resize(ddim);
 }
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef CONV_CPU_REGISTER
+#define CONV_CPU_REGISTER
+framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+
+#ifndef CONV_CPU_REGISTER
+#define CONV_CPU_REGISTER
+framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 template class FusionFcOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fc);
-REGISTER_OPERATOR_CPU(fc, ops::FusionFcOp);
+REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(fc);
 REGISTER_OPERATOR_MALI_GPU(fc, ops::FusionFcOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
diff --git a/src/operators/fusion_fc_op.h b/src/operators/fusion_fc_op.h
index 0ca4d2b27ad46b77ddba55b6b377e741c97bdc9e..c07d59e31e8124325d48a5b9ff208e1e425146a8 100644
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -38,7 +38,7 @@ class FusionFcMatcher : public framework::FusionOpMatcher {
       framework::Node *node,
       std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
     node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}}, removed_nodes);
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Z"}}}}, removed_nodes);
   }
 
   std::string Type() { return G_OP_TYPE_FC; }
@@ -66,17 +66,21 @@ class FusionFcOp
 };
 
 #ifdef PADDLE_MOBILE_CPU
+
 #ifndef CONV_CPU_REGISTER
 #define CONV_CPU_REGISTER
-static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
+extern framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
 #endif
+
 #endif
 
 #ifdef PADDLE_MOBILE_MALI_GPU
+
 #ifndef CONV_CPU_REGISTER
 #define CONV_CPU_REGISTER
 static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
 #endif
+
 #endif
 
 #ifdef PADDLE_MOBILE_FPGA
@@ -85,4 +89,13 @@ static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_fc);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fc);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/im2sequence_op.cpp b/src/operators/im2sequence_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..273ce462d0aa423a6bf023186c6a579e975dfb11
--- /dev/null
+++ b/src/operators/im2sequence_op.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef IM2SEQUENCE_OP
+
+#include "operators/im2sequence_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+int Im2SequenceOutputSize(int input_size, int kernel, int padding_1,
+                          int padding_2, int stride) {
+  int output_size =
+      1 + (padding_1 + padding_2 + input_size - kernel + stride - 1) / stride;
+  return output_size;
+}
+
+template <typename Dtype, typename T>
+void Im2SequenceOp<Dtype, T>::InferShape() const {
+  auto in_x_dims = this->param_.Input()->dims();
+
+  const std::vector<int> &kernels = this->param_.Kernels();
+
+  const std::vector<int> &strides = this->param_.Strides();
+
+  std::vector<int> paddings = this->param_.Paddings();
+
+  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(Im2SequenceOutputSize(in_x_dims[i + 2], kernels[i],
+                                                 paddings[i], paddings[i + 2],
+                                                 strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+template class Im2SequenceOp<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(im2sequence, ops::Im2SequenceOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/im2sequence_op.h b/src/operators/im2sequence_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0695da9308d33ca2b86a5e052210507beb9a82d3
--- /dev/null
+++ b/src/operators/im2sequence_op.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef IM2SEQUENCE_OP
+
+#pragma once
+
+#include <operators/op_param.h>
+#include "framework/operator.h"
+#include "operators/kernel/im2sequence_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using namespace framework;
+
+template <typename DeviceType, typename T>
+class Im2SequenceOp : public framework::OperatorWithKernel<
+                          DeviceType, Im2SequenceParam,
+                          operators::Im2SequenceKernel<DeviceType, T>> {
+ public:
+  Im2SequenceOp(const std::string &type, const VariableNameMap &inputs,
+                const VariableNameMap &outputs,
+                const framework::AttributeMap &attrs,
+                std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, Im2SequenceParam,
+            operators::Im2SequenceKernel<DeviceType, T>>(type, inputs, outputs,
+                                                         attrs, scope) {}
+
+  // using framework::OperatorWithKernel<
+  //    DeviceType, Im2SequenceParam,
+  //    operators::Im2SequenceKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ private:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(im2sequence);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/kernel/arm/batchnorm_kernel.cpp b/src/operators/kernel/arm/batchnorm_kernel.cpp
index ecebc009bc36542f54578c881716d5fa92c04b7b..f78d1fdc95ac9e10619dbf32fdc84d01a370f315 100644
--- a/src/operators/kernel/arm/batchnorm_kernel.cpp
+++ b/src/operators/kernel/arm/batchnorm_kernel.cpp
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #ifdef BATCHNORM_OP
 
-#pragma once
-
 #include "operators/kernel/batchnorm_kernel.h"
 #include "operators/kernel/central-arm-func/batchnorm_arm_func.h"
 
@@ -23,7 +21,7 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool BatchNormKernel<CPU, float>::Init(const BatchNormParam &para) const {
+bool BatchNormKernel<CPU, float>::Init(BatchNormParam *param) {
   return true;
 }
 
diff --git a/src/operators/kernel/arm/box_coder_kernel.cpp b/src/operators/kernel/arm/box_coder_kernel.cpp
index 02ec85a444c380ba76d64b0d15d35ce27b9eaf44..d2a479391fbbb416eea7d19ae64125cac4637ef1 100644
--- a/src/operators/kernel/arm/box_coder_kernel.cpp
+++ b/src/operators/kernel/arm/box_coder_kernel.cpp
@@ -15,129 +15,21 @@ limitations under the License. */
 #ifdef BOXCODER_OP
 
 #include "operators/kernel/box_coder_kernel.h"
+#include "operators/kernel/central-arm-func/box_coder_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
-template <typename T>
-void EncodeCenterSize(const framework::Tensor& target_box,
-                      const framework::Tensor& prior_box,
-                      const framework::Tensor& prior_box_var, T* output) {
-  int64_t row = target_box.dims()[0];
-  int64_t col = prior_box.dims()[0];
-  int64_t len = prior_box.dims()[1];
-  auto* target_box_data = target_box.data<T>();
-  auto* prior_box_data = prior_box.data<T>();
-  auto* prior_box_var_data = prior_box_var.data<T>();
-
-  for (int64_t i = 0; i < row; ++i) {
-    for (int64_t j = 0; j < col; ++j) {
-      T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len];
-      T prior_box_height =
-          prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
-      T prior_box_center_x =
-          (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
-      T prior_box_center_y =
-          (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
-
-      T target_box_center_x =
-          (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
-      T target_box_center_y =
-          (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
-      T target_box_width =
-          target_box_data[i * len + 2] - target_box_data[i * len];
-      T target_box_height =
-          target_box_data[i * len + 3] - target_box_data[i * len + 1];
-
-      size_t offset = i * col * len + j * len;
-      output[offset] = (target_box_center_x - prior_box_center_x) /
-                       prior_box_width / prior_box_var_data[j * len];
-      output[offset + 1] = (target_box_center_y - prior_box_center_y) /
-                           prior_box_height / prior_box_var_data[j * len + 1];
-      output[offset + 2] =
-          std::log(std::fabs(target_box_width / prior_box_width)) /
-          prior_box_var_data[j * len + 2];
-      output[offset + 3] =
-          std::log(std::fabs(target_box_height / prior_box_height)) /
-          prior_box_var_data[j * len + 3];
-    }
-  }
-}
-
-template <typename T>
-void DecodeCenterSize(const framework::Tensor& target_box,
-                      const framework::Tensor& prior_box,
-                      const framework::Tensor& prior_box_var, T* output) {
-  int64_t row = target_box.dims()[0];
-  int64_t col = prior_box.dims()[0];
-  int64_t len = prior_box.dims()[1];
-
-  auto* target_box_data = target_box.data<T>();
-  auto* prior_box_data = prior_box.data<T>();
-  auto* prior_box_var_data = prior_box_var.data<T>();
-
-  for (int64_t i = 0; i < row; ++i) {
-    for (int64_t j = 0; j < col; ++j) {
-      size_t offset = i * col * len + j * len;
-      T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len];
-      T prior_box_height =
-          prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
-      T prior_box_center_x =
-          (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
-      T prior_box_center_y =
-          (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
-
-      T target_box_center_x = prior_box_var_data[j * len] *
-                                  target_box_data[offset] * prior_box_width +
-                              prior_box_center_x;
-      T target_box_center_y = prior_box_var_data[j * len + 1] *
-                                  target_box_data[offset + 1] *
-                                  prior_box_height +
-                              prior_box_center_y;
-      T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
-                                    target_box_data[offset + 2]) *
-                           prior_box_width;
-      T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
-                                     target_box_data[offset + 3]) *
-                            prior_box_height;
-
-      output[offset] = target_box_center_x - target_box_width / 2;
-      output[offset + 1] = target_box_center_y - target_box_height / 2;
-      output[offset + 2] = target_box_center_x + target_box_width / 2;
-      output[offset + 3] = target_box_center_y + target_box_height / 2;
-    }
-  }
-}
-
 template <>
-bool BoxCoderKernel<CPU, float>::Init(const BoxCoderParam& para) const {
+bool BoxCoderKernel<CPU, float>::Init(BoxCoderParam *param) {
   return true;
 }
 
 template <>
-void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam& param) const {
-  const auto* input_priorbox = param.InputPriorBox();
-  const auto* input_priorboxvar = param.InputPriorBoxVar();
-  const auto* input_targetbox = param.InputTargetBox();
-
-  const auto& code_type = param.CodeType();
-
-  auto row = input_targetbox->dims()[0];
-  auto col = input_priorbox->dims()[0];
-  auto len = input_priorbox->dims()[1];
-
-  Tensor* output_box = param.OutputBox();
-  auto* output_box_dataptr = output_box->mutable_data<float>({row, col, len});
-
-  if (code_type == "encode_center_size") {
-    EncodeCenterSize<float>(*input_targetbox, *input_priorbox,
-                            *input_priorboxvar, output_box_dataptr);
-  }
-  if (code_type == "decode_center_size") {
-    DecodeCenterSize<float>(*input_targetbox, *input_priorbox,
-                            *input_priorboxvar, output_box_dataptr);
-  }
+void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam &param) const {
+  BoxCoderCompute<float>(param);
 }
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/arm/concat_kernel.cpp b/src/operators/kernel/arm/concat_kernel.cpp
index 0312047b8e8af1eb9dad57c751e392e8a5054878..b6810bf76946bfb8151f3001b76fcbaa5e99e5fc 100644
--- a/src/operators/kernel/arm/concat_kernel.cpp
+++ b/src/operators/kernel/arm/concat_kernel.cpp
@@ -15,77 +15,19 @@ limitations under the License. */
 #ifdef CONCAT_OP
 
 #include "operators/kernel/concat_kernel.h"
+#include "operators/kernel/central-arm-func/concat_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
-template <typename T>
-class ConcatFunctor {
- public:
-  void operator()(const std::vector<framework::Tensor> &input, const int axis,
-                  framework::Tensor *output) {
-    size_t num = input.size();
-    int rows = 1;
-    auto dim_0 = input[0].dims();
-    for (int i = 0; i < axis; ++i) {
-      rows *= dim_0[i];
-    }
-    int out_rows = rows, out_cols = 0;
-
-    std::vector<int64_t> input_cols(input.size());
-    for (int i = 0; i < num; ++i) {
-      int t_cols = input[i].numel() / rows;
-      out_cols += t_cols;
-      input_cols[i] = t_cols;
-    }
-
-    // computation
-    for (int k = 0; k < out_rows; ++k) {
-      T *dst_ptr = output->data<T>() + k * out_cols;
-      int col_idx = 0;
-      for (int j = 0; j < num; ++j) {
-        int col_len = input_cols[j];
-        const T *src_prt = input[j].data<T>() + k * col_len;
-        memory::Copy(dst_ptr + col_idx, src_prt, sizeof(T) * col_len);
-        col_idx += col_len;
-      }
-    }
-  }
-};
 
 template <>
-bool ConcatKernel<CPU, float>::Init(const ConcatParam &para) const {
+bool ConcatKernel<CPU, float>::Init(ConcatParam *param) {
   return true;
 }
 
 template <>
 void ConcatKernel<CPU, float>::Compute(const ConcatParam &param) const {
-  auto inputs = param.Inputs();
-  auto *out = param.Out();
-  int64_t axis = param.Axis();
-  out->mutable_data<float>();
-
-  /// Sometimes direct copies will be faster, this maybe need deeply analysis.
-  if (axis == 0 && inputs.size() < 10) {
-    size_t output_offset = 0;
-    for (auto *in : inputs) {
-      auto in_stride = framework::stride_numel(in->dims());
-      auto out_stride = framework::stride_numel(out->dims());
-      auto dst = out->data<float>() + output_offset;
-      auto src = in->data<float>();
-      PADDLE_MOBILE_ENFORCE(
-          in_stride.size() == out_stride.size(),
-          "src and dst tensor should have the same dims size.");
-      memory::Copy(dst, src, sizeof(float) * in_stride[0]);
-      output_offset += in_stride[0];
-    }
-  } else {
-    std::vector<framework::Tensor> inputs_concat(inputs.size());
-    for (int j = 0; j < inputs.size(); ++j) {
-      inputs_concat[j] = *inputs[j];
-    }
-    ConcatFunctor<float> concat_functor;
-    concat_functor(inputs_concat, static_cast<int>(axis), out);
-  }
+  ConcatCompute<float>(param);
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1fd1c66d4dc92a9918243b23e400ef5309422050
--- /dev/null
+++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#include "operators/kernel/conv_add_bn_relu_kernel.h"
+#include "operators/kernel/central-arm-func/conv_add_bn_relu_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddBNReluKernel<CPU, float>::Init(FusionConvAddBNReluParam *param) {
+  const Tensor *mean = param->InputMean();
+  const Tensor *variance = param->InputVariance();
+  const Tensor *scale = param->InputScale();
+  const Tensor *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+
+  const int C = mean->numel();
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  Tensor *new_scale = new Tensor();
+  Tensor *new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({C});
+  auto new_bias_ptr = new_bias->mutable_data<float>({C});
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  return true;
+}
+
+template <>
+void ConvAddBNReluKernel<CPU, float>::Compute(
+    const FusionConvAddBNReluParam &param) const {
+  ConvAddBNReluCompute<float>(param);
+}
+template class ConvAddBNReluKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/conv_add_kernel.cpp b/src/operators/kernel/arm/conv_add_kernel.cpp
index 4bde8289007415dccbc7a630c7646ac718087c55..88f839f611f1ed7f46c11a1b24feb6e29ff07ec7 100644
--- a/src/operators/kernel/arm/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_kernel.cpp
@@ -14,104 +14,21 @@ limitations under the License. */
 #ifdef FUSION_CONVADD_OP
 
 #include "operators/kernel/conv_add_kernel.h"
+#include "../central-arm-func/conv_add_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ConvAddKernel<CPU, float>::Init(const FusionConvAddParam &para) const {
+bool ConvAddKernel<CPU, float>::Init(FusionConvAddParam *param) {
   return true;
 }
 
 template <>
 void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam &param) const {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-  int axis = param.Axis();
-  Tensor *output = param.Output();
-  math::expand_bias(bias, axis, output->dims());
-  output->ShareDataWith(bias);
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(1));
-    }
-  }
+  ConvAddCompute<float>(param);
 }
+
 template class ConvAddKernel<CPU, float>;
 
 }  // namespace operators
diff --git a/src/operators/kernel/arm/conv_add_relu_kernel.cpp b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
index d3c04179b37014adc6c81f32dd6c08f697283671..356dd191e761afc5d5b6bfacd250f90ae31017b2 100644
--- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
@@ -21,8 +21,7 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ConvAddReluKernel<CPU, float>::Init(
-    const FusionConvAddReluParam &para) const {
+bool ConvAddReluKernel<CPU, float>::Init(FusionConvAddReluParam *param) {
   return true;
 }
 
diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp
index 049425d88f96a322a0b4cb47c18d85f2df03d577..ca8aeff0dd3db5fe7b625bdeb947b2927eb619ce 100644
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -21,7 +21,7 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ConvKernel<CPU, float>::Init(const ConvParam &para) const {
+bool ConvKernel<CPU, float>::Init(ConvParam *param) {
   return true;
 }
 
diff --git a/src/operators/kernel/arm/depthwise_conv_kernel.cpp b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
index 4cbfa23248e87e2bf3a8d97330fa19f92985a9d0..6ede0e2bef2383df8aa0593a07297f2f6233acaf 100644
--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
@@ -21,7 +21,7 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool DepthwiseConvKernel<CPU, float>::Init(const ConvParam &para) const {
+bool DepthwiseConvKernel<CPU, float>::Init(ConvParam *param) {
   return true;
 }
 
diff --git a/src/operators/kernel/arm/dropout_kernel.cpp b/src/operators/kernel/arm/dropout_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..af16048a1b4eba2ff36f842b6cf968031989576e
--- /dev/null
+++ b/src/operators/kernel/arm/dropout_kernel.cpp
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef DROPOUT_OP
+
+#pragma once
+
+#include "operators/kernel/dropout_kernel.h"
+#include <operators/math/transform.h>
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DropoutKernel<CPU, float>::Init(DropoutParam *para) {
+  return true;
+}
+
+template <typename T>
+struct DropoutFunctor {
+  inline T operator()(T in) const { return in; }
+};
+
+template <>
+void DropoutKernel<CPU, float>::Compute(const DropoutParam &param) const {
+  const auto *input_x = param.InputX();
+  auto *input_x_ptr = input_x->data<float>();
+  auto *out = param.Out();
+  auto *out_ptr = out->mutable_data<float>();
+
+  DropoutFunctor<float> func_;
+  math::Transform trans;
+  trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/elementwise_add_kernel.cpp b/src/operators/kernel/arm/elementwise_add_kernel.cpp
index 2f5e26a37e4f2c1d370805ee7b565a60f4748b0a..fdab1c60a310480d8e59f3f84802001ea592433a 100644
--- a/src/operators/kernel/arm/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_add_kernel.cpp
@@ -14,38 +14,23 @@ limitations under the License. */
 
 #ifdef ELEMENTWISEADD_OP
 
-#pragma once
-
 #include "operators/kernel/elementwise_add_kernel.h"
+#include "operators/kernel/central-arm-func/elementwise_add_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
-template <typename T>
-struct AddFunctor {
-  inline T operator()(T a, T b) const { return a + b; }
-};
-
 template <>
-bool ElementwiseAddKernel<CPU, float>::Init(
-    const ElementwiseAddParam &para) const {
+bool ElementwiseAddKernel<CPU, float>::Init(ElementwiseAddParam *param) {
   return true;
 }
 
 template <>
 void ElementwiseAddKernel<CPU, float>::Compute(
     const ElementwiseAddParam &param) const {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *Out = param.Out();
-  Out->mutable_data<float>();
-  int axis = param.Axis();
-  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
-                                                 AddFunctor<float>(), Out);
+  ElementwiseAddCompute<float>(param);
 }
 
-template class ElementwiseAddKernel<CPU, float>;
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/arm/fusion_fc_kernel.cpp b/src/operators/kernel/arm/fusion_fc_kernel.cpp
index 5fac70e40781593669abd15b8f28ff6272f7133c..c72960e67f19c601e6f27a3bedf7123c80875e0c 100644
--- a/src/operators/kernel/arm/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/arm/fusion_fc_kernel.cpp
@@ -14,60 +14,20 @@ limitations under the License. */
 
 #ifdef FUSION_FC_OP
 
-#pragma once
-
 #include "operators/kernel/fusion_fc_kernel.h"
+#include "operators/kernel/central-arm-func/fusion_fc_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool FusionFcKernel<CPU, float>::Init(const FusionFcParam &para) const {
+bool FusionFcKernel<CPU, float>::Init(FusionFcParam *param) {
   return true;
 }
 
 template <>
 void FusionFcKernel<CPU, float>::Compute(const FusionFcParam &param) const {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  const Tensor *input_z = param.InputZ();
-  auto *input_z_data = input_z->data<float>();
-  int axis = param.Axis();
-  Tensor *out = param.Out();
-  auto *out_data = out->mutable_data<float>();
-  const Tensor x_matrix =
-      input_x->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
-          : *input_x;
-  const Tensor y_matrix =
-      input_y->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
-          : *input_y;
-  auto out_dim = out->dims();
-  if (out_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
-  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
-                        " out_dim.size must be 2.");
-  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
-  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ")
-
-  int64_t classes = input_z->numel();
-  for (int i = 0; i < out_dim[0]; i++) {
-    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
-  }
-
-  for (int i = 0; i < out->numel(); i++) {
-    DLOG << out_data[i];
-  }
-  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
-                      out, static_cast<float>(1));
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-  //            if (out_dim.size() != 2) {
-  //                out->Resize(out_dim);
-  //            }
+  FusionFcCompute<float>(param);
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/arm/im2sequence_kernel.cpp b/src/operators/kernel/arm/im2sequence_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..709fa30a23d4efba3531d9bc567c99f53875bc12
--- /dev/null
+++ b/src/operators/kernel/arm/im2sequence_kernel.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef IM2SEQUENCE_OP
+
+#include "operators/kernel/im2sequence_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool Im2SequenceKernel<CPU, float>::Init(Im2SequenceParam *para) {
+  return true;
+}
+
+inline int Im2SeqOutputSize(int input_size, int filter_size, int padding_0,
+                            int padding_1, int stride) {
+  const int output_size =
+      (input_size + padding_0 + padding_1 - filter_size) / stride + 1;
+  return output_size;
+}
+
+template <>
+void Im2SequenceKernel<CPU, float>::Compute(
+    const Im2SequenceParam &param) const {
+  const Tensor *in_x = param.Input();
+  Tensor *out = param.Output();
+  out->mutable_data<float>();
+
+  std::vector<int> kernels = param.Kernels();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+
+  auto in_x_dim = in_x->dims();
+  const int batch_size = static_cast<int>(in_x_dim[0]);
+  const int img_channels = static_cast<int>(in_x_dim[1]);
+  const int img_height = static_cast<int>(in_x_dim[2]);
+  const int img_width = static_cast<int>(in_x_dim[3]);
+
+  int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
+                                       paddings[2], strides[0]);
+  int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
+                                      paddings[3], strides[1]);
+  const std::vector<int> dilations({1, 1});
+
+  // TODO: verify
+  auto out_dims = out->dims();
+  out->Resize({batch_size, out->numel() / batch_size});
+
+  for (int i = 0; i < batch_size; i++) {
+    const Tensor src =
+        in_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+    Tensor dst = out->Slice(i, i + 1).Resize(
+        {output_height, output_width, img_channels, kernels[0], kernels[1]});
+
+    math::Im2ColFunctor<math::ColFormat::kOCF, CPU, float> f;
+    f(src, dilations, strides, paddings, &dst);
+  }
+  out->Resize(out_dims);
+}
+
+template class Im2SequenceKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/lrn_kernel.cpp b/src/operators/kernel/arm/lrn_kernel.cpp
index 839c5ee95bd4d1e9d3fd80af3df0f8a45797434e..0c20c5167adee5165067cc5ab4935df255751755 100644
--- a/src/operators/kernel/arm/lrn_kernel.cpp
+++ b/src/operators/kernel/arm/lrn_kernel.cpp
@@ -14,40 +14,22 @@ limitations under the License. */
 
 #ifdef LRN_OP
 
-#pragma once
-
 #include "operators/kernel/lrn_kernel.h"
+#include "operators/kernel/central-arm-func/lrn_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool LrnKernel<CPU, float>::Init(const LrnParam &para) const {
+bool LrnKernel<CPU, float>::Init(LrnParam *param) {
   return true;
 }
 
 template <>
 void LrnKernel<CPU, float>::Compute(const LrnParam &param) const {
-  const Tensor *input_x = param.InputX();
-  auto x_dims = input_x->dims();
-  Tensor *out = param.Out();
-  out->mutable_data<float>();
-  /// data_format = NCHW
-  const int N = x_dims[0];
-  const int C = x_dims[1];
-  const int H = x_dims[2];
-  const int W = x_dims[3];
-
-  const int n = param.N();
-  const float alpha = param.Alpha();
-  const float beta = param.Beta();
-  const float k = param.K();
-  LRNFunctor<float> lrnFunctor;
-  lrnFunctor(*input_x, out, N, C, H, W, n, k, alpha, beta);
+  LrnCompute<float>(param);
 }
 
-template class LrnKernel<CPU, float>;
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/arm/mul_kernel.cpp b/src/operators/kernel/arm/mul_kernel.cpp
index b3bb2b8075fdf306d47640c2bee3f2fc00ef0bc0..ac5010ce5492ae1d99e59bfa761e22bb3aa5d1c9 100644
--- a/src/operators/kernel/arm/mul_kernel.cpp
+++ b/src/operators/kernel/arm/mul_kernel.cpp
@@ -14,45 +14,22 @@ limitations under the License. */
 
 #ifdef MUL_OP
 
-#pragma once
-
 #include "operators/kernel/mul_kernel.h"
+#include "operators/kernel/central-arm-func/mul_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool MulKernel<CPU, float>::Init(const MulParam &para) const {
+bool MulKernel<CPU, float>::Init(MulParam *param) {
   return true;
 }
 
 template <>
 void MulKernel<CPU, float>::Compute(const MulParam &param) const {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *out = param.Out();
-  out->mutable_data<float>();
-  const Tensor x_matrix =
-      input_x->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
-          : *input_x;
-  const Tensor y_matrix =
-      input_y->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
-          : *input_y;
-  auto out_dim = out->dims();
-  if (out_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
-                      out, static_cast<float>(0));
-  if (out_dim.size() != 2) {
-    out->Resize(out_dim);
-  }
+  MulCompute<float>(param);
 }
 
-template class MulKernel<CPU, float>;
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/arm/multiclass_nms_kernel.cpp b/src/operators/kernel/arm/multiclass_nms_kernel.cpp
index 58f224ab536173fff46bb1739b27590382fbab6d..9ed8f1731afe2bab723c66ea1e2e8c5042f6ce28 100644
--- a/src/operators/kernel/arm/multiclass_nms_kernel.cpp
+++ b/src/operators/kernel/arm/multiclass_nms_kernel.cpp
@@ -14,269 +14,21 @@ limitations under the License. */
 
 #ifdef MULTICLASSNMS_OP
 
-#pragma once
-
 #include "operators/kernel/multiclass_nms_kernel.h"
+#include "operators/kernel/central-arm-func/multiclass_nms_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
-constexpr int kOutputDim = 6;
-constexpr int kBBoxSize = 4;
-
-template <class T>
-bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                          const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <class T>
-static inline void GetMaxScoreIndex(
-    const std::vector<T>& scores, const T threshold, int top_k,
-    std::vector<std::pair<T, int>>* sorted_indices) {
-  for (size_t i = 0; i < scores.size(); ++i) {
-    if (scores[i] > threshold) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
-    }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
-                   SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
-}
-
-template <class T>
-static inline T BBoxArea(const T* box, const bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <class T>
-static inline T JaccardOverlap(const T* box1, const T* box2,
-                               const bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = inter_xmax - inter_xmin;
-    const T inter_h = inter_ymax - inter_ymin;
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <typename T>
-static inline void NMSFast(const Tensor& bbox, const Tensor& scores,
-                           const T score_threshold, const T nms_threshold,
-                           const T eta, const int64_t top_k,
-                           std::vector<int>* selected_indices) {
-  // The total boxes for each instance.
-  int64_t num_boxes = bbox.dims()[0];
-  // 4: [xmin ymin xmax ymax]
-  int64_t box_size = bbox.dims()[1];
-
-  std::vector<T> scores_data(num_boxes);
-  std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices;
-  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
-
-  selected_indices->clear();
-  T adaptive_threshold = nms_threshold;
-  const T* bbox_data = bbox.data<T>();
-
-  while (sorted_indices.size() != 0) {
-    const int idx = sorted_indices.front().second;
-    bool keep = true;
-    for (size_t k = 0; k < selected_indices->size(); ++k) {
-      if (keep) {
-        const int kept_idx = (*selected_indices)[k];
-        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                      bbox_data + kept_idx * box_size, true);
-        keep = overlap <= adaptive_threshold;
-      } else {
-        break;
-      }
-    }
-    if (keep) {
-      selected_indices->push_back(idx);
-    }
-    sorted_indices.erase(sorted_indices.begin());
-    if (keep && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-}
-
-template <typename T>
-void MultiClassNMS(const Tensor& scores, const Tensor& bboxes,
-                   std::map<int, std::vector<int>>* indices, int* num_nmsed_out,
-                   const int& background_label, const int& nms_top_k,
-                   const int& keep_top_k, const T& nms_threshold,
-                   const T& nms_eta, const T& score_threshold) {
-  int64_t class_num = scores.dims()[0];
-  int64_t predict_dim = scores.dims()[1];
-  int num_det = 0;
-  for (int64_t c = 0; c < class_num; ++c) {
-    if (c == background_label) continue;
-    Tensor score = scores.Slice(c, c + 1);
-    /// [c] is key
-    NMSFast<float>(bboxes, score, score_threshold, nms_threshold, nms_eta,
-                   nms_top_k, &((*indices)[c]));
-    num_det += (*indices)[c].size();
-  }
-
-  *num_nmsed_out = num_det;
-  const T* scores_data = scores.data<T>();
-  if (keep_top_k > -1 && num_det > keep_top_k) {
-    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-    for (const auto& it : *indices) {
-      int label = it.first;
-      const T* sdata = scores_data + label * predict_dim;
-      const std::vector<int>& label_indices = it.second;
-      for (size_t j = 0; j < label_indices.size(); ++j) {
-        int idx = label_indices[j];
-        // PADDLE_ENFORCE_LT(idx, predict_dim);
-        score_index_pairs.push_back(
-            std::make_pair(sdata[idx], std::make_pair(label, idx)));
-      }
-    }
-    // Keep top k results per image.
-    std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
-                     SortScorePairDescend<std::pair<int, int>>);
-    score_index_pairs.resize(keep_top_k);
-
-    // Store the new indices.
-    std::map<int, std::vector<int>> new_indices;
-    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
-      int label = score_index_pairs[j].second.first;
-      int idx = score_index_pairs[j].second.second;
-      new_indices[label].push_back(idx);
-    }
-    new_indices.swap(*indices);
-    *num_nmsed_out = keep_top_k;
-  }
-}
-
-template <typename T>
-void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
-                      const std::map<int, std::vector<int>>& selected_indices,
-                      Tensor* outs) {
-  int predict_dim = scores.dims()[1];
-  auto* scores_data = scores.data<T>();
-  auto* bboxes_data = bboxes.data<T>();
-  auto* odata = outs->data<T>();
-
-  int count = 0;
-  for (const auto& it : selected_indices) {
-    /// one batch
-    int label = it.first;
-    const T* sdata = scores_data + label * predict_dim;
-    const std::vector<int>& indices = it.second;
-    for (size_t j = 0; j < indices.size(); ++j) {
-      int idx = indices[j];
-      const T* bdata = bboxes_data + idx * kBBoxSize;
-      odata[count * kOutputDim] = label;           // label
-      odata[count * kOutputDim + 1] = sdata[idx];  // score
-      // xmin, ymin, xmax, ymax
-      std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
-      count++;
-    }
-  }
-}
-
 template <>
-bool MultiClassNMSKernel<CPU, float>::Init(
-    const MultiClassNMSParam& para) const {
+bool MultiClassNMSKernel<CPU, float>::Init(MultiClassNMSParam *param) {
   return true;
 }
 
 template <>
 void MultiClassNMSKernel<CPU, float>::Compute(
-    const MultiClassNMSParam& param) const {
-  const auto* input_bboxes = param.InputBBoxes();
-  const auto& input_bboxes_dims = input_bboxes->dims();
-
-  const auto* input_scores = param.InputScores();
-  const auto& input_scores_dims = input_scores->dims();
-
-  auto* outs = param.Out();
-  auto background_label = param.BackGroundLabel();
-  auto nms_top_k = param.NMSTopK();
-  auto keep_top_k = param.KeepTopK();
-  auto nms_threshold = param.NMSThreshold();
-  auto nms_eta = param.NMSEta();
-  auto score_threshold = param.ScoreThreshold();
-
-  int64_t batch_size = input_scores_dims[0];
-  int64_t class_num = input_scores_dims[1];
-  int64_t predict_dim = input_scores_dims[2];
-  int64_t box_dim = input_bboxes_dims[2];
-
-  std::vector<std::map<int, std::vector<int>>> all_indices;
-  std::vector<size_t> batch_starts = {0};
-  for (int64_t i = 0; i < batch_size; ++i) {
-    Tensor ins_score = input_scores->Slice(i, i + 1);
-    ins_score.Resize({class_num, predict_dim});
-
-    Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
-    ins_boxes.Resize({predict_dim, box_dim});
-
-    std::map<int, std::vector<int>> indices;
-    int num_nmsed_out = 0;
-    MultiClassNMS<float>(ins_score, ins_boxes, &indices, &num_nmsed_out,
-                         background_label, nms_top_k, keep_top_k, nms_threshold,
-                         nms_eta, score_threshold);
-    all_indices.push_back(indices);
-    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-  }
-
-  int num_kept = batch_starts.back();
-  if (num_kept == 0) {
-    float* od = outs->mutable_data<float>({1});
-    od[0] = -1;
-  } else {
-    outs->mutable_data<float>({num_kept, kOutputDim});
-    for (int64_t i = 0; i < batch_size; ++i) {
-      Tensor ins_score = input_scores->Slice(i, i + 1);
-      ins_score.Resize({class_num, predict_dim});
-
-      Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
-      ins_boxes.Resize({predict_dim, box_dim});
-
-      int64_t s = batch_starts[i];
-      int64_t e = batch_starts[i + 1];
-      if (e > s) {
-        Tensor out = outs->Slice(s, e);
-        MultiClassOutput<float>(ins_score, ins_boxes, all_indices[i], &out);
-      }
-    }
-  }
-
-  //            framework::LoD lod;
-  //            lod.emplace_back(batch_starts);
-  //
-  //            outs->set_lod(lod);
+    const MultiClassNMSParam &param) const {
+  MultiClassNMSCompute<float>(param);
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/arm/pool_kernel.cpp b/src/operators/kernel/arm/pool_kernel.cpp
index 09162a13a4d0c59220cc25a02d06369c3f21ed32..be2189340f480bef80fd00a612cf32e71ea10a1c 100644
--- a/src/operators/kernel/arm/pool_kernel.cpp
+++ b/src/operators/kernel/arm/pool_kernel.cpp
@@ -14,70 +14,19 @@ limitations under the License. */
 
 #ifdef POOL_OP
 
-#include <operators/kernel/pool_kernel.h>
-#include "common/log.h"
-
+#include "operators/kernel/pool_kernel.h"
+#include "../central-arm-func/pool_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
 
-inline void PoolBasic(std::string pooling_type, std::vector<int> ksize,
-                      std::vector<int> strides, std::vector<int> paddings,
-                      const Tensor *in_x, Tensor *out) {
-  if (pooling_type == "max") {
-    math::PoolFunctor<CPU, math::MaxPool<float>, float> pool2d_forward;
-    math::MaxPool<float> pool_process;
-    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
-
-  } else if (pooling_type == "avg") {
-    math::PoolFunctor<CPU, math::AvgPool<float>, float> pool2d_forward;
-    math::AvgPool<float> pool_process;
-    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
-  }
-}
-
 template <>
-bool PoolKernel<CPU, float>::Init(const PoolParam &para) const {
+bool PoolKernel<CPU, float>::Init(PoolParam *param) {
   return true;
 }
 
 template <>
 void PoolKernel<CPU, float>::Compute(const PoolParam &param) const {
-  const Tensor *in_x = param.Input();
-  Tensor *out = param.Output();
-  std::string pooling_type = param.PoolingType();
-
-  std::vector<int> ksize = param.Ksize();
-
-  std::vector<int> strides = param.Strides();
-
-  std::vector<int> paddings = param.Paddings();
-  if (ksize.size() != 2) {
-    LOG(paddle_mobile::LogLevel::kLOG_ERROR)
-        << "Pool op only supports 2D and 3D input.";
-  }
-
-  if (param.isGlobalPooling()) {
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
-      ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
-    }
-  } else if (ksize[0] == 3 && ksize[0] == ksize[1]) {
-    if (pooling_type == "max") {
-      math::Pool3x3Max(strides, paddings, in_x, out);
-    } else if (pooling_type == "avg") {
-      math::Pool3x3Avg(strides, paddings, in_x, out);
-    }
-
-  } else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
-    if (pooling_type == "max") {
-      math::Pool2x2Max(strides, paddings, in_x, out);
-    } else if (pooling_type == "avg") {
-      math::Pool2x2Avg(strides, paddings, in_x, out);
-    }
-
-  } else {
-    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
-  }
+  PoolCompute<float>(param);
 }
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/arm/prior_box_kernel.cpp b/src/operators/kernel/arm/prior_box_kernel.cpp
index c14a143ce5d98749c95c780941966944e9ca1249..217d4b83cb1156a0e942c5ced5917546250e8bb1 100644
--- a/src/operators/kernel/arm/prior_box_kernel.cpp
+++ b/src/operators/kernel/arm/prior_box_kernel.cpp
@@ -14,138 +14,20 @@ limitations under the License. */
 
 #ifdef PRIORBOX_OP
 
-#pragma once
-
 #include "operators/kernel/prior_box_kernel.h"
+#include "operators/kernel/central-arm-func/prior_box_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
-template <typename T>
-struct ClipFunctor {
-  inline T operator()(T in) const {
-    return std::min<T>(std::max<T>(in, 0.), 1.);
-  }
-};
-
 template <>
-bool PriorBoxKernel<CPU, float>::Init(const PriorBoxParam &para) const {
+bool PriorBoxKernel<CPU, float>::Init(PriorBoxParam *param) {
   return true;
 }
 
 template <>
 void PriorBoxKernel<CPU, float>::Compute(const PriorBoxParam &param) const {
-  const auto *input_ = param.Input();
-  const auto &input_dims = input_->dims();
-
-  const auto *input_image = param.InputImage();
-  const auto &input_image_dims = input_image->dims();
-
-  const auto &min_sizes = param.MinSizes();
-  const auto &max_sizes = param.MaxSizes();
-  const auto &variances = param.Variances();
-  const auto &input_aspect_ratio = param.AspectRatios();
-  const bool &flip = param.Flip();
-  const bool &clip = param.Clip();
-  const float &step_w = param.StepW();
-  const float &step_h = param.StepH();
-  const float &offset = param.Offset();
-
-  Tensor *output_boxes = param.OutputBoxes();
-  auto output_boxes_dataptr = output_boxes->mutable_data<float>();
-  Tensor *output_variances = param.OutputVariances();
-  auto output_variances_dataptr = output_variances->mutable_data<float>();
-
-  std::vector<float> aspect_ratios;
-  ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
-
-  auto img_width = input_image_dims[3];
-  auto img_height = input_image_dims[2];
-
-  auto feature_width = input_dims[3];
-  auto feature_height = input_dims[2];
-
-  auto stride0 = output_boxes->dims()[1] * output_boxes->dims()[2] *
-                 output_boxes->dims()[3];
-  auto stride1 = output_boxes->dims()[2] * output_boxes->dims()[3];
-  auto stride2 = output_boxes->dims()[3];
-
-  float step_width, step_height;
-  /// 300 / 19
-  if (step_w == 0 || step_h == 0) {
-    step_width = static_cast<float>(img_width) / feature_width;
-    step_height = static_cast<float>(img_height) / feature_height;
-  } else {
-    step_width = step_w;
-    step_height = step_h;
-  }
-
-  int num_priors = aspect_ratios.size() * min_sizes.size();
-  if (!max_sizes.empty()) {
-    num_priors += max_sizes.size();
-  }
-
-  for (int h = 0; h < feature_height; ++h) {
-    for (int w = 0; w < feature_width; ++w) {
-      /// map origin image
-      float center_x = (w + offset) * step_width;
-      float center_y = (h + offset) * step_height;
-      float box_width, box_height;
-      int idx = 0;
-      for (size_t s = 0; s < min_sizes.size(); ++s) {
-        auto min_size = min_sizes[s];
-        // priors with different aspect ratios
-        for (float ar : aspect_ratios) {
-          box_width = min_size * sqrt(ar) / 2.;
-          box_height = min_size / sqrt(ar) / 2.;
-          /// box_width/2 , / img_width 为了得到feature map 相对于
-          /// 原图的归一化位置的比例。
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] =
-              (center_x - box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] =
-              (center_y - box_height) / img_height;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 2] =
-              (center_x + box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] =
-              (center_y + box_height) / img_height;
-          idx++;
-        }
-        if (!max_sizes.empty()) {
-          auto max_size = max_sizes[s];
-          // square prior with size sqrt(minSize * maxSize)
-          box_width = box_height = sqrt(min_size * max_size) / 2.;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] =
-              (center_x - box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] =
-              (center_y - box_height) / img_height;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 2] =
-              (center_x + box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] =
-              (center_y + box_height) / img_height;
-          idx++;
-        }
-      }
-    }
-  }
-  if (clip) {
-    math::Transform trans;
-    ClipFunctor<float> clip_func;
-    trans(output_boxes_dataptr, output_boxes_dataptr + output_boxes->numel(),
-          output_boxes_dataptr, clip_func);
-  }
-
-  if ((variances.size() != 4)) {
-    LOG(kLOG_ERROR) << " variances.size() must be 4.";
-  }
-
-  int64_t box_num = feature_height * feature_width * num_priors;
-
-  for (int i = 0; i < box_num; i++) {
-    output_variances_dataptr[4 * i] = variances[0];
-    output_variances_dataptr[4 * i + 1] = variances[1];
-    output_variances_dataptr[4 * i + 2] = variances[2];
-    output_variances_dataptr[4 * i + 3] = variances[3];
-  }
+  PriorBoxCompute<float>(param);
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/arm/relu_kernel.cpp b/src/operators/kernel/arm/relu_kernel.cpp
index 8ec0bfc4fa0c5762418efbd18cda664a1ec43271..63259a0c303f5e186f9eb90b98f2a8685f8ba5ca 100644
--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -14,101 +14,22 @@ limitations under the License. */
 
 #ifdef RELU_OP
 
-#pragma once
-
 #include "operators/kernel/relu_kernel.h"
-#include <operators/math/transform.h>
+#include "operators/kernel/central-arm-func/relu_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
-template <typename T>
-struct ReluFunctor {
-  inline T operator()(T in) const { return in > 0 ? in : 0; }
-};
-
 template <>
-bool ReluKernel<CPU, float>::Init(const ReluParam &para) const {
+bool ReluKernel<CPU, float>::Init(ReluParam *param) {
   return true;
 }
 
-/*
- * @b 特化到具体平台的实现, param 从 op 层传入
- * */
 template <>
 void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
-  const auto *input_x = param.InputX();
-  auto *input_x_ptr = input_x->data<float>();
-  auto *out = param.Out();
-  auto *out_ptr = out->mutable_data<float>();
-
-  int numel = input_x->numel();
-  //  if (numel > 64) {
-  //    asm volatile(
-  //        "pld        [%[input_x_ptr], #0]        \n\t"
-  //        "vmov.f32   q8,    #0.0                 \n\t"
-  //        "subs %[num], %[num], #32                \n\t"
-  //        "blt        end_num_%=                  \n\t"
-  //        "loop_num_%=:                           \n\t"
-  //        "pld        [%[input_x_ptr], #1024]      \n\t"
-  //
-  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-  //
-  //        "vmax.f32 q0, q0, q8                   \n\t"
-  //        "vmax.f32 q1, q1, q8                    \n\t"
-  //        "vmax.f32 q2, q2, q8                   \n\t"
-  //        "vmax.f32 q3, q3, q8                   \n\t"
-  //        "vmax.f32 q4, q4, q8                   \n\t"
-  //        "vmax.f32 q5, q5, q8                   \n\t"
-  //        "vmax.f32 q6, q6, q8                   \n\t"
-  //        "vmax.f32 q7, q7, q8                   \n\t"
-  //
-  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
-  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
-  //
-  //        "subs %[num], %[num], #32              \n\t"
-  //        "bge        loop_num_%=                \n\t"
-  //        "end_num_%=:                           \n\t"
-  //        "cmp %[num], #0                         \n\t"
-  //        "bge   end_%=                          \n\t"
-  //        "mov r6, #4                             \n\t"
-  //        "mul r5, %[num], r6                     \n\t"
-  //        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
-  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-  //        "vmax.f32 q0, q0, q8                   \n\t"
-  //        "vmax.f32 q1, q1, q8                    \n\t"
-  //        "vmax.f32 q2, q2, q8                   \n\t"
-  //        "vmax.f32 q3, q3, q8                   \n\t"
-  //        "vmax.f32 q4, q4, q8                   \n\t"
-  //        "vmax.f32 q5, q5, q8                   \n\t"
-  //        "vmax.f32 q6, q6, q8                   \n\t"
-  //        "vmax.f32 q7, q7, q8                   \n\t"
-  //        "add %[out_ptr], %[out_ptr], r5       \n\t"
-  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
-  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
-  //        "end_%=:                                \n\t"
-  //        :
-  //        :
-  //        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num]
-  //        "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-  //        "q7", "q8", "r5",
-  //          "r6");
-  //  } else {
-  ReluFunctor<float> func_;
-  math::Transform trans;
-  trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
-  //  }
+  ReluCompute<float>(param);
 }
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/arm/reshape_kernel.cpp b/src/operators/kernel/arm/reshape_kernel.cpp
index 8e40dcc987f52265824325af94aa45a1dbdbdb74..5ae8e5e3f945d115215652ded58dc8571868fcd7 100644
--- a/src/operators/kernel/arm/reshape_kernel.cpp
+++ b/src/operators/kernel/arm/reshape_kernel.cpp
@@ -14,44 +14,20 @@ limitations under the License. */
 
 #ifdef RESHAPE_OP
 
-#pragma once
-
 #include "operators/kernel/reshape_kernel.h"
+#include "operators/kernel/central-arm-func/reshape_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ReshapeKernel<CPU, float>::Init(const ReshapeParam &para) const {
+bool ReshapeKernel<CPU, float>::Init(ReshapeParam *param) {
   return true;
 }
 
 template <>
 void ReshapeKernel<CPU, float>::Compute(const ReshapeParam &param) const {
-  const auto *input_x = param.InputX();
-  const auto &input_x_dims = input_x->dims();
-  auto *out = param.Out();
-  framework::DDim out_dims = out->dims();
-  const auto *input_shape = param.InputShape();
-
-  if (input_shape) {
-    auto *shape_data = input_shape->data<int>();
-    framework::Tensor cpu_shape_tensor;
-    auto shape =
-        std::vector<int>(shape_data, shape_data + input_shape->numel());
-    out_dims = ValidateShape(shape, input_x->dims());
-  }
-
-  bool inplace = param.Inplace();
-  out->Resize(out_dims);
-  if (!inplace) {
-    out->mutable_data<float>();
-    framework::TensorCopy(*input_x, out);
-    out->Resize(out_dims);
-  } else {
-    out->ShareDataWith(*input_x);
-    out->Resize(out_dims);
-  }
+  ReshapeCompute<float>(param);
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/arm/sigmoid_kernel.cpp b/src/operators/kernel/arm/sigmoid_kernel.cpp
index 3f256c69c2ba04de97bb443770c2b460046028e9..eb67de153ddb13fb48e42c28d6ec2270b0bc59b4 100644
--- a/src/operators/kernel/arm/sigmoid_kernel.cpp
+++ b/src/operators/kernel/arm/sigmoid_kernel.cpp
@@ -15,74 +15,25 @@ limitations under the License. */
 #ifdef SIGMOID_OP
 
 #include "../sigmoid_kernel.h"
+#include "../central-arm-func/sigmoid_arm_func.h"
 #if __ARM_NEON
 #include "../../math/math_func_neon.h"
 #endif
-
+#include <cmath>
 namespace paddle_mobile {
 namespace operators {
 
 using framework::DDim;
 using framework::Tensor;
 
-void sigmoid(const Tensor *X, Tensor *Y) {
-#if __ARM_NEON
-  const float *input = X->data<float>();
-  float *output = Y->mutable_data<float>();
-  const DDim &dDim = X->dims();
-  int axis_index = 1;
-  if (dDim.size() < 4) {
-    axis_index = 0;
-  }
-  DDim outer_ddim =
-      paddle_mobile::framework::slice_ddim(dDim, 0, axis_index + 1);
-  DDim inner_ddim =
-      paddle_mobile::framework::slice_ddim(dDim, axis_index + 1, dDim.size());
-  int out_size = paddle_mobile::framework::product(outer_ddim);
-  int inner_size = paddle_mobile::framework::product(inner_ddim);
-
-  DLOG << "outsize=" << out_size;
-  DLOG << "innersize=" << inner_size;
-  #pragma omp parallel for
-  for (int i = 0; i < out_size; ++i) {
-    const float *input_outer_ptr = input + i * inner_size;
-    float *output_outer_ptr = output + i * inner_size;
-    int nn = inner_size >> 2;
-    int remain = inner_size - (nn << 2);
-    float32x4_t _one = vdupq_n_f32(1.f);
-    for (; nn > 0; nn--) {
-      float32x4_t data = vld1q_f32(input_outer_ptr);
-      data = vnegq_f32(data);
-      data = exp_ps(data);
-      data = vaddq_f32(data, _one);
-      float32x4_t out_data = vrecpeq_f32(data);
-      out_data = vmulq_f32(vrecpsq_f32(data, out_data), out_data);
-      vst1q_f32(output_outer_ptr, out_data);
-
-      input_outer_ptr += 4;
-      output_outer_ptr += 4;
-    }
-    for (; remain > 0; remain--) {
-      *output_outer_ptr = 1.f / (1.f + exp(-*input_outer_ptr));
-      output_outer_ptr++;
-      input_outer_ptr++;
-    }
-  }
-#endif
-}
-
 template <>
-bool SigmoidKernel<CPU, float>::Init(const SigmoidParam &para) const {
+bool SigmoidKernel<CPU, float>::Init(SigmoidParam *param) {
   return true;
 }
 
 template <>
 void SigmoidKernel<CPU, float>::Compute(const SigmoidParam &param) const {
-  const Tensor *in_x = param.InputX();
-  Tensor *out = param.Out();
-  auto x_dims = in_x->dims();
-  out->Resize(x_dims);
-  sigmoid(in_x, out);
+  SigmoidCompute<float>(param);
 }
 
 template class SigmoidKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/softmax_kernel.cpp b/src/operators/kernel/arm/softmax_kernel.cpp
index 8e966aa0af9ac84b70b154b33bad7dad9e79121d..3ce763be38678319cfc23be83180450e5d3b209c 100644
--- a/src/operators/kernel/arm/softmax_kernel.cpp
+++ b/src/operators/kernel/arm/softmax_kernel.cpp
@@ -15,22 +15,19 @@ limitations under the License. */
 #ifdef SOFTMAX_OP
 
 #include "../softmax_kernel.h"
-#include "../../math/softmax.h"
+#include "../central-arm-func/softmax_arm_func.h"
+#include "operators/math/softmax.h"
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool SoftmaxKernel<CPU, float>::Init(const SoftmaxParam &para) const {
+bool SoftmaxKernel<CPU, float>::Init(SoftmaxParam *param) {
   return true;
 }
 
 template <>
 void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam &param) const {
-  const Tensor *in_x = param.InputX();
-  Tensor *out = param.Out();
-  auto x_dims = in_x->dims();
-  out->Resize(x_dims);
-  math::SoftmaxFuntor<CPU, float>()(in_x, out);
+  SoftmaxCompute<float>(param);
 }
 
 template class SoftmaxKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/transpose_kernel.cpp b/src/operators/kernel/arm/transpose_kernel.cpp
index a44ff22a2f228cc357c066a01e142de7cc4f2083..c358edd76e93cee3f8be6086a70c34671c87d383 100644
--- a/src/operators/kernel/arm/transpose_kernel.cpp
+++ b/src/operators/kernel/arm/transpose_kernel.cpp
@@ -14,72 +14,19 @@ limitations under the License. */
 #ifdef TRANSPOSE_OP
 
 #include "operators/kernel/transpose_kernel.h"
+#include "operators/kernel/central-arm-func/transpose_arm_func.h"
+
 namespace paddle_mobile {
 namespace operators {
 
-// vector<int> pos;
-// template <typename T>
-// void TransposeFunc(const int numel, const T* input, const vector<int> axis,
-//                    const vector<int> old_strides, const vector<int>
-//                    new_strides, T* output) {
-//   for (int i = 0; i < numel; ++i) {
-//     int old_idx = 0;
-//     int idx = i;
-//     for (int j = 0; j < axis.size(); ++j) {
-//       int order = axis[j];
-//       old_idx += (idx / new_strides[j]) * old_strides[order];
-//       idx %= new_strides[j];
-//     }
-//     output[i] = input[old_idx];
-//   }
-// }
-
 template <>
-bool TransposeKernel<CPU, float>::Init(const TransposeParam& para) const {
+bool TransposeKernel<CPU, float>::Init(TransposeParam *param) {
   return true;
 }
 
 template <>
-void TransposeKernel<CPU, float>::Compute(const TransposeParam& param) const {
-  const auto* input_x = param.InputX();
-  const auto input_x_dims = input_x->dims();
-  auto* out = param.Out();
-  const auto axis = param.Axis();
-  const auto* input_x_data = input_x->data<float>();
-  auto* out_data = out->mutable_data<float>();
-
-  size_t ndim = axis.size();
-  std::vector<int> xdim(ndim);
-  std::vector<int> xstride(ndim);
-  std::vector<int> xout(ndim);
-  for (int i = 0; i < ndim; i++) {
-    int j = ndim - 1 - i;
-    xdim[j] = input_x_dims[axis[i]];
-    xstride[j] = 1;
-    for (int k = axis[i] + 1; k < ndim; k++) {
-      xstride[j] *= input_x_dims[k];
-    }
-    xout[j] = xstride[j] * xdim[j];
-  }
-
-  auto numel = input_x->numel();
-  size_t pind = 0;
-  std::vector<int> ind(ndim);
-  for (int i = 0; i < numel; i++) {
-    out_data[i] = input_x_data[pind];
-    ind[0]++;
-    pind += xstride[0];
-    for (int j = 0; j < ndim - 1; j++) {
-      if (ind[j] == xdim[j]) {
-        ind[j + 1]++;
-        ind[j] = 0;
-        pind += xstride[j + 1];
-        pind -= xout[j];
-      } else {
-        break;
-      }
-    }
-  }
+void TransposeKernel<CPU, float>::Compute(const TransposeParam &param) const {
+  TransposeCompute<float>(param);
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/batchnorm_kernel.h b/src/operators/kernel/batchnorm_kernel.h
index 6ef5329bc58fea8bfc17d9115b7004fed2bc4ed7..367dd0996c0df5fba7c3570285cf5e2cfd3fac99 100644
--- a/src/operators/kernel/batchnorm_kernel.h
+++ b/src/operators/kernel/batchnorm_kernel.h
@@ -29,7 +29,7 @@ class BatchNormKernel
     : public framework::OpKernelBase<DeviceType, BatchNormParam> {
  public:
   void Compute(const BatchNormParam &param) const;
-  bool Init(const BatchNormParam &para) const;
+  bool Init(BatchNormParam *param);
 };
 
 }  // namespace operators
diff --git a/src/operators/kernel/box_coder_kernel.h b/src/operators/kernel/box_coder_kernel.h
index 4c4206f52b3ffc5e60983bf1d6adb25292d01ac4..2ad63ecd90a07d955c3e239277ac1bd60f3510bb 100644
--- a/src/operators/kernel/box_coder_kernel.h
+++ b/src/operators/kernel/box_coder_kernel.h
@@ -30,7 +30,7 @@ class BoxCoderKernel
     : public framework::OpKernelBase<DeviceType, BoxCoderParam> {
  public:
   void Compute(const BoxCoderParam& param) const;
-  bool Init(const BoxCoderParam& para) const;
+  bool Init(BoxCoderParam* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/central-arm-func/batchnorm_arm_func.h b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
index 7f02d768b790b5f496ab0eac369fa3a4100ee733..2845e5f8b7d18e1c446e0eac73e730b59f9e4550 100644
--- a/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
+++ b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <cmath>
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
@@ -52,6 +53,8 @@ void BatchnormCompute(const BatchNormParam &param) {
                         "C must equal to variance.numel()");
 
   int HXW = H * W;
+
+#ifdef ARMV7
   if (HXW > 32) {
     int NXC = N * C;
     float *inv_std_ptr = new float[NXC * 4];
@@ -226,6 +229,37 @@ void BatchnormCompute(const BatchNormParam &param) {
 
     delete[] inv_std_ptr;
   }
+#endif
+  float *inv_std_ptr = new float[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+
+  Tensor new_scale;
+  auto new_scale_ptr = new_scale.mutable_data<float>(framework::make_ddim({C}));
+  Tensor new_bias;
+  auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
+
+  /// ((x - est_mean) * (inv_var) * scale + bias equal to
+  /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+    {
+      for (int n = 0; n < N; n++) {
+        for (int h = 0; h < H; h++) {
+          int tmp_index = n * stride0 + i * stride1 + h * stride2;
+          for (int w = 0; w < W; w++) {
+            int index = tmp_index + w;
+            out_ptr[index] =
+                input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
+          }
+        }
+      }
+    }
+  }
+  delete[] inv_std_ptr;
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/central-arm-func/box_coder_arm_func.h b/src/operators/kernel/central-arm-func/box_coder_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..eeb05f31b744c9e55e78375a495c5a5debf095c2
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/box_coder_arm_func.h
@@ -0,0 +1,140 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef BOXCODER_OP
+#pragma once
+
+#include <cmath>
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+void EncodeCenterSize(const framework::Tensor& target_box,
+                      const framework::Tensor& prior_box,
+                      const framework::Tensor& prior_box_var, T* output) {
+  int64_t row = target_box.dims()[0];
+  int64_t col = prior_box.dims()[0];
+  int64_t len = prior_box.dims()[1];
+  auto* target_box_data = target_box.data<T>();
+  auto* prior_box_data = prior_box.data<T>();
+  auto* prior_box_var_data = prior_box_var.data<T>();
+
+  for (int64_t i = 0; i < row; ++i) {
+    for (int64_t j = 0; j < col; ++j) {
+      T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len];
+      T prior_box_height =
+          prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+      T prior_box_center_x =
+          (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+      T prior_box_center_y =
+          (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+
+      T target_box_center_x =
+          (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
+      T target_box_center_y =
+          (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
+      T target_box_width =
+          target_box_data[i * len + 2] - target_box_data[i * len];
+      T target_box_height =
+          target_box_data[i * len + 3] - target_box_data[i * len + 1];
+
+      size_t offset = i * col * len + j * len;
+      output[offset] = (target_box_center_x - prior_box_center_x) /
+                       prior_box_width / prior_box_var_data[j * len];
+      output[offset + 1] = (target_box_center_y - prior_box_center_y) /
+                           prior_box_height / prior_box_var_data[j * len + 1];
+      output[offset + 2] =
+          std::log(std::fabs(target_box_width / prior_box_width)) /
+          prior_box_var_data[j * len + 2];
+      output[offset + 3] =
+          std::log(std::fabs(target_box_height / prior_box_height)) /
+          prior_box_var_data[j * len + 3];
+    }
+  }
+}
+
+template <typename T>
+void DecodeCenterSize(const framework::Tensor& target_box,
+                      const framework::Tensor& prior_box,
+                      const framework::Tensor& prior_box_var, T* output) {
+  int64_t row = target_box.dims()[0];
+  int64_t col = prior_box.dims()[0];
+  int64_t len = prior_box.dims()[1];
+
+  auto* target_box_data = target_box.data<T>();
+  auto* prior_box_data = prior_box.data<T>();
+  auto* prior_box_var_data = prior_box_var.data<T>();
+
+  for (int64_t i = 0; i < row; ++i) {
+    for (int64_t j = 0; j < col; ++j) {
+      size_t offset = i * col * len + j * len;
+      T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len];
+      T prior_box_height =
+          prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+      T prior_box_center_x =
+          (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+      T prior_box_center_y =
+          (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+
+      T target_box_center_x = prior_box_var_data[j * len] *
+                                  target_box_data[offset] * prior_box_width +
+                              prior_box_center_x;
+      T target_box_center_y = prior_box_var_data[j * len + 1] *
+                                  target_box_data[offset + 1] *
+                                  prior_box_height +
+                              prior_box_center_y;
+      T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
+                                    target_box_data[offset + 2]) *
+                           prior_box_width;
+      T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
+                                     target_box_data[offset + 3]) *
+                            prior_box_height;
+
+      output[offset] = target_box_center_x - target_box_width / 2;
+      output[offset + 1] = target_box_center_y - target_box_height / 2;
+      output[offset + 2] = target_box_center_x + target_box_width / 2;
+      output[offset + 3] = target_box_center_y + target_box_height / 2;
+    }
+  }
+}
+
+template <typename P>
+void BoxCoderCompute(const BoxCoderParam& param) {
+  const auto* input_priorbox = param.InputPriorBox();
+  const auto* input_priorboxvar = param.InputPriorBoxVar();
+  const auto* input_targetbox = param.InputTargetBox();
+
+  const auto& code_type = param.CodeType();
+
+  auto row = input_targetbox->dims()[0];
+  auto col = input_priorbox->dims()[0];
+  auto len = input_priorbox->dims()[1];
+
+  Tensor* output_box = param.OutputBox();
+  auto* output_box_dataptr = output_box->mutable_data<float>({row, col, len});
+
+  if (code_type == "encode_center_size") {
+    EncodeCenterSize<float>(*input_targetbox, *input_priorbox,
+                            *input_priorboxvar, output_box_dataptr);
+  }
+  if (code_type == "decode_center_size") {
+    DecodeCenterSize<float>(*input_targetbox, *input_priorbox,
+                            *input_priorboxvar, output_box_dataptr);
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/concat_arm_func.h b/src/operators/kernel/central-arm-func/concat_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9926505b33b32ee83a16f882cc0f775797f154a
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/concat_arm_func.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONCAT_OP
+#pragma once
+
+#include <vector>
+
+namespace paddle_mobile {
+namespace operators {
+template <typename T>
+class ConcatFunctor {
+ public:
+  void operator()(const std::vector<framework::Tensor> &input, const int axis,
+                  framework::Tensor *output) {
+    size_t num = input.size();
+    int rows = 1;
+    auto dim_0 = input[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      rows *= dim_0[i];
+    }
+    int out_rows = rows, out_cols = 0;
+
+    std::vector<int64_t> input_cols(input.size());
+    for (int i = 0; i < num; ++i) {
+      int t_cols = input[i].numel() / rows;
+      out_cols += t_cols;
+      input_cols[i] = t_cols;
+    }
+
+    // computation
+    for (int k = 0; k < out_rows; ++k) {
+      T *dst_ptr = output->data<T>() + k * out_cols;
+      int col_idx = 0;
+      for (int j = 0; j < num; ++j) {
+        int col_len = input_cols[j];
+        const T *src_prt = input[j].data<T>() + k * col_len;
+        memory::Copy(dst_ptr + col_idx, src_prt, sizeof(T) * col_len);
+        col_idx += col_len;
+      }
+    }
+  }
+};
+
+template <typename P>
+void ConcatCompute(const ConcatParam &param) {
+  auto inputs = param.Inputs();
+  auto *out = param.Out();
+  int64_t axis = param.Axis();
+  out->mutable_data<float>();
+
+  /// Sometimes direct copies will be faster, this maybe need deeply analysis.
+  if (axis == 0 && inputs.size() < 10) {
+    size_t output_offset = 0;
+    for (auto *in : inputs) {
+      auto in_stride = framework::stride_numel(in->dims());
+      auto out_stride = framework::stride_numel(out->dims());
+      auto dst = out->data<float>() + output_offset;
+      auto src = in->data<float>();
+      PADDLE_MOBILE_ENFORCE(
+          in_stride.size() == out_stride.size(),
+          "src and dst tensor should have the same dims size.");
+      memory::Copy(dst, src, sizeof(float) * in_stride[0]);
+      output_offset += in_stride[0];
+    }
+  } else {
+    std::vector<framework::Tensor> inputs_concat(inputs.size());
+    for (int j = 0; j < inputs.size(); ++j) {
+      inputs_concat[j] = *inputs[j];
+    }
+    ConcatFunctor<float> concat_functor;
+    concat_functor(inputs_concat, static_cast<int>(axis), out);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_add_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed6dc46a90f2b6fa73555b3575f24103a34d1dda
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
@@ -0,0 +1,138 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADD_OP
+#pragma once
+
+#include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+void ConvAddBasic(const FusionConvAddParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor bias = *param.Bias();
+  int axis = param.Axis();
+  Tensor *output = param.Output();
+  math::expand_bias(bias, axis, output->dims());
+  output->ShareDataWith(bias);
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(1));
+    }
+  }
+}
+
+template <typename P>
+void ConvAddCompute(const FusionConvAddParam &param) {
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
+                               param.Bias(), true);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3) {
+    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
+                           param.Filter(), param.Bias(), param.Output(), true);
+  } else {
+    ConvAddBasic(param);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h b/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..13fe50bf74ee164c2cc663f5a6a9eeddbfa3804b
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#pragma once
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+void ConvAddBNReluBasic(const FusionConvAddBNReluParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor bias = *param.Bias();
+  Tensor new_bias = *param.NewBias();
+  Tensor new_scale = *param.NewScale();
+  auto new_bias_ptr = new_bias.data<float>();
+  auto new_scale_ptr = new_scale.data<float>();
+  int axis = param.Axis();
+  Tensor *output = param.Output();
+  math::expand_bias(bias, axis, output->dims());
+  output->ShareDataWith(bias);
+
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(1));
+    }
+  }
+  /// todo : use neon in special case instead of 2for(300ms)
+  auto output_ptr = output->data<float>();
+  for (int c = 0; c < output_matrix_shape[0]; c++) {
+    int start = c * output_matrix_shape[1];
+    for (int j = 0; j < output_matrix_shape[1]; j++) {
+      output_ptr[start + j] =
+          output_ptr[start + j] * new_scale_ptr[c] + new_bias_ptr[c];
+      output_ptr[start + j] =
+          output_ptr[start + j] < 0 ? 0 : output_ptr[start + j];
+    }
+  }
+}
+template <typename P>
+void ConvAddBNReluCompute(const FusionConvAddBNReluParam &param) {
+  Tensor Bias;
+  Bias.mutable_data<float>({param.Groups()});
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConvAddBNRelu3x3s1p1(
+        param.Input(), param.Filter(), param.Output(), &Bias, 1,
+        param.NewScale(), param.NewBias(), 1, 1);
+  } else if (0 && param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
+                           param.Filter(), &Bias, param.Output(), false);
+  } else {
+    ConvAddBNReluBasic(param);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h
index d08eebe5493bd9026073c3349631a42024579b95..33caded3afaaf125bac9108f2fafeda3d3c2049f 100644
--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -17,13 +17,15 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "operators/math/conv_func.h"
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
-
-template <typename P>
-void ConvCompute(const ConvParam &param) {
+inline void ConvBasic(const ConvParam &param) {
   const Tensor *input = param.Input();
   Tensor filter = *param.Filter();
   Tensor *output = param.Output();
@@ -109,6 +111,25 @@ void ConvCompute(const ConvParam &param) {
   }
 }
 
+template <typename P>
+void ConvCompute(const ConvParam &param) {
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
+                               nullptr, false);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3) {
+    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
+                           param.Filter(), nullptr, param.Output(), false);
+  } else {
+    ConvBasic(param);
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
index e43e3664cb005bab4d3c5ec8b5b35bd6925c982d..885f2051f645546c2585caa72aa9c80f8d352e6c 100644
--- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
@@ -15,8 +15,10 @@ limitations under the License. */
 #ifdef DEPTHWISECONV_OP
 
 #pragma once
+#include <operators/math/depthwise_conv_3x3.h>
 #include <vector>
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
+
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
@@ -24,89 +26,21 @@ namespace operators {
 
 template <typename P>
 void DepthwiseConvCompute(const ConvParam &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  //  DLOG << " compute end get Attrs " << strides[0];
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(0));
-    }
+  Tensor Bias;
+  Bias.mutable_data<float>({param.Groups()});
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
+                               &Bias, false);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
+                           param.Filter(), &Bias, param.Output(), false);
+  } else {
+    ConvBasic(param);
   }
 }
 
diff --git a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b3f5d0a8083b63334319b2054f9bf463efa66c7
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEADD_OP
+
+#pragma once
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct AddFunctor {
+  inline T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename P>
+void ElementwiseAddCompute(const ElementwiseAddParam &param) {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *Out = param.Out();
+  Out->mutable_data<float>();
+  int axis = param.Axis();
+  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
+                                                 AddFunctor<float>(), Out);
+}
+
+template class ElementwiseAddKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a01f554140712c6a941b40372cbcfe35a951ce7
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_FC_OP
+
+#pragma once
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void FusionFcCompute(const FusionFcParam &param) {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  const Tensor *input_z = param.InputZ();
+  auto *input_z_data = input_z->data<float>();
+  int axis = param.Axis();
+  Tensor *out = param.Out();
+  auto *out_data = out->mutable_data<float>();
+  const Tensor x_matrix =
+      input_x->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+          : *input_x;
+  const Tensor y_matrix =
+      input_y->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+          : *input_y;
+  auto out_dim = out->dims();
+  if (out_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
+  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
+                        " out_dim.size must be 2.");
+  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
+  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ")
+
+  int64_t classes = input_z->numel();
+  for (int i = 0; i < out_dim[0]; i++) {
+    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
+  }
+
+  for (int i = 0; i < out->numel(); i++) {
+    DLOG << out_data[i];
+  }
+  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                      out, static_cast<float>(1));
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  //            if (out_dim.size() != 2) {
+  //                out->Resize(out_dim);
+  //            }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/lrn_arm_func.h b/src/operators/kernel/central-arm-func/lrn_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..52bb1b67dee83c28f513649a8763034a8d538d73
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/lrn_arm_func.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef LRN_OP
+
+#pragma once
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void LrnCompute(const LrnParam &param) {
+  const Tensor *input_x = param.InputX();
+  auto x_dims = input_x->dims();
+  Tensor *out = param.Out();
+  out->mutable_data<float>();
+  /// data_format = NCHW
+  const int N = x_dims[0];
+  const int C = x_dims[1];
+  const int H = x_dims[2];
+  const int W = x_dims[3];
+
+  const int n = param.N();
+  const float alpha = param.Alpha();
+  const float beta = param.Beta();
+  const float k = param.K();
+  LRNFunctor<float> lrnFunctor;
+  lrnFunctor(*input_x, out, N, C, H, W, n, k, alpha, beta);
+}
+
+template class LrnKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/mul_arm_func.h b/src/operators/kernel/central-arm-func/mul_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..9dfb1f48a574156f1b026fc6af3a03d77b81263f
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/mul_arm_func.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef MUL_OP
+
+#pragma once
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void MulCompute(const MulParam &param) {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *out = param.Out();
+  out->mutable_data<float>();
+  const Tensor x_matrix =
+      input_x->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+          : *input_x;
+  const Tensor y_matrix =
+      input_y->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+          : *input_y;
+  auto out_dim = out->dims();
+  if (out_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                      out, static_cast<float>(0));
+  if (out_dim.size() != 2) {
+    out->Resize(out_dim);
+  }
+}
+
+template class MulKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h b/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..8833f012d97390e758ac6fc394ef237cb86632b1
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
@@ -0,0 +1,280 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef MULTICLASSNMS_OP
+#pragma once
+
+#include <algorithm>
+#include <map>
+#include <utility>
+#include <vector>
+
+namespace paddle_mobile {
+namespace operators {
+
+constexpr int kOutputDim = 6;
+constexpr int kBBoxSize = 4;
+
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+static inline void GetMaxScoreIndex(
+    const std::vector<T>& scores, const T threshold, int top_k,
+    std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <class T>
+static inline T BBoxArea(const T* box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static inline T JaccardOverlap(const T* box1, const T* box2,
+                               const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_w = inter_xmax - inter_xmin;
+    const T inter_h = inter_ymax - inter_ymin;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <typename T>
+static inline void NMSFast(const Tensor& bbox, const Tensor& scores,
+                           const T score_threshold, const T nms_threshold,
+                           const T eta, const int64_t top_k,
+                           std::vector<int>* selected_indices) {
+  // The total boxes for each instance.
+  int64_t num_boxes = bbox.dims()[0];
+  // 4: [xmin ymin xmax ymax]
+  int64_t box_size = bbox.dims()[1];
+
+  std::vector<T> scores_data(num_boxes);
+  std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
+  std::vector<std::pair<T, int>> sorted_indices;
+  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+
+  selected_indices->clear();
+  T adaptive_threshold = nms_threshold;
+  const T* bbox_data = bbox.data<T>();
+
+  while (sorted_indices.size() != 0) {
+    const int idx = sorted_indices.front().second;
+    bool keep = true;
+    for (size_t k = 0; k < selected_indices->size(); ++k) {
+      if (keep) {
+        const int kept_idx = (*selected_indices)[k];
+        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                      bbox_data + kept_idx * box_size, true);
+        keep = overlap <= adaptive_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) {
+      selected_indices->push_back(idx);
+    }
+    sorted_indices.erase(sorted_indices.begin());
+    if (keep && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+}
+
+template <typename T>
+void MultiClassNMS(const Tensor& scores, const Tensor& bboxes,
+                   std::map<int, std::vector<int>>* indices, int* num_nmsed_out,
+                   const int& background_label, const int& nms_top_k,
+                   const int& keep_top_k, const T& nms_threshold,
+                   const T& nms_eta, const T& score_threshold) {
+  int64_t class_num = scores.dims()[0];
+  int64_t predict_dim = scores.dims()[1];
+  int num_det = 0;
+  for (int64_t c = 0; c < class_num; ++c) {
+    if (c == background_label) continue;
+    Tensor score = scores.Slice(c, c + 1);
+    /// [c] is key
+    NMSFast<float>(bboxes, score, score_threshold, nms_threshold, nms_eta,
+                   nms_top_k, &((*indices)[c]));
+    num_det += (*indices)[c].size();
+  }
+
+  *num_nmsed_out = num_det;
+  const T* scores_data = scores.data<T>();
+  if (keep_top_k > -1 && num_det > keep_top_k) {
+    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+    for (const auto& it : *indices) {
+      int label = it.first;
+      const T* sdata = scores_data + label * predict_dim;
+      const std::vector<int>& label_indices = it.second;
+      for (size_t j = 0; j < label_indices.size(); ++j) {
+        int idx = label_indices[j];
+        // PADDLE_ENFORCE_LT(idx, predict_dim);
+        score_index_pairs.push_back(
+            std::make_pair(sdata[idx], std::make_pair(label, idx)));
+      }
+    }
+    // Keep top k results per image.
+    std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
+                     SortScorePairDescend<std::pair<int, int>>);
+    score_index_pairs.resize(keep_top_k);
+
+    // Store the new indices.
+    std::map<int, std::vector<int>> new_indices;
+    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+      int label = score_index_pairs[j].second.first;
+      int idx = score_index_pairs[j].second.second;
+      new_indices[label].push_back(idx);
+    }
+    new_indices.swap(*indices);
+    *num_nmsed_out = keep_top_k;
+  }
+}
+
+template <typename T>
+void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
+                      const std::map<int, std::vector<int>>& selected_indices,
+                      Tensor* outs) {
+  int predict_dim = scores.dims()[1];
+  auto* scores_data = scores.data<T>();
+  auto* bboxes_data = bboxes.data<T>();
+  auto* odata = outs->data<T>();
+
+  int count = 0;
+  for (const auto& it : selected_indices) {
+    /// one batch
+    int label = it.first;
+    const T* sdata = scores_data + label * predict_dim;
+    const std::vector<int>& indices = it.second;
+    for (size_t j = 0; j < indices.size(); ++j) {
+      int idx = indices[j];
+      const T* bdata = bboxes_data + idx * kBBoxSize;
+      odata[count * kOutputDim] = label;           // label
+      odata[count * kOutputDim + 1] = sdata[idx];  // score
+      // xmin, ymin, xmax, ymax
+      std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
+      count++;
+    }
+  }
+}
+
+template <typename P>
+void MultiClassNMSCompute(const MultiClassNMSParam& param) {
+  const auto* input_bboxes = param.InputBBoxes();
+  const auto& input_bboxes_dims = input_bboxes->dims();
+
+  const auto* input_scores = param.InputScores();
+  const auto& input_scores_dims = input_scores->dims();
+
+  auto* outs = param.Out();
+  auto background_label = param.BackGroundLabel();
+  auto nms_top_k = param.NMSTopK();
+  auto keep_top_k = param.KeepTopK();
+  auto nms_threshold = param.NMSThreshold();
+  auto nms_eta = param.NMSEta();
+  auto score_threshold = param.ScoreThreshold();
+
+  int64_t batch_size = input_scores_dims[0];
+  int64_t class_num = input_scores_dims[1];
+  int64_t predict_dim = input_scores_dims[2];
+  int64_t box_dim = input_bboxes_dims[2];
+
+  std::vector<std::map<int, std::vector<int>>> all_indices;
+  std::vector<size_t> batch_starts = {0};
+  for (int64_t i = 0; i < batch_size; ++i) {
+    Tensor ins_score = input_scores->Slice(i, i + 1);
+    ins_score.Resize({class_num, predict_dim});
+
+    Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
+    ins_boxes.Resize({predict_dim, box_dim});
+
+    std::map<int, std::vector<int>> indices;
+    int num_nmsed_out = 0;
+    MultiClassNMS<float>(ins_score, ins_boxes, &indices, &num_nmsed_out,
+                         background_label, nms_top_k, keep_top_k, nms_threshold,
+                         nms_eta, score_threshold);
+    all_indices.push_back(indices);
+    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+  }
+
+  int num_kept = batch_starts.back();
+  if (num_kept == 0) {
+    float* od = outs->mutable_data<float>({1});
+    od[0] = -1;
+  } else {
+    outs->mutable_data<float>({num_kept, kOutputDim});
+    for (int64_t i = 0; i < batch_size; ++i) {
+      Tensor ins_score = input_scores->Slice(i, i + 1);
+      ins_score.Resize({class_num, predict_dim});
+
+      Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
+      ins_boxes.Resize({predict_dim, box_dim});
+
+      int64_t s = batch_starts[i];
+      int64_t e = batch_starts[i + 1];
+      if (e > s) {
+        Tensor out = outs->Slice(s, e);
+        MultiClassOutput<float>(ins_score, ins_boxes, all_indices[i], &out);
+      }
+    }
+  }
+
+  //            framework::LoD lod;
+  //            lod.emplace_back(batch_starts);
+  //
+  //            outs->set_lod(lod);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/pool_arm_func.h b/src/operators/kernel/central-arm-func/pool_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..892dca2ea40d40484b4c32a57f8633849cc9d038
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/pool_arm_func.h
@@ -0,0 +1,95 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef POOL_OP
+#pragma once
+
+#include <string>
+#include <vector>
+#include "operators/math/pooling.h"
+
+namespace paddle_mobile {
+namespace operators {
+using framework::Tensor;
+
+inline void PoolBasic(std::string pooling_type, std::vector<int> ksize,
+                      std::vector<int> strides, std::vector<int> paddings,
+                      const Tensor *in_x, Tensor *out) {
+  if (pooling_type == "max") {
+    math::PoolFunctor<CPU, math::MaxPool<float>, float> pool2d_forward;
+    math::MaxPool<float> pool_process;
+    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
+
+  } else if (pooling_type == "avg") {
+    math::PoolFunctor<CPU, math::AvgPool<float>, float> pool2d_forward;
+    math::AvgPool<float> pool_process;
+    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
+  }
+}
+template <typename P>
+void PoolCompute(const PoolParam &param) {
+  const Tensor *in_x = param.Input();
+  Tensor *out = param.Output();
+  std::string pooling_type = param.PoolingType();
+
+  std::vector<int> ksize = param.Ksize();
+
+  std::vector<int> strides = param.Strides();
+
+  std::vector<int> paddings = param.Paddings();
+  if (ksize.size() != 2) {
+    LOG(paddle_mobile::LogLevel::kLOG_ERROR)
+        << "Pool op only supports 2D and 3D input.";
+  }
+
+  if (param.isGlobalPooling()) {
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      paddings[i] = 0;
+      ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+    }
+  } else if (ksize[0] == 3 && ksize[0] == ksize[1]) {
+    if (pooling_type == "max") {
+      if (strides[0] == strides[1] && strides[0] == 1 &&
+          paddings[0] == paddings[1] && paddings[1] == 1) {
+        math::Pool3x3Maxs1p1(in_x, out);
+      } else {
+        math::Pool3x3Max(strides, paddings, in_x, out);
+      }
+    } else if (pooling_type == "avg") {
+      if (strides[0] == strides[1] && strides[0] == 1 &&
+          paddings[0] == paddings[1] && paddings[1] == 1) {
+        math::Pool3x3Avgs1p1(in_x, out);
+      } else {
+        math::Pool3x3Avg(strides, paddings, in_x, out);
+      }
+    }
+
+  } else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
+#ifndef IOS
+    if (pooling_type == "max") {
+      math::Pool2x2Max(strides, paddings, in_x, out);
+    } else if (pooling_type == "avg") {
+      math::Pool2x2Avg(strides, paddings, in_x, out);
+    }
+#else
+    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+#endif
+  } else {
+    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/central-arm-func/prior_box_arm_func.h b/src/operators/kernel/central-arm-func/prior_box_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..892dceb9254ac423d3591a0fc9e9347bc375831b
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/prior_box_arm_func.h
@@ -0,0 +1,149 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PRIORBOX_OP
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct ClipFunctor {
+  inline T operator()(T in) const {
+    return std::min<T>(std::max<T>(in, 0.), 1.);
+  }
+};
+
+template <typename P>
+void PriorBoxCompute(const PriorBoxParam &param) {
+  const auto *input_ = param.Input();
+  const auto &input_dims = input_->dims();
+
+  const auto *input_image = param.InputImage();
+  const auto &input_image_dims = input_image->dims();
+
+  const auto &min_sizes = param.MinSizes();
+  const auto &max_sizes = param.MaxSizes();
+  const auto &variances = param.Variances();
+  const auto &input_aspect_ratio = param.AspectRatios();
+  const bool &flip = param.Flip();
+  const bool &clip = param.Clip();
+  const float &step_w = param.StepW();
+  const float &step_h = param.StepH();
+  const float &offset = param.Offset();
+
+  Tensor *output_boxes = param.OutputBoxes();
+  auto output_boxes_dataptr = output_boxes->mutable_data<float>();
+  Tensor *output_variances = param.OutputVariances();
+  auto output_variances_dataptr = output_variances->mutable_data<float>();
+
+  std::vector<float> aspect_ratios;
+  ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
+
+  auto img_width = input_image_dims[3];
+  auto img_height = input_image_dims[2];
+
+  auto feature_width = input_dims[3];
+  auto feature_height = input_dims[2];
+
+  auto stride0 = output_boxes->dims()[1] * output_boxes->dims()[2] *
+                 output_boxes->dims()[3];
+  auto stride1 = output_boxes->dims()[2] * output_boxes->dims()[3];
+  auto stride2 = output_boxes->dims()[3];
+
+  float step_width, step_height;
+  /// 300 / 19
+  if (step_w == 0 || step_h == 0) {
+    step_width = static_cast<float>(img_width) / feature_width;
+    step_height = static_cast<float>(img_height) / feature_height;
+  } else {
+    step_width = step_w;
+    step_height = step_h;
+  }
+
+  int num_priors = aspect_ratios.size() * min_sizes.size();
+  if (!max_sizes.empty()) {
+    num_priors += max_sizes.size();
+  }
+
+  for (int h = 0; h < feature_height; ++h) {
+    for (int w = 0; w < feature_width; ++w) {
+      /// map origin image
+      float center_x = (w + offset) * step_width;
+      float center_y = (h + offset) * step_height;
+      float box_width, box_height;
+      int idx = 0;
+      for (size_t s = 0; s < min_sizes.size(); ++s) {
+        auto min_size = min_sizes[s];
+        // priors with different aspect ratios
+        for (float ar : aspect_ratios) {
+          box_width = min_size * sqrt(ar) / 2.;
+          box_height = min_size / sqrt(ar) / 2.;
+          /// box_width/2 , / img_width 为了得到feature map 相对于
+          /// 原图的归一化位置的比例。
+          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] =
+              (center_x - box_width) / img_width;
+          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] =
+              (center_y - box_height) / img_height;
+          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 2] =
+              (center_x + box_width) / img_width;
+          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] =
+              (center_y + box_height) / img_height;
+          idx++;
+        }
+        if (!max_sizes.empty()) {
+          auto max_size = max_sizes[s];
+          // square prior with size sqrt(minSize * maxSize)
+          box_width = box_height = sqrt(min_size * max_size) / 2.;
+          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] =
+              (center_x - box_width) / img_width;
+          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] =
+              (center_y - box_height) / img_height;
+          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 2] =
+              (center_x + box_width) / img_width;
+          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] =
+              (center_y + box_height) / img_height;
+          idx++;
+        }
+      }
+    }
+  }
+  if (clip) {
+    math::Transform trans;
+    ClipFunctor<float> clip_func;
+    trans(output_boxes_dataptr, output_boxes_dataptr + output_boxes->numel(),
+          output_boxes_dataptr, clip_func);
+  }
+
+  if ((variances.size() != 4)) {
+    LOG(kLOG_ERROR) << " variances.size() must be 4.";
+  }
+
+  int64_t box_num = feature_height * feature_width * num_priors;
+
+  for (int i = 0; i < box_num; i++) {
+    output_variances_dataptr[4 * i] = variances[0];
+    output_variances_dataptr[4 * i + 1] = variances[1];
+    output_variances_dataptr[4 * i + 2] = variances[2];
+    output_variances_dataptr[4 * i + 3] = variances[3];
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/relu_arm_func.h b/src/operators/kernel/central-arm-func/relu_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..19ccb3e862a29cab79453572b24ed0c5a2a8301d
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/relu_arm_func.h
@@ -0,0 +1,108 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RELU_OP
+#pragma once
+
+#include <operators/math/transform.h>
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct ReluFunctor {
+  inline T operator()(T in) const { return in > 0 ? in : 0; }
+};
+
+/*
+ * @b 特化到具体平台的实现, param 从 op 层传入
+ * */
+template <typename P>
+void ReluCompute(const ReluParam &param) {
+  const auto *input_x = param.InputX();
+  auto *input_x_ptr = input_x->data<float>();
+  auto *out = param.Out();
+  auto *out_ptr = out->mutable_data<float>();
+
+  int numel = input_x->numel();
+  //  if (numel > 64) {
+  //    asm volatile(
+  //        "pld        [%[input_x_ptr], #0]        \n\t"
+  //        "vmov.f32   q8,    #0.0                 \n\t"
+  //        "subs %[num], %[num], #32                \n\t"
+  //        "blt        end_num_%=                  \n\t"
+  //        "loop_num_%=:                           \n\t"
+  //        "pld        [%[input_x_ptr], #1024]      \n\t"
+  //
+  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+  //
+  //        "vmax.f32 q0, q0, q8                   \n\t"
+  //        "vmax.f32 q1, q1, q8                    \n\t"
+  //        "vmax.f32 q2, q2, q8                   \n\t"
+  //        "vmax.f32 q3, q3, q8                   \n\t"
+  //        "vmax.f32 q4, q4, q8                   \n\t"
+  //        "vmax.f32 q5, q5, q8                   \n\t"
+  //        "vmax.f32 q6, q6, q8                   \n\t"
+  //        "vmax.f32 q7, q7, q8                   \n\t"
+  //
+  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+  //
+  //        "subs %[num], %[num], #32              \n\t"
+  //        "bge        loop_num_%=                \n\t"
+  //        "end_num_%=:                           \n\t"
+  //        "cmp %[num], #0                         \n\t"
+  //        "bge   end_%=                          \n\t"
+  //        "mov r6, #4                             \n\t"
+  //        "mul r5, %[num], r6                     \n\t"
+  //        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
+  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+  //        "vmax.f32 q0, q0, q8                   \n\t"
+  //        "vmax.f32 q1, q1, q8                    \n\t"
+  //        "vmax.f32 q2, q2, q8                   \n\t"
+  //        "vmax.f32 q3, q3, q8                   \n\t"
+  //        "vmax.f32 q4, q4, q8                   \n\t"
+  //        "vmax.f32 q5, q5, q8                   \n\t"
+  //        "vmax.f32 q6, q6, q8                   \n\t"
+  //        "vmax.f32 q7, q7, q8                   \n\t"
+  //        "add %[out_ptr], %[out_ptr], r5       \n\t"
+  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+  //        "end_%=:                                \n\t"
+  //        :
+  //        :
+  //        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num]
+  //        "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
+  //        "q7", "q8", "r5",
+  //          "r6");
+  //  } else {
+  ReluFunctor<float> func_;
+  math::Transform trans;
+  trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
+  //  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/reshape_arm_func.h b/src/operators/kernel/central-arm-func/reshape_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2fb836257418923f41e94ceaf499e38033c6b4c
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/reshape_arm_func.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE_OP
+#pragma once
+
+#include <vector>
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void ReshapeCompute(const ReshapeParam &param) {
+  const auto *input_x = param.InputX();
+  const auto &input_x_dims = input_x->dims();
+  auto *out = param.Out();
+  framework::DDim out_dims = out->dims();
+  const auto *input_shape = param.InputShape();
+
+  if (input_shape) {
+    auto *shape_data = input_shape->data<int>();
+    framework::Tensor cpu_shape_tensor;
+    auto shape =
+        std::vector<int>(shape_data, shape_data + input_shape->numel());
+    out_dims = ValidateShape(shape, input_x->dims());
+  }
+
+  bool inplace = param.Inplace();
+  out->Resize(out_dims);
+  if (!inplace) {
+    out->mutable_data<float>();
+    framework::TensorCopy(*input_x, out);
+    out->Resize(out_dims);
+  } else {
+    out->ShareDataWith(*input_x);
+    out->Resize(out_dims);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/sigmoid_arm_func.h b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb0e4ab7e4b4f18f8ede4d85b859e68f7d58bda2
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SIGMOID_OP
+#pragma once
+
+#include "operators/op_param.h"
+#if __ARM_NEON
+#include <arm_neon.h>
+#include "operators/math/math_func_neon.h"
+#endif
+
+namespace paddle_mobile {
+namespace operators {
+using framework::DDim;
+void sigmoid(const Tensor *X, Tensor *Y) {
+#if __ARM_NEON
+  const float *input = X->data<float>();
+  float *output = Y->mutable_data<float>();
+  const DDim &dDim = X->dims();
+  int axis_index = 1;
+  if (dDim.size() < 4) {
+    axis_index = 0;
+  }
+  DDim outer_ddim =
+      paddle_mobile::framework::slice_ddim(dDim, 0, axis_index + 1);
+  DDim inner_ddim =
+      paddle_mobile::framework::slice_ddim(dDim, axis_index + 1, dDim.size());
+  int out_size = paddle_mobile::framework::product(outer_ddim);
+  int inner_size = paddle_mobile::framework::product(inner_ddim);
+
+  DLOG << "outsize=" << out_size;
+  DLOG << "innersize=" << inner_size;
+  #pragma omp parallel for
+  for (int i = 0; i < out_size; ++i) {
+    const float *input_outer_ptr = input + i * inner_size;
+    float *output_outer_ptr = output + i * inner_size;
+    int nn = inner_size >> 2;
+    int remain = inner_size - (nn << 2);
+    float32x4_t _one = vdupq_n_f32(1.f);
+    for (; nn > 0; nn--) {
+      float32x4_t data = vld1q_f32(input_outer_ptr);
+      data = vnegq_f32(data);
+      data = exp_ps(data);
+      data = vaddq_f32(data, _one);
+      float32x4_t out_data = vrecpeq_f32(data);
+      out_data = vmulq_f32(vrecpsq_f32(data, out_data), out_data);
+      vst1q_f32(output_outer_ptr, out_data);
+
+      input_outer_ptr += 4;
+      output_outer_ptr += 4;
+    }
+    for (; remain > 0; remain--) {
+      *output_outer_ptr = 1.f / (1.f + exp(-*input_outer_ptr));
+      output_outer_ptr++;
+      input_outer_ptr++;
+    }
+  }
+#endif
+}
+
+template <typename P>
+void SigmoidCompute(const SigmoidParam &param) {
+  const Tensor *in_x = param.InputX();
+  Tensor *out = param.Out();
+  auto x_dims = in_x->dims();
+  out->Resize(x_dims);
+  sigmoid(in_x, out);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/common/openmp-fix.cpp b/src/operators/kernel/central-arm-func/softmax_arm_func.h
similarity index 51%
rename from src/common/openmp-fix.cpp
rename to src/operators/kernel/central-arm-func/softmax_arm_func.h
index 8c31ef45c68227c612155e826e664367a7917501..5a60bf88ae5d936567dc096c1f4bb31a73f0ef34 100644
--- a/src/common/openmp-fix.cpp
+++ b/src/operators/kernel/central-arm-func/softmax_arm_func.h
@@ -12,16 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_MOBILE_USE_OPENMP
-/**
- * android-ndk-r17 has a problem when linking with openmp.
- * if paddle-mobile enables -fopenmp, but didn't use those omp_* functions,
- * after linking another binary with libpaddle-mobile.so, the omp_get_thread_num
- * will not work. see test/common/test_openmp.cc the detailed reason is still
- * unclear, but this trick will work. a better solution is hacking the linker,
- * try some flags to make it link omp_* functions, but I didn't find out how to
- * make it work.
- */
-#include <omp.h>
-static int _ = omp_get_num_procs();
+#ifdef SOFTMAX_OP
+#pragma once
+#include "../../math/softmax.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename P>
+void SoftmaxCompute(const SoftmaxParam &param) {
+  const Tensor *in_x = param.InputX();
+  Tensor *out = param.Out();
+  auto x_dims = in_x->dims();
+  out->Resize(x_dims);
+  math::SoftmaxFuntor<CPU, float>()(in_x, out);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
 #endif
diff --git a/src/operators/kernel/central-arm-func/transpose_arm_func.h b/src/operators/kernel/central-arm-func/transpose_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..1cbebc4525113374061541518775a94c6a64401f
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/transpose_arm_func.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TRANSPOSE_OP
+#pragma once
+
+#include <vector>
+
+namespace paddle_mobile {
+namespace operators {
+
+// vector<int> pos;
+// template <typename T>
+// void TransposeFunc(const int numel, const T* input, const vector<int> axis,
+//                    const vector<int> old_strides, const vector<int>
+//                    new_strides, T* output) {
+//   for (int i = 0; i < numel; ++i) {
+//     int old_idx = 0;
+//     int idx = i;
+//     for (int j = 0; j < axis.size(); ++j) {
+//       int order = axis[j];
+//       old_idx += (idx / new_strides[j]) * old_strides[order];
+//       idx %= new_strides[j];
+//     }
+//     output[i] = input[old_idx];
+//   }
+// }
+
+template <typename P>
+void TransposeCompute(const TransposeParam& param) {
+  const auto* input_x = param.InputX();
+  const auto input_x_dims = input_x->dims();
+  auto* out = param.Out();
+  const auto axis = param.Axis();
+  const auto* input_x_data = input_x->data<float>();
+  auto* out_data = out->mutable_data<float>();
+
+  size_t ndim = axis.size();
+  std::vector<int> xdim(ndim);
+  std::vector<int> xstride(ndim);
+  std::vector<int> xout(ndim);
+  for (int i = 0; i < ndim; i++) {
+    int j = ndim - 1 - i;
+    xdim[j] = input_x_dims[axis[i]];
+    xstride[j] = 1;
+    for (int k = axis[i] + 1; k < ndim; k++) {
+      xstride[j] *= input_x_dims[k];
+    }
+    xout[j] = xstride[j] * xdim[j];
+  }
+
+  auto numel = input_x->numel();
+  size_t pind = 0;
+  std::vector<int> ind(ndim);
+  for (int i = 0; i < numel; i++) {
+    out_data[i] = input_x_data[pind];
+    ind[0]++;
+    pind += xstride[0];
+    for (int j = 0; j < ndim - 1; j++) {
+      if (ind[j] == xdim[j]) {
+        ind[j + 1]++;
+        ind[j] = 0;
+        pind += xstride[j + 1];
+        pind -= xout[j];
+      } else {
+        break;
+      }
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/concat_kernel.h b/src/operators/kernel/concat_kernel.h
index 6a7b7c6005b6e85e5b1ccfee713672b6e333b98a..adba64391e3e79569030c95e2d2681a31187f03a 100644
--- a/src/operators/kernel/concat_kernel.h
+++ b/src/operators/kernel/concat_kernel.h
@@ -27,7 +27,7 @@ template <typename DeviceType, typename T>
 class ConcatKernel : public framework::OpKernelBase<DeviceType, ConcatParam> {
  public:
   void Compute(const ConcatParam &param) const;
-  bool Init(const ConcatParam &para) const;
+  bool Init(ConcatParam *param);
 };
 
 }  // namespace operators
diff --git a/src/operators/kernel/conv_add_bn_relu_kernel.h b/src/operators/kernel/conv_add_bn_relu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..73aaf4c900393b9cbee4682fc67147d9ef0853fc
--- /dev/null
+++ b/src/operators/kernel/conv_add_bn_relu_kernel.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::DDim;
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class ConvAddBNReluKernel
+    : public OpKernelBase<DeviceType, FusionConvAddBNReluParam> {
+ public:
+  void Compute(const FusionConvAddBNReluParam &param) const;
+  bool Init(FusionConvAddBNReluParam *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/conv_add_kernel.h b/src/operators/kernel/conv_add_kernel.h
index 8f733f245dc26664ce38413a09fc5404029cdd2f..465d8bdd8cfd71d678eb2816cae10ea6a06cec35 100644
--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
@@ -20,9 +20,11 @@ limitations under the License. */
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif
+#include "common/common.h"
 #include "framework/ddim.h"
 #include "framework/operator.h"
 #include "operators/math/conv_func.h"
+#include "operators/math/depthwise_conv_3x3.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"
@@ -38,7 +40,7 @@ template <typename DeviceType, typename T>
 class ConvAddKernel : public OpKernelBase<DeviceType, FusionConvAddParam> {
  public:
   void Compute(const FusionConvAddParam &param) const;
-  bool Init(const FusionConvAddParam &para) const;
+  bool Init(FusionConvAddParam *param);
 };
 
 }  // namespace operators
diff --git a/src/operators/kernel/conv_add_relu_kernel.h b/src/operators/kernel/conv_add_relu_kernel.h
index 9b86cd22e82e641ee6cb0a15bd25c8a1c6cbe8cb..3f36d80c4781aebea756b04e340d056a79cfd7d7 100644
--- a/src/operators/kernel/conv_add_relu_kernel.h
+++ b/src/operators/kernel/conv_add_relu_kernel.h
@@ -36,7 +36,7 @@ class ConvAddReluKernel
     : public OpKernelBase<DeviceType, FusionConvAddReluParam> {
  public:
   void Compute(const FusionConvAddReluParam &param) const;
-  bool Init(const FusionConvAddReluParam &para) const;
+  bool Init(FusionConvAddReluParam *param);
 };
 
 }  // namespace operators
diff --git a/src/operators/kernel/conv_kernel.h b/src/operators/kernel/conv_kernel.h
index 812ddd5a441f3a24c557546c1780248a557a6eb0..fedbee32a006f263fd3de25064496dad1a23177b 100644
--- a/src/operators/kernel/conv_kernel.h
+++ b/src/operators/kernel/conv_kernel.h
@@ -32,7 +32,7 @@ template <typename DeviceType, typename T>
 class ConvKernel : public OpKernelBase<DeviceType, ConvParam> {
  public:
   void Compute(const ConvParam &param) const;
-  bool Init(const ConvParam &para) const;
+  bool Init(ConvParam *param);
 };
 
 }  // namespace operators
diff --git a/src/operators/kernel/depthwise_conv_kernel.h b/src/operators/kernel/depthwise_conv_kernel.h
index a8a8fb338620477670477703018bf9e6e9a8a604..b74a58a649bd9fa27e941e2cd5ea50b30c0218cb 100644
--- a/src/operators/kernel/depthwise_conv_kernel.h
+++ b/src/operators/kernel/depthwise_conv_kernel.h
@@ -31,7 +31,7 @@ template <typename DeviceType, typename T>
 class DepthwiseConvKernel : public OpKernelBase<DeviceType, ConvParam> {
  public:
   void Compute(const ConvParam &param) const;
-  bool Init(const ConvParam &para) const;
+  bool Init(ConvParam *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/dropout_kernel.h b/src/operators/kernel/dropout_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a3783971959db8fba9ca6b701fb6eb6340fcb3f
--- /dev/null
+++ b/src/operators/kernel/dropout_kernel.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef DROPOUT_OP
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+#pragma once;
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class DropoutKernel : public framework::OpKernelBase<DeviceType, DropoutParam> {
+ public:
+  void Compute(const DropoutParam& param) const;
+  bool Init(DropoutParam* para);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/elementwise_add_kernel.h b/src/operators/kernel/elementwise_add_kernel.h
index fe6a0238dcd5249e822de3b5930438df808bf853..70334c1d3f788f60e974da74133823f82ab05765 100644
--- a/src/operators/kernel/elementwise_add_kernel.h
+++ b/src/operators/kernel/elementwise_add_kernel.h
@@ -30,7 +30,7 @@ class ElementwiseAddKernel
     : public framework::OpKernelBase<DeviceType, ElementwiseAddParam> {
  public:
   void Compute(const ElementwiseAddParam &param) const;
-  bool Init(const ElementwiseAddParam &para) const;
+  bool Init(ElementwiseAddParam *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/conv_kernel.cpp b/src/operators/kernel/fpga/conv_kernel.cpp
index 30dd64fd1466902036a72faa4be5d359d2bdb0bf..dc537362a216983974bea325433c456136356fc8 100644
--- a/src/operators/kernel/fpga/conv_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_kernel.cpp
@@ -20,7 +20,7 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ConvKernel<FPGA, float>::Init(const ConvParam &para) const {
+bool ConvKernel<FPGA, float>::Init(ConvParam *param) {
   return true;
 }
 
diff --git a/src/operators/kernel/fusion_fc_kernel.h b/src/operators/kernel/fusion_fc_kernel.h
index c4e2b30176fb904d7fb906c5efc5137a5dcb8d59..0e31134ba5a18405a5855db1e85b3885608c4071 100644
--- a/src/operators/kernel/fusion_fc_kernel.h
+++ b/src/operators/kernel/fusion_fc_kernel.h
@@ -28,7 +28,7 @@ class FusionFcKernel
     : public framework::OpKernelBase<DeviceType, FusionFcParam> {
  public:
   void Compute(const FusionFcParam& param) const;
-  bool Init(const FusionFcParam& para) const;
+  bool Init(FusionFcParam* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/im2sequence_kernel.h b/src/operators/kernel/im2sequence_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb592613f73d90dae5a7d6e515f8bc091981776e
--- /dev/null
+++ b/src/operators/kernel/im2sequence_kernel.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef IM2SEQUENCE_OP
+
+#include "framework/operator.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+#pragma once;
+
+namespace paddle_mobile {
+namespace operators {
+
+using namespace framework;
+
+template <typename DeviceType, typename T>
+class Im2SequenceKernel
+    : public framework::OpKernelBase<DeviceType, Im2SequenceParam> {
+ public:
+  void Compute(const Im2SequenceParam& param) const;
+  bool Init(Im2SequenceParam* para);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/lrn_kernel.h b/src/operators/kernel/lrn_kernel.h
index 3b4b34250ef376032299bf14b2d4e2074e46245d..7327451a0aa21b7bcf9ae111f63c19f2b6bb2d3a 100644
--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
@@ -14,10 +14,11 @@ limitations under the License. */
 
 #ifdef LRN_OP
 
-#pragma once
 #include "framework/operator.h"
 #include "operators/op_param.h"
 
+#include <cmath>
+
 #ifdef __ARM_NEON
 #include "arm_neon.h"
 #include "operators/math/math_func_neon.h"
@@ -169,7 +170,7 @@ template <typename DeviceType, typename T>
 class LrnKernel : public framework::OpKernelBase<DeviceType, LrnParam> {
  public:
   void Compute(const LrnParam &param) const;
-  bool Init(const LrnParam &para) const;
+  bool Init(LrnParam *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/mali/batchnorm_kernel.cpp b/src/operators/kernel/mali/batchnorm_kernel.cpp
index 3f20a4a66096f83decfe81b176a7563f6caa07d1..e749f4223e54988c0db54ac739a9f73bcd8c6240 100644
--- a/src/operators/kernel/mali/batchnorm_kernel.cpp
+++ b/src/operators/kernel/mali/batchnorm_kernel.cpp
@@ -129,7 +129,7 @@ class AclBatchNormOp : public acl::ACLOperator {
 };
 
 template <>
-bool BatchNormKernel<GPU_MALI, float>::Init(const BatchNormParam& param) const {
+bool BatchNormKernel<GPU_MALI, float>::Init(BatchNormParam* param) {
   AclBatchNormOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
diff --git a/src/operators/kernel/mali/conv_kernel.cpp b/src/operators/kernel/mali/conv_kernel.cpp
index 635a7381581376417b2e860834ed5246ede99b89..30bb763728763785b2f66d0283743a91463d7f9a 100644
--- a/src/operators/kernel/mali/conv_kernel.cpp
+++ b/src/operators/kernel/mali/conv_kernel.cpp
@@ -196,7 +196,7 @@ class AclConvOp : public acl::ACLOperator {
 };
 
 template <>
-bool ConvKernel<GPU_MALI, float>::Init(const ConvParam& param) const {
+bool ConvKernel<GPU_MALI, float>::Init(ConvParam* param) {
   AclConvOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
diff --git a/src/operators/kernel/mali/elementwise_add_kernel.cpp b/src/operators/kernel/mali/elementwise_add_kernel.cpp
index 43d33b3fd2b2cc747ae8c943437e675c84a4cdc6..9748bbbb5454f10ad9ea83e37d599fb1c6cdb53e 100644
--- a/src/operators/kernel/mali/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp
@@ -27,8 +27,7 @@ struct AddFunctor {
 };
 
 template <>
-bool ElementwiseAddKernel<GPU_MALI, float>::Init(
-    const ElementwiseAddParam &para) const {
+bool ElementwiseAddKernel<GPU_MALI, float>::Init(ElementwiseAddParam *param) {
   return true;
 }
 
diff --git a/src/operators/kernel/mali/fushion_fc_kernel.cpp b/src/operators/kernel/mali/fushion_fc_kernel.cpp
index 64ab07a9b955893c01e2684cba0a14fa25d032ed..a76c3c46012a758a05cf8f846a15376ad1b9f33c 100644
--- a/src/operators/kernel/mali/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
@@ -22,7 +22,7 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool FusionFcKernel<GPU_MALI, float>::Init(const FusionFcParam &para) const {
+bool FusionFcKernel<GPU_MALI, float>::Init(FusionFcParam *param) {
   return true;
 }
 
diff --git a/src/operators/kernel/mali/mul_kernel.cpp b/src/operators/kernel/mali/mul_kernel.cpp
index f2a84deaa1de999e94e335de6d4f40981bded5a8..3a9ec4ebb319d9e521240ad987a49549c22c1ff2 100644
--- a/src/operators/kernel/mali/mul_kernel.cpp
+++ b/src/operators/kernel/mali/mul_kernel.cpp
@@ -22,7 +22,7 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool MulKernel<GPU_MALI, float>::Init(const MulParam &para) const {
+bool MulKernel<GPU_MALI, float>::Init(MulParam *param) {
   return true;
 }
 
diff --git a/src/operators/kernel/mali/reshape_kernel.cpp b/src/operators/kernel/mali/reshape_kernel.cpp
index d7521454d46dfc82064930971d2b996b542af54a..57837a677033590e92a307bd69a77c076c5ba805 100644
--- a/src/operators/kernel/mali/reshape_kernel.cpp
+++ b/src/operators/kernel/mali/reshape_kernel.cpp
@@ -22,7 +22,7 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ReshapeKernel<GPU_MALI, float>::Init(const ReshapeParam &para) const {
+bool ReshapeKernel<GPU_MALI, float>::Init(ReshapeParam *param) {
   return true;
 }
 
diff --git a/src/operators/kernel/mul_kernel.h b/src/operators/kernel/mul_kernel.h
index 81db202c2d26fae9abb971a2cafe32f9b20dfe22..f7dcb738b38448fe38eb60dcbbd4a2abda7a858a 100644
--- a/src/operators/kernel/mul_kernel.h
+++ b/src/operators/kernel/mul_kernel.h
@@ -29,7 +29,7 @@ template <typename DeviceType, typename T>
 class MulKernel : public framework::OpKernelBase<DeviceType, MulParam> {
  public:
   void Compute(const MulParam &param) const;
-  bool Init(const MulParam &para) const;
+  bool Init(MulParam *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/multiclass_nms_kernel.h b/src/operators/kernel/multiclass_nms_kernel.h
index ca86604f2c6e550c219e54b6533c1500fb2912c4..9bd00b874a1140373decca582f793febf0e941ec 100644
--- a/src/operators/kernel/multiclass_nms_kernel.h
+++ b/src/operators/kernel/multiclass_nms_kernel.h
@@ -28,7 +28,7 @@ class MultiClassNMSKernel
     : public framework::OpKernelBase<DeviceType, MultiClassNMSParam> {
  public:
   void Compute(const MultiClassNMSParam& param) const;
-  bool Init(const MultiClassNMSParam& para) const;
+  bool Init(MultiClassNMSParam* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/pool_kernel.h b/src/operators/kernel/pool_kernel.h
index 3285f56cc01fad554bff7e6a4d25769f8ef56d24..fd9faa3d5a508084924e080f5c5ed7e7b454b5f2 100644
--- a/src/operators/kernel/pool_kernel.h
+++ b/src/operators/kernel/pool_kernel.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #pragma once
 
 #include "framework/operator.h"
-#include "operators/math/pooling.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
@@ -28,7 +27,7 @@ template <typename DeviceType, typename T>
 class PoolKernel : public OpKernelBase<DeviceType, PoolParam> {
  public:
   void Compute(const PoolParam &param) const override;
-  bool Init(const PoolParam &para) const;
+  bool Init(PoolParam *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/prior_box_kernel.h b/src/operators/kernel/prior_box_kernel.h
index be13f0fc16c0c0c0b16117a46b80a9f9acdeffae..d169a01d7f45f7dbdcc02be0e1e71690b8550af8 100644
--- a/src/operators/kernel/prior_box_kernel.h
+++ b/src/operators/kernel/prior_box_kernel.h
@@ -16,8 +16,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+#include <cmath>
 #include <vector>
-
 #include "framework/operator.h"
 #include "operators/math/transform.h"
 #include "operators/op_param.h"
@@ -54,7 +55,7 @@ class PriorBoxKernel
     : public framework::OpKernelBase<DeviceType, PriorBoxParam> {
  public:
   void Compute(const PriorBoxParam& param) const;
-  bool Init(const PriorBoxParam& para) const;
+  bool Init(PriorBoxParam* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/relu_kernel.h b/src/operators/kernel/relu_kernel.h
index 2155c33811f553435e4a89b5b23533e2bd42db5d..64016656b20b0fdb08f1342f7853e2e727a6bb81 100644
--- a/src/operators/kernel/relu_kernel.h
+++ b/src/operators/kernel/relu_kernel.h
@@ -27,7 +27,7 @@ template <typename DeviceType, typename T>
 class ReluKernel : public framework::OpKernelBase<DeviceType, ReluParam> {
  public:
   void Compute(const ReluParam& param) const;
-  bool Init(const ReluParam& para) const;
+  bool Init(ReluParam* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/reshape_kernel.h b/src/operators/kernel/reshape_kernel.h
index 364f5b0902c2661017f2e72520849836f64dd0bb..47eba531b9f36d83d44588d9cdfb162519c24180 100644
--- a/src/operators/kernel/reshape_kernel.h
+++ b/src/operators/kernel/reshape_kernel.h
@@ -71,7 +71,7 @@ template <typename DeviceType, typename T>
 class ReshapeKernel : public framework::OpKernelBase<DeviceType, ReshapeParam> {
  public:
   void Compute(const ReshapeParam& param) const;
-  bool Init(const ReshapeParam& para) const;
+  bool Init(ReshapeParam* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/sigmoid_kernel.h b/src/operators/kernel/sigmoid_kernel.h
index e9eaae5ad867c6880db7346f9632ff37a92aaf66..fc3eb5e1bf158c541b2f00d9e57ddd4699344006 100644
--- a/src/operators/kernel/sigmoid_kernel.h
+++ b/src/operators/kernel/sigmoid_kernel.h
@@ -26,7 +26,7 @@ template <typename DeviceType, typename T>
 class SigmoidKernel : public OpKernelBase<DeviceType, SigmoidParam> {
  public:
   void Compute(const SigmoidParam& param) const override;
-  bool Init(const SigmoidParam& para) const;
+  bool Init(SigmoidParam* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/softmax_kernel.h b/src/operators/kernel/softmax_kernel.h
index a7a7666e32ef1923a47d71d94c93e813a23028c5..a500d9c81cce96b0f1db6d45981ad9aa02ea7c0b 100644
--- a/src/operators/kernel/softmax_kernel.h
+++ b/src/operators/kernel/softmax_kernel.h
@@ -23,13 +23,11 @@ namespace paddle_mobile {
 namespace operators {
 using framework::OpKernelBase;
 
-void simoid(Tensor *X, Tensor *Y);
-
 template <typename DeviceType, typename T>
 class SoftmaxKernel : public OpKernelBase<DeviceType, SoftmaxParam> {
  public:
   void Compute(const SoftmaxParam &param) const override;
-  bool Init(const SoftmaxParam &para) const;
+  bool Init(SoftmaxParam *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/transpose_kernel.h b/src/operators/kernel/transpose_kernel.h
index 6526d97df9863392f783841a784cb5df4e45f218..f1a21ebbb28c2acdb905ce9f09c28f0d47e17294 100644
--- a/src/operators/kernel/transpose_kernel.h
+++ b/src/operators/kernel/transpose_kernel.h
@@ -29,7 +29,7 @@ class TransposeKernel
     : public framework::OpKernelBase<DeviceType, TransposeParam> {
  public:
   void Compute(const TransposeParam& param) const;
-  bool Init(const TransposeParam& para) const;
+  bool Init(TransposeParam* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/lrn_op.cpp b/src/operators/lrn_op.cpp
index dc43cb022ac9d7435654cbc565c81c57ba80b350..1a5a8eccc1fc314d27517db8bc286035e573c9be 100644
--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -30,11 +30,9 @@ template class LrnOp<CPU, float>;
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(lrn);
 REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(lrn);
 REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
diff --git a/src/operators/lrn_op.h b/src/operators/lrn_op.h
index d67b9f6be741581918b09d19a8a8b26c28ceed1c..0d756a14f4d935fd59ac2bfc7c811c674b1587fe 100644
--- a/src/operators/lrn_op.h
+++ b/src/operators/lrn_op.h
@@ -46,4 +46,13 @@ class LrnOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(lrn);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(lrn);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f23affb45107b0d2414c49843cdfbd70c953c95c
--- /dev/null
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -0,0 +1,845 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/math/depthwise_conv_3x3.h"
+#include <arm_neon.h>
+#include <vector>
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
+                      vector<int> paddings, const Tensor *filter, Tensor *bias,
+                      Tensor *output, bool if_bias) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+
+  const int input_height = input->dims()[2];
+
+  const int input_width = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int _kernel_size = 3;
+  const int stride_height = strides[0];
+  const int stride_width = strides[1];
+  const int padding_height = paddings[0];
+  const int padding_width = paddings[1];
+  const float zero = 0;
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+  const int filter_channel_stride = 9;
+
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+  if (if_bias) {
+    math::expand_bias(*bias, 1, output->dims());
+    output->ShareDataWith(*bias);
+  }
+  float *output_data = output->mutable_data<float>();
+
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  const int filter_batch_stride = output_channels * output_channel_stride;
+  const float *pos1, *pos2, *pos3, *filter1, *filter2, *filter3, *output_ptr;
+  int hstart, wstart, hend, wend;
+  float result;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      filter1 = filter_data;
+      filter2 = filter1 + 3;
+      filter3 = filter2 + 3;
+
+      for (int ph = 0; ph < output_height; ph++) {
+        for (int pw = 0; pw < output_width; pw++) {
+          hstart = ph * stride_height - padding_height;
+          wstart = pw * stride_width - padding_width;
+          hend = min(hstart + _kernel_size, input_height + padding_height);
+          wend = min(wstart + _kernel_size, input_width + padding_width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          hend = min(hend, input_height);
+          wend = min(wend, input_width);
+          pos1 = input_data + hstart * input_width + wstart;
+          pos2 = input_data + (hstart + 1) * input_width + wstart;
+          pos3 = input_data + (hstart + 2) * input_width + wstart;
+          output_ptr = output_data + ph * output_width + pw;
+
+          if (hend - hstart != 3 || wend - wstart != 3) {
+            result = 0;
+            float fake_input[9] = {0};
+            if (hstart == 0 && wstart == 0) {
+              // 左上角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend && k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k -
+                                   (3 - wend)];
+                  }
+                }
+              }
+            } else if (hstart == 0 && wend == input_width) {
+              // 右上角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend && k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k + wstart];
+                  }
+                }
+              }
+
+            } else if (hend == input_height && wstart == 0) {
+              // 左下角
+
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - 1 - hstart && k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k - (3 - wend)];
+                  }
+                }
+              }
+            } else if (hend == input_height && wend == input_width) {
+              // 右下角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - hstart - 1 &&
+                      k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+            } else if (hstart == 0) {
+              // 顶部
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k + wstart];
+                  }
+                }
+              }
+
+            } else if (hend == input_height) {
+              // 底部
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - hstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+
+            } else if (wstart == 0) {
+              // 左侧
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width +
+                                   (k - (3 - wend))];
+                  }
+                }
+              }
+
+            } else if (wend == input_width) {
+              // 右侧
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+            }
+            for (int l = 0; l < 9; ++l) {
+              result += fake_input[l] * filter1[l];
+            }
+            if (if_bias) {
+              output_data[ph * output_width + pw] += result;
+            } else {
+              output_data[ph * output_width + pw] = result;
+            }
+
+          } else {
+#if defined(ARMV17)
+            asm volatile(
+
+                "vld1.32  {q1}, [%[pos1]]        \n\t"
+                "vld1.32  {q4}, [%[filter1]]        \n\t"
+                "vmov.f32 q0,    #0.0              \n\t"
+
+                "vld1.32  {q2}, [%[pos2]]        \n\t"
+                "vld1.32  {q5}, [%[filter2]]        \n\t"
+                "vmla.f32 q0, q1, q4           \n\t"
+
+                "vld1.32  {q3}, [%[pos3]]        \n\t"
+                "vld1.32  {q6}, [%[filter3]]        \n\t"
+
+                "vmla.f32 q0, q2, q5           \n\t"
+                "vmla.f32 q0, q3, q6          \n\t"
+
+                "vmov.f32 d1[1],  %[zero]         \n\t"
+
+                "vadd.f32  d4, d0, d1           \n\t"
+                "vadd.f32  s10, s8, s9            \n\t"
+                "vst1.32 {d5[0]},[%[output_ptr]]    \n\t"
+                :
+                : [input_data] "r"(input_data), [pos1] "r"(pos1),
+                  [pos2] "r"(pos2), [pos3] "r"(pos3), [filter1] "r"(filter1),
+                  [filter2] "r"(filter2), [filter3] "r"(filter3),
+                  [output_ptr] "r"(output_ptr), [zero] "r"(zero)
+                : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
+#else
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos2);
+            const float32x4_t data3 = vld1q_f32(pos3);
+
+            const float32x4_t v_filter1 = vld1q_f32(filter1);
+            const float32x4_t v_filter2 = vld1q_f32(filter2);
+            const float32x4_t v_filter3 = vld1q_f32(filter3);
+            float32x4_t mula = vmulq_f32(data1, v_filter1);
+            mula = vmlaq_f32(mula, data2, v_filter2);
+            mula = vmlaq_f32(mula, data3, v_filter3);
+            float32x2_t res = vpadd_f32(
+                vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
+            res = vpadd_f32(res, res);
+            if (if_bias) {
+              output_data[ph * output_width + pw] += vget_lane_f32(res, 0);
+            } else {
+              output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
+            }
+#endif
+          }
+        }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+      filter_data += filter_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+#endif
+}
+
+void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
+                          Tensor *output, Tensor *bias, bool if_bias) {
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+  float *output_data = output->data<float>();
+  const float *bias_data;
+  if (if_bias) {
+    bias_data = bias->data<float>();
+  }
+
+  const int h = static_cast<int>(input->dims()[2]);
+  const int w = static_cast<int>(input->dims()[3]);
+  const int l = h;
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int c = static_cast<int>(input->dims()[1]);
+  const int hxw = h * w;
+  float32x4_t vbias = vdupq_n_f32(0.0);
+  for (int b = 0; b < batch_size; ++b) {
+    const float *filter_data_tmp = filter_data;
+
+    for (int j = 0; j < c; ++j) {
+      if (if_bias) {
+        vbias = vdupq_n_f32(bias_data[j]);
+      }
+
+      int l_mid = l - 2;  // l=1->l_mid=-1,l=2->l_mid=0
+      float w00 = filter_data_tmp[0];
+      float w01 = filter_data_tmp[1];
+      float w02 = filter_data_tmp[2];
+      float w10 = filter_data_tmp[3];
+      float w11 = filter_data_tmp[4];
+      float w12 = filter_data_tmp[5];
+      float w20 = filter_data_tmp[6];
+      float w21 = filter_data_tmp[7];
+      float w22 = filter_data_tmp[8];
+
+      output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
+                       w21 * input_data[l] + w22 * input_data[l + 1];
+      output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
+                           w20 * input_data[2 * l - 2] +
+                           w21 * input_data[2 * l - 1];
+      output_data[(l - 1) * l] =
+          w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
+          w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
+      output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
+                               w01 * input_data[(l - 2) * (l + 1) + 1] +
+                               w10 * input_data[l * l - 2] +
+                               w11 * input_data[l * l - 1];
+      if (if_bias) {
+        output_data[0] += bias_data[j];
+        output_data[l - 1] += bias_data[j];
+        output_data[(l - 1) * l] += bias_data[j];
+        output_data[l * l - 1] += bias_data[j];
+      }
+
+      for (int i = 1; i < l - 1; ++i) {
+        output_data[i * l] =
+            w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
+            w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
+            w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1];
+
+        output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
+                                     w01 * input_data[i * l + l - 1 - l] +
+                                     w10 * input_data[i * l + l - 1 - 1] +
+                                     w11 * input_data[i * l + l - 1] +
+                                     w20 * input_data[i * l + l - 1 + l - 1] +
+                                     w21 * input_data[i * l + l - 1 + l];
+        if (if_bias) {
+          output_data[i * l] += bias_data[j];
+          output_data[i * l + l - 1] += bias_data[j];
+        }
+      }
+
+      // top 1 row and bottom 1 row
+      const float *input_tmp = input_data;
+
+      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
+          tmp3, tmp4, tmp5, out0;
+      in0 = vld1q_f32(input_tmp);
+      in2 = vld1q_f32(input_tmp + l);
+      const float *input_tmp_end = input_tmp + (l - 2) * l;
+      in4 = vld1q_f32(input_tmp_end);
+      in6 = vld1q_f32(input_tmp_end + l);
+      int c_mid = l_mid;
+      auto output_ptr = output_data + 1;
+      for (; c_mid > 3; c_mid -= 4) {
+        in1 = vld1q_f32(input_tmp + 4);
+        in3 = vld1q_f32(input_tmp + l + 4);
+
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+
+        out0 = vmulq_n_f32(in0, w10);
+        out0 = vmlaq_n_f32(out0, tmp0, w11);
+        out0 = vmlaq_n_f32(out0, tmp1, w12);
+        out0 = vmlaq_n_f32(out0, in2, w20);
+        out0 = vmlaq_n_f32(out0, tmp2, w21);
+        out0 = vmlaq_n_f32(out0, tmp3, w22);
+        out0 = vaddq_f32(out0, vbias);
+
+        vst1q_f32(output_ptr, out0);
+
+        in5 = vld1q_f32(input_tmp_end + 4);
+        in7 = vld1q_f32(input_tmp_end + l + 4);
+
+        tmp0 = vextq_f32(in4, in5, 1);
+        tmp1 = vextq_f32(in4, in5, 2);
+        tmp2 = vextq_f32(in6, in7, 1);
+        tmp3 = vextq_f32(in6, in7, 2);
+
+        out0 = vmulq_n_f32(in4, w00);
+        out0 = vmlaq_n_f32(out0, tmp0, w01);
+        out0 = vmlaq_n_f32(out0, tmp1, w02);
+        out0 = vmlaq_n_f32(out0, in6, w10);
+        out0 = vmlaq_n_f32(out0, tmp2, w11);
+        out0 = vmlaq_n_f32(out0, tmp3, w12);
+        out0 = vaddq_f32(out0, vbias);
+
+        vst1q_f32(output_ptr + (l - 1) * l, out0);
+
+        // can optimize to each 8 stride.
+        input_tmp += 4;
+        input_tmp_end += 4;
+        output_ptr += 4;
+        in0 = in1;
+        in2 = in3;
+        in4 = in5;
+        in6 = in7;
+      }
+
+      // top right pad
+      float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]);
+      float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]);
+
+      tmp0 = vextq_f32(in0, pad0, 1);
+      tmp1 = vextq_f32(in0, pad0, 2);
+      tmp2 = vextq_f32(in2, pad1, 1);
+      tmp3 = vextq_f32(in2, pad1, 2);
+
+      out0 = vmulq_n_f32(in0, w10);
+      out0 = vmlaq_n_f32(out0, tmp0, w11);
+      out0 = vmlaq_n_f32(out0, tmp1, w12);
+      out0 = vmlaq_n_f32(out0, in2, w20);
+      out0 = vmlaq_n_f32(out0, tmp2, w21);
+      out0 = vmlaq_n_f32(out0, tmp3, w22);
+      out0 = vaddq_f32(out0, vbias);
+
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + i, out0, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + i, out0, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + i, out0, 2);
+        }
+      }
+
+      // bottom right pad
+      float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]);
+      float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]);
+
+      tmp0 = vextq_f32(in4, pad2, 1);
+      tmp1 = vextq_f32(in4, pad2, 2);
+      tmp2 = vextq_f32(in6, pad3, 1);
+      tmp3 = vextq_f32(in6, pad3, 2);
+
+      out0 = vmulq_n_f32(in4, w00);
+      out0 = vmlaq_n_f32(out0, tmp0, w01);
+      out0 = vmlaq_n_f32(out0, tmp1, w02);
+      out0 = vmlaq_n_f32(out0, in6, w10);
+      out0 = vmlaq_n_f32(out0, tmp2, w11);
+      out0 = vmlaq_n_f32(out0, tmp3, w12);
+      out0 = vaddq_f32(out0, vbias);
+
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2);
+        }
+      }
+      // mid
+
+      for (int i = 0; i < l - 2; ++i) {
+        auto output_ptr = output_data + (i + 1) * l + 1;
+        input_tmp = input_data + i * l;
+        auto in0_tmp = vld1q_f32(input_tmp);
+        auto in2_tmp = vld1q_f32(input_tmp + l);
+        auto in4_tmp = vld1q_f32(input_tmp + l + l);
+        c_mid = l_mid;
+        for (; c_mid > 3; c_mid -= 4) {
+          auto in1_tmp = vld1q_f32(input_tmp + 4);
+          auto in3_tmp = vld1q_f32(input_tmp + l + 4);
+          auto in5_tmp = vld1q_f32(input_tmp + l + l + 4);
+
+          tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
+          tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
+          tmp2 = vextq_f32(in2_tmp, in3_tmp, 1);
+          tmp3 = vextq_f32(in2_tmp, in3_tmp, 2);
+          tmp4 = vextq_f32(in4_tmp, in5_tmp, 1);
+          tmp5 = vextq_f32(in4_tmp, in5_tmp, 2);
+
+          out0 = vmulq_n_f32(in0_tmp, w00);
+          out0 = vmlaq_n_f32(out0, tmp0, w01);
+          out0 = vmlaq_n_f32(out0, tmp1, w02);
+          out0 = vmlaq_n_f32(out0, in2_tmp, w10);
+          out0 = vmlaq_n_f32(out0, tmp2, w11);
+          out0 = vmlaq_n_f32(out0, tmp3, w12);
+          out0 = vmlaq_n_f32(out0, in4_tmp, w20);
+          out0 = vmlaq_n_f32(out0, tmp4, w21);
+          out0 = vmlaq_n_f32(out0, tmp5, w22);
+          out0 = vaddq_f32(out0, vbias);
+
+          vst1q_f32(output_ptr, out0);
+
+          output_ptr += 4;
+          input_tmp += 4;
+          in0_tmp = in1_tmp;
+          in2_tmp = in3_tmp;
+          in4_tmp = in5_tmp;
+        }
+
+        float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]);
+        float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]);
+        float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]);
+
+        tmp0 = vextq_f32(in0_tmp, pad0, 1);
+        tmp1 = vextq_f32(in0_tmp, pad0, 2);
+        tmp2 = vextq_f32(in2_tmp, pad1, 1);
+        tmp3 = vextq_f32(in2_tmp, pad1, 2);
+        tmp4 = vextq_f32(in4_tmp, pad2, 1);
+        tmp5 = vextq_f32(in4_tmp, pad2, 2);
+
+        out0 = vmulq_n_f32(in0_tmp, w00);
+        out0 = vmlaq_n_f32(out0, tmp0, w01);
+        out0 = vmlaq_n_f32(out0, tmp1, w02);
+        out0 = vmlaq_n_f32(out0, in2_tmp, w10);
+        out0 = vmlaq_n_f32(out0, tmp2, w11);
+        out0 = vmlaq_n_f32(out0, tmp3, w12);
+        out0 = vmlaq_n_f32(out0, in4_tmp, w20);
+        out0 = vmlaq_n_f32(out0, tmp4, w21);
+        out0 = vmlaq_n_f32(out0, tmp5, w22);
+        out0 = vaddq_f32(out0, vbias);
+
+        for (int i = 0; i < c_mid; ++i) {
+          if (i == 0) {
+            vst1q_lane_f32(output_ptr + i, out0, 0);
+          }
+          if (i == 1) {
+            vst1q_lane_f32(output_ptr + i, out0, 1);
+          }
+          if (i == 2) {
+            vst1q_lane_f32(output_ptr + i, out0, 2);
+          }
+        }
+      }
+      output_data += hxw;
+      input_data += hxw;
+      filter_data_tmp += 9;
+    }
+  }
+}
+
+void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
+                                   Tensor *output, Tensor *bias, bool if_bias,
+                                   const Tensor *new_scale,
+                                   const Tensor *new_bias, bool if_bn,
+                                   bool if_relu) {
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+  float *output_data = output->data<float>();
+  const float *bias_data = bias->data<float>();
+  const float *newscale_data = new_scale->data<float>();
+  const float *newbias_data = new_bias->data<float>();
+
+  const int h = static_cast<int>(input->dims()[2]);
+  const int w = static_cast<int>(input->dims()[3]);
+  const int l = h;
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int c = static_cast<int>(input->dims()[1]);
+  const int hxw = h * w;
+  float32x4_t vbias = vdupq_n_f32(0.0);
+  float32x4_t vnewbias = vdupq_n_f32(0.0);
+  float32x4_t vnewscale = vdupq_n_f32(1.0);
+  float32x4_t vzero = vdupq_n_f32(0);
+
+  for (int b = 0; b < batch_size; ++b) {
+    const float *filter_data_tmp = filter_data;
+
+    for (int j = 0; j < c; ++j) {
+      if (if_bias) {
+        vbias = vdupq_n_f32(bias_data[j]);
+      }
+      if (if_bn) {
+        vnewbias = vdupq_n_f32(newbias_data[j]);
+        vnewscale = vdupq_n_f32(newscale_data[j]);
+      }
+      int l_mid = l - 2;  // l=1->l_mid=-1,l=2->l_mid=0
+      float w00 = filter_data_tmp[0];
+      float w01 = filter_data_tmp[1];
+      float w02 = filter_data_tmp[2];
+      float w10 = filter_data_tmp[3];
+      float w11 = filter_data_tmp[4];
+      float w12 = filter_data_tmp[5];
+      float w20 = filter_data_tmp[6];
+      float w21 = filter_data_tmp[7];
+      float w22 = filter_data_tmp[8];
+
+      output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
+                       w21 * input_data[l] + w22 * input_data[l + 1];
+
+      output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
+                           w20 * input_data[2 * l - 2] +
+                           w21 * input_data[2 * l - 1];
+
+      output_data[(l - 1) * l] =
+          w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
+          w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
+      output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
+                               w01 * input_data[(l - 2) * (l + 1) + 1] +
+                               w10 * input_data[l * l - 2] +
+                               w11 * input_data[l * l - 1];
+      if (if_bias) {
+        output_data[0] += bias_data[j];
+        output_data[l - 1] += bias_data[j];
+        output_data[(l - 1) * l] += bias_data[j];
+        output_data[l * l - 1] += bias_data[j];
+      }
+      if (if_bn) {
+        output_data[0] = output_data[0] * newscale_data[j] + newbias_data[j];
+        output_data[l - 1] =
+            output_data[l - 1] * newscale_data[j] + newbias_data[j];
+        output_data[(l - 1) * l] =
+            output_data[(l - 1) * l] * newscale_data[j] + newbias_data[j];
+        output_data[l * l - 1] =
+            output_data[l * l - 1] * newscale_data[j] + newbias_data[j];
+      }
+      if (if_relu) {
+        output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
+        output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - 1];
+        output_data[(l - 1) * l] =
+            output_data[(l - 1) * l] < 0 ? 0 : output_data[(l - 1) * l];
+        output_data[l * l - 1] =
+            output_data[l * l - 1] < 0 ? 0 : output_data[l * l - 1];
+      }
+      for (int i = 1; i < l - 1; ++i) {
+        output_data[i * l] =
+            w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
+            w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
+            w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1];
+        output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
+                                     w01 * input_data[i * l + l - 1 - l] +
+                                     w10 * input_data[i * l + l - 1 - 1] +
+                                     w11 * input_data[i * l + l - 1] +
+                                     w20 * input_data[i * l + l - 1 + l - 1] +
+                                     w21 * input_data[i * l + l - 1 + l];
+        if (if_bias) {
+          output_data[i * l] += bias_data[j];
+          output_data[i * l + l - 1] += bias_data[j];
+        }
+        if (if_bn) {
+          output_data[i * l] =
+              output_data[i * l] * newscale_data[j] + newbias_data[j];
+          output_data[i * l + l - 1] =
+              output_data[i * l + l - 1] * newscale_data[j] + newbias_data[j];
+        }
+        if (if_relu) {
+          output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i * l];
+          output_data[i * l + l - 1] =
+              output_data[i * l + l - 1] < 0 ? 0 : output_data[i * l + l - 1];
+        }
+      }
+
+      // top 1 row and bottom 1 row
+      const float *input_tmp = input_data;
+
+      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
+          tmp3, tmp4, tmp5, out0;
+      in0 = vld1q_f32(input_tmp);
+      in2 = vld1q_f32(input_tmp + l);
+      const float *input_tmp_end = input_tmp + (l - 2) * l;
+      in4 = vld1q_f32(input_tmp_end);
+      in6 = vld1q_f32(input_tmp_end + l);
+      int c_mid = l_mid;
+      auto output_ptr = output_data + 1;
+      for (; c_mid > 3; c_mid -= 4) {
+        in1 = vld1q_f32(input_tmp + 4);
+        in3 = vld1q_f32(input_tmp + l + 4);
+
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+
+        out0 = vmulq_n_f32(in0, w10);
+        out0 = vmlaq_n_f32(out0, tmp0, w11);
+        out0 = vmlaq_n_f32(out0, tmp1, w12);
+        out0 = vmlaq_n_f32(out0, in2, w20);
+        out0 = vmlaq_n_f32(out0, tmp2, w21);
+        out0 = vmlaq_n_f32(out0, tmp3, w22);
+        out0 = vaddq_f32(out0, vbias);
+        out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+        if (if_relu) {
+          out0 = vmaxq_f32(out0, vzero);
+        }
+        vst1q_f32(output_ptr, out0);
+
+        in5 = vld1q_f32(input_tmp_end + 4);
+        in7 = vld1q_f32(input_tmp_end + l + 4);
+
+        tmp0 = vextq_f32(in4, in5, 1);
+        tmp1 = vextq_f32(in4, in5, 2);
+        tmp2 = vextq_f32(in6, in7, 1);
+        tmp3 = vextq_f32(in6, in7, 2);
+
+        out0 = vmulq_n_f32(in4, w00);
+        out0 = vmlaq_n_f32(out0, tmp0, w01);
+        out0 = vmlaq_n_f32(out0, tmp1, w02);
+        out0 = vmlaq_n_f32(out0, in6, w10);
+        out0 = vmlaq_n_f32(out0, tmp2, w11);
+        out0 = vmlaq_n_f32(out0, tmp3, w12);
+        out0 = vaddq_f32(out0, vbias);
+        out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+        if (if_relu) {
+          out0 = vmaxq_f32(out0, vzero);
+        }
+        vst1q_f32(output_ptr + (l - 1) * l, out0);
+
+        // can optimize to each 8 stride.
+        input_tmp += 4;
+        input_tmp_end += 4;
+        output_ptr += 4;
+        in0 = in1;
+        in2 = in3;
+        in4 = in5;
+        in6 = in7;
+      }
+
+      // top right pad
+      float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]);
+      float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]);
+
+      tmp0 = vextq_f32(in0, pad0, 1);
+      tmp1 = vextq_f32(in0, pad0, 2);
+      tmp2 = vextq_f32(in2, pad1, 1);
+      tmp3 = vextq_f32(in2, pad1, 2);
+
+      out0 = vmulq_n_f32(in0, w10);
+      out0 = vmlaq_n_f32(out0, tmp0, w11);
+      out0 = vmlaq_n_f32(out0, tmp1, w12);
+      out0 = vmlaq_n_f32(out0, in2, w20);
+      out0 = vmlaq_n_f32(out0, tmp2, w21);
+      out0 = vmlaq_n_f32(out0, tmp3, w22);
+      out0 = vaddq_f32(out0, vbias);
+      out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+      if (if_relu) {
+        out0 = vmaxq_f32(out0, vzero);
+      }
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + i, out0, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + i, out0, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + i, out0, 2);
+        }
+      }
+
+      // bottom right pad
+      float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]);
+      float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]);
+
+      tmp0 = vextq_f32(in4, pad2, 1);
+      tmp1 = vextq_f32(in4, pad2, 2);
+      tmp2 = vextq_f32(in6, pad3, 1);
+      tmp3 = vextq_f32(in6, pad3, 2);
+
+      out0 = vmulq_n_f32(in4, w00);
+      out0 = vmlaq_n_f32(out0, tmp0, w01);
+      out0 = vmlaq_n_f32(out0, tmp1, w02);
+      out0 = vmlaq_n_f32(out0, in6, w10);
+      out0 = vmlaq_n_f32(out0, tmp2, w11);
+      out0 = vmlaq_n_f32(out0, tmp3, w12);
+      out0 = vaddq_f32(out0, vbias);
+      out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+      if (if_relu) {
+        out0 = vmaxq_f32(out0, vzero);
+      }
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2);
+        }
+      }
+      // mid
+
+      for (int i = 0; i < l - 2; ++i) {
+        auto output_ptr = output_data + (i + 1) * l + 1;
+        input_tmp = input_data + i * l;
+        auto in0_tmp = vld1q_f32(input_tmp);
+        auto in2_tmp = vld1q_f32(input_tmp + l);
+        auto in4_tmp = vld1q_f32(input_tmp + l + l);
+        c_mid = l_mid;
+        for (; c_mid > 3; c_mid -= 4) {
+          auto in1_tmp = vld1q_f32(input_tmp + 4);
+          auto in3_tmp = vld1q_f32(input_tmp + l + 4);
+          auto in5_tmp = vld1q_f32(input_tmp + l + l + 4);
+
+          tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
+          tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
+          tmp2 = vextq_f32(in2_tmp, in3_tmp, 1);
+          tmp3 = vextq_f32(in2_tmp, in3_tmp, 2);
+          tmp4 = vextq_f32(in4_tmp, in5_tmp, 1);
+          tmp5 = vextq_f32(in4_tmp, in5_tmp, 2);
+
+          out0 = vmulq_n_f32(in0_tmp, w00);
+          out0 = vmlaq_n_f32(out0, tmp0, w01);
+          out0 = vmlaq_n_f32(out0, tmp1, w02);
+          out0 = vmlaq_n_f32(out0, in2_tmp, w10);
+          out0 = vmlaq_n_f32(out0, tmp2, w11);
+          out0 = vmlaq_n_f32(out0, tmp3, w12);
+          out0 = vmlaq_n_f32(out0, in4_tmp, w20);
+          out0 = vmlaq_n_f32(out0, tmp4, w21);
+          out0 = vmlaq_n_f32(out0, tmp5, w22);
+          out0 = vaddq_f32(out0, vbias);
+          out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+          if (if_relu) {
+            out0 = vmaxq_f32(out0, vzero);
+          }
+          vst1q_f32(output_ptr, out0);
+
+          output_ptr += 4;
+          input_tmp += 4;
+          in0_tmp = in1_tmp;
+          in2_tmp = in3_tmp;
+          in4_tmp = in5_tmp;
+        }
+
+        float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]);
+        float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]);
+        float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]);
+
+        tmp0 = vextq_f32(in0_tmp, pad0, 1);
+        tmp1 = vextq_f32(in0_tmp, pad0, 2);
+        tmp2 = vextq_f32(in2_tmp, pad1, 1);
+        tmp3 = vextq_f32(in2_tmp, pad1, 2);
+        tmp4 = vextq_f32(in4_tmp, pad2, 1);
+        tmp5 = vextq_f32(in4_tmp, pad2, 2);
+
+        out0 = vmulq_n_f32(in0_tmp, w00);
+        out0 = vmlaq_n_f32(out0, tmp0, w01);
+        out0 = vmlaq_n_f32(out0, tmp1, w02);
+        out0 = vmlaq_n_f32(out0, in2_tmp, w10);
+        out0 = vmlaq_n_f32(out0, tmp2, w11);
+        out0 = vmlaq_n_f32(out0, tmp3, w12);
+        out0 = vmlaq_n_f32(out0, in4_tmp, w20);
+        out0 = vmlaq_n_f32(out0, tmp4, w21);
+        out0 = vmlaq_n_f32(out0, tmp5, w22);
+        out0 = vaddq_f32(out0, vbias);
+        out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+        if (if_relu) {
+          out0 = vmaxq_f32(out0, vzero);
+        }
+        for (int i = 0; i < c_mid; ++i) {
+          if (i == 0) {
+            vst1q_lane_f32(output_ptr + i, out0, 0);
+          }
+          if (i == 1) {
+            vst1q_lane_f32(output_ptr + i, out0, 1);
+          }
+          if (i == 2) {
+            vst1q_lane_f32(output_ptr + i, out0, 2);
+          }
+        }
+      }
+      output_data += hxw;
+      input_data += hxw;
+      filter_data_tmp += 9;
+    }
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/math/depthwise_conv_3x3.h b/src/operators/math/depthwise_conv_3x3.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0beb479926902a71b7e06128aa8cecdd5443196
--- /dev/null
+++ b/src/operators/math/depthwise_conv_3x3.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+#include "framework/tensor.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+using framework::Tensor;
+using std::max;
+using std::min;
+using std::vector;
+
+void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
+                      vector<int> paddings, const Tensor *filter, Tensor *bias,
+                      Tensor *output, bool if_bias);
+void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
+                          Tensor *output, Tensor *bias, bool if_bias);
+void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
+                                   Tensor *output, Tensor *bias, bool if_bias,
+                                   const Tensor *new_scale,
+                                   const Tensor *new_bias, bool if_bn,
+                                   bool if_relu);
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index c35a14bf508835b120e1a4108cba0945208867dc..e9974df967b293317c3014803bec27d2da73fca3 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -116,6 +116,8 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
   int i, j;
   const float *Bij;
   for (j = 0; j < n - n_tail; j += NR) {
+#ifdef ARMV7
+
     for (i = 0; i < k; ++i) {
       Bij = &B(i, j);
       asm volatile(
@@ -125,6 +127,15 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
           : [Bij] "r"(Bij)
           : "memory", "q0");
     }
+#else
+    for (i = 0; i < k; ++i) {
+      Bij = &B(i, j);
+      *buffer++ = *Bij;
+      *buffer++ = *(Bij + 1);
+      *buffer++ = *(Bij + 2);
+      *buffer++ = *(Bij + 3);
+    }
+#endif
   }
   if (n_tail != 0) {
     for (i = 0; i < k; ++i) {
@@ -747,10 +758,14 @@ void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
            const float *B, int ldb, float beta, float *C, int ldc) {
   int i, j, p, mc, nc, kc;
   float beta_;
+
+#ifdef ARMV7
   if (m == 1) {
     VectorKernel(1, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
     return;
   }
+#endif
+
   for (j = 0; j < n; j += NC) {
     nc = s_min(n - j, NC);
     for (p = 0; p < k; p += KC) {
@@ -797,6 +812,7 @@ void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
   }
 }
 
+#ifdef ARMV7
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
                   const float *B, int ldb, float beta, float *C, int ldc) {
   float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
@@ -1010,6 +1026,7 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
     }
   }
 }
+#endif
 
 }  // namespace math
 }  // namespace operators
diff --git a/src/operators/math/pool_2x2.cpp b/src/operators/math/pool_2x2.cpp
index 96d277c136b4656dbb1fd682489bd7dee5c3af0e..e0fd5da57cc91d4f1c55c560134398126517db29 100644
--- a/src/operators/math/pool_2x2.cpp
+++ b/src/operators/math/pool_2x2.cpp
@@ -22,6 +22,9 @@ namespace math {
 void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                 Tensor *output) {
 #if __ARM_NEON
+
+#ifdef ARMV7
+
   const int batch_size = input->dims()[0];
 
   const int input_height = input->dims()[2];
@@ -90,11 +93,15 @@ void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
     output_data += output_batch_stride;
   }
 #endif
+
+#endif
 }
 
 void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                 Tensor *output) {
 #if __ARM_NEON
+
+#ifdef ARMV7
   const int batch_size = input->dims()[0];
 
   const int input_height = input->dims()[2];
@@ -164,6 +171,12 @@ void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
     input_data += input_batch_stride;
     output_data += output_batch_stride;
   }
+#else
+
+// TODO(): to imp other asm
+
+#endif
+
 #endif
 }
 
diff --git a/src/operators/math/pool_3x3.cpp b/src/operators/math/pool_3x3.cpp
index f404b644d78cb1b94eb96a2d587fead2575b3814..83d0bcb699f82b9c290080982ba6750a64d74e53 100644
--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
@@ -13,13 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef POOL_OP
-#define __ARM_NEON true
 #include "pool_3x3.h"
 #include "framework/tensor.h"
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON
-
+#include <climits>
 namespace paddle_mobile {
 namespace operators {
 namespace math {
@@ -27,6 +26,481 @@ using framework::Tensor;
 using std::max;
 using std::min;
 using std::vector;
+void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+
+  const int h_in = input->dims()[2];
+
+  const int w_in = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  const int h_out = output->dims()[2];
+  const int w_out = output->dims()[3];
+  const int outputdata_channel_stride = h_out * w_out;
+  const int inputdata_channel_stride = h_in * w_in;
+  float *out_data = output->data<float>();
+  const float *input_data = input->data<float>();
+  const float coef = 1.0 / 9.0;
+  for (int k = 0; k < batch_size; ++k) {
+    for (int c = 0; c < output_channels; ++c) {
+      // four corner point
+      out_data[0] = (input_data[0] + input_data[1] + input_data[w_in] +
+                     input_data[w_in + 1]) *
+                    coef;
+      out_data[w_out - 1] =
+          (input_data[w_in - 2] + input_data[w_in - 1] +
+           input_data[w_in * 2 - 2] + input_data[2 * w_in - 1]) *
+          coef;
+      out_data[(h_out - 1) * w_out] =
+          (input_data[(h_in - 2) * w_in] + input_data[(h_in - 2) * w_in + 1] +
+           input_data[(h_in - 1) * w_in] + input_data[(h_in - 1) * w_in + 1]) *
+          coef;
+      out_data[h_out * w_out - 1] =
+          (input_data[h_in * w_in - 1] + input_data[h_in * w_in - 2] +
+           input_data[(h_in - 1) * w_in - 1] +
+           input_data[(h_in - 1) * w_in - 2]) *
+          coef;
+      // left side & right side
+      for (int i = 1; i < h_in - 1; ++i) {
+        out_data[i * w_out] =
+            (input_data[i * w_in - w_in] + input_data[i * w_in - w_in + 1] +
+             input_data[i * w_in] + input_data[i * w_in + 1] +
+             input_data[i * w_in + w_in] + input_data[i * w_in + w_in + 1]) *
+            coef;
+        out_data[i * w_out + w_out - 1] =
+            (input_data[i * w_in - w_in + w_in - 2] +
+             input_data[i * w_in - w_in + 1 + w_in - 2] +
+             input_data[i * w_in + w_in - 2] +
+             input_data[i * w_in + 1 + w_in - 2] +
+             input_data[i * w_in + w_in + w_in - 2] +
+             input_data[i * w_in + w_in + 1 + w_in - 2]) *
+            coef;
+      }
+      // top 1 row & bottom 1 row
+      const float *input_tmp = input_data;
+
+      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
+          tmp3, tmp4, tmp5, sum, out0;
+      float32x4_t v_coef = vdupq_n_f32(coef);
+      in0 = vld1q_f32(input_tmp);
+      in2 = vld1q_f32(input_tmp + w_in);
+      const float *input_tmp_end = input_tmp + (h_in - 2) * w_in;
+      in4 = vld1q_f32(input_tmp_end);
+      in6 = vld1q_f32(input_tmp_end + w_in);
+      int c_mid = w_out - 2;
+      auto output_ptr = out_data + 1;
+      for (; c_mid > 3; c_mid -= 4) {
+        in1 = vld1q_f32(input_tmp + 4);
+        in3 = vld1q_f32(input_tmp + w_in + 4);
+
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+
+        sum = vaddq_f32(in0, tmp0);
+        sum = vaddq_f32(sum, tmp1);
+        sum = vaddq_f32(sum, in2);
+        sum = vaddq_f32(sum, tmp2);
+        sum = vaddq_f32(sum, tmp3);
+
+        vst1q_f32(output_ptr, vmulq_f32(sum, v_coef));
+
+        in5 = vld1q_f32(input_tmp_end + 4);
+        in7 = vld1q_f32(input_tmp_end + w_in + 4);
+
+        tmp0 = vextq_f32(in4, in5, 1);
+        tmp1 = vextq_f32(in4, in5, 2);
+        tmp2 = vextq_f32(in6, in7, 1);
+        tmp3 = vextq_f32(in6, in7, 2);
+
+        sum = vaddq_f32(in0, tmp0);
+        sum = vaddq_f32(sum, tmp1);
+        sum = vaddq_f32(sum, in2);
+        sum = vaddq_f32(sum, tmp2);
+        sum = vaddq_f32(sum, tmp3);
+
+        vst1q_f32(output_ptr + (h_out - 1) * w_out, vmulq_f32(sum, v_coef));
+
+        // can optimize to each 8 stride.
+        input_tmp += 4;
+        input_tmp_end += 4;
+        output_ptr += 4;
+        in0 = in1;
+        in2 = in3;
+        in4 = in5;
+        in6 = in7;
+      }
+      // top right remain
+      float32x4_t pad0 = vdupq_n_f32(input_data[w_in - 1]);
+      float32x4_t pad1 = vdupq_n_f32(input_data[2 * w_in - 1]);
+
+      tmp0 = vextq_f32(in0, pad0, 1);
+      tmp1 = vextq_f32(in0, pad0, 2);
+      tmp2 = vextq_f32(in2, pad1, 2);
+      tmp3 = vextq_f32(in2, pad1, 2);
+
+      sum = vaddq_f32(in0, tmp0);
+      sum = vaddq_f32(sum, tmp1);
+      sum = vaddq_f32(sum, in2);
+      sum = vaddq_f32(sum, tmp2);
+      sum = vaddq_f32(sum, tmp3);
+      out0 = vmulq_f32(sum, v_coef);
+
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + i, out0, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + i, out0, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + i, out0, 2);
+        }
+      }
+
+      // bottom_right remain
+      float32x4_t pad2 = vdupq_n_f32(input_data[(h_in - 1) * w_in - 1]);
+      float32x4_t pad3 = vdupq_n_f32(input_data[h_in * w_in - 1]);
+
+      tmp0 = vextq_f32(in4, pad2, 1);
+      tmp1 = vextq_f32(in4, pad2, 2);
+      tmp2 = vextq_f32(in6, pad3, 2);
+      tmp3 = vextq_f32(in6, pad3, 2);
+
+      sum = vaddq_f32(in4, tmp0);
+      sum = vaddq_f32(sum, tmp1);
+      sum = vaddq_f32(sum, in6);
+      sum = vaddq_f32(sum, tmp2);
+      sum = vaddq_f32(sum, tmp3);
+      out0 = vmulq_f32(sum, v_coef);
+
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 2);
+        }
+      }
+      // mid
+      for (int j = 0; j < h_out - 2; ++j) {
+        output_ptr = out_data + w_out * (j + 1) + 1;
+        input_tmp = input_data + j * w_in;
+
+        in0 = vld1q_f32(input_tmp);
+        in2 = vld1q_f32(input_tmp + w_in);
+        in4 = vld1q_f32(input_tmp + 2 * w_in);
+        c_mid = w_out - 2;
+        for (; c_mid > 3; c_mid -= 4) {
+          in1 = vld1q_f32(input_tmp + 4);
+          in3 = vld1q_f32(input_tmp + w_in + 4);
+          in5 = vld1q_f32(input_tmp + 2 * w_in + 4);
+
+          tmp0 = vextq_f32(in0, in1, 1);
+          tmp1 = vextq_f32(in0, in1, 2);
+          tmp2 = vextq_f32(in2, in3, 1);
+          tmp3 = vextq_f32(in2, in3, 2);
+          tmp4 = vextq_f32(in4, in5, 1);
+          tmp5 = vextq_f32(in4, in5, 2);
+
+          sum = vaddq_f32(in0, tmp0);
+          sum = vaddq_f32(sum, tmp1);
+          sum = vaddq_f32(sum, in2);
+          sum = vaddq_f32(sum, tmp2);
+          sum = vaddq_f32(sum, tmp3);
+          sum = vaddq_f32(sum, in4);
+          sum = vaddq_f32(sum, tmp4);
+          sum = vaddq_f32(sum, tmp5);
+
+          out0 = vmulq_f32(sum, v_coef);
+          vst1q_f32(output_ptr, out0);
+          output_ptr += 4;
+          input_tmp += 4;
+          in0 = in1;
+          in2 = in3;
+          in4 = in5;
+        }
+        // mid remain
+        float32x4_t pad0 = vdupq_n_f32(input_data[(j + 1) * w_in - 1]);
+        float32x4_t pad1 = vdupq_n_f32(input_data[(j + 2) * w_in - 1]);
+        float32x4_t pad2 = vdupq_n_f32(input_data[(j + 2) * w_in - 1]);
+
+        tmp0 = vextq_f32(in0, pad0, 1);
+        tmp1 = vextq_f32(in0, pad0, 2);
+        tmp2 = vextq_f32(in2, pad1, 1);
+        tmp3 = vextq_f32(in2, pad1, 2);
+        tmp4 = vextq_f32(in4, pad2, 1);
+        tmp5 = vextq_f32(in4, pad2, 2);
+
+        sum = vaddq_f32(in0, tmp0);
+        sum = vaddq_f32(sum, tmp1);
+        sum = vaddq_f32(sum, in2);
+        sum = vaddq_f32(sum, tmp2);
+        sum = vaddq_f32(sum, tmp3);
+        sum = vaddq_f32(sum, in4);
+        sum = vaddq_f32(sum, tmp4);
+        sum = vaddq_f32(sum, tmp5);
+        out0 = vmulq_f32(sum, v_coef);
+
+        for (int i = 0; i < c_mid; ++i) {
+          if (i == 0) {
+            vst1q_lane_f32(output_ptr + i, out0, 0);
+          }
+          if (i == 1) {
+            vst1q_lane_f32(output_ptr + i, out0, 1);
+          }
+          if (i == 2) {
+            vst1q_lane_f32(output_ptr + i, out0, 2);
+          }
+        }
+      }
+      input_data += inputdata_channel_stride;
+      out_data += outputdata_channel_stride;
+    }
+  }
+#endif
+}
+
+void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+
+  const int h_in = input->dims()[2];
+
+  const int w_in = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  const int h_out = output->dims()[2];
+  const int w_out = output->dims()[3];
+  const int outputdata_channel_stride = h_out * w_out;
+  const int inputdata_channel_stride = h_in * w_in;
+  float *out_data = output->data<float>();
+  const float *input_data = input->data<float>();
+  for (int k = 0; k < batch_size; ++k) {
+    for (int c = 0; c < output_channels; ++c) {
+      // four corner point
+      out_data[0] = std::max(std::max(input_data[0], input_data[1]),
+                             std::max(input_data[w_in], input_data[w_in + 1]));
+      out_data[w_out - 1] = std::max(
+          std::max(input_data[w_in - 2], input_data[w_in - 1]),
+          std::max(input_data[w_in * 2 - 2], input_data[2 * w_in - 1]));
+      out_data[(h_out - 1) * w_out] =
+          std::max(std::max(input_data[(h_in - 2) * w_in],
+                            input_data[(h_in - 2) * w_in + 1]),
+                   std::max(input_data[(h_in - 1) * w_in],
+                            input_data[(h_in - 1) * w_in + 1]));
+      out_data[h_out * w_out - 1] = std::max(
+          std::max(input_data[(h_in - 1) * w_in - 1],
+                   input_data[(h_in - 1) * w_in - 2]),
+          std::max(input_data[h_in * w_in - 1], input_data[h_in * w_in - 2]));
+      // left side & right side
+      for (int i = 1; i < h_in - 1; ++i) {
+        float max1 = std::max(input_data[i * w_in - w_in],
+                              input_data[i * w_in - w_in + 1]);
+        float max2 = std::max(input_data[i * w_in], input_data[i * w_in + 1]);
+        float max3 = std::max(input_data[i * w_in + w_in],
+                              input_data[i * w_in + w_in + 1]);
+        out_data[i * w_out] = std::max(std::max(max1, max2), max3);
+
+        max1 = std::max(input_data[i * w_in - w_in + w_in - 2],
+                        input_data[i * w_in - w_in + 1 + w_in - 2]);
+        max2 = std::max(input_data[i * w_in + w_in - 2],
+                        input_data[i * w_in + 1 + w_in - 2]);
+        max3 = std::max(input_data[i * w_in + w_in + w_in - 2],
+                        input_data[i * w_in + w_in + 1 + w_in - 2]);
+        out_data[i * w_out + w_out - 1] = std::max(std::max(max1, max2), max3);
+      }
+      // top 1 row & bottom 1 row
+      const float *input_tmp = input_data;
+
+      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
+          tmp3, tmp4, tmp5, max;
+      in0 = vld1q_f32(input_tmp);
+      in2 = vld1q_f32(input_tmp + w_in);
+      const float *input_tmp_end = input_tmp + (h_in - 2) * w_in;
+      in4 = vld1q_f32(input_tmp_end);
+      in6 = vld1q_f32(input_tmp_end + w_in);
+      int c_mid = w_out - 2;
+      auto output_ptr = out_data + 1;
+      for (; c_mid > 3; c_mid -= 4) {
+        in1 = vld1q_f32(input_tmp + 4);
+        in3 = vld1q_f32(input_tmp + w_in + 4);
+
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+
+        max = vmaxq_f32(in0, tmp0);
+        max = vmaxq_f32(max, tmp1);
+        max = vmaxq_f32(max, in2);
+        max = vmaxq_f32(max, tmp2);
+        max = vmaxq_f32(max, tmp3);
+
+        vst1q_f32(output_ptr, max);
+
+        in5 = vld1q_f32(input_tmp_end + 4);
+        in7 = vld1q_f32(input_tmp_end + w_in + 4);
+
+        tmp0 = vextq_f32(in4, in5, 1);
+        tmp1 = vextq_f32(in4, in5, 2);
+        tmp2 = vextq_f32(in6, in7, 1);
+        tmp3 = vextq_f32(in6, in7, 2);
+
+        max = vmaxq_f32(in4, tmp0);
+        max = vmaxq_f32(max, tmp1);
+        max = vmaxq_f32(max, in6);
+        max = vmaxq_f32(max, tmp2);
+        max = vmaxq_f32(max, tmp3);
+
+        vst1q_f32(output_ptr + (h_out - 1) * w_out, max);
+
+        input_tmp += 4;
+        input_tmp_end += 4;
+        output_ptr += 4;
+        in0 = in1;
+        in2 = in3;
+        in4 = in5;
+        in6 = in7;
+      }
+      // top right remain
+      float32x4_t pad0 = vdupq_n_f32(input_data[w_in - 1]);
+      float32x4_t pad1 = vdupq_n_f32(input_data[2 * w_in - 1]);
+
+      tmp0 = vextq_f32(in0, pad0, 1);
+      tmp1 = vextq_f32(in0, pad0, 2);
+      tmp2 = vextq_f32(in2, pad1, 1);
+      tmp3 = vextq_f32(in2, pad1, 2);
+
+      max = vmaxq_f32(in0, tmp0);
+      max = vmaxq_f32(max, tmp1);
+      max = vmaxq_f32(max, in2);
+      max = vmaxq_f32(max, tmp2);
+      max = vmaxq_f32(max, tmp3);
+
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + i, max, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + i, max, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + i, max, 2);
+        }
+      }
+
+      // bottom_right remain
+      float32x4_t pad2 = vdupq_n_f32(input_data[(h_in - 1) * w_in - 1]);
+      float32x4_t pad3 = vdupq_n_f32(input_data[h_in * w_in - 1]);
+
+      tmp0 = vextq_f32(in4, pad2, 1);
+      tmp1 = vextq_f32(in4, pad2, 2);
+      tmp2 = vextq_f32(in6, pad3, 1);
+      tmp3 = vextq_f32(in6, pad3, 2);
+
+      max = vmaxq_f32(in4, tmp0);
+      max = vmaxq_f32(max, tmp1);
+      max = vmaxq_f32(max, in6);
+      max = vmaxq_f32(max, tmp2);
+      max = vmaxq_f32(max, tmp3);
+
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, max, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, max, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, max, 2);
+        }
+      }
+      // mid
+      for (int j = 0; j < h_out - 2; ++j) {
+        output_ptr = out_data + (j + 1) * w_out + 1;
+        input_tmp = input_data + j * w_in;
+
+        in0 = vld1q_f32(input_tmp);
+        in2 = vld1q_f32(input_tmp + w_in);
+        in4 = vld1q_f32(input_tmp + 2 * w_in);
+        c_mid = w_out - 2;
+        for (; c_mid > 3; c_mid -= 4) {
+          in1 = vld1q_f32(input_tmp + 4);
+          in3 = vld1q_f32(input_tmp + w_in + 4);
+          in5 = vld1q_f32(input_tmp + 2 * w_in + 4);
+
+          tmp0 = vextq_f32(in0, in1, 1);
+          tmp1 = vextq_f32(in0, in1, 2);
+          tmp2 = vextq_f32(in2, in3, 1);
+          tmp3 = vextq_f32(in2, in3, 2);
+          tmp4 = vextq_f32(in4, in5, 1);
+          tmp5 = vextq_f32(in4, in5, 2);
+
+          max = vmaxq_f32(in0, tmp0);
+          max = vmaxq_f32(max, tmp1);
+          max = vmaxq_f32(max, in2);
+          max = vmaxq_f32(max, tmp2);
+          max = vmaxq_f32(max, tmp3);
+          max = vmaxq_f32(max, in4);
+          max = vmaxq_f32(max, tmp4);
+          max = vmaxq_f32(max, tmp5);
+
+          vst1q_f32(output_ptr, max);
+          output_ptr += 4;
+          input_tmp += 4;
+          in0 = in1;
+          in2 = in3;
+          in4 = in5;
+        }
+        // mid remain
+        float32x4_t pad0 = vdupq_n_f32(input_data[(j + 1) * w_in - 1]);
+        float32x4_t pad1 = vdupq_n_f32(input_data[(j + 2) * w_in - 1]);
+        float32x4_t pad2 = vdupq_n_f32(input_data[(j + 3) * w_in - 1]);
+
+        tmp0 = vextq_f32(in0, pad0, 1);
+        tmp1 = vextq_f32(in0, pad0, 2);
+        tmp2 = vextq_f32(in2, pad1, 1);
+        tmp3 = vextq_f32(in2, pad1, 2);
+        tmp4 = vextq_f32(in4, pad2, 1);
+        tmp5 = vextq_f32(in4, pad2, 2);
+
+        max = vmaxq_f32(in0, tmp0);
+        max = vmaxq_f32(max, tmp1);
+        max = vmaxq_f32(max, in2);
+        max = vmaxq_f32(max, tmp2);
+        max = vmaxq_f32(max, tmp3);
+        max = vmaxq_f32(max, in4);
+        max = vmaxq_f32(max, tmp4);
+        max = vmaxq_f32(max, tmp5);
+
+        for (int i = 0; i < c_mid; ++i) {
+          if (i == 0) {
+            vst1q_lane_f32(output_ptr + i, max, 0);
+          }
+          if (i == 1) {
+            vst1q_lane_f32(output_ptr + i, max, 1);
+          }
+          if (i == 2) {
+            vst1q_lane_f32(output_ptr + i, max, 2);
+          }
+        }
+      }
+      input_data += inputdata_channel_stride;
+      out_data += outputdata_channel_stride;
+    }
+  }
+#endif
+}
 
 void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                 Tensor *output) {
diff --git a/src/operators/math/pool_3x3.h b/src/operators/math/pool_3x3.h
index 22a398084390701aefc8815c9aa93b82b4c4ec7b..53d39b81cc158f02601a352f0ec2996f1d444304 100644
--- a/src/operators/math/pool_3x3.h
+++ b/src/operators/math/pool_3x3.h
@@ -15,7 +15,8 @@ limitations under the License. */
 #ifdef POOL_OP
 
 #pragma once
-
+#include <algorithm>
+#include <vector>
 #include "framework/tensor.h"
 #if __ARM_NEON
 #include <arm_neon.h>
@@ -26,7 +27,8 @@ namespace operators {
 namespace math {
 using framework::Tensor;
 using std::vector;
-
+void Pool3x3Avgs1p1(const Tensor *input, Tensor *output);
+void Pool3x3Maxs1p1(const Tensor *input, Tensor *output);
 void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                 Tensor *output);
 
diff --git a/src/operators/math/pooling.cpp b/src/operators/math/pooling.cpp
index 4287408394f1a7f407154938f3e83e9fac3543a2..0252eae845c06da454cfcd65b54982cb0506acb9 100644
--- a/src/operators/math/pooling.cpp
+++ b/src/operators/math/pooling.cpp
@@ -57,7 +57,7 @@ class PoolFunctor<CPU, PoolProcess, T> {
     T *output_data = output->mutable_data<T>();
 
     for (int i = 0; i < batch_size; i++) {
-      #pragma omp parallel for
+      //  #pragma omp parallel for
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
           int hstart = ph * stride_height - padding_height;
diff --git a/src/operators/mul_op.cpp b/src/operators/mul_op.cpp
index 49ae3a5e8484cb2f6628eb53cabd9321ae5705b8..60e0c087383388c83ca1711c057af822a6e2a730 100644
--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -56,11 +56,9 @@ template class MulOp<CPU, float>;
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(mul);
 REGISTER_OPERATOR_CPU(mul, ops::MulOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(mul);
 REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
diff --git a/src/operators/mul_op.h b/src/operators/mul_op.h
index ad5c9a3702348455cb559c28453df82d81e1c4c8..64b811b01091418c9febdfb8d03bacd77421dcf5 100644
--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
@@ -46,4 +46,13 @@ class MulOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(mul);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(mul);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/multiclass_nms_op.cpp b/src/operators/multiclass_nms_op.cpp
index 52adf6cc627d76b18b3b48928c344545327ca99e..eea625469ec030e0c7d62baea8312e11f1308ce2 100644
--- a/src/operators/multiclass_nms_op.cpp
+++ b/src/operators/multiclass_nms_op.cpp
@@ -40,7 +40,6 @@ template class MultiClassNMSOp<CPU, float>;
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(multiclass_nms);
 REGISTER_OPERATOR_CPU(multiclass_nms, ops::MultiClassNMSOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/multiclass_nms_op.h b/src/operators/multiclass_nms_op.h
index 30cf8f67942f7888599e8f0057baff1ddd5d6cea..425f7d33e35e0864b5f5a7739dbfa18bc8eb0c30 100644
--- a/src/operators/multiclass_nms_op.h
+++ b/src/operators/multiclass_nms_op.h
@@ -52,4 +52,12 @@ class MultiClassNMSOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(multiclass_nms);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index ad7de0ee44db3a727ec06d5fabfca203226215f4..892b08e6da0ce92df95e81dd9896df3ee8899fb9 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -195,8 +195,7 @@ class OpParam {
 class ConvParam : OpParam {
  public:
   ConvParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs,
-            const framework::Scope &scope) {
+            const AttributeMap &attrs, const Scope &scope) {
     filter_ = FilterFrom<LoDTensor>(inputs, scope);
     input_ = InputFrom<LoDTensor>(inputs, scope);
     output_ = OutputFrom<LoDTensor>(outputs, scope);
@@ -237,12 +236,11 @@ Print &operator<<(Print &printer, const ConvParam &conv_param);
 class ElementwiseAddParam : OpParam {
  public:
   ElementwiseAddParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs,
-                      const framework::AttributeMap &attrs,
-                      const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    input_y_ = InputYFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+                      const VariableNameMap &outputs, const AttributeMap &attrs,
+                      const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    input_y_ = InputYFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
     axis_ = GetAttr<int>("axis", attrs);
   }
 
@@ -267,11 +265,10 @@ class ElementwiseAddParam : OpParam {
 class MulParam : OpParam {
  public:
   MulParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const framework::AttributeMap &attrs,
-           const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    input_y_ = InputYFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+           const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    input_y_ = InputYFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
     x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
     y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
   }
@@ -299,10 +296,9 @@ class MulParam : OpParam {
 class ConcatParam : public OpParam {
  public:
   ConcatParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-              const framework::AttributeMap &attrs,
-              const framework::Scope &scope) {
+              const AttributeMap &attrs, const Scope &scope) {
     inputs_ = InputMultiFrom<LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
     axis_ = GetAttr<int>("axis", attrs);
   }
 
@@ -323,11 +319,10 @@ class ConcatParam : public OpParam {
 class LrnParam : public OpParam {
  public:
   LrnParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const framework::AttributeMap &attrs,
-           const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
-    mid_out_ = MidOutFrom<framework::LoDTensor>(outputs, scope);
+           const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
+    mid_out_ = MidOutFrom<LoDTensor>(outputs, scope);
     n_ = GetAttr<int>("n", attrs);
     alpha_ = GetAttr<float>("alpha", attrs);
     beta_ = GetAttr<float>("beta", attrs);
@@ -367,14 +362,13 @@ class LrnParam : public OpParam {
 class BatchNormParam : OpParam {
  public:
   BatchNormParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs,
-                 const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    output_y_ = OutputYFrom<framework::LoDTensor>(outputs, scope);
-    input_bias_ = InputBiasFrom<framework::LoDTensor>(inputs, scope);
-    input_mean_ = InputMeanFrom<framework::LoDTensor>(inputs, scope);
-    input_scale_ = InputScaleFrom<framework::LoDTensor>(inputs, scope);
-    input_variance_ = InputVarianceFrom<framework::LoDTensor>(inputs, scope);
+                 const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    output_y_ = OutputYFrom<LoDTensor>(outputs, scope);
+    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
+    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
+    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
+    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
     epsilon_ = GetAttr<float>("epsilon", attrs);
     momentum_ = GetAttr<float>("momentum", attrs);
     is_test_ = GetAttr<bool>("is_test", attrs);
@@ -418,11 +412,10 @@ class BatchNormParam : OpParam {
 class PoolParam : public OpParam {
  public:
   PoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs,
-            const framework::Scope &scope) {
-    input_ = InputXFrom<framework::LoDTensor>(inputs, scope);
+            const AttributeMap &attrs, const Scope &scope) {
+    input_ = InputXFrom<LoDTensor>(inputs, scope);
 
-    output_ = OutFrom<framework::LoDTensor>(outputs, scope);
+    output_ = OutFrom<LoDTensor>(outputs, scope);
     pooling_type_ = GetAttr<string>("pooling_type", attrs);
     ksize_ = GetAttr<vector<int>>("ksize", attrs);
     strides_ = GetAttr<vector<int>>("strides", attrs);
@@ -464,13 +457,11 @@ class PoolParam : public OpParam {
 class PriorBoxParam : public OpParam {
  public:
   PriorBoxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const framework::AttributeMap &attrs,
-                const framework::Scope &scope) {
-    input_ = InputFrom<framework::LoDTensor>(inputs, scope);
-    input_image_ = InputImageFrom<framework::LoDTensor>(inputs, scope);
-    output_boxes_ = OutputBoxesFrom<framework::LoDTensor>(outputs, scope);
-    output_variances_ =
-        OutputVariancesFrom<framework::LoDTensor>(outputs, scope);
+                const AttributeMap &attrs, const Scope &scope) {
+    input_ = InputFrom<LoDTensor>(inputs, scope);
+    input_image_ = InputImageFrom<LoDTensor>(inputs, scope);
+    output_boxes_ = OutputBoxesFrom<LoDTensor>(outputs, scope);
+    output_variances_ = OutputVariancesFrom<LoDTensor>(outputs, scope);
     min_sizes_ = GetAttr<vector<float>>("min_sizes", attrs);
     max_sizes_ = GetAttr<vector<float>>("max_sizes", attrs);
     aspect_ratios_ = GetAttr<vector<float>>("aspect_ratios", attrs);
@@ -528,13 +519,11 @@ class PriorBoxParam : public OpParam {
 class BoxCoderParam : public OpParam {
  public:
   BoxCoderParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const framework::AttributeMap &attrs,
-                const framework::Scope &scope) {
-    input_priorbox_ = InputPriorBoxFrom<framework::LoDTensor>(inputs, scope);
-    input_priorboxvar_ =
-        InputPriorBoxVarFrom<framework::LoDTensor>(inputs, scope);
-    input_targetbox_ = InputTargetBoxFrom<framework::LoDTensor>(inputs, scope);
-    output_box_ = OutputBoxFrom<framework::LoDTensor>(outputs, scope);
+                const AttributeMap &attrs, const Scope &scope) {
+    input_priorbox_ = InputPriorBoxFrom<LoDTensor>(inputs, scope);
+    input_priorboxvar_ = InputPriorBoxVarFrom<LoDTensor>(inputs, scope);
+    input_targetbox_ = InputTargetBoxFrom<LoDTensor>(inputs, scope);
+    output_box_ = OutputBoxFrom<LoDTensor>(outputs, scope);
     code_type_ = GetAttr<std::string>("code_type", attrs);
   }
   const Tensor *InputPriorBox() const { return input_priorbox_; }
@@ -560,10 +549,9 @@ class BoxCoderParam : public OpParam {
 class SoftmaxParam : public OpParam {
  public:
   SoftmaxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const framework::AttributeMap &attrs,
-               const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+               const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
   }
   const Tensor *InputX() const { return input_x_; }
   Tensor *Out() const { return out_; }
@@ -578,10 +566,9 @@ class SoftmaxParam : public OpParam {
 class SigmoidParam : public OpParam {
  public:
   SigmoidParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const framework::AttributeMap &attrs,
-               const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+               const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
   }
   const Tensor *InputX() const { return input_x_; }
   Tensor *Out() const { return out_; }
@@ -643,9 +630,9 @@ class MultiClassNMSParam : public OpParam {
 class FeedParam : public OpParam {
  public:
   FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs, framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+            const AttributeMap &attrs, Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
     auto var = scope.Var("batch_size");
     batch_size = var->GetValue<int>();
   }
@@ -662,10 +649,9 @@ class FeedParam : public OpParam {
 class FetchParam : public OpParam {
  public:
   FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs,
-             const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+             const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
   }
   const Tensor *InputX() const { return input_x_; }
   Tensor *Out() const { return out_; }
@@ -848,5 +834,143 @@ class FusionConvAddReluParam : public FusionConvAddParam {
 };
 #endif
 
+#ifdef FUSION_CONVADDBNRELU_OP
+class FusionConvAddBNReluParam : public OpParam {
+ public:
+  FusionConvAddBNReluParam(const VariableNameMap &inputs,
+                           const VariableNameMap &outputs,
+                           const AttributeMap &attrs, const Scope &scope) {
+    bias_ = InputYFrom<LoDTensor>(inputs, scope);
+    axis_ = GetAttr<int>("axis", attrs);
+    filter_ = FilterFrom<LoDTensor>(inputs, scope);
+    input_ = InputFrom<LoDTensor>(inputs, scope);
+    output_ = OutFrom<LoDTensor>(outputs, scope);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
+    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
+    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
+    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
+    epsilon_ = GetAttr<float>("epsilon", attrs);
+    momentum_ = GetAttr<float>("momentum", attrs);
+    is_test_ = GetAttr<bool>("is_test", attrs);
+  }
+  Tensor *Bias() const { return bias_; }
+
+  const int &Axis() const { return axis_; }
+
+  const Tensor *Input() const { return input_; }
+
+  const Tensor *Filter() const { return filter_; }
+
+  Tensor *Output() const { return output_; }
+
+  const vector<int> &Strides() const { return strides_; }
+
+  const vector<int> &Paddings() const { return paddings_; }
+
+  const vector<int> &Dilations() const { return dilations_; }
+
+  const int &Groups() const { return groups; }
+
+  const Tensor *InputBias() const { return input_bias_; }
+
+  const Tensor *InputMean() const { return input_mean_; }
+
+  const Tensor *InputScale() const { return input_scale_; }
+
+  const Tensor *InputVariance() const { return input_variance_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+  const float &Momentum() const { return momentum_; }
+
+  const bool &IsTest() const { return is_test_; }
+
+  void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; }
+
+  void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; }
+
+  const Tensor *NewScale() const { return new_scale_; }
+
+  const Tensor *NewBias() const { return new_bias_; }
+
+ protected:
+  Tensor *bias_;
+  int axis_;
+  Tensor *input_;
+  Tensor *output_;
+  Tensor *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+  Tensor *input_bias_;
+  Tensor *input_mean_;
+  Tensor *input_scale_;
+  Tensor *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  Tensor *new_bias_;
+  Tensor *new_scale_;
+};
+
+Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
+#endif
+
+#ifdef IM2SEQUENCE_OP
+class Im2SequenceParam : public OpParam {
+ public:
+  Im2SequenceParam(const VariableNameMap &inputs,
+                   const VariableNameMap &outputs, const AttributeMap &attrs,
+                   const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
+    kernels_ = GetAttr<vector<int>>("kernels", attrs);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+  }
+
+  const Tensor *Input() const { return input_x_; }
+
+  Tensor *Output() const { return out_; }
+
+  const vector<int> &Kernels() const { return kernels_; }
+
+  const vector<int> &Strides() const { return strides_; }
+
+  const vector<int> &Paddings() const { return paddings_; }
+
+ private:
+  Tensor *input_x_;
+  Tensor *out_;
+  vector<int> kernels_;
+  vector<int> strides_;
+  vector<int> paddings_;
+};
+#endif
+
+#ifdef DROPOUT_OP
+class DropoutParam : public OpParam {
+ public:
+  DropoutParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+               const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
+    out_ = OutFrom<LoDTensor>(outputs, scope);
+  }
+
+  const Tensor *InputX() const { return input_x_; }
+
+  Tensor *Out() const { return out_; }
+
+ private:
+  Tensor *input_x_;
+  Tensor *out_;
+};
+#endif
+
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/pool_op.cpp b/src/operators/pool_op.cpp
index 62eaf6b5f8105c4d2ab63f2f883445705b815860..41016d74deb5bcd7d3679b1c762467e2dc65de34 100644
--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
@@ -60,11 +60,9 @@ template class PoolOp<CPU, float>;
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(pool2d);
 REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(pool2d);
 REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
diff --git a/src/operators/pool_op.h b/src/operators/pool_op.h
index 5b436fb18bdc055add21acd37e5a1a9c7b6e5b02..4c48efdc53af7eb75f694d4b5a0a7ce5078d2e25 100644
--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
@@ -48,4 +48,13 @@ class PoolOp : public OperatorWithKernel<DeviceType, PoolParam,
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(pool2d);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(pool2d);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/prior_box_op.cpp b/src/operators/prior_box_op.cpp
index 44e1741b66f301aee55f1f4d33b9bb1173e6004d..81ba045a209a48105ab895f7687e56ed3db44305 100644
--- a/src/operators/prior_box_op.cpp
+++ b/src/operators/prior_box_op.cpp
@@ -50,7 +50,6 @@ template class PriorBoxOp<CPU, float>;
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(prior_box);
 REGISTER_OPERATOR_CPU(prior_box, ops::PriorBoxOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/prior_box_op.h b/src/operators/prior_box_op.h
index 5b3e3fffd6787360b69ff3af2d19bc8e05549c04..6fcaa07c74f0e005fd5b91ae04ec7219e0394064 100644
--- a/src/operators/prior_box_op.h
+++ b/src/operators/prior_box_op.h
@@ -51,4 +51,12 @@ class PriorBoxOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(prior_box);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/relu_op.cpp b/src/operators/relu_op.cpp
index 877dcee1a7f4a5a75d013031235d3a216c35f854..b80a56f38aec4bf1bf625d54f4115626447a654a 100644
--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -34,11 +34,9 @@ template class ReluOp<CPU, float>;
  * */
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(relu);
 REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(relu);
 REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
diff --git a/src/operators/relu_op.h b/src/operators/relu_op.h
index 8f9e55cf8a2d5bb58e85c21cd2cee3647b00fa22..204ec3d29c147d0d52b9b05d16de6807211a5e57 100644
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
@@ -53,4 +53,13 @@ class ReluOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(relu);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/reshape_op.cpp b/src/operators/reshape_op.cpp
index c7294079b26250770006aeb1b79c15469489b988..193678613cc8dd2b8f9b8ae1654b0adacea09505 100644
--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -33,11 +33,9 @@ template class ReshapeOp<CPU, float>;
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(reshape);
 REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(reshape);
 REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
diff --git a/src/operators/reshape_op.h b/src/operators/reshape_op.h
index 90d31153135f629585d56eb89ae12830215900d8..da2328ec3570359ccdb45ce1511c02f322498aa1 100644
--- a/src/operators/reshape_op.h
+++ b/src/operators/reshape_op.h
@@ -51,4 +51,14 @@ class ReshapeOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(reshape);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(reshape);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/sigmoid_op.cpp b/src/operators/sigmoid_op.cpp
index 79190e6c3368b9d375770062d948580779393f04..c83738b2c88c3c51ebc0d649fe134da9e44f30ea 100644
--- a/src/operators/sigmoid_op.cpp
+++ b/src/operators/sigmoid_op.cpp
@@ -28,7 +28,6 @@ template class SigmoidOp<CPU, float>;
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(sigmoid);
 REGISTER_OPERATOR_CPU(sigmoid, ops::SigmoidOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/sigmoid_op.h b/src/operators/sigmoid_op.h
index bd914a63783f65c7b55d783f2bbcdf19c303c00f..bffef7880b2bb6057f5d489eaac6dea7a3fb3ab5 100644
--- a/src/operators/sigmoid_op.h
+++ b/src/operators/sigmoid_op.h
@@ -46,4 +46,12 @@ class SigmoidOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(sigmoid);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/softmax_op.cpp b/src/operators/softmax_op.cpp
index 296e3ef30f7c0260cca169bcfe2f6b445493792a..db8fe1d94363c1db578a369d9eca00dde17d30af 100644
--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -28,11 +28,9 @@ template class SoftmaxOp<CPU, float>;
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(softmax);
 REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(softmax);
 REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
diff --git a/src/operators/softmax_op.h b/src/operators/softmax_op.h
index 1445ca055ea0472cdaa02d7496ff895feb9174bc..f645d7edf7a3b9f7a92cf286feec58e960a5e3b7 100644
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
@@ -48,4 +48,13 @@ class SoftmaxOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(softmax);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(softmax);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/transpose_op.cpp b/src/operators/transpose_op.cpp
index 989b277b9d58a8c029e041a89a1982f8994bae44..7e578b290174734ba8c210a354c9e56fde364858 100644
--- a/src/operators/transpose_op.cpp
+++ b/src/operators/transpose_op.cpp
@@ -53,7 +53,6 @@ template class TransposeOp<CPU, float>;
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(transpose);
 REGISTER_OPERATOR_CPU(transpose, ops::TransposeOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/transpose_op.h b/src/operators/transpose_op.h
index 349220b58ff3e0daec8c7dc2e2dec969ced8b289..25cf07c4c9253736d513505e5f8eba6147f3740c 100644
--- a/src/operators/transpose_op.h
+++ b/src/operators/transpose_op.h
@@ -50,4 +50,12 @@ class TransposeOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(transpose);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 9bfc55c93daa2f69200941bfb49a8a6312fa9eb1..09d1ff031f2d29eb64c83d43724b1039fce9385f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,23 +1,23 @@
 set(dir ${CMAKE_CURRENT_SOURCE_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build")
 
-if (googlenet)
+if (NET STREQUAL "googlenet")
     # gen test
     ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-googlenet paddle-mobile)
-elseif (mobilenet)
+elseif (NET STREQUAL "mobilenet")
     # gen test
     ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-mobilenet paddle-mobile)
-elseif (yolo)
+elseif (NET STREQUAL "yolo")
     # gen test
     ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-yolo paddle-mobile)
-elseif (squeezenet)
+elseif (NET STREQUAL "squeezenet")
     # gen test
     ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-squeezenet paddle-mobile)
-elseif(resnet)
+elseif(NET STREQUAL "resnet")
     # gen test
     ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-resnet paddle-mobile)
diff --git a/test/executor_for_test.h b/test/executor_for_test.h
index 0d3051327a57202e2b8d1dcbdda571fd244de108..c9ab4783d6826992ee81ffd63b0391169645576c 100644
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #include "common/log.h"
 #include "framework/op_registry.h"
-#include "io/io.h"
+#include "io/executor.h"
 #include "operators/conv_op.h"
 #include "operators/elementwise_add_op.h"
 #include "operators/pool_op.h"
diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp
index 8c76eb1dde3ef39a342d19e7f3d4e26fc1be2b2f..f4215de46c2bafd732b0092b58c25bf6fcefdf7a 100644
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "../test_helper.h"
-#include "io/io.h"
+#include "io/loader.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
-  auto program = loader.Load(g_mobilenet_ssd, false, false);
+  auto program = loader.Load(g_googlenet, true);
   //  auto program = loader.Load(g_googlenet_combine + "/model",
   //  g_googlenet_combine +
   //    "/params", true);
diff --git a/test/framework/test_optimize.cpp b/test/framework/test_optimize.cpp
index 32574764e1ba538ab0bea31d1e238096e7098dfc..3cae963eca048da221d69c4c336dd4fdfecbb584 100644
--- a/test/framework/test_optimize.cpp
+++ b/test/framework/test_optimize.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "../test_helper.h"
 #include "framework/program/program-optimize/node.h"
 #include "framework/program/program-optimize/program_optimize.h"
-#include "io/io.h"
+#include "io/loader.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/net/test_mobilenet+ssd.cpp b/test/net/test_mobilenet+ssd.cpp
index 097d03ad710468a881050ff729e8352f029d664f..1a7c4cd49cb1707b9c7783cf74e87e74da39732e 100644
--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
@@ -17,23 +17,23 @@ limitations under the License. */
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
   auto time1 = time();
-  auto program = loader.Load(g_mobilenet_ssd, true);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, true);
+  if (paddle_mobile.Load(g_mobilenet_ssd, true)) {
+    auto time2 = time();
+    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
 
-  std::vector<int64_t> dims{1, 3, 300, 300};
-  Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {1, 3, 300, 300}, static_cast<float>(0),
-                     static_cast<float>(1));
+    std::vector<int64_t> dims{1, 3, 300, 300};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 300, 300}, static_cast<float>(0),
+                       static_cast<float>(1));
 
-  std::vector<float> input(input_tensor.data<float>(),
-                           input_tensor.data<float>() + input_tensor.numel());
-  auto time3 = time();
-  executor.Predict(input, dims);
-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+    auto time3 = time();
+    paddle_mobile.Predict(input, dims);
+    auto time4 = time();
+    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  }
   return 0;
 }
diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp
index 8400b08f2292bb5655e2d85298acce603e1ce603..1f38dc5d19d0e7bb54faf75a41419941e8b1f412 100644
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -17,28 +17,25 @@ limitations under the License. */
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
   auto time1 = time();
-  auto program = loader.Load(g_mobilenet, true);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, true);
-
-  std::vector<int64_t> dims{1, 3, 224, 224};
-  Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
-                     static_cast<float>(1));
-
-  std::vector<float> input(input_tensor.data<float>(),
-                           input_tensor.data<float>() + input_tensor.numel());
-  auto time3 = time();
-  auto vec_result = executor.Predict(input, dims);
-  float sum = 0;
-  for (const auto item : vec_result) {
-    sum += item;
+  if (paddle_mobile.Load(g_mobilenet, true)) {
+    auto time2 = time();
+    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
+                       static_cast<float>(1));
+
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+    auto time3 = time();
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    auto time4 = time();
+
+    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
   }
-  DLOG << "mobilenet output sum =" << sum;
-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+
   return 0;
 }
diff --git a/test/net/test_resnet.cpp b/test/net/test_resnet.cpp
index 55f4c5efef209c421fc550c1f17422acd64b11b9..883ad95392ad351a2634e1a56ac050f02d8767e6 100644
--- a/test/net/test_resnet.cpp
+++ b/test/net/test_resnet.cpp
@@ -17,23 +17,23 @@ limitations under the License. */
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
   auto time1 = time();
-  auto program = loader.Load(g_resnet, false);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+  if (paddle_mobile.Load(g_resnet, false)) {
+    auto time2 = time();
+    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    std::vector<int64_t> dims{1, 3, 32, 32};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
+                       static_cast<float>(1));
 
-  std::vector<int64_t> dims{1, 3, 32, 32};
-  Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
-                     static_cast<float>(1));
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+    auto time3 = time();
+    paddle_mobile.Predict(input, dims);
+    auto time4 = time();
+    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  }
 
-  std::vector<float> input(input_tensor.data<float>(),
-                           input_tensor.data<float>() + input_tensor.numel());
-  auto time3 = time();
-  executor.Predict(input, dims);
-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
   return 0;
 }
diff --git a/test/net/test_squeezenet.cpp b/test/net/test_squeezenet.cpp
index 30460018fe8cc008e0031c1c713150745767fa28..39d4687ff3de37c571ee89213485fb0b6bc939df 100644
--- a/test/net/test_squeezenet.cpp
+++ b/test/net/test_squeezenet.cpp
@@ -17,25 +17,25 @@ limitations under the License. */
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
   auto time1 = time();
-  auto program = loader.Load(g_squeezenet, false);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+  if (paddle_mobile.Load(g_squeezenet, false)) {
+    auto time2 = time();
+    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    std::vector<int64_t> dims{1, 3, 227, 227};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
+                       static_cast<float>(1));
 
-  std::vector<int64_t> dims{1, 3, 227, 227};
-  Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
-                     static_cast<float>(1));
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+    auto time3 = time();
+    paddle_mobile.Predict(input, dims);
+    auto time4 = time();
+    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  }
 
-  std::vector<float> input(input_tensor.data<float>(),
-                           input_tensor.data<float>() + input_tensor.numel());
-  auto time3 = time();
-  executor.Predict(input, dims);
-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
   return 0;
 }
diff --git a/test/net/test_yolo.cpp b/test/net/test_yolo.cpp
index c82443e23953def917826fe4ec3b2c484b588f59..65dec59ad0579d362c75ae6ec1d362fb957d4fc5 100644
--- a/test/net/test_yolo.cpp
+++ b/test/net/test_yolo.cpp
@@ -17,25 +17,25 @@ limitations under the License. */
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
   auto time1 = time();
-  auto program = loader.Load(g_yolo, false);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+  if (paddle_mobile.Load(g_yolo, false)) {
+    auto time2 = time();
+    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
 
-  std::vector<int64_t> dims{1, 3, 227, 227};
-  Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
-                     static_cast<float>(1));
+    std::vector<int64_t> dims{1, 3, 227, 227};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
+                       static_cast<float>(1));
 
-  std::vector<float> input(input_tensor.data<float>(),
-                           input_tensor.data<float>() + input_tensor.numel());
-  auto time3 = time();
-  executor.Predict(input, dims);
-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+    auto time3 = time();
+    paddle_mobile.Predict(input, dims);
+    auto time4 = time();
+    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  }
   return 0;
 }
diff --git a/test/operators/test_im2sequence_op.cpp b/test/operators/test_im2sequence_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a7512d3bf3cffcb100fe292e50fc7b7b23fa0aa0
--- /dev/null
+++ b/test/operators/test_im2sequence_op.cpp
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../executor_for_test.h"
+#include "../test_include.h"
+#include "operators/im2sequence_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_ocr_recg);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::ReluOp<paddle_mobile::CPU, float>>
+      executor(program, "im2sequence");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {2, 2, 3, 3}, -1, 1);
+  input_tensors.push_back(input1);
+
+  // 2. input_names
+  vector<string> input_names({
+      "conv2d_19.tmp_1",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"im2sequence_0.tmp_0"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({8, 9});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  for (int j = 0; j < input_tensors[0].numel(); ++j) {
+    DLOG << " value of input: " << input1_data[j];
+  }
+
+  for (int j = 0; j < output[0]->numel(); ++j) {
+    DLOG << " value of output: " << output0_data[j];
+  }
+  return 0;
+}
diff --git a/test/operators/test_sigmoid_op.cpp b/test/operators/test_sigmoid_op.cpp
index 4ed3efaf28aa986f0b679729c46cb386150583e3..c8fac6b9eee5c5777ddb0147bc81d361d4dd09f5 100644
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "../../src/operators/kernel/sigmoid_kernel.h"
 #include "../test_helper.h"
-#include "io/io.h"
+#include "io/executor.h"
 
 int main() {
   paddle_mobile::framework::Tensor input;
diff --git a/test/test_helper.h b/test/test_helper.h
index fe720ded8270f2bc02a4f1e72625954962184069..81ad23ff3b4e53db0225630eebaa34878ad4c139 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include <chrono>
 #include <fstream>
 #include <random>
 
+#include "common/common.h"
 #include "common/log.h"
 #include "framework/ddim.h"
 #include "framework/tensor.h"
@@ -35,17 +35,6 @@ static const std::string g_test_image_1x3x224x224 =
 using paddle_mobile::framework::DDim;
 using paddle_mobile::framework::Tensor;
 
-using Time = decltype(std::chrono::high_resolution_clock::now());
-
-Time time() { return std::chrono::high_resolution_clock::now(); }
-
-double time_diff(Time t1, Time t2) {
-  typedef std::chrono::microseconds ms;
-  auto diff = t2 - t1;
-  ms counter = std::chrono::duration_cast<ms>(diff);
-  return counter.count() / 1000.0;
-}
-
 template <typename T>
 void SetupTensor(paddle_mobile::framework::Tensor *input,
                  paddle_mobile::framework::DDim dims, T lower, T upper) {
diff --git a/test/test_include.h b/test/test_include.h
index 2d89dc8c9ed1de1ad49ebca07724b6649e2a12a7..4728a469334010e7353e6ab1f3695ec23f3e7456 100644
--- a/test/test_include.h
+++ b/test/test_include.h
@@ -30,4 +30,4 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
-#include "io/io.h"
+#include "io/paddle_mobile.h"
diff --git a/tools/android-cmake/android.toolchain.cmake b/tools/android-cmake/android.toolchain.cmake
index 4db5cd41b46246f92882f1548290fb87fc915aae..a57d9c102ff65d4c10cc9bd3773ffa4c87e482fa 100644
--- a/tools/android-cmake/android.toolchain.cmake
+++ b/tools/android-cmake/android.toolchain.cmake
@@ -37,7 +37,7 @@
 # ANDROID_DISABLE_FORMAT_STRING_CHECKS
 # ANDROID_CCACHE
 
-cmake_minimum_required(VERSION 3.6.0)
+# cmake_minimum_required(VERSION 3.6.0)
 
 # Inhibit all of CMake's own NDK handling code.
 set(CMAKE_SYSTEM_VERSION 1)
diff --git a/tools/arm-platform.cmake b/tools/arm-platform.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..9f2b6d5e89d92255848af54321ea09ebdb058691
--- /dev/null
+++ b/tools/arm-platform.cmake
@@ -0,0 +1,9 @@
+
+set(ARCH "armv7-a")
+
+set(FLOAT_ABI "softfp" CACHE STRING "-mfloat-api chosen")
+set_property(CACHE FLOAT_ABI PROPERTY STRINGS "softfp" "soft" "hard")
+
+set(FPU "neon")
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${ARCH} -mfloat-abi=${FLOAT_ABI} -mfpu=${FPU}")
diff --git a/tools/build.sh b/tools/build.sh
index 42e872c580cffef3bd904dc9cc575e9961ef4257..0b891e56a44e06a53f01b792304d33a49c760f68 100755
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -31,9 +31,9 @@ build_for_mac() {
 }
 
 build_for_android() {
-    rm -rf "../build"
-    if [ -z "${ANDROID_NDK}" ]; then
-        echo "ANDROID_NDK not found!"
+    #rm -rf "../build"
+    if [ -z "${NDK_ROOT}" ]; then
+        echo "NDK_ROOT not found!"
         exit -1
     fi
 
@@ -60,7 +60,6 @@ build_for_android() {
     TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
     ANDROID_ARM_MODE="arm"
     if [ $# -eq 1 ]; then
-    NET=$1
     cmake .. \
         -B"../build/release/${PLATFORM}" \
         -DANDROID_ABI="${ABI}" \
@@ -70,7 +69,7 @@ build_for_android() {
         -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
         -DANDROID_STL=c++_static \
         -DANDROID=true \
-        -D"${NET}=true" \
+        -DNET=$1 \
         -D"${ARM_PLATFORM}"=true
     else
 
@@ -90,7 +89,7 @@ build_for_android() {
 }
 
 build_for_ios() {
-    rm -rf "../build"
+#    rm -rf "../build"
     PLATFORM="ios"
     MODE="Release"
     BUILD_DIR=../build/release/"${PLATFORM}"
@@ -99,7 +98,6 @@ build_for_ios() {
     CXX_FLAGS="-fobjc-abi-version=2 -fobjc-arc -std=gnu++14 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT}"
     mkdir -p "${BUILD_DIR}"
     if [ $# -eq 1 ]; then
-        NET=$1
         cmake .. \
             -B"${BUILD_DIR}" \
             -DCMAKE_BUILD_TYPE="${MODE}" \
@@ -107,7 +105,7 @@ build_for_ios() {
             -DIOS_PLATFORM=OS \
             -DCMAKE_C_FLAGS="${C_FLAGS}" \
             -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-            -D"${NET}"=true \
+            -DNET=$1 \
             -DIS_IOS="true"
     else
         cmake .. \
@@ -121,6 +119,9 @@ build_for_ios() {
     fi
     cd "${BUILD_DIR}"
     make -j 8
+    cd ./build
+    # 生成符号表
+    ranlib *.a
 }
 
 build_error() {
@@ -129,16 +130,12 @@ build_error() {
 
 if [ $# -lt 1 ]; then
 	echo "error: target missing!"
-    echo "available targets: mac|linux|ios|android"
-    echo "sample usage: ./build.sh mac"
+    echo "available targets: ios|android"
+    echo "sample usage: ./build.sh android"
 else
     if [ $# -eq 2 ]; then
         if [ $2 != "googlenet" -a $2 != "mobilenet" -a $2 != "yolo" -a $2 != "squeezenet" -a $2 != "resnet" ]; then
-            if [ $1 = "mac" ]; then
-		        build_for_mac
-	        elif [ $1 = "linux" ]; then
-		        build_for_linux
-	        elif [ $1 = "android" ]; then
+	        if [ $1 = "android" ]; then
 		        build_for_android
 	        elif [ $1 = "ios" ]; then
 		        build_for_ios
@@ -146,11 +143,7 @@ else
 		        build_error
 	        fi
         else
-            if [ $1 = "mac" ]; then
-		        build_for_mac $2
-	        elif [ $1 = "linux" ]; then
-		        build_for_linux $2
-	        elif [ $1 = "android" ]; then
+	        if [ $1 = "android" ]; then
 		        build_for_android $2
 	        elif [ $1 = "ios" ]; then
 		        build_for_ios $2
@@ -159,11 +152,7 @@ else
 	        fi
         fi
     else
-        if [ $1 = "mac" ]; then
-		    build_for_mac
-	    elif [ $1 = "linux" ]; then
-		    build_for_linux
-	    elif [ $1 = "android" ]; then
+	    if [ $1 = "android" ]; then
 		    build_for_android
 	    elif [ $1 = "ios" ]; then
 		    build_for_ios
diff --git a/tools/op.cmake b/tools/op.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..e5b006ee0e890307275dcd472a2e31f51c3fb891
--- /dev/null
+++ b/tools/op.cmake
@@ -0,0 +1,158 @@
+if (NET STREQUAL "googlenet")
+  set(CONCAT_OP ON)
+  set(CONV_OP ON)
+  set(LRN_OP ON)
+  set(MUL_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(FUSION_FC_OP ON)
+  set(POOL_OP ON)
+  set(RELU_OP ON)
+  set(FUSION_CONVADD_OP ON)
+  set(FUSION_CONVADD_RELU_OP ON)
+elseif (NET STREQUAL "mobilenet")
+  set(CONV_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(RELU_OP ON)
+  set(SOFTMAX_OP ON)
+  set(SOFTMAX_OP ON)
+  set(DEPTHWISECONV_OP ON)
+  set(BATCHNORM_OP ON)
+  set(POOL_OP ON)
+  set(RESHAPE_OP ON)
+  set(FUSION_CONVADDBNRELU_OP)
+elseif (NET STREQUAL "yolo")
+  set(BATCHNORM_OP ON)
+  set(CONV_OP ON)
+  set(RELU_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+elseif (NET STREQUAL "squeezenet")
+  set(CONCAT_OP ON)
+  set(CONV_OP ON)
+  set(RELU_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(POOL_OP ON)
+  set(RESHAPE_OP ON)
+  set(SOFTMAX_OP ON)
+elseif (NET STREQUAL "resnet")
+  set(CONV_OP ON)
+  set(BATCHNORM_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(SOFTMAX_OP ON)
+  set(MUL_OP ON)
+  set(POOL_OP ON)
+  set(RELU_OP ON)
+else ()
+  set(BATCHNORM_OP ON)
+  set(BOXCODER_OP ON)
+  set(CONCAT_OP ON)
+  set(CONV_OP ON)
+  set(DEPTHWISECONV_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(FUSION_CONVADD_OP ON)
+  set(CONVADDRELU_OP ON)
+  set(FUSION_FC_OP ON)
+  set(LRN_OP ON)
+  set(MUL_OP ON)
+  set(MULTICLASSNMS_OP ON)
+  set(POOL_OP ON)
+  set(PRIORBOX_OP ON)
+  set(RELU_OP ON)
+  set(RESHAPE_OP ON)
+  set(SIGMOID_OP ON)
+  set(SOFTMAX_OP ON)
+  set(TRANSPOSE_OP ON)
+  set(FUSION_CONVADD_RELU_OP ON)
+  set(FUSION_CONVADDBNRELU_OP ON)
+  set(DROPOUT_OP ON)
+  set(IM2SEQUENCE_OP ON)
+  # option(BATCHNORM_OP "" ON)
+  # option(BOXCODER_OP "" ON)
+  # option(CONCAT_OP "" ON)
+  # option(CONV_OP "" ON)
+  # option(DEPTHWISECONV_OP "" ON)
+  # option(ELEMENTWISEADD_OP "" ON)
+  # option(FUSION_CONVADD_OP "" ON)
+  # option(CONVADDRELU_OP "" ON)
+  # option(FUSION_FC_OP "" ON)
+  # option(LRN_OP "" ON)
+  # option(MUL_OP "" ON)
+  # option(MULTICLASSNMS_OP "" ON)
+  # option(POOL_OP "" ON)
+  # option(PRIORBOX_OP "" ON)
+  # option(RELU_OP "" ON)
+  # option(RESHAPE_OP "" ON)
+  # option(SIGMOID_OP "" ON)
+  # option(SOFTMAX_OP "" ON)
+  # option(TRANSPOSE_OP "" ON)
+  # option(FUSION_CONVADD_RELU_OP "" ON)
+endif ()
+
+if (BATCHNORM_OP)
+  add_definitions(-DBATCHNORM_OP)
+endif()
+if (BOXCODER_OP)
+  add_definitions(-DBOXCODER_OP)
+endif()
+if (CONCAT_OP)
+  add_definitions(-DCONCAT_OP)
+endif()
+if (CONV_OP)
+  add_definitions(-DCONV_OP)
+endif()
+if (DEPTHWISECONV_OP)
+  add_definitions(-DDEPTHWISECONV_OP)
+endif()
+if (ELEMENTWISEADD_OP)
+  add_definitions(-DELEMENTWISEADD_OP)
+endif()
+if (FUSION_CONVADD_OP)
+  add_definitions(-DFUSION_CONVADD_OP)
+endif()
+if (CONVADDRELU_OP)
+  add_definitions(-DCONVADDRELU_OP)
+endif()
+if (FUSION_FC_OP)
+  add_definitions(-DFUSION_FC_OP)
+endif()
+if (LRN_OP)
+  add_definitions(-DLRN_OP)
+endif()
+if (MUL_OP)
+  add_definitions(-DMUL_OP)
+endif()
+if (MULTICLASSNMS_OP)
+  add_definitions(-DMULTICLASSNMS_OP)
+endif()
+if (POOL_OP)
+  add_definitions(-DPOOL_OP)
+endif()
+if (PRIORBOX_OP)
+  add_definitions(-DPRIORBOX_OP)
+endif()
+if (RELU_OP)
+  add_definitions(-DRELU_OP)
+endif()
+if (RESHAPE_OP)
+  add_definitions(-DRESHAPE_OP)
+endif()
+if (SIGMOID_OP)
+  add_definitions(-DSIGMOID_OP)
+endif()
+if (SOFTMAX_OP)
+  add_definitions(-DSOFTMAX_OP)
+endif()
+if (TRANSPOSE_OP)
+  add_definitions(-DTRANSPOSE_OP)
+endif()
+if (FUSION_CONVADD_RELU_OP)
+  add_definitions(-DFUSION_CONVADD_RELU_OP)
+endif()
+if (FUSION_CONVADDBNRELU_OP)
+  add_definitions(-DFUSION_CONVADDBNRELU_OP)
+endif()
+if (DROPOUT_OP)
+  add_definitions(-DDROPOUT_OP)
+endif()
+if (IM2SEQUENCE_OP)
+  add_definitions(-DIM2SEQUENCE_OP)
+endif()
diff --git a/tools/pre-commit.hooks/clang-format.hook b/tools/pre-commit.hooks/clang-format.hook
index 4fa4253bad78fe287fb92863a684a5d7def71061..ece9ebc598e3fa63d1d76409dc0068854aaec851 100644
--- a/tools/pre-commit.hooks/clang-format.hook
+++ b/tools/pre-commit.hooks/clang-format.hook
@@ -14,6 +14,10 @@ fi
 
 # https://medicineyeh.wordpress.com/2017/07/13/clang-format-with-pragma/
 shift
-perl -i -pe 's|#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@"
-clang-format -i $@
+perl -i -pe 's|^\s+#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@"
+(
+# remove clang format ios_io folder
+flist=$(echo "$@" | perl -pe 's|src/ios_io/[^ ]*||')
+clang-format -i $flist
+)
 perl -i -pe 's|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> ||' "$@"
diff --git a/tools/toolchains/arm-android-neon.cmake b/tools/toolchains/arm-android-neon.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..f2fa600b90fb54886838e953e61c1e940569dee6
--- /dev/null
+++ b/tools/toolchains/arm-android-neon.cmake
@@ -0,0 +1,2 @@
+set(ANDROID_ARM_NEON ON)
+include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")
\ No newline at end of file
diff --git a/tools/toolchains/arm-linux-gnueabi.cmake b/tools/toolchains/arm-linux-gnueabi.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c2b1b853def5f470565e670751708f76c59e16c4
--- /dev/null
+++ b/tools/toolchains/arm-linux-gnueabi.cmake
@@ -0,0 +1,16 @@
+# CMake toolchain file for building ARM software on Linux environment
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_VERSION 1)
+
+set(CMAKE_C_COMPILER   /usr/bin/arm-linux-gnueabi-gcc)
+set(CMAKE_CXX_COMPILER /usr/bin/arm-linux-gnueabi-g++)
+set(CMAKE_STRIP /usr/bin/arm-linux-gnueabi-strip)
+
+set(CMAKE_FIND_ROOT_PATH  /usr/arm-linux-gnueabi)
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+set(ARM_LINUX 1)