optimize pool3x3 kernel

d4e21584 · wangliu · 25c10df2 · 4d035b1d · d4e21584 · d4e21584
104 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,8 +40,6 @@ else()
    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/*.h)
    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/*.cc)
    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/*.cpp)
 endif()
 if(FPGA)
@@ -103,6 +101,10 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
+# NET default
+set(NET "defult" CACHE STRING "select net type")
+set_property(CACHE NET PROPERTY STRINGS "defult" "googlenet" "mobilenet" "yolo" "squeezenet")
 include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
 # if (IS_IOS)
@@ -118,4 +120,3 @@ if(DEBUGING)
    add_subdirectory(test)
 endif()
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -183,6 +183,10 @@ upstream
 接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。
+![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294877166787.jpg)
+之后就可以提交代码了
 ## 删除远程分支
 在 PR 被 merge 进主仓库后，我们可以在 PR 的页面删除远程仓库的分支。
@@ -219,7 +223,8 @@ upstream
     - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
     - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
   - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
-3. 如果解决了某个Issue的问题，请在该Pull Request的**第一个**评论框中加上：`fix #issue_number`，这样当该PUll Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
+3. 如果解决了某个Issue的问题，请在该Pull Request的**第一个**评论框中加上：`fix #issue_number`，这样当该Pull Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
 此外，在回复评审人意见时，请您遵守以下约定：

--- a/README.md
+++ b/README.md
-# Paddle-Mobile 
+# Paddle-Mobile
 [![Build Status](https://travis-ci.org/PaddlePaddle/paddle-mobile.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/paddle-mobile)
-[![License](https://img.shields.io/badge/license-Apache%202-brightgreen.svg)](LICENSE)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/doc)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
+<!--[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)-->
-This project is used to develop the next version deep learning freamwork for mobile device.
-# Development
-[Used model in development](https://mms-mis.cdn.bcebos.com/paddle-mobile/models.zip)
+欢迎来到 Paddle-Mobile GitHub 项目。
-## cross-compilation to android
+Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平台的深度学习的框架。Paddle-Moible设计思想和PaddlePaddle的最新版fluid版本保持了高度一致，同时针对嵌入式做了大量优化。设计之初就对嵌入式的性能、体积、能耗、硬件平台覆盖等方面做了考虑。
-* NDK is required
+## Features
-* ANDROID_NDK environment variable is required
-```bash 
+- **ARM CPU**
-sh build.sh android
-```
-## build for x86
+    arm cpu是paddle-mobile的主要支持方向，cpu的通用性一直是其优势。嵌入式深度学习，需要大量的cpu汇编实现。我们正在紧锣密鼓的编码，为的是能充分硬件的每一点加速能力。
-paddle-mobile is to run on arm platform. x86 only used to test not arm assembly code. So do not recommend compiling x86.
+    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile现在单核运行一次mobilenet 1.0是160+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间。
+- **Mali GPU**
-Now only support osx.
+    Mali GPU是百度和ARM合作开发的，双方团队近期都在致力于将paddle的op能无缝运行在ACL(arm compute library)。目前已经支持squeezenet，googlenet，resnet等几个网络模型，后续会继续加大力度。使全部移动端paddle op能高效运行在mali gpu上。 
+    在
+- **苹果设备的GPU Metal实现**
+    基于Metal实现的苹果设备的GPU预测库，也已经在实现中，近期也会有相应可运行版本。
+- **FPGA**
+    FPGA实现正在进行中，是基于Xilinx的ZU5目标开发板。
+- **灵活性**
+    * paddle-mobile cpu版不依赖任何第三库, 可进行快速集成。
+    * 使用泛型特化进行平台切换, 可灵活切换 cpu、gpu 和其他协处理器。
+    * 可根据特定的常见网络, 进行编译特定的 op, 降低编译时间, 减小包大小。
+    * 使用 docker 编译, 提供统一的编译环境。
+    * 高可拓展性, 方便拓展其他协处理器, 提供高性能 arm 算子实现, 方便其他协处理器开发者集成开发。
+    * 直接兼容 paddle-fluid 模型, 不需要额外的转换操作。
+- **体积**
+    paddle-mobile从设计之初就深入考虑到移动端的包体积的问题，cpu实现中没有外部依赖。在编译过程中，如果该网络不需要的op是完全不会被打入的。同时编译选项优化也为体积压缩提供了帮助。
+    除了二进制体积，我们对代码体积极力避免过大。整个仓库不到5m的代码体积。
+## 文档
+### 设计文档
+关于paddle-mobile设计文档在下面链接中，如果想了解更多内容。[issue](https://github.com/PaddlePaddle/paddle-mobile/issues)中会有很多早期的设计和讨论过程。
+[设计文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/design_doc.md)
+### 开发文档
+开发文档主要是关于编译、运行等问题。做为开发者，它可以和贡献文档共同结合使用。
+[开发文档]()https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_doc.md
+### 贡献文档
+- [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
+- 上面文档中涵盖了主要的贡献代码流程，如果在实践中您还遇到了其他问题，可以发[issue](https://github.com/PaddlePaddle/paddle-mobile/issues)。我们看到后会尽快处理。
+## 模型获得
+目前Paddle-Mobile仅支持Paddle fluid训练的模型。如果你手中的模型是不同种类的模型，需要进行模型转换才可以运行。
+### 1. 直接使用Paddle Fluid训练
+该方式最为可靠，推荐方式
+### 2. caffe转为Paddle Fluid模型
+[链接](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
+### 3. ONNX
+ONNX全称为“Open Neural Network Exchange”，即“开放的神经网络切换”。该项目的目的是让不同的神经网络开发框架做到互通互用。
+除直接使用PaddlePaddle训练fluid版本的模型外，还可以通过onnx转换得到个别Paddle fluid模型。
+目前，百度也在做onnx支持工作。相关转换项目在这里：[paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)。
+```flow
+st=>start: 其他模型
+op1=>operation: onnx模型
+op2=>operation: paddle-onnx
+op3=>operation: paddle fluid模型
+e=>end: paddle-mobile运行
+st->op1->op2->op3->e
 ```
-sh build.sh mac
-```
-## Old Version of Mobile-Deep-Learning
+### 4. 部分测试模型下载
-The old version of MDL was I moved to here [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) 
+[下载链接](https://mms-mis.cdn.bcebos.com/paddle-mobile/models.zip)
+## 问题解决
+欢迎提出或解决我们的问题，有疑问可以发issue. [Github Issues](https://github.com/PaddlePaddle/paddle-mobile/issues).
+## Copyright and License
+Paddle-Mobile 提供相对宽松的Apache-2.0开源协议 [Apache-2.0 license](LICENSE).
+## 旧版 Mobile-Deep-Learning
+原MDL(Mobile-Deep-Learning)工程被迁移到了这里 [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) 
--- a/doc/design_doc.md
+++ b/doc/design_doc.md
+# paddle-mobile 设计文档
+#### 以下是 paddle-mobile 代码的执行流程图:
+![执行流程图](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305189473720.png)
+#### 主要分为: Loader 模块、 Program 模块、 Executor 模块、 op 模块、 kernel 模块、scope variable Tensor 模块
+#### 下面展开说一下各个模块的作用以及设计思路
+### 一. Loader
+先来看一下模型, 模型分为两种结构:
+ 一种为参数文件是散开的, 如下图, 红框为模型结构的 protobuf 文件, 其余为参数文件
+![模型描述](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305190629577.png)
+另一种为参数文件结合在一起的, 如下图, 红框内为模型结构描述的 protobuf 文件, 另一个文件为结合在一起的参数文件
+![模型描述combined](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191057130.png)
+loader 模块的作用是将模型结构信息 load 进内存, 将红框内的 protobuf 文件 load 进内存, 并对模型结构进行优化(如将几个细粒度的 op 融合成 粗粒度的 op, 如将 conv、 add、 batchnorm、 relu 融合为 conv\_add\_batchnorm\_relu).
+方便进行算法优化.
+__那么为什么融合在一起能够做算法优化 ?__
+如果未融合的 conv add batchnorm relu 运算是这样的
+```
+[n]
+[conv_res] = conv([n])
+for &res in conv_res {
+	res = add_biase(res)
+}
+for &res in conv_res {
+	res = batchnorm(res)
+}
+for &res in conv_res {
+	res = relu(res)
+}
+```
+融合后的 conv\_add\_batchnorm\_relu 运算是这样的:
+```
+[n]
+[conv_res] = conv([n])
+for &res in conv_res {
+	res = relu(batchnorm(add_biase(res)))
+}
+```
+由于 conv 可以转换为两个大矩阵相乘, 更进一步可以分为若干个一行一列的小矩阵相乘, 那最终的运算是这样的:
+```
+[n]
+for &res in [res] {
+	res = relu(batchnorm(add_biase(A * B)))
+}
+其中 A 和 B 为 1 * k 和 k * 1 矩阵
+```
+### 二. Program
+program 为 loader 模块的结果, 包含了优化前的模型结构对象, 以及优化后的模型结构对象, 此模块基本对应着 paddle 模型的结构, 关于paddle 模型的一些概念的定义, 详细设计可以参考 [program.md](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), 以下是一个简单的概况: 
+* programDesc 中包含着若干个(googlenet mobilenet yolo squeezenet resnet 常见的模型只有一个)可以嵌套的 block, blocks中的第一个block中的某个 op 可能会执行 blocks 中后边 block 中的一系列 op 运算(只有多个block才会有此概念)
+* block 包含着 ops 和 vars
+* ops 为一系列 op 的描述, 描述着每个 op 的类型, 输入输出, 所需参数
+* vars 里包含的为所有 op 运算所需的参数描述
+### 三. Executor
+executor 主要是用于 op 运算的上层调度操作, 主要有两个操作,  executor 实例化 和 暴露给上层的 predict 方法
+* executor 实例化过程中, 主要进行了这几个操作 
+	1. 根据 loader 产出的 program 初始化 operator 对象 
+	2. 分配所有需要用到的内存, 包括每个op 的输入输出, 权重参数, 目前模型的权重参数文件的内存格式为 NCHW, op 的输入输出中间矩阵参数也是 NCHW 格式
+	3. 调用每个 op 的 init 方法, init 方法是每个 op 实现者进行参数预处理的地方, 有助于减少 predict 的耗时
+* predict, 主要用于拿到外部的输入, 顺序调用 op 的 run 方法进行运算, 并返回最终的结果.
+### 四. op
+关于 op 模块代码的详细设计可以参考 [operator部分代码设计](https://github.com/PaddlePaddle/paddle-mobile/issues/300), operator主要包含一个kernel用于运算、一个 param 用于存储属性, operator 主要有三个操作, Init、RunImp、InferShape
+* Init: Init 函数主要用于参数预处理, 如对 batchNorm 参数进行预处理, 可以将 batchNorm 运算转化为 a * x + b 形式的运算, 这个函数也会调用, kernel 的 Init 函数对 kernel 进行初始化
+* RunImp: RunImp 函数会调用自己的kernel 的 compute 方法进行运算
+* InferShape: InferShape 函数会根据输入和参数得出输出的形状, 这个函数会在 executor 实例化时, 内存初始化前调用
+每个 operator 都需要进行注册才可以被使用, 以 conv 为例, 需在 conv_op.cpp 底部这样写: 
+```c++
+// 三个平台都注册了 conv op
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(conv2d);
+REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(conv2d);
+REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(conv2d);
+REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
+#endif
+```
+__一个关于包大小的优化__:
+每个 operator 都由一个宏控制编译, 如 conv_op.h(除了 conv_op.h ,  conv_op.cpp、conv_kernle.h、conv_kernle.cpp 也都需要加此宏控制)
+```c++
+#ifdef CONV_OP    //这个宏控制着 conv_op 是否被编译, 除了 conv_op.h ,  conv_op.cpp、conv_kernle.h conv_kernle.cpp 也都需要加此宏控制
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/conv_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+template <typename DeviceType, typename T>
+class ConvOp
+	//impl  
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+```
+这样做的目的是为了根据不同类型的网络编译特定的op, 在 cmake 中已经配置好不同网络编译的宏, 如果你要进行编译支持 yolo 的模型, 仅需执行:
+```sh
+cd toools
+sh build.sh android yolo
+```
+这样只会编译 yolo 所包含的四种 op, 极大的减小了包体积和编译时间
+### 五. kernel
+kernel 为 op 的底层运算实现, 主要有两个函数, Init 和 Compute, 分别用来初始化、预处理 和 运算操作, 值得提出的是, kernel 会根据泛型特化到不同的平台, 如图所示:
+![设备特化]![](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191401976.png)
+不同平台的 kernel 实现, 为同一个 kernel 类不同泛型的特化实现, 目前有三个平台, arm、mali、fpga, 图中的 central-arm-func\ 目录为 op kernel 的 arm 实现, 它承担了 arm\ 目录下 kernel 的底层实现, 同时 arm 处理器作为中央处理器, central-arm-func\ 也可以作为其他协处理器的底层实现, 如: fpga 的某一个 op kernel 还没有 fpga 协处理器的实现, 就可以直接调用使用这里的 arm 实现.
+__如果你有兴趣新增一个协处理器实现, 就可以在次添加一个 kernel 目录, 提供协处理器实现, 如果某个 kernel 你没有实现完, 你也可以直接使用 arm 实现__
+### 六. scope variable Tensor
+* scope 用来存储管理所需用到的所有 variable(用来存储不同类型的对象, 主要是矩阵Tensor, 也就是说 scpoe 管理着 op 运算过程中所有参数矩阵, 输入输出矩阵), 可以将 scope 理解为一个 map, 这里在 map 上封了一层 scope 的概念是为了方便内存管理
+* variable 可以用来存储不同类型的对象, paddle-mobile 里主要用它来存储矩阵 Tensor
+* tensor 代表着矩阵, 通过泛型可以用来存储不同类型的矩阵, 但需要注意的是, 存入和取出时的类型必须保持一致, 如果类型不一致,  使用 inline const T \*data() const 获取指针会不能通过类型检查, 通过  inline T \*mutable_data() 获取指针会重新分配内存, 以下是关于 Tensor 的一些小概念:
+	1. DDim: 用来存储矩阵的维度信息.
+	2. Slice(): 这个函数用来获取 N 维 (NCHW中的 N) 上切片
+	3. 当实例化未分配内存时, 调用 inline T *mutable_data() 会分配内存
--- a/doc/development_doc.md
+++ b/doc/development_doc.md
+# iOS开发文档
+## 编译
+### 一. 使用 build.sh 编译
+```sh 
+sh build.sh ios
+# 如果只想编译某个特定模型的 op, 则需执行以下命令
+sh build.sh ios googlenet
+# 在这个文件夹下, 你可以拿到生成的 .a 库
+cd ../build/release/ios/build
+```
+### 二. 使用 xcode 编译
+我们提供了 ios 开发更为熟悉的 xcode 编译环境:
+在 ios/ 目录下打开 PaddleMobile.xcworkspace 即可编译 PaddleMobile 或者 运行 Demo
+### 三. 集成
+#### 如使用 c++ 接口
+将 
+```
+libpaddle-mobile.a 
+io.h  
+program.h 
+types.h 
+lod_tensor.h 
+tensor.h
+```
+拖入工程, io.h 为接口文件, 可在 [github](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/io.h)上查看接口注释
+#### 如使用 oc 接口
+将在xcode 编译生成的
+```
+libPaddleMobile.a 
+PaddleMobile.h
+```
+拖入工程, 接口如下:
+```
+/*
+	创建单例对象
+*/
+ (instancetype)sharedInstance;
+/*
+	load 模型, 开辟内存
+*/
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+/*
+	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image means:(NSArray<NSNumber *> *)means scale:(float)scale;
+/*
+	进行预测
+*/
+- (NSArray *)predict:(CGImageRef)image;
+/*
+	清理内存
+*/
+- (void)clear;
+```
--- a/doc/images/devices.png
+++ b/doc/images/devices.png
--- a/doc/images/flow_chart.png
+++ b/doc/images/flow_chart.png
--- a/doc/images/model_desc.png
+++ b/doc/images/model_desc.png
--- a/doc/images/model_desc_combined.png
+++ b/doc/images/model_desc_combined.png
--- a/ios/PaddleMobile.xcworkspace/contents.xcworkspacedata
+++ b/ios/PaddleMobile.xcworkspace/contents.xcworkspacedata
+<?xml version="1.0" encoding="UTF-8"?>
+<Workspace
+   version = "1.0">
+   <FileRef
+      location = "group:PaddleMobileDemo/PaddleMobileDemo.xcodeproj">
+   </FileRef>
+   <FileRef
+      location = "group:PaddleMobile/PaddleMobile.xcodeproj">
+   </FileRef>
+</Workspace>
--- a/ios/PaddleMobile.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ b/ios/PaddleMobile.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>IDEDidComputeMac32BitWarning</key>
+	<true/>
+</dict>
+</plist>
--- a/ios/PaddleMobile.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
+++ b/ios/PaddleMobile.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
--- a/ios/PaddleMobile.xcworkspace/xcuserdata/liuruilong.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist
+++ b/ios/PaddleMobile.xcworkspace/xcuserdata/liuruilong.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist
+<?xml version="1.0" encoding="UTF-8"?>
+<Bucket
+   type = "0"
+   version = "2.0">
+</Bucket>
--- a/ios/PaddleMobile/PaddleMobile.xcodeproj/project.pbxproj
+++ b/ios/PaddleMobile/PaddleMobile.xcodeproj/project.pbxproj
--- a/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ b/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+<?xml version="1.0" encoding="UTF-8"?>
+<Workspace
+   version = "1.0">
+   <FileRef
+      location = "self:PaddleMobile.xcodeproj">
+   </FileRef>
+</Workspace>
--- a/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ b/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>IDEDidComputeMac32BitWarning</key>
+	<true/>
+</dict>
+</plist>
--- a/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
+++ b/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
--- a/ios/PaddleMobile/PaddleMobile.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
+++ b/ios/PaddleMobile/PaddleMobile.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>SchemeUserState</key>
+	<dict>
+		<key>PaddleMobile.xcscheme</key>
+		<dict>
+			<key>orderHint</key>
+			<integer>1</integer>
+		</dict>
+	</dict>
+</dict>
+</plist>
--- a/ios/PaddleMobile/PaddleMobile/MacroDefine.h
+++ b/ios/PaddleMobile/PaddleMobile/MacroDefine.h
+//
+//  MacroDefine.h
+//  PaddleMobile
+//
+//  Created by liuRuiLong on 2018/6/30.
+//  Copyright © 2018年 orange. All rights reserved.
+//
+#ifndef MacroDefine_h
+#define MacroDefine_h
+#endif /* MacroDefine_h */
--- a/ios/PaddleMobile/PaddleMobile/PaddleMobile.h
+++ b/ios/PaddleMobile/PaddleMobile/PaddleMobile.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#import <CoreImage/CoreImage.h>
+#import <Foundation/Foundation.h>
+@interface PaddleMobile : NSObject
+ (instancetype)sharedInstance;
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+- (NSArray *)predict:(CGImageRef)image means:(NSArray<NSNumber *> *)means scale:(float)scale;
+- (NSArray *)predict:(CGImageRef)image;
+- (void)clear;
+@end
--- a/ios/PaddleMobile/PaddleMobile/PaddleMobile.m
+++ b/ios/PaddleMobile/PaddleMobile/PaddleMobile.m
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#import "PaddleMobile.h"
+@implementation PaddleMobile
+ (instancetype)sharedInstance{
+  //TODO: imp
+  exit(0);
+}
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath{
+  //TODO: imp
+  exit(0);
+}
+- (NSArray *)predict:(CGImageRef)image means:(NSArray<NSNumber *> *)means scale:(float)scale{
+  //TODO: imp
+  exit(0);
+}
+- (NSArray *)predict:(CGImageRef)image{
+  //TODO: imp
+  exit(0);
+}
+- (void)clear{
+  //TODO: imp
+  exit(0);
+}
+@end
--- a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.pbxproj
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.pbxproj
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 50;
+	objects = {
+/* Begin PBXBuildFile section */
+		FC086BC920E783AF00D85EF7 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = FC086BC820E783AF00D85EF7 /* AppDelegate.m */; };
+		FC086BCC20E783AF00D85EF7 /* ViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = FC086BCB20E783AF00D85EF7 /* ViewController.m */; };
+		FC086BCF20E783AF00D85EF7 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC086BCD20E783AF00D85EF7 /* Main.storyboard */; };
+		FC086BD120E783B100D85EF7 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = FC086BD020E783B100D85EF7 /* Assets.xcassets */; };
+		FC086BD420E783B100D85EF7 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC086BD220E783B100D85EF7 /* LaunchScreen.storyboard */; };
+		FC086BD720E783B100D85EF7 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = FC086BD620E783B100D85EF7 /* main.m */; };
+/* End PBXBuildFile section */
+/* Begin PBXFileReference section */
+		FC086BC420E783AF00D85EF7 /* PaddleMobileDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = PaddleMobileDemo.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		FC086BC720E783AF00D85EF7 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
+		FC086BC820E783AF00D85EF7 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
+		FC086BCA20E783AF00D85EF7 /* ViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ViewController.h; sourceTree = "<group>"; };
+		FC086BCB20E783AF00D85EF7 /* ViewController.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ViewController.m; sourceTree = "<group>"; };
+		FC086BCE20E783AF00D85EF7 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
+		FC086BD020E783B100D85EF7 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		FC086BD320E783B100D85EF7 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
+		FC086BD520E783B100D85EF7 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		FC086BD620E783B100D85EF7 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+/* Begin PBXFrameworksBuildPhase section */
+		FC086BC120E783AF00D85EF7 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+/* Begin PBXGroup section */
+		FC086BBB20E783AF00D85EF7 = {
+			isa = PBXGroup;
+			children = (
+				FC086BC620E783AF00D85EF7 /* PaddleMobileDemo */,
+				FC086BC520E783AF00D85EF7 /* Products */,
+			);
+			sourceTree = "<group>";
+		};
+		FC086BC520E783AF00D85EF7 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				FC086BC420E783AF00D85EF7 /* PaddleMobileDemo.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		FC086BC620E783AF00D85EF7 /* PaddleMobileDemo */ = {
+			isa = PBXGroup;
+			children = (
+				FC086BC720E783AF00D85EF7 /* AppDelegate.h */,
+				FC086BC820E783AF00D85EF7 /* AppDelegate.m */,
+				FC086BCA20E783AF00D85EF7 /* ViewController.h */,
+				FC086BCB20E783AF00D85EF7 /* ViewController.m */,
+				FC086BCD20E783AF00D85EF7 /* Main.storyboard */,
+				FC086BD020E783B100D85EF7 /* Assets.xcassets */,
+				FC086BD220E783B100D85EF7 /* LaunchScreen.storyboard */,
+				FC086BD520E783B100D85EF7 /* Info.plist */,
+				FC086BD620E783B100D85EF7 /* main.m */,
+			);
+			path = PaddleMobileDemo;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+/* Begin PBXNativeTarget section */
+		FC086BC320E783AF00D85EF7 /* PaddleMobileDemo */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = FC086BDA20E783B100D85EF7 /* Build configuration list for PBXNativeTarget "PaddleMobileDemo" */;
+			buildPhases = (
+				FC086BC020E783AF00D85EF7 /* Sources */,
+				FC086BC120E783AF00D85EF7 /* Frameworks */,
+				FC086BC220E783AF00D85EF7 /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = PaddleMobileDemo;
+			productName = PaddleMobileDemo;
+			productReference = FC086BC420E783AF00D85EF7 /* PaddleMobileDemo.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+/* Begin PBXProject section */
+		FC086BBC20E783AF00D85EF7 /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastUpgradeCheck = 0930;
+				ORGANIZATIONNAME = orange;
+				TargetAttributes = {
+					FC086BC320E783AF00D85EF7 = {
+						CreatedOnToolsVersion = 9.3.1;
+					};
+				};
+			};
+			buildConfigurationList = FC086BBF20E783AF00D85EF7 /* Build configuration list for PBXProject "PaddleMobileDemo" */;
+			compatibilityVersion = "Xcode 9.3";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = FC086BBB20E783AF00D85EF7;
+			productRefGroup = FC086BC520E783AF00D85EF7 /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				FC086BC320E783AF00D85EF7 /* PaddleMobileDemo */,
+			);
+		};
+/* End PBXProject section */
+/* Begin PBXResourcesBuildPhase section */
+		FC086BC220E783AF00D85EF7 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FC086BD420E783B100D85EF7 /* LaunchScreen.storyboard in Resources */,
+				FC086BD120E783B100D85EF7 /* Assets.xcassets in Resources */,
+				FC086BCF20E783AF00D85EF7 /* Main.storyboard in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+/* Begin PBXSourcesBuildPhase section */
+		FC086BC020E783AF00D85EF7 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FC086BCC20E783AF00D85EF7 /* ViewController.m in Sources */,
+				FC086BD720E783B100D85EF7 /* main.m in Sources */,
+				FC086BC920E783AF00D85EF7 /* AppDelegate.m in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+/* Begin PBXVariantGroup section */
+		FC086BCD20E783AF00D85EF7 /* Main.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				FC086BCE20E783AF00D85EF7 /* Base */,
+			);
+			name = Main.storyboard;
+			sourceTree = "<group>";
+		};
+		FC086BD220E783B100D85EF7 /* LaunchScreen.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				FC086BD320E783B100D85EF7 /* Base */,
+			);
+			name = LaunchScreen.storyboard;
+			sourceTree = "<group>";
+		};
+/* End PBXVariantGroup section */
+/* Begin XCBuildConfiguration section */
+		FC086BD820E783B100D85EF7 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+			};
+			name = Debug;
+		};
+		FC086BD920E783B100D85EF7 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = iphoneos;
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		FC086BDB20E783B100D85EF7 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				DEVELOPMENT_TEAM = Z5M2UUN5YV;
+				INFOPLIST_FILE = PaddleMobileDemo/Info.plist;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = orange.PaddleMobileDemo;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		FC086BDC20E783B100D85EF7 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				DEVELOPMENT_TEAM = Z5M2UUN5YV;
+				INFOPLIST_FILE = PaddleMobileDemo/Info.plist;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = orange.PaddleMobileDemo;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+/* Begin XCConfigurationList section */
+		FC086BBF20E783AF00D85EF7 /* Build configuration list for PBXProject "PaddleMobileDemo" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				FC086BD820E783B100D85EF7 /* Debug */,
+				FC086BD920E783B100D85EF7 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		FC086BDA20E783B100D85EF7 /* Build configuration list for PBXNativeTarget "PaddleMobileDemo" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				FC086BDB20E783B100D85EF7 /* Debug */,
+				FC086BDC20E783B100D85EF7 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = FC086BBC20E783AF00D85EF7 /* Project object */;
+}
--- a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+<?xml version="1.0" encoding="UTF-8"?>
+<Workspace
+   version = "1.0">
+   <FileRef
+      location = "self:PaddleMobileDemo.xcodeproj">
+   </FileRef>
+</Workspace>
--- a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>IDEDidComputeMac32BitWarning</key>
+	<true/>
+</dict>
+</plist>
--- a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
--- a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>SchemeUserState</key>
+	<dict>
+		<key>PaddleMobileDemo.xcscheme</key>
+		<dict>
+			<key>orderHint</key>
+			<integer>0</integer>
+		</dict>
+	</dict>
+</dict>
+</plist>
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.h
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#import <UIKit/UIKit.h>
+@interface AppDelegate : UIResponder <UIApplicationDelegate>
+@property (strong, nonatomic) UIWindow *window;
+@end
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.m
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.m
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#import "AppDelegate.h"
+@interface AppDelegate ()
+@end
+@implementation AppDelegate
+- (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
+    // Override point for customization after application launch.
+    return YES;
+}
+- (void)applicationWillResignActive:(UIApplication *)application {
+    // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
+    // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
+}
+- (void)applicationDidEnterBackground:(UIApplication *)application {
+    // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
+    // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
+}
+- (void)applicationWillEnterForeground:(UIApplication *)application {
+    // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
+}
+- (void)applicationDidBecomeActive:(UIApplication *)application {
+    // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
+}
+- (void)applicationWillTerminate:(UIApplication *)application {
+    // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
+}
+@end
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/AppIcon.appiconset/Contents.json
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/AppIcon.appiconset/Contents.json
+{
+  "images" : [
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "83.5x83.5",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ios-marketing",
+      "size" : "1024x1024",
+      "scale" : "1x"
+    }
+  ],
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/Contents.json
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/Contents.json
+{
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/LaunchScreen.storyboard
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/LaunchScreen.storyboard
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="13122.16" systemVersion="17A277" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
+    <dependencies>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="13104.12"/>
+        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--View Controller-->
+        <scene sceneID="EHf-IW-A2E">
+            <objects>
+                <viewController id="01J-lp-oVM" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="53" y="375"/>
+        </scene>
+    </scenes>
+</document>
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/Main.storyboard
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/Main.storyboard
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="13122.16" systemVersion="17A277" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+    <dependencies>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="13104.12"/>
+        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--View Controller-->
+        <scene sceneID="tne-QT-ifu">
+            <objects>
+                <viewController id="BYZ-38-t0r" customClass="ViewController" customModuleProvider="" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
+            </objects>
+        </scene>
+    </scenes>
+</document>
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/Info.plist
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/Info.plist
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>$(DEVELOPMENT_LANGUAGE)</string>
+	<key>CFBundleExecutable</key>
+	<string>$(EXECUTABLE_NAME)</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>$(PRODUCT_NAME)</string>
+	<key>CFBundlePackageType</key>
+	<string>APPL</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleVersion</key>
+	<string>1</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>UILaunchStoryboardName</key>
+	<string>LaunchScreen</string>
+	<key>UIMainStoryboardFile</key>
+	<string>Main</string>
+	<key>UIRequiredDeviceCapabilities</key>
+	<array>
+		<string>armv7</string>
+	</array>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+	<key>UISupportedInterfaceOrientations~ipad</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+</dict>
+</plist>
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.h
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#import <UIKit/UIKit.h>
+@interface ViewController : UIViewController
+@end
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.m
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.m
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#import "ViewController.h"
+@interface ViewController ()
+@end
+@implementation ViewController
+- (void)viewDidLoad {
+    [super viewDidLoad];
+}
+- (void)didReceiveMemoryWarning {
+    [super didReceiveMemoryWarning];
+}
+@end
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/main.m
+++ b/ios/PaddleMobileDemo/PaddleMobileDemo/main.m
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#import <UIKit/UIKit.h>
+#import "AppDelegate.h"
+int main(int argc, char * argv[]) {
+    @autoreleasepool {
+        return UIApplicationMain(argc, argv, nil, NSStringFromClass([AppDelegate class]));
+    }
+}
--- a/src/common/log.h
+++ b/src/common/log.h
@@ -174,7 +174,10 @@ struct ToLog;
 struct Print {
  friend struct ToLog;
  template <typename T>
-  Print &operator<<(T const &value) {}
+  Print &operator<<(T const &value) {
+    Print p = Print();
+    return p;
+  }
 private:
 };

--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -40,6 +40,8 @@ const std::string G_OP_TYPE_SPLIT = "split";
 const std::string G_OP_TYPE_FEED = "feed";
 const std::string G_OP_TYPE_FETCH = "fetch";
 const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
+const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence";
+const std::string G_OP_TYPE_DROPOUT = "dropout";
 std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
@@ -66,6 +68,8 @@ std::unordered_map<
        {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}}};
+        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}}};
 }  // namespace paddle_mobile
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -96,6 +96,8 @@ extern const std::string G_OP_TYPE_SPLIT;
 extern const std::string G_OP_TYPE_FEED;
 extern const std::string G_OP_TYPE_FETCH;
 extern const std::string G_OP_TYPE_DEPTHWISE_CONV;
+extern const std::string G_OP_TYPE_IM2SEQUENCE;
+extern const std::string G_OP_TYPE_DROPOUT;
 extern std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>

--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -83,6 +83,7 @@ struct Variant {
      return *const_cast<T *>(reinterpret_cast<const T *>(&data));
    } else {
      PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant ");
+      exit(0);
    }
  }

--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -129,6 +129,7 @@ class Attribute {
      return vistor(attr.variant_.Get<int64_t>());
    } else {
      PADDLE_MOBILE_THROW_EXCEPTION("type not support");
+      exit(0);
    }
  }

--- a/src/framework/data_layout.h
+++ b/src/framework/data_layout.h
@@ -40,6 +40,7 @@ inline DataLayout StringToDataLayout(const std::string &str) {
    return DataLayout::kAnyLayout;
  } else {
    PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
+    exit(0);
  }
 }
@@ -52,6 +53,8 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) {
    case DataLayout::kAnyLayout:
      return "ANY_LAYOUT";
    default:
+      PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string ")
+      exit(0);
      break;
  }
 }

--- a/src/framework/ddim.h
+++ b/src/framework/ddim.h
@@ -58,7 +58,8 @@ struct DDim {
    } else if (d.var.TypeId() == typeid(Dim<9>).hash_code()) {
      return vistor(d.var.Get<Dim<9>>());
    } else {
-      DLOG << " dim not support";
+      PADDLE_MOBILE_ENFORCE(false, " dim not support");
+      exit(0);
    }
  }

--- a/src/framework/dim.h
+++ b/src/framework/dim.h
@@ -129,6 +129,7 @@ int64_t &indexer(Dim<D> &dim, int idx) {
 template <>
 int64_t &indexer<0>(Dim<0> &dim, int idx) {
  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
+  exit(0);
 }
 template <int D>
@@ -145,6 +146,7 @@ int64_t indexer(const Dim<D> &dim, int idx) {
 template <>
 int64_t indexer<0>(const Dim<0> &dim, int idx) {
  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
+  exit(0);
 }
 }  // namespace

--- a/src/io/io.cpp
+++ b/src/io/io.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "io/io.h"
+#include "io/executor.h"
 #include <algorithm>
 #include <vector>
 #include "common/enforce.h"
@@ -39,7 +39,7 @@ char *Get_binary_data(std::string filename) {
  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
                        filename.c_str());
  fseek(file, 0, SEEK_END);
-  long size = ftell(file);
+  int64_t size = ftell(file);
  PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
  rewind(file);
  char *data = new char[size];
@@ -50,116 +50,6 @@ char *Get_binary_data(std::string filename) {
  return data;
 }
-static size_t ReadBuffer(const char *file_name, uint8_t **out) {
-  printf("%s \n", file_name);
-  FILE *fp;
-  fp = fopen(file_name, "rb");
-  PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name);
-  fseek(fp, 0, SEEK_END);
-  size_t size = ftell(fp);
-  rewind(fp);
-  DLOG << "model size: " << size;
-  *out = reinterpret_cast<uint8_t *>(malloc(size));
-  size_t cur_len = 0;
-  size_t nread;
-  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
-    cur_len += nread;
-  }
-  fclose(fp);
-  return cur_len;
-}
-template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &dirname, bool optimize, bool can_add_split) {
-  auto program =
-      this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
-  program.model_path = dirname;
-  return program;
-}
-template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &model_path, const std::string &para_path,
-    bool optimize) {
-  auto program = this->LoadProgram(model_path, optimize);
-  program.para_path = para_path;
-  program.combined = true;
-  return program;
-}
-template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
-    const std::string &model_path, bool optimize, bool can_add_split) {
-  std::string model_filename = model_path;
-  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-  uint8_t *buf = NULL;
-  size_t read_size = ReadBuffer(model_filename.c_str(), &buf);
-  PADDLE_MOBILE_ENFORCE(buf != NULL, "read from __model__ is null");
-  c_program = paddle_mobile__framework__proto__program_desc__unpack(
-      NULL, read_size, buf);
-  //
-  PADDLE_MOBILE_ENFORCE(c_program != NULL, "program is null");
-  //
-  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
-  //
-  auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
-  framework::Program<Dtype, P> program;
-  program.originProgram = originProgramDesc;
-  auto scope = std::make_shared<framework::Scope>();
-  program.scope = scope;
-  for (const auto &block : originProgramDesc->Blocks()) {
-    for (auto var_desc : block->Vars()) {
-      auto var = scope->Var(var_desc->Name());
-      if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-        if (var_desc->Persistable() &&
-            var_desc->Type() != framework::VARTYPE_TYPE_FEED_MINIBATCH &&
-            var_desc->Type() != framework::VARTYPE_TYPE_FETCH_LIST) {
-          auto dim = var_desc->Tensor_desc().Dims();
-          auto tensor = var->GetMutable<framework::LoDTensor>();
-          tensor->Resize(framework::make_ddim(dim));
-        } else {
-          auto dim = var_desc->Tensor_desc().Dims();
-          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
-          dim[0] = 1;
-          auto tensor = var->GetMutable<framework::LoDTensor>();
-          tensor->Resize(framework::make_ddim(dim));
-        }
-      } else {
-        // TODO(codeWorm): some.
-      }
-    }
-  }
-  if (optimize) {
-    framework::ProgramOptimize program_optimize;
-    program.optimizeProgram =
-        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
-  }
-  if (optimize) {
-    program.optimizeProgram->Description("optimize: ");
-  } else {
-    originProgramDesc->Description("program: ");
-  }
-  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
-  return program;
-}
-template class Loader<CPU, Precision::FP32>;
-template class Loader<FPGA, Precision::FP32>;
-template class Loader<GPU_MALI, Precision::FP32>;
 #pragma mark - executor
 template <typename Dtype, Precision P>
 Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
@@ -209,30 +99,30 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
-                                    framework::LoDTensor *tensor, char *&data) {
+                                    framework::LoDTensor *tensor, char **data) {
  // 1. version
-  uint32_t version = *(uint32_t *)data;
+  uint32_t version = *reinterpret_cast<uint32_t *>(*data);
-  data += sizeof(uint32_t);
+  (*data) += sizeof(uint32_t);
  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
-  memcpy(lod_level_ptr, data, sizeof(uint64_t));
+  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
-  data += sizeof(uint64_t);
+  (*data) += sizeof(uint64_t);
  auto &lod = *tensor->mutable_lod();
  lod.resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size = *(uint64_t *)data;
+    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
-    data += sizeof(uint64_t);
+    (*data) += sizeof(uint64_t);
    DLOG << "lod size: " << i << size;
    std::vector<size_t> tmp(size / sizeof(size_t));
    for (int k = 0; k < tmp.size(); ++k) {
-      tmp[k] = *(size_t *)data;
+      tmp[k] = *reinterpret_cast<size_t *>(*data);
-      DLOG << "tmp[k]: " << k << *(size_t *)data;
+      (*data) += sizeof(size_t);
-      data += sizeof(size_t);
    }
    for (auto j : tmp) {
@@ -242,18 +132,18 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
  }
  // 3. tensor version
-  uint32_t tensor_version = *(uint32_t *)data;
+  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
-  data += sizeof(uint32_t);
+  (*data) += sizeof(uint32_t);
  // 4. tensor desc
-  int32_t size = *(int32_t *)data;
+  int32_t size = *reinterpret_cast<int32_t *>(*data);
-  data += sizeof(int32_t);
+  (*data) += sizeof(int32_t);
  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
-    buf.get()[m] = data[m];
+    buf.get()[m] = (*data)[m];
  }
-  data += (sizeof(char) * size);
+  (*data) += (sizeof(char) * size);
  const framework::TensorDesc &desc = var_desc.Tensor_desc();
  int memory_size = 1;
@@ -290,9 +180,9 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
  }
  for (int n = 0; n < memory_size * type_size; ++n) {
-    static_cast<char *>(memory)[n] = data[n];
+    static_cast<char *>(memory)[n] = (*data)[n];
  }
-  data += (sizeof(char) * memory_size * type_size);
+  (*data) += (sizeof(char) * memory_size * type_size);
 }
 template <typename Dtype, Precision P>
@@ -309,7 +199,7 @@ void Executor<Dtype, P>::InitMemory() {
        char *origin_data =
            Get_binary_data(program_.model_path + "/" + var_desc->Name());
        char *data = origin_data;
-        LoadMemory(*var_desc, tensor, data);
+        LoadMemory(*var_desc, tensor, &data);
        delete origin_data;
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
@@ -335,7 +225,7 @@ void Executor<Dtype, P>::InitCombineMemory() {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
-        LoadMemory(*var_desc, tensor, data);
+        LoadMemory(*var_desc, tensor, &data);
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
          auto tensor = var->template GetMutable<framework::LoDTensor>();
@@ -442,7 +332,8 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
                                                   *(program_.scope));
 #ifdef PADDLE_MOBILE_PROFILE
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
-  // TODO expose profile info as an interface, user can get them to analysis
+  // TODO(haipeng): expose profile info as an interface, user can get them to
+  // analysis
  //      the performance of their deepnet.
  FILE *df = fopen("net.dot", "w");
  fprintf(df, "digraph {\n");
@@ -480,8 +371,9 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
-    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(), (float)p.second,
+    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
-           (float)p.second / _ptotal * 100.0);
+           static_cast<float>(p.second),
+           static_cast<float>(p.second) / _ptotal * 100.0);
  }
  printf("====================[---------]======================\n");
 #endif

--- a/src/io/io.h
+++ b/src/io/io.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
 #include "common/types.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"
@@ -32,31 +33,6 @@ limitations under the License. */
 namespace paddle_mobile {
-template <typename Dtype = CPU, Precision P = Precision::FP32>
-class Loader {
- public:
-  /*
-   * @b load separate format fluid model
-   * @b 加载分开形式的 fluid 模型
-   * */
-  const framework::Program<Dtype, P> Load(const std::string &dirname,
-                                          bool optimize = false,
-                                          bool can_add_split = false);
-  /*
-   * @b load combine format fluid mode
-   * @b 加载结合在一起格式的模型
-   * */
-  const framework::Program<Dtype, P> Load(const std::string &model_path,
-                                          const std::string &para_path,
-                                          bool optimize = false);
- private:
-  const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
-                                                 bool optimize = false,
-                                                 bool can_add_split = false);
-};
 template <typename Dtype = CPU, Precision P = Precision::FP32>
 class Executor {
 public:
@@ -86,7 +62,7 @@ class Executor {
  Executor() = default;
  void InitMemory();
  void LoadMemory(const framework::VarDesc var_desc,
-                  framework::LoDTensor *tensor, char *&data);
+                  framework::LoDTensor *tensor, char **data);
  void InitCombineMemory();
  framework::Program<Dtype> program_;
  int batch_size_ = 1;

--- a/src/io/loader.cpp
+++ b/src/io/loader.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "io/loader.h"
+#include "framework/lod_tensor.h"
+#include "framework/program/program-optimize/program_optimize.h"
+namespace paddle_mobile {
+using framework::Variable;
+static size_t ReadBuffer(const char *file_name, uint8_t **out) {
+  printf("%s \n", file_name);
+  FILE *fp;
+  fp = fopen(file_name, "rb");
+  PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name);
+  fseek(fp, 0, SEEK_END);
+  size_t size = ftell(fp);
+  rewind(fp);
+  DLOG << "model size: " << size;
+  *out = reinterpret_cast<uint8_t *>(malloc(size));
+  size_t cur_len = 0;
+  size_t nread;
+  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
+    cur_len += nread;
+  }
+  fclose(fp);
+  return cur_len;
+}
+template <typename Dtype, Precision P>
+const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
+    const std::string &dirname, bool optimize, bool can_add_split) {
+  auto program =
+      this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
+  program.model_path = dirname;
+  return program;
+}
+template <typename Dtype, Precision P>
+const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
+    const std::string &model_path, const std::string &para_path,
+    bool optimize) {
+  auto program = this->LoadProgram(model_path, optimize);
+  program.para_path = para_path;
+  program.combined = true;
+  return program;
+}
+template <typename Dtype, Precision P>
+const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
+    const std::string &model_path, bool optimize, bool can_add_split) {
+  std::string model_filename = model_path;
+  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
+  uint8_t *buf = NULL;
+  size_t read_size = ReadBuffer(model_filename.c_str(), &buf);
+  PADDLE_MOBILE_ENFORCE(buf != NULL, "read from __model__ is null");
+  c_program = paddle_mobile__framework__proto__program_desc__unpack(
+      NULL, read_size, buf);
+  //
+  PADDLE_MOBILE_ENFORCE(c_program != NULL, "program is null");
+  //
+  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
+  //
+  auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
+  framework::Program<Dtype, P> program;
+  program.originProgram = originProgramDesc;
+  auto scope = std::make_shared<framework::Scope>();
+  program.scope = scope;
+  for (const auto &block : originProgramDesc->Blocks()) {
+    for (auto var_desc : block->Vars()) {
+      auto var = scope->Var(var_desc->Name());
+      if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+        if (var_desc->Persistable() &&
+            var_desc->Type() != framework::VARTYPE_TYPE_FEED_MINIBATCH &&
+            var_desc->Type() != framework::VARTYPE_TYPE_FETCH_LIST) {
+          auto dim = var_desc->Tensor_desc().Dims();
+          auto tensor = var->GetMutable<framework::LoDTensor>();
+          tensor->Resize(framework::make_ddim(dim));
+        } else {
+          auto dim = var_desc->Tensor_desc().Dims();
+          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
+          dim[0] = 1;
+          auto tensor = var->GetMutable<framework::LoDTensor>();
+          tensor->Resize(framework::make_ddim(dim));
+        }
+      } else {
+        // TODO(codeWorm): some.
+      }
+    }
+  }
+  if (optimize) {
+    framework::ProgramOptimize program_optimize;
+    program.optimizeProgram =
+        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
+  }
+  if (optimize) {
+    program.optimizeProgram->Description("optimize: ");
+  } else {
+    originProgramDesc->Description("program: ");
+  }
+  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
+  return program;
+}
+template class Loader<CPU, Precision::FP32>;
+template class Loader<FPGA, Precision::FP32>;
+template class Loader<GPU_MALI, Precision::FP32>;
+}  // namespace paddle_mobile
--- a/src/io/loader.h
+++ b/src/io/loader.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "common/types.h"
+#include "framework/program/program.h"
+namespace paddle_mobile {
+template <typename Dtype = CPU, Precision P = Precision::FP32>
+class Loader {
+ public:
+  /*
+   * @b load separate format fluid model
+   * @b 加载分开形式的 fluid 模型
+   * */
+  const framework::Program<Dtype, P> Load(const std::string &dirname,
+                                          bool optimize = false,
+                                          bool can_add_split = false);
+  /*
+   * @b load combine format fluid mode
+   * @b 加载结合在一起格式的模型
+   * */
+  const framework::Program<Dtype, P> Load(const std::string &model_path,
+                                          const std::string &para_path,
+                                          bool optimize = false);
+ private:
+  const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
+                                                 bool optimize = false,
+                                                 bool can_add_split = false);
+};
+}  // namespace paddle_mobile
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+//
+// Created by liuRuiLong on 2018/7/2.
+//
+#include "io/paddle_mobile.h"
+namespace paddle_mobile {
+template <typename Dtype, Precision P>
+bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
+                                  int batch_size) {
+  if (loader_.get() == nullptr) {
+    loader_ = std::make_shared<Loader<Dtype, P>>();
+  } else {
+    LOG(kLOG_INFO) << "loader inited";
+  }
+  if (executor_.get() == nullptr) {
+    executor_ = std::make_shared<Executor<Dtype, P>>(
+        loader_->Load(dirname, optimize), batch_size, optimize);
+  } else {
+    LOG(kLOG_INFO) << "executor inited";
+  }
+  return true;
+}
+template <typename Dtype, Precision P>
+bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
+                                  const std::string &para_path, bool optimize,
+                                  int batch_size) {
+  if (loader_.get() == nullptr) {
+    loader_ = std::make_shared<Loader<Dtype, P>>();
+  } else {
+    LOG(kLOG_INFO) << "loader inited";
+  }
+  if (executor_.get() == nullptr) {
+    executor_ = std::make_shared<Executor<Dtype, P>>(
+        loader_->Load(model_path, para_path, optimize), batch_size, optimize);
+  } else {
+    LOG(kLOG_INFO) << "executor inited";
+  }
+  return true;
+}
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::Predict(
+    const framework::Tensor &t) {
+  return executor_->Predict(t);
+}
+template <typename Dtype, Precision P>
+std::vector<typename PaddleMobile<Dtype, P>::Ptype>
+PaddleMobile<Dtype, P>::Predict(const std::vector<Ptype> &input,
+                                const std::vector<int64_t> &dims) {
+  return executor_->Predict(input, dims);
+}
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::Clear() {
+  executor_ = nullptr;
+  loader_ = nullptr;
+}
+template class PaddleMobile<CPU, Precision::FP32>;
+template class PaddleMobile<FPGA, Precision::FP32>;
+template class PaddleMobile<GPU_MALI, Precision::FP32>;
+}  // namespace paddle_mobile
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "common/types.h"
+#include "framework/tensor.h"
+#include "io/executor.h"
+#include "io/loader.h"
+namespace paddle_mobile {
+template <typename Dtype = CPU, Precision P = Precision::FP32>
+class PaddleMobile {
+  typedef typename PrecisionTrait<P>::ptype Ptype;
+ public:
+  PaddleMobile() {}
+  /*
+   * @b load separate format fluid model
+   * @b 加载分开形式的 fluid 模型
+   * */
+  bool Load(const std::string &dirname, bool optimize = false,
+            int batch_size = 1);
+  /*
+   * @b load combine format fluid mode
+   * @b 加载结合在一起格式的模型
+   * */
+  bool Load(const std::string &model_path, const std::string &para_path,
+            bool optimize = false, int batch_size = 1);
+  /*
+   * @b to predict
+   * */
+  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
+  /*
+   * @b to predict with vector and dim
+   *
+   * @b 使用 输入 和 输入的维度信息 进行预测
+   * */
+  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
+                             const std::vector<int64_t> &dims);
+  void Clear();
+ private:
+  std::shared_ptr<Loader<Dtype, P>> loader_;
+  std::shared_ptr<Executor<Dtype, P>> executor_;
+};
+}  // namespace paddle_mobile
--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
@@ -15,6 +15,10 @@ limitations under the License. */
 #ifdef ANDROID
 #include "paddle_mobile_jni.h"
+#include "common/log.h"
+#include "framework/tensor.h"
+#include "io/paddle_mobile.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -28,17 +32,16 @@ using std::string;
 extern const char *ANDROID_LOG_TAG =
    "paddle_mobile LOG built on " __DATE__ " " __TIME__;
-static Executor<CPU> *shared_executor_instance = nullptr;
+static PaddleMobile<CPU> *shared_paddle_mobile_instance = nullptr;
 // toDo mutex lock
 // static std::mutex shared_mutex;
-Executor<CPU> *getExecutorInstance(const Program<CPU> p, int batch_size,
+PaddleMobile<CPU> *getPaddleMobileInstance() {
-                                   bool use_optimize) {
+  if (nullptr == shared_paddle_mobile_instance) {
-  if (nullptr == shared_executor_instance) {
+    shared_paddle_mobile_instance = new PaddleMobile<CPU>();
-    shared_executor_instance = new Executor<CPU>(p, batch_size, use_optimize);
  }
-  return shared_executor_instance;
+  return shared_paddle_mobile_instance;
 }
 string jstring2cppstring(JNIEnv *env, jstring jstr) {
@@ -51,11 +54,9 @@ string jstring2cppstring(JNIEnv *env, jstring jstr) {
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                                          jclass thiz,
                                                          jstring modelPath) {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  bool optimize = true;
-  auto program = loader.Load(jstring2cppstring(env, modelPath), optimize);
+  return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
-  shared_executor_instance = getExecutorInstance(program, 1, optimize);
+                                         optimize);
-  return shared_executor_instance != nullptr ? JNI_TRUE : JNI_FALSE;
 }
 JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
@@ -73,7 +74,7 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
  for (int i = 0; i < framework::product(ddim); i++) {
    input_ptr[i] = dataPointer[i];
  }
-  auto output = shared_executor_instance->Predict(input);
+  auto output = shared_paddle_mobile_instance->Predict(input);
  count = output->numel();
  result = env->NewFloatArray(count);
  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
@@ -81,7 +82,9 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
 }
 JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_clear(JNIEnv *env,
-                                                       jclass thiz) {}
+                                                       jclass thiz) {
+  getPaddleMobileInstance()->Clear();
+}
 }  // namespace jni
 }  // namespace paddle_mobile

--- a/src/jni/paddle_mobile_jni.h
+++ b/src/jni/paddle_mobile_jni.h
@@ -15,9 +15,6 @@ limitations under the License. */
 #pragma once
 #ifdef ANDROID
 #include <jni.h>
-#include "common/log.h"
-#include "framework/tensor.h"
-#include "io/io.h"
 #ifdef __cplusplus
 extern "C" {

--- a/src/operators/dropout_op.cpp
+++ b/src/operators/dropout_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef DROPOUT_OP
+#include "operators/dropout_op.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename Dtype, typename T>
+void DropoutOp<Dtype, T>::InferShape() const {
+  auto input_dims = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(input_dims);
+}
+template class DropoutOp<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(dropout);
+REGISTER_OPERATOR_CPU(dropout, ops::DropoutOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/dropout_op.h
+++ b/src/operators/dropout_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef DROPOUT_OP
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/dropout_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using paddle_mobile::framework::Tensor;
+template <typename DeviceType, typename T>
+class DropoutOp
+    : public framework::OperatorWithKernel<
+          DeviceType, DropoutParam, operators::DropoutKernel<DeviceType, T>> {
+ public:
+  DropoutOp(const std::string &type, const VariableNameMap &inputs,
+            const VariableNameMap &outputs, const framework::AttributeMap attrs,
+            std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, DropoutParam,
+                                      operators::DropoutKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  // using framework::OperatorWithKernel<DeviceType, DropoutParam,
+  //                                    operators::DropoutKernel<DeviceType,
+  //                                    T>>;
+  void InferShape() const override;
+ protected:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/im2sequence_op.cpp
+++ b/src/operators/im2sequence_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef IM2SEQUENCE_OP
+#include "operators/im2sequence_op.h"
+namespace paddle_mobile {
+namespace operators {
+int Im2SequenceOutputSize(int input_size, int kernel, int padding_1,
+                          int padding_2, int stride) {
+  int output_size =
+      1 + (padding_1 + padding_2 + input_size - kernel + stride - 1) / stride;
+  return output_size;
+}
+template <typename Dtype, typename T>
+void Im2SequenceOp<Dtype, T>::InferShape() const {
+  auto in_x_dims = this->param_.Input()->dims();
+  const std::vector<int> &kernels = this->param_.Kernels();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(Im2SequenceOutputSize(in_x_dims[i + 2], kernels[i],
+                                                 paddings[i], paddings[i + 2],
+                                                 strides[i]));
+  }
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+template class Im2SequenceOp<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(im2sequence);
+REGISTER_OPERATOR_CPU(im2sequence, ops::Im2SequenceOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/im2sequence_op.h
+++ b/src/operators/im2sequence_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef IM2SEQUENCE_OP
+#pragma once
+#include <operators/op_param.h>
+#include "framework/operator.h"
+#include "operators/kernel/im2sequence_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+using namespace framework;
+template <typename DeviceType, typename T>
+class Im2SequenceOp : public framework::OperatorWithKernel<
+                          DeviceType, Im2SequenceParam,
+                          operators::Im2SequenceKernel<DeviceType, T>> {
+ public:
+  Im2SequenceOp(const std::string &type, const VariableNameMap &inputs,
+                const VariableNameMap &outputs,
+                const framework::AttributeMap &attrs,
+                std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, Im2SequenceParam,
+            operators::Im2SequenceKernel<DeviceType, T>>(type, inputs, outputs,
+                                                         attrs, scope) {}
+  // using framework::OperatorWithKernel<
+  //    DeviceType, Im2SequenceParam,
+  //    operators::Im2SequenceKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+ private:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/box_coder_kernel.cpp
+++ b/src/operators/kernel/arm/box_coder_kernel.cpp
@@ -15,130 +15,21 @@ limitations under the License. */
 #ifdef BOXCODER_OP
 #include "operators/kernel/box_coder_kernel.h"
-#include <cmath>
+#include "operators/kernel/central-arm-func/box_coder_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
-template <typename T>
-void EncodeCenterSize(const framework::Tensor& target_box,
-                      const framework::Tensor& prior_box,
-                      const framework::Tensor& prior_box_var, T* output) {
-  int64_t row = target_box.dims()[0];
-  int64_t col = prior_box.dims()[0];
-  int64_t len = prior_box.dims()[1];
-  auto* target_box_data = target_box.data<T>();
-  auto* prior_box_data = prior_box.data<T>();
-  auto* prior_box_var_data = prior_box_var.data<T>();
-  for (int64_t i = 0; i < row; ++i) {
-    for (int64_t j = 0; j < col; ++j) {
-      T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len];
-      T prior_box_height =
-          prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
-      T prior_box_center_x =
-          (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
-      T prior_box_center_y =
-          (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
-      T target_box_center_x =
-          (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
-      T target_box_center_y =
-          (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
-      T target_box_width =
-          target_box_data[i * len + 2] - target_box_data[i * len];
-      T target_box_height =
-          target_box_data[i * len + 3] - target_box_data[i * len + 1];
-      size_t offset = i * col * len + j * len;
-      output[offset] = (target_box_center_x - prior_box_center_x) /
-                       prior_box_width / prior_box_var_data[j * len];
-      output[offset + 1] = (target_box_center_y - prior_box_center_y) /
-                           prior_box_height / prior_box_var_data[j * len + 1];
-      output[offset + 2] =
-          std::log(std::fabs(target_box_width / prior_box_width)) /
-          prior_box_var_data[j * len + 2];
-      output[offset + 3] =
-          std::log(std::fabs(target_box_height / prior_box_height)) /
-          prior_box_var_data[j * len + 3];
-    }
-  }
-}
-template <typename T>
-void DecodeCenterSize(const framework::Tensor& target_box,
-                      const framework::Tensor& prior_box,
-                      const framework::Tensor& prior_box_var, T* output) {
-  int64_t row = target_box.dims()[0];
-  int64_t col = prior_box.dims()[0];
-  int64_t len = prior_box.dims()[1];
-  auto* target_box_data = target_box.data<T>();
-  auto* prior_box_data = prior_box.data<T>();
-  auto* prior_box_var_data = prior_box_var.data<T>();
-  for (int64_t i = 0; i < row; ++i) {
-    for (int64_t j = 0; j < col; ++j) {
-      size_t offset = i * col * len + j * len;
-      T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len];
-      T prior_box_height =
-          prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
-      T prior_box_center_x =
-          (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
-      T prior_box_center_y =
-          (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
-      T target_box_center_x = prior_box_var_data[j * len] *
-                                  target_box_data[offset] * prior_box_width +
-                              prior_box_center_x;
-      T target_box_center_y = prior_box_var_data[j * len + 1] *
-                                  target_box_data[offset + 1] *
-                                  prior_box_height +
-                              prior_box_center_y;
-      T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
-                                    target_box_data[offset + 2]) *
-                           prior_box_width;
-      T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
-                                     target_box_data[offset + 3]) *
-                            prior_box_height;
-      output[offset] = target_box_center_x - target_box_width / 2;
-      output[offset + 1] = target_box_center_y - target_box_height / 2;
-      output[offset + 2] = target_box_center_x + target_box_width / 2;
-      output[offset + 3] = target_box_center_y + target_box_height / 2;
-    }
-  }
-}
 template <>
-bool BoxCoderKernel<CPU, float>::Init(BoxCoderParam* param) {
+bool BoxCoderKernel<CPU, float>::Init(BoxCoderParam *param) {
  return true;
 }
 template <>
-void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam& param) const {
+void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam &param) const {
-  const auto* input_priorbox = param.InputPriorBox();
+  BoxCoderCompute<float>(param);
-  const auto* input_priorboxvar = param.InputPriorBoxVar();
-  const auto* input_targetbox = param.InputTargetBox();
-  const auto& code_type = param.CodeType();
-  auto row = input_targetbox->dims()[0];
-  auto col = input_priorbox->dims()[0];
-  auto len = input_priorbox->dims()[1];
-  Tensor* output_box = param.OutputBox();
-  auto* output_box_dataptr = output_box->mutable_data<float>({row, col, len});
-  if (code_type == "encode_center_size") {
-    EncodeCenterSize<float>(*input_targetbox, *input_priorbox,
-                            *input_priorboxvar, output_box_dataptr);
-  }
-  if (code_type == "decode_center_size") {
-    DecodeCenterSize<float>(*input_targetbox, *input_priorbox,
-                            *input_priorboxvar, output_box_dataptr);
-  }
 }
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/arm/concat_kernel.cpp
+++ b/src/operators/kernel/arm/concat_kernel.cpp
@@ -15,42 +15,10 @@ limitations under the License. */
 #ifdef CONCAT_OP
 #include "operators/kernel/concat_kernel.h"
+#include "operators/kernel/central-arm-func/concat_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
-template <typename T>
-class ConcatFunctor {
- public:
-  void operator()(const std::vector<framework::Tensor> &input, const int axis,
-                  framework::Tensor *output) {
-    size_t num = input.size();
-    int rows = 1;
-    auto dim_0 = input[0].dims();
-    for (int i = 0; i < axis; ++i) {
-      rows *= dim_0[i];
-    }
-    int out_rows = rows, out_cols = 0;
-    std::vector<int64_t> input_cols(input.size());
-    for (int i = 0; i < num; ++i) {
-      int t_cols = input[i].numel() / rows;
-      out_cols += t_cols;
-      input_cols[i] = t_cols;
-    }
-    // computation
-    for (int k = 0; k < out_rows; ++k) {
-      T *dst_ptr = output->data<T>() + k * out_cols;
-      int col_idx = 0;
-      for (int j = 0; j < num; ++j) {
-        int col_len = input_cols[j];
-        const T *src_prt = input[j].data<T>() + k * col_len;
-        memory::Copy(dst_ptr + col_idx, src_prt, sizeof(T) * col_len);
-        col_idx += col_len;
-      }
-    }
-  }
-};
 template <>
 bool ConcatKernel<CPU, float>::Init(ConcatParam *param) {
@@ -59,33 +27,7 @@ bool ConcatKernel<CPU, float>::Init(ConcatParam *param) {
 template <>
 void ConcatKernel<CPU, float>::Compute(const ConcatParam &param) const {
-  auto inputs = param.Inputs();
+  ConcatCompute<float>(param);
-  auto *out = param.Out();
-  int64_t axis = param.Axis();
-  out->mutable_data<float>();
-  /// Sometimes direct copies will be faster, this maybe need deeply analysis.
-  if (axis == 0 && inputs.size() < 10) {
-    size_t output_offset = 0;
-    for (auto *in : inputs) {
-      auto in_stride = framework::stride_numel(in->dims());
-      auto out_stride = framework::stride_numel(out->dims());
-      auto dst = out->data<float>() + output_offset;
-      auto src = in->data<float>();
-      PADDLE_MOBILE_ENFORCE(
-          in_stride.size() == out_stride.size(),
-          "src and dst tensor should have the same dims size.");
-      memory::Copy(dst, src, sizeof(float) * in_stride[0]);
-      output_offset += in_stride[0];
-    }
-  } else {
-    std::vector<framework::Tensor> inputs_concat(inputs.size());
-    for (int j = 0; j < inputs.size(); ++j) {
-      inputs_concat[j] = *inputs[j];
-    }
-    ConcatFunctor<float> concat_functor;
-    concat_functor(inputs_concat, static_cast<int>(axis), out);
-  }
 }
 }  // namespace operators

--- a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
@@ -22,11 +22,11 @@ namespace operators {
 template <>
 bool ConvAddBNReluKernel<CPU, float>::Init(FusionConvAddBNReluParam *param) {
-  const Tensor *mean = (*param).InputMean();
+  const Tensor *mean = param->InputMean();
-  const Tensor *variance = (*param).InputVariance();
+  const Tensor *variance = param->InputVariance();
-  const Tensor *scale = (*param).InputScale();
+  const Tensor *scale = param->InputScale();
-  const Tensor *bias = (*param).InputBias();
+  const Tensor *bias = param->InputBias();
-  const float epsilon = (*param).Epsilon();
+  const float epsilon = param->Epsilon();
  auto mean_ptr = mean->data<float>();
  auto variance_ptr = variance->data<float>();
@@ -47,8 +47,8 @@ bool ConvAddBNReluKernel<CPU, float>::Init(FusionConvAddBNReluParam *param) {
    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
  }
-  (*param).SetNewScale(new_scale);
+  param->SetNewScale(new_scale);
-  (*param).SetNewBias(new_bias);
+  param->SetNewBias(new_bias);
  return true;
 }

--- a/src/operators/kernel/arm/dropout_kernel.cpp
+++ b/src/operators/kernel/arm/dropout_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef DROPOUT_OP
+#pragma once
+#include "operators/kernel/dropout_kernel.h"
+#include <operators/math/transform.h>
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool DropoutKernel<CPU, float>::Init(DropoutParam *para) {
+  return true;
+}
+template <typename T>
+struct DropoutFunctor {
+  inline T operator()(T in) const { return in; }
+};
+template <>
+void DropoutKernel<CPU, float>::Compute(const DropoutParam &param) const {
+  const auto *input_x = param.InputX();
+  auto *input_x_ptr = input_x->data<float>();
+  auto *out = param.Out();
+  auto *out_ptr = out->mutable_data<float>();
+  DropoutFunctor<float> func_;
+  math::Transform trans;
+  trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_add_kernel.cpp
@@ -14,18 +14,12 @@ limitations under the License. */
 #ifdef ELEMENTWISEADD_OP
-#pragma once
 #include "operators/kernel/elementwise_add_kernel.h"
+#include "operators/kernel/central-arm-func/elementwise_add_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
-template <typename T>
-struct AddFunctor {
-  inline T operator()(T a, T b) const { return a + b; }
-};
 template <>
 bool ElementwiseAddKernel<CPU, float>::Init(ElementwiseAddParam *param) {
  return true;
@@ -34,17 +28,9 @@ bool ElementwiseAddKernel<CPU, float>::Init(ElementwiseAddParam *param) {
 template <>
 void ElementwiseAddKernel<CPU, float>::Compute(
    const ElementwiseAddParam &param) const {
-  const Tensor *input_x = param.InputX();
+  ElementwiseAddCompute<float>(param);
-  const Tensor *input_y = param.InputY();
-  Tensor *Out = param.Out();
-  Out->mutable_data<float>();
-  int axis = param.Axis();
-  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
-                                                 AddFunctor<float>(), Out);
 }
-template class ElementwiseAddKernel<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/arm/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/arm/fusion_fc_kernel.cpp
@@ -14,9 +14,8 @@ limitations under the License. */
 #ifdef FUSION_FC_OP
-#pragma once
 #include "operators/kernel/fusion_fc_kernel.h"
+#include "operators/kernel/central-arm-func/fusion_fc_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
@@ -28,46 +27,7 @@ bool FusionFcKernel<CPU, float>::Init(FusionFcParam *param) {
 template <>
 void FusionFcKernel<CPU, float>::Compute(const FusionFcParam &param) const {
-  const Tensor *input_x = param.InputX();
+  FusionFcCompute<float>(param);
-  const Tensor *input_y = param.InputY();
-  const Tensor *input_z = param.InputZ();
-  auto *input_z_data = input_z->data<float>();
-  int axis = param.Axis();
-  Tensor *out = param.Out();
-  auto *out_data = out->mutable_data<float>();
-  const Tensor x_matrix =
-      input_x->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
-          : *input_x;
-  const Tensor y_matrix =
-      input_y->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
-          : *input_y;
-  auto out_dim = out->dims();
-  if (out_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
-  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
-                        " out_dim.size must be 2.");
-  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
-  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ")
-  int64_t classes = input_z->numel();
-  for (int i = 0; i < out_dim[0]; i++) {
-    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
-  }
-  for (int i = 0; i < out->numel(); i++) {
-    DLOG << out_data[i];
-  }
-  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
-                      out, static_cast<float>(1));
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-  //            if (out_dim.size() != 2) {
-  //                out->Resize(out_dim);
-  //            }
 }
 }  // namespace operators

--- a/src/operators/kernel/arm/im2sequence_kernel.cpp
+++ b/src/operators/kernel/arm/im2sequence_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef IM2SEQUENCE_OP
+#include "operators/kernel/im2sequence_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool Im2SequenceKernel<CPU, float>::Init(Im2SequenceParam *para) {
+  return true;
+}
+inline int Im2SeqOutputSize(int input_size, int filter_size, int padding_0,
+                            int padding_1, int stride) {
+  const int output_size =
+      (input_size + padding_0 + padding_1 - filter_size) / stride + 1;
+  return output_size;
+}
+template <>
+void Im2SequenceKernel<CPU, float>::Compute(
+    const Im2SequenceParam &param) const {
+  const Tensor *in_x = param.Input();
+  Tensor *out = param.Output();
+  out->mutable_data<float>();
+  std::vector<int> kernels = param.Kernels();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  auto in_x_dim = in_x->dims();
+  const int batch_size = static_cast<int>(in_x_dim[0]);
+  const int img_channels = static_cast<int>(in_x_dim[1]);
+  const int img_height = static_cast<int>(in_x_dim[2]);
+  const int img_width = static_cast<int>(in_x_dim[3]);
+  int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
+                                       paddings[2], strides[0]);
+  int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
+                                      paddings[3], strides[1]);
+  const std::vector<int> dilations({1, 1});
+  // TODO: verify
+  auto out_dims = out->dims();
+  out->Resize({batch_size, out->numel() / batch_size});
+  for (int i = 0; i < batch_size; i++) {
+    const Tensor src =
+        in_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+    Tensor dst = out->Slice(i, i + 1).Resize(
+        {output_height, output_width, img_channels, kernels[0], kernels[1]});
+    math::Im2ColFunctor<math::ColFormat::kOCF, CPU, float> f;
+    f(src, dilations, strides, paddings, &dst);
+  }
+  out->Resize(out_dims);
+}
+template class Im2SequenceKernel<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/lrn_kernel.cpp
+++ b/src/operators/kernel/arm/lrn_kernel.cpp
@@ -14,9 +14,8 @@ limitations under the License. */
 #ifdef LRN_OP
-#pragma once
 #include "operators/kernel/lrn_kernel.h"
+#include "operators/kernel/central-arm-func/lrn_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
@@ -28,26 +27,9 @@ bool LrnKernel<CPU, float>::Init(LrnParam *param) {
 template <>
 void LrnKernel<CPU, float>::Compute(const LrnParam &param) const {
-  const Tensor *input_x = param.InputX();
+  LrnCompute<float>(param);
-  auto x_dims = input_x->dims();
-  Tensor *out = param.Out();
-  out->mutable_data<float>();
-  /// data_format = NCHW
-  const int N = x_dims[0];
-  const int C = x_dims[1];
-  const int H = x_dims[2];
-  const int W = x_dims[3];
-  const int n = param.N();
-  const float alpha = param.Alpha();
-  const float beta = param.Beta();
-  const float k = param.K();
-  LRNFunctor<float> lrnFunctor;
-  lrnFunctor(*input_x, out, N, C, H, W, n, k, alpha, beta);
 }
-template class LrnKernel<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/arm/mul_kernel.cpp
+++ b/src/operators/kernel/arm/mul_kernel.cpp
@@ -14,9 +14,8 @@ limitations under the License. */
 #ifdef MUL_OP
-#pragma once
 #include "operators/kernel/mul_kernel.h"
+#include "operators/kernel/central-arm-func/mul_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
@@ -28,31 +27,9 @@ bool MulKernel<CPU, float>::Init(MulParam *param) {
 template <>
 void MulKernel<CPU, float>::Compute(const MulParam &param) const {
-  const Tensor *input_x = param.InputX();
+  MulCompute<float>(param);
-  const Tensor *input_y = param.InputY();
-  Tensor *out = param.Out();
-  out->mutable_data<float>();
-  const Tensor x_matrix =
-      input_x->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
-          : *input_x;
-  const Tensor y_matrix =
-      input_y->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
-          : *input_y;
-  auto out_dim = out->dims();
-  if (out_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
-                      out, static_cast<float>(0));
-  if (out_dim.size() != 2) {
-    out->Resize(out_dim);
-  }
 }
-template class MulKernel<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/arm/multiclass_nms_kernel.cpp
+++ b/src/operators/kernel/arm/multiclass_nms_kernel.cpp
@@ -15,265 +15,20 @@ limitations under the License. */
 #ifdef MULTICLASSNMS_OP
 #include "operators/kernel/multiclass_nms_kernel.h"
-#include <algorithm>
+#include "operators/kernel/central-arm-func/multiclass_nms_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
-constexpr int kOutputDim = 6;
-constexpr int kBBoxSize = 4;
-template <class T>
-bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                          const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-template <class T>
-static inline void GetMaxScoreIndex(
-    const std::vector<T>& scores, const T threshold, int top_k,
-    std::vector<std::pair<T, int>>* sorted_indices) {
-  for (size_t i = 0; i < scores.size(); ++i) {
-    if (scores[i] > threshold) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
-    }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
-                   SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
-}
-template <class T>
-static inline T BBoxArea(const T* box, const bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-template <class T>
-static inline T JaccardOverlap(const T* box1, const T* box2,
-                               const bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = inter_xmax - inter_xmin;
-    const T inter_h = inter_ymax - inter_ymin;
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-template <typename T>
-static inline void NMSFast(const Tensor& bbox, const Tensor& scores,
-                           const T score_threshold, const T nms_threshold,
-                           const T eta, const int64_t top_k,
-                           std::vector<int>* selected_indices) {
-  // The total boxes for each instance.
-  int64_t num_boxes = bbox.dims()[0];
-  // 4: [xmin ymin xmax ymax]
-  int64_t box_size = bbox.dims()[1];
-  std::vector<T> scores_data(num_boxes);
-  std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices;
-  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
-  selected_indices->clear();
-  T adaptive_threshold = nms_threshold;
-  const T* bbox_data = bbox.data<T>();
-  while (sorted_indices.size() != 0) {
-    const int idx = sorted_indices.front().second;
-    bool keep = true;
-    for (size_t k = 0; k < selected_indices->size(); ++k) {
-      if (keep) {
-        const int kept_idx = (*selected_indices)[k];
-        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                      bbox_data + kept_idx * box_size, true);
-        keep = overlap <= adaptive_threshold;
-      } else {
-        break;
-      }
-    }
-    if (keep) {
-      selected_indices->push_back(idx);
-    }
-    sorted_indices.erase(sorted_indices.begin());
-    if (keep && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-}
-template <typename T>
-void MultiClassNMS(const Tensor& scores, const Tensor& bboxes,
-                   std::map<int, std::vector<int>>* indices, int* num_nmsed_out,
-                   const int& background_label, const int& nms_top_k,
-                   const int& keep_top_k, const T& nms_threshold,
-                   const T& nms_eta, const T& score_threshold) {
-  int64_t class_num = scores.dims()[0];
-  int64_t predict_dim = scores.dims()[1];
-  int num_det = 0;
-  for (int64_t c = 0; c < class_num; ++c) {
-    if (c == background_label) continue;
-    Tensor score = scores.Slice(c, c + 1);
-    /// [c] is key
-    NMSFast<float>(bboxes, score, score_threshold, nms_threshold, nms_eta,
-                   nms_top_k, &((*indices)[c]));
-    num_det += (*indices)[c].size();
-  }
-  *num_nmsed_out = num_det;
-  const T* scores_data = scores.data<T>();
-  if (keep_top_k > -1 && num_det > keep_top_k) {
-    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-    for (const auto& it : *indices) {
-      int label = it.first;
-      const T* sdata = scores_data + label * predict_dim;
-      const std::vector<int>& label_indices = it.second;
-      for (size_t j = 0; j < label_indices.size(); ++j) {
-        int idx = label_indices[j];
-        // PADDLE_ENFORCE_LT(idx, predict_dim);
-        score_index_pairs.push_back(
-            std::make_pair(sdata[idx], std::make_pair(label, idx)));
-      }
-    }
-    // Keep top k results per image.
-    std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
-                     SortScorePairDescend<std::pair<int, int>>);
-    score_index_pairs.resize(keep_top_k);
-    // Store the new indices.
-    std::map<int, std::vector<int>> new_indices;
-    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
-      int label = score_index_pairs[j].second.first;
-      int idx = score_index_pairs[j].second.second;
-      new_indices[label].push_back(idx);
-    }
-    new_indices.swap(*indices);
-    *num_nmsed_out = keep_top_k;
-  }
-}
-template <typename T>
-void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
-                      const std::map<int, std::vector<int>>& selected_indices,
-                      Tensor* outs) {
-  int predict_dim = scores.dims()[1];
-  auto* scores_data = scores.data<T>();
-  auto* bboxes_data = bboxes.data<T>();
-  auto* odata = outs->data<T>();
-  int count = 0;
-  for (const auto& it : selected_indices) {
-    /// one batch
-    int label = it.first;
-    const T* sdata = scores_data + label * predict_dim;
-    const std::vector<int>& indices = it.second;
-    for (size_t j = 0; j < indices.size(); ++j) {
-      int idx = indices[j];
-      const T* bdata = bboxes_data + idx * kBBoxSize;
-      odata[count * kOutputDim] = label;           // label
-      odata[count * kOutputDim + 1] = sdata[idx];  // score
-      // xmin, ymin, xmax, ymax
-      std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
-      count++;
-    }
-  }
-}
 template <>
-bool MultiClassNMSKernel<CPU, float>::Init(MultiClassNMSParam* param) {
+bool MultiClassNMSKernel<CPU, float>::Init(MultiClassNMSParam *param) {
  return true;
 }
 template <>
 void MultiClassNMSKernel<CPU, float>::Compute(
-    const MultiClassNMSParam& param) const {
+    const MultiClassNMSParam &param) const {
-  const auto* input_bboxes = param.InputBBoxes();
+  MultiClassNMSCompute<float>(param);
-  const auto& input_bboxes_dims = input_bboxes->dims();
-  const auto* input_scores = param.InputScores();
-  const auto& input_scores_dims = input_scores->dims();
-  auto* outs = param.Out();
-  auto background_label = param.BackGroundLabel();
-  auto nms_top_k = param.NMSTopK();
-  auto keep_top_k = param.KeepTopK();
-  auto nms_threshold = param.NMSThreshold();
-  auto nms_eta = param.NMSEta();
-  auto score_threshold = param.ScoreThreshold();
-  int64_t batch_size = input_scores_dims[0];
-  int64_t class_num = input_scores_dims[1];
-  int64_t predict_dim = input_scores_dims[2];
-  int64_t box_dim = input_bboxes_dims[2];
-  std::vector<std::map<int, std::vector<int>>> all_indices;
-  std::vector<size_t> batch_starts = {0};
-  for (int64_t i = 0; i < batch_size; ++i) {
-    Tensor ins_score = input_scores->Slice(i, i + 1);
-    ins_score.Resize({class_num, predict_dim});
-    Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
-    ins_boxes.Resize({predict_dim, box_dim});
-    std::map<int, std::vector<int>> indices;
-    int num_nmsed_out = 0;
-    MultiClassNMS<float>(ins_score, ins_boxes, &indices, &num_nmsed_out,
-                         background_label, nms_top_k, keep_top_k, nms_threshold,
-                         nms_eta, score_threshold);
-    all_indices.push_back(indices);
-    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-  }
-  int num_kept = batch_starts.back();
-  if (num_kept == 0) {
-    float* od = outs->mutable_data<float>({1});
-    od[0] = -1;
-  } else {
-    outs->mutable_data<float>({num_kept, kOutputDim});
-    for (int64_t i = 0; i < batch_size; ++i) {
-      Tensor ins_score = input_scores->Slice(i, i + 1);
-      ins_score.Resize({class_num, predict_dim});
-      Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
-      ins_boxes.Resize({predict_dim, box_dim});
-      int64_t s = batch_starts[i];
-      int64_t e = batch_starts[i + 1];
-      if (e > s) {
-        Tensor out = outs->Slice(s, e);
-        MultiClassOutput<float>(ins_score, ins_boxes, all_indices[i], &out);
-      }
-    }
-  }
-  //            framework::LoD lod;
-  //            lod.emplace_back(batch_starts);
-  //
-  //            outs->set_lod(lod);
 }
 }  // namespace operators

--- a/src/operators/kernel/arm/prior_box_kernel.cpp
+++ b/src/operators/kernel/arm/prior_box_kernel.cpp
@@ -15,17 +15,11 @@ limitations under the License. */
 #ifdef PRIORBOX_OP
 #include "operators/kernel/prior_box_kernel.h"
+#include "operators/kernel/central-arm-func/prior_box_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
-template <typename T>
-struct ClipFunctor {
-  inline T operator()(T in) const {
-    return std::min<T>(std::max<T>(in, 0.), 1.);
-  }
-};
 template <>
 bool PriorBoxKernel<CPU, float>::Init(PriorBoxParam *param) {
  return true;
@@ -33,117 +27,7 @@ bool PriorBoxKernel<CPU, float>::Init(PriorBoxParam *param) {
 template <>
 void PriorBoxKernel<CPU, float>::Compute(const PriorBoxParam &param) const {
-  const auto *input_ = param.Input();
+  PriorBoxCompute<float>(param);
-  const auto &input_dims = input_->dims();
-  const auto *input_image = param.InputImage();
-  const auto &input_image_dims = input_image->dims();
-  const auto &min_sizes = param.MinSizes();
-  const auto &max_sizes = param.MaxSizes();
-  const auto &variances = param.Variances();
-  const auto &input_aspect_ratio = param.AspectRatios();
-  const bool &flip = param.Flip();
-  const bool &clip = param.Clip();
-  const float &step_w = param.StepW();
-  const float &step_h = param.StepH();
-  const float &offset = param.Offset();
-  Tensor *output_boxes = param.OutputBoxes();
-  auto output_boxes_dataptr = output_boxes->mutable_data<float>();
-  Tensor *output_variances = param.OutputVariances();
-  auto output_variances_dataptr = output_variances->mutable_data<float>();
-  std::vector<float> aspect_ratios;
-  ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
-  auto img_width = input_image_dims[3];
-  auto img_height = input_image_dims[2];
-  auto feature_width = input_dims[3];
-  auto feature_height = input_dims[2];
-  auto stride0 = output_boxes->dims()[1] * output_boxes->dims()[2] *
-                 output_boxes->dims()[3];
-  auto stride1 = output_boxes->dims()[2] * output_boxes->dims()[3];
-  auto stride2 = output_boxes->dims()[3];
-  float step_width, step_height;
-  /// 300 / 19
-  if (step_w == 0 || step_h == 0) {
-    step_width = static_cast<float>(img_width) / feature_width;
-    step_height = static_cast<float>(img_height) / feature_height;
-  } else {
-    step_width = step_w;
-    step_height = step_h;
-  }
-  int num_priors = aspect_ratios.size() * min_sizes.size();
-  if (!max_sizes.empty()) {
-    num_priors += max_sizes.size();
-  }
-  for (int h = 0; h < feature_height; ++h) {
-    for (int w = 0; w < feature_width; ++w) {
-      /// map origin image
-      float center_x = (w + offset) * step_width;
-      float center_y = (h + offset) * step_height;
-      float box_width, box_height;
-      int idx = 0;
-      for (size_t s = 0; s < min_sizes.size(); ++s) {
-        auto min_size = min_sizes[s];
-        // priors with different aspect ratios
-        for (float ar : aspect_ratios) {
-          box_width = min_size * sqrt(ar) / 2.;
-          box_height = min_size / sqrt(ar) / 2.;
-          /// box_width/2 , / img_width 为了得到feature map 相对于
-          /// 原图的归一化位置的比例。
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] =
-              (center_x - box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] =
-              (center_y - box_height) / img_height;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 2] =
-              (center_x + box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] =
-              (center_y + box_height) / img_height;
-          idx++;
-        }
-        if (!max_sizes.empty()) {
-          auto max_size = max_sizes[s];
-          // square prior with size sqrt(minSize * maxSize)
-          box_width = box_height = sqrt(min_size * max_size) / 2.;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] =
-              (center_x - box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] =
-              (center_y - box_height) / img_height;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 2] =
-              (center_x + box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] =
-              (center_y + box_height) / img_height;
-          idx++;
-        }
-      }
-    }
-  }
-  if (clip) {
-    math::Transform trans;
-    ClipFunctor<float> clip_func;
-    trans(output_boxes_dataptr, output_boxes_dataptr + output_boxes->numel(),
-          output_boxes_dataptr, clip_func);
-  }
-  if ((variances.size() != 4)) {
-    LOG(kLOG_ERROR) << " variances.size() must be 4.";
-  }
-  int64_t box_num = feature_height * feature_width * num_priors;
-  for (int i = 0; i < box_num; i++) {
-    output_variances_dataptr[4 * i] = variances[0];
-    output_variances_dataptr[4 * i + 1] = variances[1];
-    output_variances_dataptr[4 * i + 2] = variances[2];
-    output_variances_dataptr[4 * i + 3] = variances[3];
-  }
 }
 }  // namespace operators

--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -15,98 +15,21 @@ limitations under the License. */
 #ifdef RELU_OP
 #include "operators/kernel/relu_kernel.h"
-#include <operators/math/transform.h>
+#include "operators/kernel/central-arm-func/relu_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
-template <typename T>
-struct ReluFunctor {
-  inline T operator()(T in) const { return in > 0 ? in : 0; }
-};
 template <>
 bool ReluKernel<CPU, float>::Init(ReluParam *param) {
  return true;
 }
-/*
- * @b 特化到具体平台的实现, param 从 op 层传入
- * */
 template <>
 void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
-  const auto *input_x = param.InputX();
+  ReluCompute<float>(param);
-  auto *input_x_ptr = input_x->data<float>();
-  auto *out = param.Out();
-  auto *out_ptr = out->mutable_data<float>();
-  int numel = input_x->numel();
-  //  if (numel > 64) {
-  //    asm volatile(
-  //        "pld        [%[input_x_ptr], #0]        \n\t"
-  //        "vmov.f32   q8,    #0.0                 \n\t"
-  //        "subs %[num], %[num], #32                \n\t"
-  //        "blt        end_num_%=                  \n\t"
-  //        "loop_num_%=:                           \n\t"
-  //        "pld        [%[input_x_ptr], #1024]      \n\t"
-  //
-  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-  //
-  //        "vmax.f32 q0, q0, q8                   \n\t"
-  //        "vmax.f32 q1, q1, q8                    \n\t"
-  //        "vmax.f32 q2, q2, q8                   \n\t"
-  //        "vmax.f32 q3, q3, q8                   \n\t"
-  //        "vmax.f32 q4, q4, q8                   \n\t"
-  //        "vmax.f32 q5, q5, q8                   \n\t"
-  //        "vmax.f32 q6, q6, q8                   \n\t"
-  //        "vmax.f32 q7, q7, q8                   \n\t"
-  //
-  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
-  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
-  //
-  //        "subs %[num], %[num], #32              \n\t"
-  //        "bge        loop_num_%=                \n\t"
-  //        "end_num_%=:                           \n\t"
-  //        "cmp %[num], #0                         \n\t"
-  //        "bge   end_%=                          \n\t"
-  //        "mov r6, #4                             \n\t"
-  //        "mul r5, %[num], r6                     \n\t"
-  //        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
-  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-  //        "vmax.f32 q0, q0, q8                   \n\t"
-  //        "vmax.f32 q1, q1, q8                    \n\t"
-  //        "vmax.f32 q2, q2, q8                   \n\t"
-  //        "vmax.f32 q3, q3, q8                   \n\t"
-  //        "vmax.f32 q4, q4, q8                   \n\t"
-  //        "vmax.f32 q5, q5, q8                   \n\t"
-  //        "vmax.f32 q6, q6, q8                   \n\t"
-  //        "vmax.f32 q7, q7, q8                   \n\t"
-  //        "add %[out_ptr], %[out_ptr], r5       \n\t"
-  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
-  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
-  //        "end_%=:                                \n\t"
-  //        :
-  //        :
-  //        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num]
-  //        "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-  //        "q7", "q8", "r5",
-  //          "r6");
-  //  } else {
-  ReluFunctor<float> func_;
-  math::Transform trans;
-  trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
-  //  }
 }
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/arm/reshape_kernel.cpp
+++ b/src/operators/kernel/arm/reshape_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef RESHAPE_OP
 #include "operators/kernel/reshape_kernel.h"
+#include "operators/kernel/central-arm-func/reshape_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
@@ -26,30 +27,7 @@ bool ReshapeKernel<CPU, float>::Init(ReshapeParam *param) {
 template <>
 void ReshapeKernel<CPU, float>::Compute(const ReshapeParam &param) const {
-  const auto *input_x = param.InputX();
+  ReshapeCompute<float>(param);
-  const auto &input_x_dims = input_x->dims();
-  auto *out = param.Out();
-  framework::DDim out_dims = out->dims();
-  const auto *input_shape = param.InputShape();
-  if (input_shape) {
-    auto *shape_data = input_shape->data<int>();
-    framework::Tensor cpu_shape_tensor;
-    auto shape =
-        std::vector<int>(shape_data, shape_data + input_shape->numel());
-    out_dims = ValidateShape(shape, input_x->dims());
-  }
-  bool inplace = param.Inplace();
-  out->Resize(out_dims);
-  if (!inplace) {
-    out->mutable_data<float>();
-    framework::TensorCopy(*input_x, out);
-    out->Resize(out_dims);
-  } else {
-    out->ShareDataWith(*input_x);
-    out->Resize(out_dims);
-  }
 }
 }  // namespace operators

--- a/src/operators/kernel/arm/transpose_kernel.cpp
+++ b/src/operators/kernel/arm/transpose_kernel.cpp
@@ -14,72 +14,19 @@ limitations under the License. */
 #ifdef TRANSPOSE_OP
 #include "operators/kernel/transpose_kernel.h"
+#include "operators/kernel/central-arm-func/transpose_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
-// vector<int> pos;
-// template <typename T>
-// void TransposeFunc(const int numel, const T* input, const vector<int> axis,
-//                    const vector<int> old_strides, const vector<int>
-//                    new_strides, T* output) {
-//   for (int i = 0; i < numel; ++i) {
-//     int old_idx = 0;
-//     int idx = i;
-//     for (int j = 0; j < axis.size(); ++j) {
-//       int order = axis[j];
-//       old_idx += (idx / new_strides[j]) * old_strides[order];
-//       idx %= new_strides[j];
-//     }
-//     output[i] = input[old_idx];
-//   }
-// }
 template <>
-bool TransposeKernel<CPU, float>::Init(TransposeParam* param) {
+bool TransposeKernel<CPU, float>::Init(TransposeParam *param) {
  return true;
 }
 template <>
-void TransposeKernel<CPU, float>::Compute(const TransposeParam& param) const {
+void TransposeKernel<CPU, float>::Compute(const TransposeParam &param) const {
-  const auto* input_x = param.InputX();
+  TransposeCompute<float>(param);
-  const auto input_x_dims = input_x->dims();
-  auto* out = param.Out();
-  const auto axis = param.Axis();
-  const auto* input_x_data = input_x->data<float>();
-  auto* out_data = out->mutable_data<float>();
-  size_t ndim = axis.size();
-  std::vector<int> xdim(ndim);
-  std::vector<int> xstride(ndim);
-  std::vector<int> xout(ndim);
-  for (int i = 0; i < ndim; i++) {
-    int j = ndim - 1 - i;
-    xdim[j] = input_x_dims[axis[i]];
-    xstride[j] = 1;
-    for (int k = axis[i] + 1; k < ndim; k++) {
-      xstride[j] *= input_x_dims[k];
-    }
-    xout[j] = xstride[j] * xdim[j];
-  }
-  auto numel = input_x->numel();
-  size_t pind = 0;
-  std::vector<int> ind(ndim);
-  for (int i = 0; i < numel; i++) {
-    out_data[i] = input_x_data[pind];
-    ind[0]++;
-    pind += xstride[0];
-    for (int j = 0; j < ndim - 1; j++) {
-      if (ind[j] == xdim[j]) {
-        ind[j + 1]++;
-        ind[j] = 0;
-        pind += xstride[j + 1];
-        pind -= xout[j];
-      } else {
-        break;
-      }
-    }
-  }
 }
 }  // namespace operators

--- a/src/operators/kernel/central-arm-func/box_coder_arm_func.h
+++ b/src/operators/kernel/central-arm-func/box_coder_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef BOXCODER_OP
+#pragma once
+#include <cmath>
+namespace paddle_mobile {
+namespace operators {
+template <typename T>
+void EncodeCenterSize(const framework::Tensor& target_box,
+                      const framework::Tensor& prior_box,
+                      const framework::Tensor& prior_box_var, T* output) {
+  int64_t row = target_box.dims()[0];
+  int64_t col = prior_box.dims()[0];
+  int64_t len = prior_box.dims()[1];
+  auto* target_box_data = target_box.data<T>();
+  auto* prior_box_data = prior_box.data<T>();
+  auto* prior_box_var_data = prior_box_var.data<T>();
+  for (int64_t i = 0; i < row; ++i) {
+    for (int64_t j = 0; j < col; ++j) {
+      T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len];
+      T prior_box_height =
+          prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+      T prior_box_center_x =
+          (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+      T prior_box_center_y =
+          (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+      T target_box_center_x =
+          (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
+      T target_box_center_y =
+          (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
+      T target_box_width =
+          target_box_data[i * len + 2] - target_box_data[i * len];
+      T target_box_height =
+          target_box_data[i * len + 3] - target_box_data[i * len + 1];
+      size_t offset = i * col * len + j * len;
+      output[offset] = (target_box_center_x - prior_box_center_x) /
+                       prior_box_width / prior_box_var_data[j * len];
+      output[offset + 1] = (target_box_center_y - prior_box_center_y) /
+                           prior_box_height / prior_box_var_data[j * len + 1];
+      output[offset + 2] =
+          std::log(std::fabs(target_box_width / prior_box_width)) /
+          prior_box_var_data[j * len + 2];
+      output[offset + 3] =
+          std::log(std::fabs(target_box_height / prior_box_height)) /
+          prior_box_var_data[j * len + 3];
+    }
+  }
+}
+template <typename T>
+void DecodeCenterSize(const framework::Tensor& target_box,
+                      const framework::Tensor& prior_box,
+                      const framework::Tensor& prior_box_var, T* output) {
+  int64_t row = target_box.dims()[0];
+  int64_t col = prior_box.dims()[0];
+  int64_t len = prior_box.dims()[1];
+  auto* target_box_data = target_box.data<T>();
+  auto* prior_box_data = prior_box.data<T>();
+  auto* prior_box_var_data = prior_box_var.data<T>();
+  for (int64_t i = 0; i < row; ++i) {
+    for (int64_t j = 0; j < col; ++j) {
+      size_t offset = i * col * len + j * len;
+      T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len];
+      T prior_box_height =
+          prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
+      T prior_box_center_x =
+          (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+      T prior_box_center_y =
+          (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+      T target_box_center_x = prior_box_var_data[j * len] *
+                                  target_box_data[offset] * prior_box_width +
+                              prior_box_center_x;
+      T target_box_center_y = prior_box_var_data[j * len + 1] *
+                                  target_box_data[offset + 1] *
+                                  prior_box_height +
+                              prior_box_center_y;
+      T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
+                                    target_box_data[offset + 2]) *
+                           prior_box_width;
+      T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
+                                     target_box_data[offset + 3]) *
+                            prior_box_height;
+      output[offset] = target_box_center_x - target_box_width / 2;
+      output[offset + 1] = target_box_center_y - target_box_height / 2;
+      output[offset + 2] = target_box_center_x + target_box_width / 2;
+      output[offset + 3] = target_box_center_y + target_box_height / 2;
+    }
+  }
+}
+template <typename P>
+void BoxCoderCompute(const BoxCoderParam& param) {
+  const auto* input_priorbox = param.InputPriorBox();
+  const auto* input_priorboxvar = param.InputPriorBoxVar();
+  const auto* input_targetbox = param.InputTargetBox();
+  const auto& code_type = param.CodeType();
+  auto row = input_targetbox->dims()[0];
+  auto col = input_priorbox->dims()[0];
+  auto len = input_priorbox->dims()[1];
+  Tensor* output_box = param.OutputBox();
+  auto* output_box_dataptr = output_box->mutable_data<float>({row, col, len});
+  if (code_type == "encode_center_size") {
+    EncodeCenterSize<float>(*input_targetbox, *input_priorbox,
+                            *input_priorboxvar, output_box_dataptr);
+  }
+  if (code_type == "decode_center_size") {
+    DecodeCenterSize<float>(*input_targetbox, *input_priorbox,
+                            *input_priorboxvar, output_box_dataptr);
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/concat_arm_func.h
+++ b/src/operators/kernel/central-arm-func/concat_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef CONCAT_OP
+#pragma once
+#include <vector>
+namespace paddle_mobile {
+namespace operators {
+template <typename T>
+class ConcatFunctor {
+ public:
+  void operator()(const std::vector<framework::Tensor> &input, const int axis,
+                  framework::Tensor *output) {
+    size_t num = input.size();
+    int rows = 1;
+    auto dim_0 = input[0].dims();
+    for (int i = 0; i < axis; ++i) {
+      rows *= dim_0[i];
+    }
+    int out_rows = rows, out_cols = 0;
+    std::vector<int64_t> input_cols(input.size());
+    for (int i = 0; i < num; ++i) {
+      int t_cols = input[i].numel() / rows;
+      out_cols += t_cols;
+      input_cols[i] = t_cols;
+    }
+    // computation
+    for (int k = 0; k < out_rows; ++k) {
+      T *dst_ptr = output->data<T>() + k * out_cols;
+      int col_idx = 0;
+      for (int j = 0; j < num; ++j) {
+        int col_len = input_cols[j];
+        const T *src_prt = input[j].data<T>() + k * col_len;
+        memory::Copy(dst_ptr + col_idx, src_prt, sizeof(T) * col_len);
+        col_idx += col_len;
+      }
+    }
+  }
+};
+template <typename P>
+void ConcatCompute(const ConcatParam &param) {
+  auto inputs = param.Inputs();
+  auto *out = param.Out();
+  int64_t axis = param.Axis();
+  out->mutable_data<float>();
+  /// Sometimes direct copies will be faster, this maybe need deeply analysis.
+  if (axis == 0 && inputs.size() < 10) {
+    size_t output_offset = 0;
+    for (auto *in : inputs) {
+      auto in_stride = framework::stride_numel(in->dims());
+      auto out_stride = framework::stride_numel(out->dims());
+      auto dst = out->data<float>() + output_offset;
+      auto src = in->data<float>();
+      PADDLE_MOBILE_ENFORCE(
+          in_stride.size() == out_stride.size(),
+          "src and dst tensor should have the same dims size.");
+      memory::Copy(dst, src, sizeof(float) * in_stride[0]);
+      output_offset += in_stride[0];
+    }
+  } else {
+    std::vector<framework::Tensor> inputs_concat(inputs.size());
+    for (int j = 0; j < inputs.size(); ++j) {
+      inputs_concat[j] = *inputs[j];
+    }
+    ConcatFunctor<float> concat_functor;
+    concat_functor(inputs_concat, static_cast<int>(axis), out);
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h
@@ -15,14 +15,12 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBNRELU_OP
 #pragma once
-#include "operators/kernel/conv_add_bn_relu_kernel.h"
 #include "operators/math/depthwise_conv_3x3.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
+void ConvAddBNReluBasic(const FusionConvAddBNReluParam &param) {
-template <typename P>
-void ConvAddBNReluCompute(const FusionConvAddBNReluParam &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
  Tensor bias = *param.Bias();
@@ -31,105 +29,122 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam &param) {
  auto new_bias_ptr = new_bias.data<float>();
  auto new_scale_ptr = new_scale.data<float>();
  int axis = param.Axis();
+  Tensor *output = param.Output();
+  math::expand_bias(bias, axis, output->dims());
+  output->ShareDataWith(bias);
  int groups = param.Groups();
  std::vector<int> strides = param.Strides();
  std::vector<int> paddings = param.Paddings();
  std::vector<int> dilations = param.Dilations();
-  Tensor *output = param.Output();
+  const int batch_size = static_cast<int>(input->dims()[0]);
  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  if (filter_shape_vec[2] == 3 && strides[0] == 1 && groups > 1) {
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-    math::DepthwiseConvAddBNRelu3x3s1p1(input, filter, output, &bias, 1,
+  size_t data_dim = filter_shape_vec.size() - 2;
-                                        &new_scale, &new_bias, 1, 1);
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  } else {
+  col_shape_vec[0] = input->dims()[1] / groups;
-    const int batch_size = static_cast<int>(input->dims()[0]);
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    math::expand_bias(bias, axis, output->dims());
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    output->ShareDataWith(bias);
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-    std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-    size_t data_dim = filter_shape_vec.size() - 2;
+  framework::DDim col_matrix_shape =
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+      framework::flatten_to_2d(col_shape, data_dim + 1);
-    col_shape_vec[0] = input->dims()[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
+  bool is_expand =
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  Tensor col;
-    }
+  Tensor col_matrix;
-    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
-    framework::DDim col_matrix_shape =
+    col_matrix.ShareDataWith(col);
-        framework::flatten_to_2d(col_shape, data_dim + 1);
+    col_matrix.Resize(col_matrix_shape);
+  }
-    bool is_expand =
-        math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-    Tensor col;
-    Tensor col_matrix;
-    if (is_expand) {
-      col.mutable_data<float>(col_shape);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-    framework::DDim input_shape = framework::slice_ddim(
+  framework::DDim input_shape = framework::slice_ddim(
-        input->dims(), 1, static_cast<int>(input->dims().size()));
+      input->dims(), 1, static_cast<int>(input->dims().size()));
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
+                                         filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
+  filter.Resize(filter_matrix_shape);
-    framework::DDim output_matrix_shape = {
+  framework::DDim output_matrix_shape = {
-        output->dims()[1],
+      output->dims()[1],
-        output->numel() / (output->dims()[0] * output->dims()[1])};
+      output->numel() / (output->dims()[0] * output->dims()[1])};
-    // convolution operator: im2col(or vol2col) + gemm
+  // convolution operator: im2col(or vol2col) + gemm
-    int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
-    int out_step = static_cast<int>(output->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
-    math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Vol2ColFunctor<CPU, float> vol2col;
-    math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-    for (int i = 0; i < batch_size; i++) {
+  for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-      for (int g = 0; g < groups; g++) {
+    for (int g = 0; g < groups; g++) {
-        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-        if (!is_expand) {
+      if (!is_expand) {
-          col.ShareDataWith(in_slice);
+        col.ShareDataWith(in_slice);
-          col_matrix.ShareDataWith(col);
+        col_matrix.ShareDataWith(col);
-          col_matrix.Resize(col_matrix_shape);
+        col_matrix.Resize(col_matrix_shape);
-        } else if (data_dim == 2U) {
+      } else if (data_dim == 2U) {
-          // im2col
+        // im2col
-          im2col(in_slice, dilations, strides,
+        im2col(in_slice, dilations, strides,
-                 std::vector<int>{paddings[0], paddings[1], paddings[0],
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                  paddings[1]},
+                                paddings[1]},
-                 &col);
+               &col);
-        } else if (data_dim == 3U) {
+      } else if (data_dim == 3U) {
-          // vol2col
+        // vol2col
-          vol2col(in_slice, dilations, strides, paddings, &col);
+        vol2col(in_slice, dilations, strides, paddings, &col);
-        }
-        // gemm
-        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-        Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        math::matmul<float>(filter_slice, false, col_matrix, false,
-                            static_cast<float>(1), &out_slice,
-                            static_cast<float>(1), false);
      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(1));
    }
+  }
-    auto output_ptr = output->data<float>();
+  /// todo : use neon in special case instead of 2for(300ms)
-    for (int c = 0; c < output_matrix_shape[0]; c++) {
+  auto output_ptr = output->data<float>();
-      int start = c * output_matrix_shape[1];
+  for (int c = 0; c < output_matrix_shape[0]; c++) {
-      for (int j = 0; j < output_matrix_shape[1]; j++) {
+    int start = c * output_matrix_shape[1];
-        output_ptr[start + j] =
+    for (int j = 0; j < output_matrix_shape[1]; j++) {
-            output_ptr[start + j] * new_scale_ptr[c] + new_bias_ptr[c];
+      output_ptr[start + j] =
-        output_ptr[start + j] =
+          output_ptr[start + j] * new_scale_ptr[c] + new_bias_ptr[c];
-            output_ptr[start + j] < 0 ? 0 : output_ptr[start + j];
+      output_ptr[start + j] =
-      }
+          output_ptr[start + j] < 0 ? 0 : output_ptr[start + j];
    }
  }
 }
+template <typename P>
+void ConvAddBNReluCompute(const FusionConvAddBNReluParam &param) {
+  Tensor Bias;
+  Bias.mutable_data<float>({param.Groups()});
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConvAddBNRelu3x3s1p1(
+        param.Input(), param.Filter(), param.Output(), &Bias, 1,
+        param.NewScale(), param.NewBias(), 1, 1);
+  } else if (0 && param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
+                           param.Filter(), &Bias, param.Output(), false);
+  } else {
+    ConvAddBNReluBasic(param);
+  }
+}
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -15,19 +15,19 @@ limitations under the License. */
 #ifdef CONV_OP
 #pragma once
+#include <operators/math/depthwise_conv_3x3.h>
 #include <vector>
-#include "operators/math/conv_func.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
-template <typename P>
+inline void ConvBasic(const ConvParam &param) {
-void ConvCompute(const ConvParam &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
  Tensor *output = param.Output();
-  output->mutable_data<float>();
  int groups = param.Groups();
  std::vector<int> strides = param.Strides();
  std::vector<int> paddings = param.Paddings();
@@ -109,6 +109,27 @@ void ConvCompute(const ConvParam &param) {
  }
 }
+template <typename P>
+void ConvCompute(const ConvParam &param) {
+  Tensor Bias;
+  Bias.mutable_data<float>({param.Groups()});
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
+                               &Bias, false);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
+                           param.Filter(), &Bias, param.Output(), false);
+  } else {
+    ConvBasic(param);
+  }
+}
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
--- a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ELEMENTWISEADD_OP
+#pragma once
+namespace paddle_mobile {
+namespace operators {
+template <typename T>
+struct AddFunctor {
+  inline T operator()(T a, T b) const { return a + b; }
+};
+template <typename P>
+void ElementwiseAddCompute(const ElementwiseAddParam &param) {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *Out = param.Out();
+  Out->mutable_data<float>();
+  int axis = param.Axis();
+  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
+                                                 AddFunctor<float>(), Out);
+}
+template class ElementwiseAddKernel<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
+++ b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
--- a/src/operators/kernel/central-arm-func/lrn_arm_func.h
+++ b/src/operators/kernel/central-arm-func/lrn_arm_func.h
--- a/src/operators/kernel/central-arm-func/mul_arm_func.h
+++ b/src/operators/kernel/central-arm-func/mul_arm_func.h
--- a/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
+++ b/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
--- a/src/operators/kernel/central-arm-func/prior_box_arm_func.h
+++ b/src/operators/kernel/central-arm-func/prior_box_arm_func.h
--- a/src/operators/kernel/central-arm-func/relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/relu_arm_func.h
--- a/src/operators/kernel/central-arm-func/reshape_arm_func.h
+++ b/src/operators/kernel/central-arm-func/reshape_arm_func.h
--- a/src/operators/kernel/central-arm-func/transpose_arm_func.h
+++ b/src/operators/kernel/central-arm-func/transpose_arm_func.h
--- a/src/operators/kernel/dropout_kernel.h
+++ b/src/operators/kernel/dropout_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef DROPOUT_OP
+#include "framework/operator.h"
+#include "operators/op_param.h"
+#pragma once;
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class DropoutKernel : public framework::OpKernelBase<DeviceType, DropoutParam> {
+ public:
+  void Compute(const DropoutParam& param) const;
+  bool Init(DropoutParam* para);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/im2sequence_kernel.h
+++ b/src/operators/kernel/im2sequence_kernel.h
--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
--- a/src/operators/math/depthwise_conv_3x3.h
+++ b/src/operators/math/depthwise_conv_3x3.h
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
--- a/test/framework/test_optimize.cpp
+++ b/test/framework/test_optimize.cpp
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
--- a/test/net/test_resnet.cpp
+++ b/test/net/test_resnet.cpp
--- a/test/net/test_squeezenet.cpp
+++ b/test/net/test_squeezenet.cpp
--- a/test/net/test_yolo.cpp
+++ b/test/net/test_yolo.cpp
--- a/test/operators/test_im2sequence_op.cpp
+++ b/test/operators/test_im2sequence_op.cpp
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
--- a/test/test_include.h
+++ b/test/test_include.h
--- a/tools/build.sh
+++ b/tools/build.sh
--- a/tools/op.cmake
+++ b/tools/op.cmake