diff --git a/.gitignore b/.gitignore
index 532601bfe9222eae0d6be7378322ab1b9c2eb110..8f92118b08bb30531869c28d32d335cc47116350 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@
 *.lai
 *.la
 *.lib
+*.a
 
 # Executables
 *.exe
@@ -69,3 +70,25 @@ build
 # clion building directories
 cmake-build-debug
 cmake-build-release
+
+# ios
+tools/libomp.a
+
+# ios demo
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/googlenet_combine/
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/*.jpg
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/*.a
+*.xcuserstate
+/tools/quantification/quantify
+
+# metal
+Podfile.lock
+metal/Pods/
+SwiftProtobuf.framework
+paddle-mobile.xcworkspace
+metal/models/
+metal/images/
+*.a
+metal/paddle-mobile/paddle-mobile/CPU/libpaddle-mobile.a
+*.xcuserdatad/
+*/xcuserdata/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 43aea26f59802c4e58cecaf2313288ba2d1f307b..bcab53eb12a87881900894e3ab1e657d17d6af1f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,26 +1,71 @@
 cmake_minimum_required(VERSION 3.0)
+option(USE_OPENMP "openmp support" OFF)
+
 project(paddle-mobile)
 
 option(DEBUGING "enable debug mode" ON)
-option(USE_OPENMP "openmp support" OFF)
-option(USE_EXCEPTION "use std exception" ON)
-option(LOG_PROFILE "log profile" ON)
+option(USE_EXCEPTION "use std exception" OFF)
+option(LOG_PROFILE "log profile" OFF)
 # select the platform to build
 option(CPU "armv7 with neon" ON)
 option(MALI_GPU "mali gpu" OFF)
 option(FPGA "fpga" OFF)
-set(DEBUGING ON)
 
-file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c)
+file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
 file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
+include_directories(src/)
+
+if(IS_IOS)
+    set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
+else()
+    set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
+endif()
+
+if (DEBUGING)
+    message(STATUS "debug")
+    set(CMAKE_BUILD_TYPE Release)
+    set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
+    add_definitions(-DPADDLE_MOBILE_DEBUG)
+else ()
+    set(CMAKE_BUILD_TYPE Release)
+    set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
+    add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
+endif ()
+
+if (USE_EXCEPTION)
+    message(STATUS "use exception")
+    add_definitions(-DENABLE_EXCEPTION)
+    add_definitions(-fexceptions)
+else()
+    add_definitions(-fno-exceptions)
+endif ()
+
+if (LOG_PROFILE)
+    add_definitions(-DPADDLE_MOBILE_PROFILE)
+endif()
+
+if(USE_OPENMP)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
+endif()
+
+# platform control
+if (ARM_LINUX)
+    include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake")
+endif ()
 
 if (CPU)
   add_definitions(-DPADDLE_MOBILE_CPU)
 else()
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/arm/*.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/arm/*.cc)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/arm/*.cpp)
-
+    file(GLOB_RECURSE _tmp_list src/operators/kernel/arm/*.cpp src/operators/kernel/arm/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/arm/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
 endif()
 
 if (MALI_GPU)
@@ -37,85 +82,102 @@ if (MALI_GPU)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ACL=1")
 else()
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/*.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/*.cc)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/*.cpp)
-
-
+    file(GLOB_RECURSE _tmp_list src/operators/kernel/mali/*.cpp src/operators/kernel/mali/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/mali/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
 endif()
 
 if(FPGA)
+    set(DEBUGING ON)
+    add_definitions(-DPADDLE_MOBILE_DEBUG)
     add_definitions(-DPADDLE_MOBILE_FPGA)
 else()
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/fpga/*.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/fpga/*.cc)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/fpga/*.cpp)
+    file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/fpga/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
+
+
+    file(GLOB_RECURSE _tmp_list src/fpga/*.cpp src/fpga/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/fpga/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
 endif()
 
-
-set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
-if (DEBUGING)
-  message(STATUS "debug")
-  set(CMAKE_BUILD_TYPE Debug)
-  set(CMAKE_CXX_FLAGS_DEBUG "-g -DNDEBUG")
-  add_definitions(-DPADDLE_MOBILE_DEBUG)
-  if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
-    add_definitions(-DARMV7)
+if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
-  endif ()
-else ()
-  set(CMAKE_BUILD_TYPE Release)
-  set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
-  add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
-endif ()
-
-if (USE_EXCEPTION)
-    message(STATUS "use exception")
-    add_definitions(-DENABLE_EXCEPTION)
-    add_definitions(-fexceptions)
 else()
-    add_definitions(-fno-exceptions)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
 endif ()
 
-if (LOG_PROFILE)
-    add_definitions(-DPADDLE_MOBILE_PROFILE)
-endif()
-
-if(USE_OPENMP)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
-endif()
-
-
-
-
-if (NOT ANDROID_NDK_TOOLCHAIN_INCLUDED)
-  list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.cpp)
-  list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.h)
-  list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
+if (IS_IOS)
+else()
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.mm)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h)
 endif ()
 
-include_directories(src/)
-
 set(CMAKE_VERBOSE_MAKEFILE ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
 
+# NET default
+if (FPGA)
+    set(NET "FPGAnets" CACHE STRING "select net type")
+else()
+    set(NET "default" CACHE STRING "select net type")
+endif()
+
+set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGAnets" "NLP")
 include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
 
-# if (IS_IOS)
-#     add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+
+# build library
 if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
     list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS)
     add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+elseif(IS_IOS)
+    if(USE_OPENMP)
+        add_library(paddle-mobile-stage0 STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+        add_custom_target(paddle-mobile ALL
+            COMMAND libtool -static -o ${CMAKE_BINARY_DIR}/libpaddle-mobile.a ${CMAKE_CURRENT_LIST_DIR}/tools/libomp.a $<TARGET_FILE:paddle-mobile-stage0>
+            WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+            DEPENDS paddle-mobile
+        )
+        add_dependencies(paddle-mobile paddle-mobile-stage0)
+    else()
+        add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+    endif()
 else ()
     add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
 endif ()
 
+# unit test
 if(DEBUGING)
-    add_subdirectory(test)
+    if(IS_IOS)
+    else()
+        add_subdirectory(test)
+    endif()
 endif()
 
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1a25d65e02afb09dabc96e1ec241346cff34f6f2..a33db73e109042276b686e8ab74261273df87390 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -183,6 +183,9 @@ upstream
 
 接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。
 
+![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294877166787.jpg)
+之后就可以提交代码了
+
 ## 删除远程分支
 
 在 PR 被 merge 进主仓库后，我们可以在 PR 的页面删除远程仓库的分支。
@@ -219,7 +222,7 @@ upstream
      - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
      - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
    - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
-3. 如果解决了某个Issue的问题，请在该Pull Request的**第一个**评论框中加上：`fix #issue_number`，这样当该PUll Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
+3. 如果解决了某个Issue的问题，请在该Pull Request的**第一个**评论框中加上：`fix #issue_number`，这样当该Pull Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
 
 此外，在回复评审人意见时，请您遵守以下约定：
 
diff --git a/Dockerfile b/Dockerfile
index 8f54b870d594ece33d9c93ea40908668d3ad2f0e..f4fa3abcd29f613fe5f7a90f22a9736a3006bf3f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -15,7 +15,6 @@ RUN apt-get install -y --no-install-recommends \
         unzip \
         git \
         make \
-        cmake \
         cmake-curses-gui \
         python \
         python-pip \
@@ -25,9 +24,15 @@ RUN apt-get install -y --no-install-recommends \
         g++-arm-linux-gnueabi \
         gcc-arm-linux-gnueabi
 RUN apt-get autoremove -y && apt-get clean
-RUN pip install --upgrade pip
-RUN pip install wheel && pip install pre-commit
 RUN ln -s clang-format-5.0 /usr/bin/clang-format
+RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip
+RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel
+RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit
 RUN cd /tmp && curl -O http://mirrors.neusoft.edu.cn/android/repository/android-ndk-r17b-linux-x86_64.zip
+RUN curl -O https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
+        tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
+        mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \
+        mv /usr/bin/cmake /usr/bin/cmake.bak && ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
+        mv /usr/bin/ccmake /usr/bin/ccmake.bak && ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
 RUN cd /opt && unzip /tmp/android-ndk-r17b-linux-x86_64.zip
 ENV NDK_ROOT /opt/android-ndk-r17b
diff --git a/README.md b/README.md
index b6ae2beed999d146c64ffc9ee495373d9b77a175..de7dd530c94b4a3055cbf07a4a19a55c21457ed0 100644
--- a/README.md
+++ b/README.md
@@ -1,36 +1,149 @@
-# Paddle-Mobile 
-
+# Paddle-Mobile
  
 [![Build Status](https://travis-ci.org/PaddlePaddle/paddle-mobile.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/paddle-mobile)
-[![License](https://img.shields.io/badge/license-Apache%202-brightgreen.svg)](LICENSE)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/doc)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
+
+<!--[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)-->
+
+
+欢迎来到 Paddle-Mobile GitHub 项目。
+
+Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平台的深度学习的框架。Paddle-Mobile设计思想和PaddlePaddle的最新版fluid版本保持了高度一致，同时针对嵌入式做了大量优化。设计之初就对嵌入式的性能、体积、能耗、硬件平台覆盖等方面做了考虑。
+
+## 简单搜索线上效果
+
+如下gif是简单搜索app的线上主体检测应用效果
+
+![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif)
+
+## Demo目录
+
+[点我](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo)
+
+## Features
+
+- **ARM CPU**
+
+|mobilenet arm v7|1线程|2线程|4线程|
+|------------|----|-----|-----|
+|麒麟970(ms)|108.180|63.935|37.545|
+|麒麟960(ms)|108.588|63.073|36.822|
+|高通845(ms)|85.952|48.890|28.641|
+|高通835(ms)|105.434|62.752|37.131|
+|||||
+|mobilenetssd arm v7|1线程|2线程|4线程|
+|麒麟970(ms)|212.686|127.205|77.485|
+|麒麟960(ms)|212.641|125.338|75.250|
+|高通845(ms)|182.863|95.671|56.857|
+|高通835(ms)|213.849|127.717|77.006|
+|||||
+|googlenet(v1) arm v7|1线程|2线程|4线程|
+|麒麟970(ms)|335.288|234.559|161.295|
+|麒麟960(ms)|354.443|232.642|157.815|
+|高通845(ms)|282.007|173.146|122.148|
+|高通835(ms)|341.250|233.354|158.554|
+|||||
+|squeezenet arm v7|1线程|2线程|4线程|
+|麒麟970(ms)|83.726|57.944|36.923|
+|麒麟960(ms)|85.835|55.762|36.496|
+|高通845(ms)|71.301|41.618|28.785|
+|高通835(ms)|82.407|56.176|36.455|
+|||||
+|yolo arm v7|1线程|2线程|4线程|
+|麒麟970(ms)|129.658|79.993|49.969|
+|麒麟960(ms)|130.208|78.791|48.390|
+|高通845(ms)|109.244|61.736|40.600|
+|高通835(ms)|130.402|80.863|50.359|
+
+    测试机型信息：
+    麒麟970:荣耀v10     (2.36GHz * 4 + 1.8GHz * 4)
+    麒麟960:华为mate9   (2.36GHz * 4 + 1.8GHz * 4)
+    骁龙835:小米6       (2.45GHz * 4 + 1.9GHz * 4)
+    骁龙845:OPPO FindX  (2.80GHz * 4 + 1.8GHz * 4)
+    
+- **Mali GPU**
+
+    Mali GPU是百度和ARM合作开发的，双方团队近期都在致力于将paddle的op能无缝运行在ACL(arm compute library)。目前已经支持squeezenet，googlenet，resnet等几个网络模型，后续会继续加大力度。使全部移动端paddle op能高效运行在mali gpu上。 
+
+- **苹果设备的GPU Metal实现**
+
+|mobilenetfssd|速度|
+|------------|-----|
+|A9(ms)|33.78|
+|A10(ms)|24.05|
+|A11(ms)|17.15|
+|||
+|genet|速度|
+|A9(ms) |3.49|
+|A10(ms)|2.54|
+|A11(ms)|1.43|
+
+
+- **FPGA**
+
+    FPGA实现正在进行中，是基于Xilinx的ZU5目标开发板。
+
+- **灵活性**
+
+    * paddle-mobile cpu版不依赖任何第三库, 可进行快速集成。
+    * 使用泛型特化进行平台切换, 可灵活切换 cpu、gpu 和其他协处理器。
+    * 可根据特定的常见网络, 进行编译特定的 op, 降低编译时间, 减小包大小。
+    * 使用 docker 编译, 提供统一的编译环境。
+    * 高可拓展性, 方便拓展其他协处理器, 提供高性能 arm 算子实现, 方便其他协处理器开发者集成开发。
+    * 直接兼容 paddle-fluid 模型, 不需要额外的转换操作。
+
+- **体积**
+
+    paddle-mobile从设计之初就深入考虑到移动端的包体积的问题，cpu实现中没有外部依赖。在编译过程中，如果该网络不需要的op是完全不会被打入的。同时编译选项优化也为体积压缩提供了帮助。
+    除了二进制体积，我们对代码体积极力避免过大。整个仓库的代码体积也非常小。
+
+
+## 文档
+
+### 设计文档
+
+关于paddle-mobile设计文档在下面链接中，如果想了解更多内容。[issue](https://github.com/PaddlePaddle/paddle-mobile/issues)中会有很多早期的设计和讨论过程。
+[设计文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/design_doc.md)
+
+### 开发文档
+
+开发文档主要是关于编译、运行等问题。做为开发者，它可以和贡献文档共同结合使用。
+[开发文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_doc.md)
 
+### 贡献文档
+- [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
+- 上面文档中涵盖了主要的贡献代码流程，如果在实践中您还遇到了其他问题，可以发[issue](https://github.com/PaddlePaddle/paddle-mobile/issues)。我们看到后会尽快处理。
 
-This project is used to develop the next version deep learning freamwork for mobile device.
 
-# Development
+## 模型获得
+目前Paddle-Mobile仅支持Paddle fluid训练的模型。如果你手中的模型是不同种类的模型，需要进行模型转换才可以运行。
+### 1. 直接使用Paddle Fluid训练
+该方式最为可靠，推荐方式
+### 2. caffe转为Paddle Fluid模型
+[链接](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
+### 3. ONNX
+ONNX全称为“Open Neural Network Exchange”，即“开放的神经网络切换”。该项目的目的是让不同的神经网络开发框架做到互通互用。
 
-[Used model in development](https://mms-mis.cdn.bcebos.com/paddle-mobile/models.zip)
+除直接使用PaddlePaddle训练fluid版本的模型外，还可以通过onnx转换得到个别Paddle fluid模型。
 
-## cross-compilation to android
+目前，百度也在做onnx支持工作。相关转换项目在这里：[paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)。
 
-* NDK is required
-* ANDROID_NDK environment variable is required
+![](http://7xop3k.com1.z0.glb.clouddn.com/15311951836000.jpg)
 
-```bash 
-sh build.sh android
-```
+### 4. 部分测试模型和测试图片下载
+[下载链接](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
 
-## build for x86
-paddle-mobile is to run on arm platform. x86 only used to test not arm assembly code. So do not recommend compiling x86.
+## 问题解决
 
-Now only support osx.
+欢迎提出或解决我们的问题，有疑问可以发issue. [Github Issues](https://github.com/PaddlePaddle/paddle-mobile/issues).
 
-```
-sh build.sh mac
-```
+## Copyright and License
+Paddle-Mobile 提供相对宽松的Apache-2.0开源协议 [Apache-2.0 license](LICENSE).
 
-## Old Version of Mobile-Deep-Learning
-The old version of MDL was I moved to here [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) 
 
+## 旧版 Mobile-Deep-Learning
+原MDL(Mobile-Deep-Learning)工程被迁移到了这里 [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) 
 
 
diff --git a/demo/ReadMe.md b/demo/ReadMe.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa71f75cb7526234bb0bb32e2e5e1f93c1789711
--- /dev/null
+++ b/demo/ReadMe.md
@@ -0,0 +1,11 @@
+## 如何运行demo
+- Android demo下载路径   
+ http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip
+- iOS demo下载路径：   
+  http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip
+  
+在demo目录下执行下载demo的脚本
+```
+sh getDemo.sh
+```
+demo工程就下载解压到当前目录中了。
\ No newline at end of file
diff --git a/demo/getDemo.sh b/demo/getDemo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b69461e01c710c30ce9a44714ed2d0cdae0c9819
--- /dev/null
+++ b/demo/getDemo.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip
+wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip
+unzip paddle-mobile%2FPaddleMobile_Android.zip
+unzip paddle-mobile%2FPaddleMobileDemo_iOS.zip
+rm -rf paddle-mobile%2FPaddleMobile_Android.zip
+rm -rf paddle-mobile%2FPaddleMobileDemo_iOS.zip
+rm -rf __MACOSX
\ No newline at end of file
diff --git a/doc/build.md b/doc/build.md
index 6a8521b593ccdeab464687e7eae79192d131d51b..1c1c906458a0dd5f525c9d5153d48356b907b23b 100644
--- a/doc/build.md
+++ b/doc/build.md
@@ -19,9 +19,14 @@ paddle-mobile   dev     33b146787711   45 hours ago    372MB
 ```
 $ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
 root@5affd29d4fc5:/ # cd /paddle-mobile
+###
+### paddle-mobile 支持 arm 架构下的各种平台，包括 android 以及 linux 等，可以使用不同的
+### toolchain 文件生成满足需要的 makefile
+###
 # 生成构建 android 产出的 Makefile
 root@5affd29d4fc5:/ # rm CMakeCache.txt
 root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
+
 # 生成构建 linux 产出的 Makefile
 root@5affd29d4fc5:/ # rm CMakeCache.txt
 root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
@@ -56,4 +61,4 @@ root@5affd29d4fc5:/ # make
 构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
 
 ## 不使用 docker
-不使用 docker 的方法，可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具，可能需要设置 CC，CXX 环境变量，或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake，或者增加自己需要的 toolchain file。
\ No newline at end of file
+不使用 docker 的方法，可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具，可能需要设置 CC，CXX 环境变量，或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake，或者增加自己需要的 toolchain file。
diff --git a/doc/design_doc.md b/doc/design_doc.md
index 3ab649236dcb93fd9181d424870a87fec418448d..bf5f78e8d805465418cad8989945f2afa7ab5587 100644
--- a/doc/design_doc.md
+++ b/doc/design_doc.md
@@ -3,8 +3,7 @@
 
 #### 以下是 paddle-mobile 代码的执行流程图:
 
-![执行流程图](./images/flow_chart.png "执行流程图")
-
+![执行流程图](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305189473720.png)
 
 
 #### 主要分为: Loader 模块、 Program 模块、 Executor 模块、 op 模块、 kernel 模块、scope variable Tensor 模块
@@ -15,11 +14,13 @@
 先来看一下模型, 模型分为两种结构:
  一种为参数文件是散开的, 如下图, 红框为模型结构的 protobuf 文件, 其余为参数文件
 
-![模型描述](./images/model_desc.png "模型描述")
+![模型描述](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305190629577.png)
+
 
 另一种为参数文件结合在一起的, 如下图, 红框内为模型结构描述的 protobuf 文件, 另一个文件为结合在一起的参数文件
 
-![模型描述combined](./images/model_desc_combined.png "模型描述combined")
+![模型描述combined](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191057130.png)
+
 
 loader 模块的作用是将模型结构信息 load 进内存, 将红框内的 protobuf 文件 load 进内存, 并对模型结构进行优化(如将几个细粒度的 op 融合成 粗粒度的 op, 如将 conv、 add、 batchnorm、 relu 融合为 conv\_add\_batchnorm\_relu).
 方便进行算法优化.
@@ -160,7 +161,7 @@ sh build.sh android yolo
 ### 五. kernel
 kernel 为 op 的底层运算实现, 主要有两个函数, Init 和 Compute, 分别用来初始化、预处理 和 运算操作, 值得提出的是, kernel 会根据泛型特化到不同的平台, 如图所示:
 
-![设备特化](./images/devices.png "设备特化")
+![设备特化]![](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191401976.png)
 
 不同平台的 kernel 实现, 为同一个 kernel 类不同泛型的特化实现, 目前有三个平台, arm、mali、fpga, 图中的 central-arm-func\ 目录为 op kernel 的 arm 实现, 它承担了 arm\ 目录下 kernel 的底层实现, 同时 arm 处理器作为中央处理器, central-arm-func\ 也可以作为其他协处理器的底层实现, 如: fpga 的某一个 op kernel 还没有 fpga 协处理器的实现, 就可以直接调用使用这里的 arm 实现.
 
diff --git a/doc/development_doc.md b/doc/development_doc.md
index 5673e043bda80e414d7e841928283764bd2b310e..3f45f956f00e78c23b60b4c108b8c90cf4065e04 100644
--- a/doc/development_doc.md
+++ b/doc/development_doc.md
@@ -1,10 +1,14 @@
+### iOS&Android开发文档
+
 # iOS开发文档
 
 ## 编译
 
-### 一. 使用 build.sh 编译
+```sh
+
+# 在 paddle-mobile 目录下:
+cd tools
 
-```sh 
 sh build.sh ios
 
 # 如果只想编译某个特定模型的 op, 则需执行以下命令
@@ -14,40 +18,33 @@ sh build.sh ios googlenet
 cd ../build/release/ios/build
 
 ```
+#### 常见问题:
 
-### 二. 使用 xcode 编译
+1. No iOS SDK's found in default search path ...
 
-我们提供了 ios 开发更为熟悉的 xcode 编译环境:
-在 ios/ 目录下打开 PaddleMobile.xcworkspace 即可编译 PaddleMobile 或者 运行 Demo
+    这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定, 
+    以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk")
 
-### 三. 集成
+## 集成
 
-#### 如使用 c++ 接口
-将 
-
-```
-libpaddle-mobile.a 
-io.h  
-program.h 
-types.h 
-lod_tensor.h 
-tensor.h
 ```
-拖入工程, io.h 为接口文件, 可在 [github](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/io.h)上查看接口注释
+将上一步生成的:
+libpaddle-mobile.a
 
-#### 如使用 oc 接口
-将在xcode 编译生成的
-```
-libPaddleMobile.a 
+/src/ios_io/ 下的
 PaddleMobile.h
 ```
-拖入工程, 接口如下:
+拖入工程
+
+#### oc 接口
+
+接口如下:
 
 ```
 /*
-	创建单例对象
+	创建对象
 */
-+ (instancetype)sharedInstance;
+- (instancetype)init;
 
 /*
 	load 模型, 开辟内存
@@ -57,12 +54,12 @@ PaddleMobile.h
 /*
 	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
 */
-- (NSArray *)predict:(CGImageRef)image means:(NSArray<NSNumber *> *)means scale:(float)scale;
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
 
 /*
 	进行预测
 */
-- (NSArray *)predict:(CGImageRef)image;
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
 
 /*
 	清理内存
@@ -72,14 +69,184 @@ PaddleMobile.h
 ```
 
 
+# Android开发文档
 
+用户可通过如下两种方式，交叉编译Android平台上适用的paddle-mobile库：
 
+- 基于Docker容器编译
+- 基于Linux交叉编译
 
 
+## 基于Docker容器编译
+### 1. 安装 docker
+安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
+### 2. 使用 docker 搭建构建环境
+首先进入 paddle-mobile 的目录下，执行 `docker build`
+以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
 
+```
+$ docker build -t paddle-mobile:dev - < Dockerfile
+```
+使用 `docker images` 可以看到我们新建的 image
 
+```
+$ docker images
+REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
+paddle-mobile   dev     33b146787711   45 hours ago    372MB
+```
+### 3. 使用 docker 构建
+进入 paddle-mobile 目录，执行 docker run
 
+```
+$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
+root@5affd29d4fc5:/ # cd /paddle-mobile
+# 生成构建 android 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
+# 生成构建 linux 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
+```
+### 4. 设置编译选项
+可以通过 ccmake 设置编译选项
 
+```
+root@5affd29d4fc5:/ # ccmake .
+                                                     Page 1 of 1
+ CMAKE_ASM_FLAGS
+ CMAKE_ASM_FLAGS_DEBUG
+ CMAKE_ASM_FLAGS_RELEASE
+ CMAKE_BUILD_TYPE
+ CMAKE_INSTALL_PREFIX             /usr/local
+ CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
+ CPU                              ON
+ DEBUGING                         ON
+ FPGA                             OFF
+ LOG_PROFILE                      ON
+ MALI_GPU                         OFF
+ NET                              googlenet
+ USE_EXCEPTION                    ON
+ USE_OPENMP                       OFF
+```
+修改选项后，按 `c`, `g` 更新 Makefile
+### 5. 构建
+使用 make 命令进行构建
+
+```
+root@5affd29d4fc5:/ # make
+```
+### 6. 查看构建产出
+构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
+
+## 基于Linux交叉编译
+### 交叉编译环境准备
+##### 下载Android NDK
+
+从源码交叉编译paddle-mobile,用户需要提前准备好交叉编译环境。Android平台使用的C/C++交叉编译工具链是[Android NDK](https://developer.android.com/ndk/)，用户可以自行前往下载，也可以通过以下命令获取：
+- Mac平台
+```
+wget https://dl.google.com/android/repository/android-ndk-r17b-darwin-x86_64.zip
+unzip android-ndk-r17b-darwin-x86_64.zip
+
+```
+- Linux平台
+```
+wget https://dl.google.com/android/repository/android-ndk-r17b-linux-x86_64.zip
+unzip android-ndk-r17b-linux-x86_64.zip
+```
+
+##### 设置环境变量
+工程中自带的独立工具链会根据环境变量NDK_ROOT查找NDK，因此需要配置环境变量：
+
+```
+export NDK_ROOT = "path to ndk"
+```
+### 执行编译
+在paddle-mobile根目录中，执行以下命令：
 
+```
+cd tools
+sh build.sh android
+
+```
+执行完毕后，生成的so位于build目录中，单测可执行文件位于test/build目录中。
+##### Tips:
+如果想要获得体积更小的库，可选择编译支持指定模型结构的库。
+如执行如下命令：
+
+```
+sh build.sh android googlenet
+```
+会得到一个支持googlnet的体积更小的库。
+
+##测试
+在编译完成后，我们提供了自动化的测试脚本，帮助用户将运行单测文件所需要的模型及库文件push到Android设备中，执行以下命令：
+
+```
+cd tools/android-debug-script
+sh run_on_android.sh (npm) 可选参数npm,用于选择是否传输模型文件到手机上
+```
+出现如下提示：
+
+```
+**** choose OP or NET to test ****
+which to test :
+```
+输入名称即可运行对应的测试文件。
+
+##部署
+Android应用可通过JNI接口调用底层C/C++，paddle-mobile对外提供的JNI接口如下：
+
+##### 1 load接口  加载模型参数
+- 用于加载参数文件分散的模型
+```
+/**
+     * Load seperated parameters
+     * @param modelDir
+     * @return
+     */
+    public static native boolean load(String modelDir);
+```
+- 用于加载参数文件合并的模型文件
+```
+/**
+     * Load combined parameters
+     * @param modelPath
+     * @param paramPath
+     * @return
+     */
+    public static native boolean loadCombined(String modelPath,String paramPath);
+
+```
+##### 2 predict接口 执行预测
+- 接受预处理过的RGB数组的predict接口
+```
+/**
+*@param buf 输入数据
+*@return 输出数据
+JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
+    JNIEnv *env, jclass thiz, jfloatArray buf);
+```
+- 接受原始yuv数据的predict接口
+```
+ /**
+     *
+     * @param buf yuv420格式的字节数组
+     * @param imgWidth yuv数据的宽
+     * @param imgHeight yuv数据的高
+     * @param ddims 输入数据的形状
+     * @param meanValues 模型训练时各通道的均值
+     * @return
+     */
+
+    public static native float[] predictYuv(byte[] buf, int imgWidth, int imgHeight, int[] ddims, float[]meanValues);
+
+```
+##### 3 clear接口 销毁实例、清理内存操作
+
+```
+JNIEXPORT void JNICALL Java_com_baidu_paddle_PMLL_clear(JNIEnv *env,
+                                                        jclass thiz);
+```
 
 
diff --git a/doc/quantification.md b/doc/quantification.md
new file mode 100644
index 0000000000000000000000000000000000000000..04a93116a08c094ef71861cec1bb3262304c4cb7
--- /dev/null
+++ b/doc/quantification.md
@@ -0,0 +1,39 @@
+# Quantification 模型量化、反量化
+
+## 背景故事
+部分网络如AlexNet训练出的模型体积较大，不适宜在移动设备上使用。
+
+
+## 解决模型过大办法
+1. 选用适合移动端的模型结构如：mobilenet、googlenet、 yolo、squeezenet 等；
+2. 使用我们提供的量化工具，可以在几乎不影响精度的情况下将float32模型减小至原模型的 1/4；
+
+- - - - - 
+## 量化工具介绍
+
+### 模型转化工具目录：
+
+- [量化工具目录](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/tools/quantification)
+
+- [模型转化工具](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/convert.cpp)
+
+#### 使用说明
+- [工具使用](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/README.md)
+
+## 如何读取量化后的模型
+load方法中添加了 quantification 参数，默认为false。 如果需要load量化后的模型，按需传参即可。
+
+[我是源代码](https://github.com/PaddlePaddle/paddle-mobile/blob/55302b33ea3bd68c9797d8f65e527544792b8095/src/io/paddle_mobile.h)
+
+```c++
+bool Load(const std::string &dirname, bool optimize = false,
+            bool quantification = false, int batch_size = 1);
+```
+
+- - - - - 
+
+
+
+
+
+
diff --git a/ios/PaddleMobile.xcworkspace/contents.xcworkspacedata b/ios/PaddleMobile.xcworkspace/contents.xcworkspacedata
deleted file mode 100644
index 7c3243eefa5fdeaa40b8697bd8e0b5ba2daeeb55..0000000000000000000000000000000000000000
--- a/ios/PaddleMobile.xcworkspace/contents.xcworkspacedata
+++ /dev/null
@@ -1,10 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<Workspace
-   version = "1.0">
-   <FileRef
-      location = "group:PaddleMobileDemo/PaddleMobileDemo.xcodeproj">
-   </FileRef>
-   <FileRef
-      location = "group:PaddleMobile/PaddleMobile.xcodeproj">
-   </FileRef>
-</Workspace>
diff --git a/ios/PaddleMobile.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate b/ios/PaddleMobile.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
deleted file mode 100644
index a74810d22b023830a6e44d19984ff92302eb84a3..0000000000000000000000000000000000000000
Binary files a/ios/PaddleMobile.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate and /dev/null differ
diff --git a/ios/PaddleMobile.xcworkspace/xcuserdata/liuruilong.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist b/ios/PaddleMobile.xcworkspace/xcuserdata/liuruilong.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist
deleted file mode 100644
index ed9a9b4d42c454d27017c91c04ce8b8a518ac029..0000000000000000000000000000000000000000
--- a/ios/PaddleMobile.xcworkspace/xcuserdata/liuruilong.xcuserdatad/xcdebugger/Breakpoints_v2.xcbkptlist
+++ /dev/null
@@ -1,5 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<Bucket
-   type = "0"
-   version = "2.0">
-</Bucket>
diff --git a/ios/PaddleMobile/PaddleMobile.xcodeproj/project.pbxproj b/ios/PaddleMobile/PaddleMobile.xcodeproj/project.pbxproj
deleted file mode 100644
index 7907ac8955996b25f9173a6114bccad3b1e1bed9..0000000000000000000000000000000000000000
--- a/ios/PaddleMobile/PaddleMobile.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,965 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 50;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		FC086BB420E7839B00D85EF7 /* PaddleMobile.m in Sources */ = {isa = PBXBuildFile; fileRef = FC086BB320E7839B00D85EF7 /* PaddleMobile.m */; };
-		FC086BB520E7839B00D85EF7 /* PaddleMobile.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = FC086BB220E7839B00D85EF7 /* PaddleMobile.h */; };
-		FC086DC620E7841E00D85EF7 /* t_malloc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086CFE20E7841E00D85EF7 /* t_malloc.cpp */; };
-		FC086DC720E7841E00D85EF7 /* lrn_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0320E7841E00D85EF7 /* lrn_op.cpp */; };
-		FC086DC820E7841E00D85EF7 /* sigmoid_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0520E7841E00D85EF7 /* sigmoid_op.cpp */; };
-		FC086DC920E7841E00D85EF7 /* box_coder_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0620E7841E00D85EF7 /* box_coder_op.cpp */; };
-		FC086DCA20E7841E00D85EF7 /* feed_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0720E7841E00D85EF7 /* feed_op.cpp */; };
-		FC086DCB20E7841E00D85EF7 /* fusion_conv_add_bn_relu_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0A20E7841E00D85EF7 /* fusion_conv_add_bn_relu_op.cpp */; };
-		FC086DCC20E7841E00D85EF7 /* reshape_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0B20E7841E00D85EF7 /* reshape_op.cpp */; };
-		FC086DCD20E7841E00D85EF7 /* concat_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0C20E7841E00D85EF7 /* concat_op.cpp */; };
-		FC086DCE20E7841E00D85EF7 /* transpose_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0D20E7841E00D85EF7 /* transpose_op.cpp */; };
-		FC086DCF20E7841E00D85EF7 /* prior_box_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0E20E7841E00D85EF7 /* prior_box_op.cpp */; };
-		FC086DD020E7841E00D85EF7 /* fusion_conv_add_relu_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D0F20E7841E00D85EF7 /* fusion_conv_add_relu_op.cpp */; };
-		FC086DD120E7841E00D85EF7 /* softmax_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D1520E7841E00D85EF7 /* softmax_op.cpp */; };
-		FC086DD220E7841E00D85EF7 /* depthwise_conv_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D1720E7841E00D85EF7 /* depthwise_conv_op.cpp */; };
-		FC086DD320E7841E00D85EF7 /* elementwise_add_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D1A20E7841E00D85EF7 /* elementwise_add_op.cpp */; };
-		FC086DD420E7841E00D85EF7 /* gemm.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D1F20E7841E00D85EF7 /* gemm.cpp */; };
-		FC086DD520E7841E00D85EF7 /* pool_2x2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D2220E7841E00D85EF7 /* pool_2x2.cpp */; };
-		FC086DD620E7841E00D85EF7 /* im2col.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D2320E7841E00D85EF7 /* im2col.cpp */; };
-		FC086DD720E7841E00D85EF7 /* vol2col.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D2620E7841E00D85EF7 /* vol2col.cpp */; };
-		FC086DD820E7841E00D85EF7 /* math_function.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D2720E7841E00D85EF7 /* math_function.cpp */; };
-		FC086DD920E7841E00D85EF7 /* pool_3x3.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D2820E7841E00D85EF7 /* pool_3x3.cpp */; };
-		FC086DDA20E7841E00D85EF7 /* pooling.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D2B20E7841E00D85EF7 /* pooling.cpp */; };
-		FC086DDB20E7841E00D85EF7 /* depthwise_conv_3x3.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D2D20E7841E00D85EF7 /* depthwise_conv_3x3.cpp */; };
-		FC086DDC20E7841E00D85EF7 /* softmax.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D2F20E7841E00D85EF7 /* softmax.cpp */; };
-		FC086DDD20E7841E00D85EF7 /* fetch_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D3420E7841E00D85EF7 /* fetch_op.cpp */; };
-		FC086DDE20E7841E00D85EF7 /* fusion_conv_add.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D3520E7841E00D85EF7 /* fusion_conv_add.cpp */; };
-		FC086DDF20E7841E00D85EF7 /* op_param.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D3620E7841E00D85EF7 /* op_param.cpp */; };
-		FC086DE020E7841E00D85EF7 /* mul_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D3A20E7841E00D85EF7 /* mul_op.cpp */; };
-		FC086DE120E7841E00D85EF7 /* relu_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D3B20E7841E00D85EF7 /* relu_op.cpp */; };
-		FC086DE220E7841E00D85EF7 /* conv_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D3C20E7841E00D85EF7 /* conv_op.cpp */; };
-		FC086DE320E7841E00D85EF7 /* fusion_fc_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D3D20E7841E00D85EF7 /* fusion_fc_op.cpp */; };
-		FC086DE420E7841E00D85EF7 /* batchnorm_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D4020E7841E00D85EF7 /* batchnorm_op.cpp */; };
-		FC086DE520E7841E00D85EF7 /* pool_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D4220E7841E00D85EF7 /* pool_op.cpp */; };
-		FC086DE620E7841E00D85EF7 /* multiclass_nms_op.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D4420E7841E00D85EF7 /* multiclass_nms_op.cpp */; };
-		FC086DE720E7841E00D85EF7 /* acl_tensor.cc in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5220E7841E00D85EF7 /* acl_tensor.cc */; };
-		FC086DE820E7841E00D85EF7 /* acl_operator.cc in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5320E7841E00D85EF7 /* acl_operator.cc */; };
-		FC086DE920E7841E00D85EF7 /* conv_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5420E7841E00D85EF7 /* conv_kernel.cpp */; };
-		FC086DEA20E7841E00D85EF7 /* conv_add_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5620E7841E00D85EF7 /* conv_add_kernel.cpp */; };
-		FC086DEB20E7841E00D85EF7 /* relu_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5820E7841E00D85EF7 /* relu_kernel.cpp */; };
-		FC086DEC20E7841E00D85EF7 /* mul_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5920E7841E00D85EF7 /* mul_kernel.cpp */; };
-		FC086DED20E7841E00D85EF7 /* elementwise_add_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5A20E7841E00D85EF7 /* elementwise_add_kernel.cpp */; };
-		FC086DEE20E7841E00D85EF7 /* softmax_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5C20E7841E00D85EF7 /* softmax_kernel.cpp */; };
-		FC086DEF20E7841E00D85EF7 /* concat_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5D20E7841E00D85EF7 /* concat_kernel.cpp */; };
-		FC086DF020E7841E00D85EF7 /* pool_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5E20E7841E00D85EF7 /* pool_kernel.cpp */; };
-		FC086DF120E7841E00D85EF7 /* reshape_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D5F20E7841E00D85EF7 /* reshape_kernel.cpp */; };
-		FC086DF220E7841E00D85EF7 /* lrn_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D6020E7841E00D85EF7 /* lrn_kernel.cpp */; };
-		FC086DF320E7841E00D85EF7 /* fushion_fc_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D6120E7841E00D85EF7 /* fushion_fc_kernel.cpp */; };
-		FC086DF420E7841E00D85EF7 /* batchnorm_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D6220E7841E00D85EF7 /* batchnorm_kernel.cpp */; };
-		FC086DF520E7841E00D85EF7 /* conv_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D6F20E7841E00D85EF7 /* conv_kernel.cpp */; };
-		FC086DF620E7841E00D85EF7 /* prior_box_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7420E7841E00D85EF7 /* prior_box_kernel.cpp */; };
-		FC086DF720E7841E00D85EF7 /* conv_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7520E7841E00D85EF7 /* conv_kernel.cpp */; };
-		FC086DF820E7841E00D85EF7 /* conv_add_bn_relu_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7620E7841E00D85EF7 /* conv_add_bn_relu_kernel.cpp */; };
-		FC086DF920E7841E00D85EF7 /* box_coder_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7720E7841E00D85EF7 /* box_coder_kernel.cpp */; };
-		FC086DFA20E7841E00D85EF7 /* conv_add_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7820E7841E00D85EF7 /* conv_add_kernel.cpp */; };
-		FC086DFB20E7841E00D85EF7 /* sigmoid_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7920E7841E00D85EF7 /* sigmoid_kernel.cpp */; };
-		FC086DFC20E7841E00D85EF7 /* relu_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7A20E7841E00D85EF7 /* relu_kernel.cpp */; };
-		FC086DFD20E7841E00D85EF7 /* mul_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7B20E7841E00D85EF7 /* mul_kernel.cpp */; };
-		FC086DFE20E7841E00D85EF7 /* elementwise_add_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7C20E7841E00D85EF7 /* elementwise_add_kernel.cpp */; };
-		FC086DFF20E7841E00D85EF7 /* conv_add_relu_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7D20E7841E00D85EF7 /* conv_add_relu_kernel.cpp */; };
-		FC086E0020E7841E00D85EF7 /* transpose_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7E20E7841E00D85EF7 /* transpose_kernel.cpp */; };
-		FC086E0120E7841E00D85EF7 /* depthwise_conv_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D7F20E7841E00D85EF7 /* depthwise_conv_kernel.cpp */; };
-		FC086E0220E7841E00D85EF7 /* softmax_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8020E7841E00D85EF7 /* softmax_kernel.cpp */; };
-		FC086E0320E7841E00D85EF7 /* concat_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8120E7841E00D85EF7 /* concat_kernel.cpp */; };
-		FC086E0420E7841E00D85EF7 /* fusion_fc_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8220E7841E00D85EF7 /* fusion_fc_kernel.cpp */; };
-		FC086E0520E7841E00D85EF7 /* pool_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8320E7841E00D85EF7 /* pool_kernel.cpp */; };
-		FC086E0620E7841E00D85EF7 /* reshape_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8420E7841E00D85EF7 /* reshape_kernel.cpp */; };
-		FC086E0720E7841E00D85EF7 /* lrn_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8520E7841E00D85EF7 /* lrn_kernel.cpp */; };
-		FC086E0820E7841E00D85EF7 /* batchnorm_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8620E7841E00D85EF7 /* batchnorm_kernel.cpp */; };
-		FC086E0920E7841E00D85EF7 /* multiclass_nms_kernel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8720E7841E00D85EF7 /* multiclass_nms_kernel.cpp */; };
-		FC086E0A20E7841E00D85EF7 /* framework.pb-c.c in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8B20E7841E00D85EF7 /* framework.pb-c.c */; };
-		FC086E0B20E7841E00D85EF7 /* tensor_util.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8C20E7841E00D85EF7 /* tensor_util.cpp */; };
-		FC086E0C20E7841E00D85EF7 /* operator.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D8F20E7841E00D85EF7 /* operator.cpp */; };
-		FC086E0D20E7841E00D85EF7 /* ddim.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D9020E7841E00D85EF7 /* ddim.cpp */; };
-		FC086E0E20E7841E00D85EF7 /* scope.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D9320E7841E00D85EF7 /* scope.cpp */; };
-		FC086E0F20E7841E00D85EF7 /* attribute.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D9920E7841E00D85EF7 /* attribute.cpp */; };
-		FC086E1020E7841E00D85EF7 /* op_desc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D9C20E7841E00D85EF7 /* op_desc.cpp */; };
-		FC086E1120E7841E00D85EF7 /* program_desc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086D9D20E7841E00D85EF7 /* program_desc.cpp */; };
-		FC086E1220E7841E00D85EF7 /* node.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086DA320E7841E00D85EF7 /* node.cpp */; };
-		FC086E1320E7841E00D85EF7 /* program_optimize.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086DA620E7841E00D85EF7 /* program_optimize.cpp */; };
-		FC086E1420E7841E00D85EF7 /* block_desc.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086DA720E7841E00D85EF7 /* block_desc.cpp */; };
-		FC086E1520E7841E00D85EF7 /* lod_tensor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086DAB20E7841E00D85EF7 /* lod_tensor.cpp */; };
-		FC086E1620E7841E00D85EF7 /* io.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086DB320E7841E00D85EF7 /* io.cpp */; };
-		FC086E1720E7841E00D85EF7 /* types.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086DB620E7841E00D85EF7 /* types.cpp */; };
-		FC086E1820E7841E00D85EF7 /* openmp-fix.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086DBA20E7841E00D85EF7 /* openmp-fix.cpp */; };
-		FC086E1920E7841E00D85EF7 /* protobuf-c.c in Sources */ = {isa = PBXBuildFile; fileRef = FC086DC120E7841E00D85EF7 /* protobuf-c.c */; };
-		FC086E1A20E7841E00D85EF7 /* paddle_mobile_jni.cpp in Sources */ = {isa = PBXBuildFile; fileRef = FC086DC420E7841E00D85EF7 /* paddle_mobile_jni.cpp */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXCopyFilesBuildPhase section */
-		FC086BAD20E7839B00D85EF7 /* CopyFiles */ = {
-			isa = PBXCopyFilesBuildPhase;
-			buildActionMask = 2147483647;
-			dstPath = "include/$(PRODUCT_NAME)";
-			dstSubfolderSpec = 16;
-			files = (
-				FC086BB520E7839B00D85EF7 /* PaddleMobile.h in CopyFiles */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXCopyFilesBuildPhase section */
-
-/* Begin PBXFileReference section */
-		FC086BAF20E7839B00D85EF7 /* libPaddleMobile.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libPaddleMobile.a; sourceTree = BUILT_PRODUCTS_DIR; };
-		FC086BB220E7839B00D85EF7 /* PaddleMobile.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = PaddleMobile.h; sourceTree = "<group>"; };
-		FC086BB320E7839B00D85EF7 /* PaddleMobile.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = PaddleMobile.m; sourceTree = "<group>"; };
-		FC086CFE20E7841E00D85EF7 /* t_malloc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = t_malloc.cpp; sourceTree = "<group>"; };
-		FC086CFF20E7841E00D85EF7 /* t_malloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = t_malloc.h; sourceTree = "<group>"; };
-		FC086D0120E7841E00D85EF7 /* feed_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = feed_op.h; sourceTree = "<group>"; };
-		FC086D0220E7841E00D85EF7 /* fusion_conv_add_bn_relu_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fusion_conv_add_bn_relu_op.h; sourceTree = "<group>"; };
-		FC086D0320E7841E00D85EF7 /* lrn_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lrn_op.cpp; sourceTree = "<group>"; };
-		FC086D0420E7841E00D85EF7 /* op_param.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = op_param.h; sourceTree = "<group>"; };
-		FC086D0520E7841E00D85EF7 /* sigmoid_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sigmoid_op.cpp; sourceTree = "<group>"; };
-		FC086D0620E7841E00D85EF7 /* box_coder_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = box_coder_op.cpp; sourceTree = "<group>"; };
-		FC086D0720E7841E00D85EF7 /* feed_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = feed_op.cpp; sourceTree = "<group>"; };
-		FC086D0820E7841E00D85EF7 /* mul_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mul_op.h; sourceTree = "<group>"; };
-		FC086D0920E7841E00D85EF7 /* prior_box_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = prior_box_op.h; sourceTree = "<group>"; };
-		FC086D0A20E7841E00D85EF7 /* fusion_conv_add_bn_relu_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fusion_conv_add_bn_relu_op.cpp; sourceTree = "<group>"; };
-		FC086D0B20E7841E00D85EF7 /* reshape_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reshape_op.cpp; sourceTree = "<group>"; };
-		FC086D0C20E7841E00D85EF7 /* concat_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = concat_op.cpp; sourceTree = "<group>"; };
-		FC086D0D20E7841E00D85EF7 /* transpose_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = transpose_op.cpp; sourceTree = "<group>"; };
-		FC086D0E20E7841E00D85EF7 /* prior_box_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = prior_box_op.cpp; sourceTree = "<group>"; };
-		FC086D0F20E7841E00D85EF7 /* fusion_conv_add_relu_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fusion_conv_add_relu_op.cpp; sourceTree = "<group>"; };
-		FC086D1020E7841E00D85EF7 /* lrn_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = lrn_op.h; sourceTree = "<group>"; };
-		FC086D1120E7841E00D85EF7 /* multiclass_nms_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = multiclass_nms_op.h; sourceTree = "<group>"; };
-		FC086D1220E7841E00D85EF7 /* relu_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = relu_op.h; sourceTree = "<group>"; };
-		FC086D1320E7841E00D85EF7 /* fusion_conv_add.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fusion_conv_add.h; sourceTree = "<group>"; };
-		FC086D1420E7841E00D85EF7 /* conv_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_op.h; sourceTree = "<group>"; };
-		FC086D1520E7841E00D85EF7 /* softmax_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = softmax_op.cpp; sourceTree = "<group>"; };
-		FC086D1620E7841E00D85EF7 /* pool_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pool_op.h; sourceTree = "<group>"; };
-		FC086D1720E7841E00D85EF7 /* depthwise_conv_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = depthwise_conv_op.cpp; sourceTree = "<group>"; };
-		FC086D1820E7841E00D85EF7 /* softmax_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = softmax_op.h; sourceTree = "<group>"; };
-		FC086D1920E7841E00D85EF7 /* elementwise_add_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = elementwise_add_op.h; sourceTree = "<group>"; };
-		FC086D1A20E7841E00D85EF7 /* elementwise_add_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = elementwise_add_op.cpp; sourceTree = "<group>"; };
-		FC086D1B20E7841E00D85EF7 /* fetch_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fetch_op.h; sourceTree = "<group>"; };
-		FC086D1D20E7841E00D85EF7 /* elementwise_op_function.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = elementwise_op_function.h; sourceTree = "<group>"; };
-		FC086D1E20E7841E00D85EF7 /* softmax.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = softmax.h; sourceTree = "<group>"; };
-		FC086D1F20E7841E00D85EF7 /* gemm.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = gemm.cpp; sourceTree = "<group>"; };
-		FC086D2020E7841E00D85EF7 /* math_function.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = math_function.h; sourceTree = "<group>"; };
-		FC086D2120E7841E00D85EF7 /* conv_func.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_func.h; sourceTree = "<group>"; };
-		FC086D2220E7841E00D85EF7 /* pool_2x2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pool_2x2.cpp; sourceTree = "<group>"; };
-		FC086D2320E7841E00D85EF7 /* im2col.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = im2col.cpp; sourceTree = "<group>"; };
-		FC086D2420E7841E00D85EF7 /* gemm.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = gemm.h; sourceTree = "<group>"; };
-		FC086D2520E7841E00D85EF7 /* im2col.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = im2col.h; sourceTree = "<group>"; };
-		FC086D2620E7841E00D85EF7 /* vol2col.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = vol2col.cpp; sourceTree = "<group>"; };
-		FC086D2720E7841E00D85EF7 /* math_function.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = math_function.cpp; sourceTree = "<group>"; };
-		FC086D2820E7841E00D85EF7 /* pool_3x3.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pool_3x3.cpp; sourceTree = "<group>"; };
-		FC086D2920E7841E00D85EF7 /* pool_2x2.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pool_2x2.h; sourceTree = "<group>"; };
-		FC086D2A20E7841E00D85EF7 /* depthwise_conv_3x3.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = depthwise_conv_3x3.h; sourceTree = "<group>"; };
-		FC086D2B20E7841E00D85EF7 /* pooling.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pooling.cpp; sourceTree = "<group>"; };
-		FC086D2C20E7841E00D85EF7 /* pool_3x3.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pool_3x3.h; sourceTree = "<group>"; };
-		FC086D2D20E7841E00D85EF7 /* depthwise_conv_3x3.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = depthwise_conv_3x3.cpp; sourceTree = "<group>"; };
-		FC086D2E20E7841E00D85EF7 /* vol2col.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vol2col.h; sourceTree = "<group>"; };
-		FC086D2F20E7841E00D85EF7 /* softmax.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = softmax.cpp; sourceTree = "<group>"; };
-		FC086D3020E7841E00D85EF7 /* transform.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = transform.h; sourceTree = "<group>"; };
-		FC086D3120E7841E00D85EF7 /* pooling.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pooling.h; sourceTree = "<group>"; };
-		FC086D3220E7841E00D85EF7 /* math_func_neon.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = math_func_neon.h; sourceTree = "<group>"; };
-		FC086D3320E7841E00D85EF7 /* fusion_conv_add_relu_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fusion_conv_add_relu_op.h; sourceTree = "<group>"; };
-		FC086D3420E7841E00D85EF7 /* fetch_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fetch_op.cpp; sourceTree = "<group>"; };
-		FC086D3520E7841E00D85EF7 /* fusion_conv_add.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fusion_conv_add.cpp; sourceTree = "<group>"; };
-		FC086D3620E7841E00D85EF7 /* op_param.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = op_param.cpp; sourceTree = "<group>"; };
-		FC086D3720E7841E00D85EF7 /* transpose_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = transpose_op.h; sourceTree = "<group>"; };
-		FC086D3820E7841E00D85EF7 /* fusion_fc_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fusion_fc_op.h; sourceTree = "<group>"; };
-		FC086D3920E7841E00D85EF7 /* batchnorm_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = batchnorm_op.h; sourceTree = "<group>"; };
-		FC086D3A20E7841E00D85EF7 /* mul_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mul_op.cpp; sourceTree = "<group>"; };
-		FC086D3B20E7841E00D85EF7 /* relu_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = relu_op.cpp; sourceTree = "<group>"; };
-		FC086D3C20E7841E00D85EF7 /* conv_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_op.cpp; sourceTree = "<group>"; };
-		FC086D3D20E7841E00D85EF7 /* fusion_fc_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fusion_fc_op.cpp; sourceTree = "<group>"; };
-		FC086D3E20E7841E00D85EF7 /* box_coder_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = box_coder_op.h; sourceTree = "<group>"; };
-		FC086D3F20E7841E00D85EF7 /* concat_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = concat_op.h; sourceTree = "<group>"; };
-		FC086D4020E7841E00D85EF7 /* batchnorm_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = batchnorm_op.cpp; sourceTree = "<group>"; };
-		FC086D4120E7841E00D85EF7 /* reshape_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = reshape_op.h; sourceTree = "<group>"; };
-		FC086D4220E7841E00D85EF7 /* pool_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pool_op.cpp; sourceTree = "<group>"; };
-		FC086D4320E7841E00D85EF7 /* sigmoid_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sigmoid_op.h; sourceTree = "<group>"; };
-		FC086D4420E7841E00D85EF7 /* multiclass_nms_op.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = multiclass_nms_op.cpp; sourceTree = "<group>"; };
-		FC086D4620E7841E00D85EF7 /* relu_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = relu_kernel.h; sourceTree = "<group>"; };
-		FC086D4720E7841E00D85EF7 /* multiclass_nms_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = multiclass_nms_kernel.h; sourceTree = "<group>"; };
-		FC086D4820E7841E00D85EF7 /* depthwise_conv_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = depthwise_conv_kernel.h; sourceTree = "<group>"; };
-		FC086D4920E7841E00D85EF7 /* lrn_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = lrn_kernel.h; sourceTree = "<group>"; };
-		FC086D4A20E7841E00D85EF7 /* pool_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = pool_kernel.h; sourceTree = "<group>"; };
-		FC086D4B20E7841E00D85EF7 /* fusion_fc_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fusion_fc_kernel.h; sourceTree = "<group>"; };
-		FC086D4C20E7841E00D85EF7 /* box_coder_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = box_coder_kernel.h; sourceTree = "<group>"; };
-		FC086D4D20E7841E00D85EF7 /* concat_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = concat_kernel.h; sourceTree = "<group>"; };
-		FC086D4E20E7841E00D85EF7 /* mul_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mul_kernel.h; sourceTree = "<group>"; };
-		FC086D4F20E7841E00D85EF7 /* softmax_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = softmax_kernel.h; sourceTree = "<group>"; };
-		FC086D5020E7841E00D85EF7 /* batchnorm_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = batchnorm_kernel.h; sourceTree = "<group>"; };
-		FC086D5220E7841E00D85EF7 /* acl_tensor.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = acl_tensor.cc; sourceTree = "<group>"; };
-		FC086D5320E7841E00D85EF7 /* acl_operator.cc */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = acl_operator.cc; sourceTree = "<group>"; };
-		FC086D5420E7841E00D85EF7 /* conv_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_kernel.cpp; sourceTree = "<group>"; };
-		FC086D5520E7841E00D85EF7 /* acl_operator.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = acl_operator.h; sourceTree = "<group>"; };
-		FC086D5620E7841E00D85EF7 /* conv_add_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_add_kernel.cpp; sourceTree = "<group>"; };
-		FC086D5720E7841E00D85EF7 /* acl_tensor.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = acl_tensor.h; sourceTree = "<group>"; };
-		FC086D5820E7841E00D85EF7 /* relu_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = relu_kernel.cpp; sourceTree = "<group>"; };
-		FC086D5920E7841E00D85EF7 /* mul_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mul_kernel.cpp; sourceTree = "<group>"; };
-		FC086D5A20E7841E00D85EF7 /* elementwise_add_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = elementwise_add_kernel.cpp; sourceTree = "<group>"; };
-		FC086D5C20E7841E00D85EF7 /* softmax_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = softmax_kernel.cpp; sourceTree = "<group>"; };
-		FC086D5D20E7841E00D85EF7 /* concat_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = concat_kernel.cpp; sourceTree = "<group>"; };
-		FC086D5E20E7841E00D85EF7 /* pool_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pool_kernel.cpp; sourceTree = "<group>"; };
-		FC086D5F20E7841E00D85EF7 /* reshape_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reshape_kernel.cpp; sourceTree = "<group>"; };
-		FC086D6020E7841E00D85EF7 /* lrn_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lrn_kernel.cpp; sourceTree = "<group>"; };
-		FC086D6120E7841E00D85EF7 /* fushion_fc_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fushion_fc_kernel.cpp; sourceTree = "<group>"; };
-		FC086D6220E7841E00D85EF7 /* batchnorm_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = batchnorm_kernel.cpp; sourceTree = "<group>"; };
-		FC086D6320E7841E00D85EF7 /* elementwise_add_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = elementwise_add_kernel.h; sourceTree = "<group>"; };
-		FC086D6520E7841E00D85EF7 /* conv_arm_func.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_arm_func.h; sourceTree = "<group>"; };
-		FC086D6620E7841E00D85EF7 /* conv_add_bn_relu_func.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_add_bn_relu_func.h; sourceTree = "<group>"; };
-		FC086D6720E7841E00D85EF7 /* conv_add_relu_arm_func.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_add_relu_arm_func.h; sourceTree = "<group>"; };
-		FC086D6820E7841E00D85EF7 /* depthwise_conv_arm_func.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = depthwise_conv_arm_func.h; sourceTree = "<group>"; };
-		FC086D6920E7841E00D85EF7 /* batchnorm_arm_func.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = batchnorm_arm_func.h; sourceTree = "<group>"; };
-		FC086D6A20E7841E00D85EF7 /* conv_add_relu_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_add_relu_kernel.h; sourceTree = "<group>"; };
-		FC086D6B20E7841E00D85EF7 /* reshape_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = reshape_kernel.h; sourceTree = "<group>"; };
-		FC086D6C20E7841E00D85EF7 /* transpose_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = transpose_kernel.h; sourceTree = "<group>"; };
-		FC086D6D20E7841E00D85EF7 /* conv_add_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_add_kernel.h; sourceTree = "<group>"; };
-		FC086D6F20E7841E00D85EF7 /* conv_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7020E7841E00D85EF7 /* conv_add_bn_relu_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_add_bn_relu_kernel.h; sourceTree = "<group>"; };
-		FC086D7120E7841E00D85EF7 /* prior_box_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = prior_box_kernel.h; sourceTree = "<group>"; };
-		FC086D7220E7841E00D85EF7 /* conv_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = conv_kernel.h; sourceTree = "<group>"; };
-		FC086D7420E7841E00D85EF7 /* prior_box_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = prior_box_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7520E7841E00D85EF7 /* conv_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7620E7841E00D85EF7 /* conv_add_bn_relu_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_add_bn_relu_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7720E7841E00D85EF7 /* box_coder_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = box_coder_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7820E7841E00D85EF7 /* conv_add_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_add_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7920E7841E00D85EF7 /* sigmoid_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sigmoid_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7A20E7841E00D85EF7 /* relu_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = relu_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7B20E7841E00D85EF7 /* mul_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mul_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7C20E7841E00D85EF7 /* elementwise_add_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = elementwise_add_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7D20E7841E00D85EF7 /* conv_add_relu_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = conv_add_relu_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7E20E7841E00D85EF7 /* transpose_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = transpose_kernel.cpp; sourceTree = "<group>"; };
-		FC086D7F20E7841E00D85EF7 /* depthwise_conv_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = depthwise_conv_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8020E7841E00D85EF7 /* softmax_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = softmax_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8120E7841E00D85EF7 /* concat_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = concat_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8220E7841E00D85EF7 /* fusion_fc_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fusion_fc_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8320E7841E00D85EF7 /* pool_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pool_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8420E7841E00D85EF7 /* reshape_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = reshape_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8520E7841E00D85EF7 /* lrn_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lrn_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8620E7841E00D85EF7 /* batchnorm_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = batchnorm_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8720E7841E00D85EF7 /* multiclass_nms_kernel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = multiclass_nms_kernel.cpp; sourceTree = "<group>"; };
-		FC086D8820E7841E00D85EF7 /* sigmoid_kernel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sigmoid_kernel.h; sourceTree = "<group>"; };
-		FC086D8920E7841E00D85EF7 /* depthwise_conv_op.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = depthwise_conv_op.h; sourceTree = "<group>"; };
-		FC086D8B20E7841E00D85EF7 /* framework.pb-c.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "framework.pb-c.c"; sourceTree = "<group>"; };
-		FC086D8C20E7841E00D85EF7 /* tensor_util.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tensor_util.cpp; sourceTree = "<group>"; };
-		FC086D8D20E7841E00D85EF7 /* operator.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = operator.h; sourceTree = "<group>"; };
-		FC086D8E20E7841E00D85EF7 /* op_info.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = op_info.h; sourceTree = "<group>"; };
-		FC086D8F20E7841E00D85EF7 /* operator.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = operator.cpp; sourceTree = "<group>"; };
-		FC086D9020E7841E00D85EF7 /* ddim.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ddim.cpp; sourceTree = "<group>"; };
-		FC086D9120E7841E00D85EF7 /* tensor_util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tensor_util.h; sourceTree = "<group>"; };
-		FC086D9220E7841E00D85EF7 /* variable.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = variable.h; sourceTree = "<group>"; };
-		FC086D9320E7841E00D85EF7 /* scope.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = scope.cpp; sourceTree = "<group>"; };
-		FC086D9420E7841E00D85EF7 /* data_layout.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = data_layout.h; sourceTree = "<group>"; };
-		FC086D9520E7841E00D85EF7 /* lod_tensor.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = lod_tensor.h; sourceTree = "<group>"; };
-		FC086D9620E7841E00D85EF7 /* dim.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dim.h; sourceTree = "<group>"; };
-		FC086D9720E7841E00D85EF7 /* framework.pb-c.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "framework.pb-c.h"; sourceTree = "<group>"; };
-		FC086D9820E7841E00D85EF7 /* op_kernel_type.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = op_kernel_type.h; sourceTree = "<group>"; };
-		FC086D9920E7841E00D85EF7 /* attribute.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = attribute.cpp; sourceTree = "<group>"; };
-		FC086D9A20E7841E00D85EF7 /* op_proto_maker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = op_proto_maker.h; sourceTree = "<group>"; };
-		FC086D9C20E7841E00D85EF7 /* op_desc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = op_desc.cpp; sourceTree = "<group>"; };
-		FC086D9D20E7841E00D85EF7 /* program_desc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = program_desc.cpp; sourceTree = "<group>"; };
-		FC086D9E20E7841E00D85EF7 /* var_desc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = var_desc.h; sourceTree = "<group>"; };
-		FC086D9F20E7841E00D85EF7 /* program_desc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = program_desc.h; sourceTree = "<group>"; };
-		FC086DA020E7841E00D85EF7 /* op_desc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = op_desc.h; sourceTree = "<group>"; };
-		FC086DA220E7841E00D85EF7 /* fusion_op_register.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fusion_op_register.h; sourceTree = "<group>"; };
-		FC086DA320E7841E00D85EF7 /* node.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = node.cpp; sourceTree = "<group>"; };
-		FC086DA420E7841E00D85EF7 /* node.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = node.h; sourceTree = "<group>"; };
-		FC086DA520E7841E00D85EF7 /* program_optimize.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = program_optimize.h; sourceTree = "<group>"; };
-		FC086DA620E7841E00D85EF7 /* program_optimize.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = program_optimize.cpp; sourceTree = "<group>"; };
-		FC086DA720E7841E00D85EF7 /* block_desc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = block_desc.cpp; sourceTree = "<group>"; };
-		FC086DA820E7841E00D85EF7 /* program.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = program.h; sourceTree = "<group>"; };
-		FC086DA920E7841E00D85EF7 /* tensor_desc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tensor_desc.h; sourceTree = "<group>"; };
-		FC086DAA20E7841E00D85EF7 /* block_desc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = block_desc.h; sourceTree = "<group>"; };
-		FC086DAB20E7841E00D85EF7 /* lod_tensor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = lod_tensor.cpp; sourceTree = "<group>"; };
-		FC086DAC20E7841E00D85EF7 /* framework.proto */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = framework.proto; sourceTree = "<group>"; };
-		FC086DAD20E7841E00D85EF7 /* ddim.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ddim.h; sourceTree = "<group>"; };
-		FC086DAE20E7841E00D85EF7 /* attribute.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = attribute.h; sourceTree = "<group>"; };
-		FC086DAF20E7841E00D85EF7 /* scope.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = scope.h; sourceTree = "<group>"; };
-		FC086DB020E7841E00D85EF7 /* tensor.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tensor.h; sourceTree = "<group>"; };
-		FC086DB120E7841E00D85EF7 /* op_registry.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = op_registry.h; sourceTree = "<group>"; };
-		FC086DB320E7841E00D85EF7 /* io.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = io.cpp; sourceTree = "<group>"; };
-		FC086DB420E7841E00D85EF7 /* io.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = io.h; sourceTree = "<group>"; };
-		FC086DB620E7841E00D85EF7 /* types.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = types.cpp; sourceTree = "<group>"; };
-		FC086DB720E7841E00D85EF7 /* threadpool.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = threadpool.h; sourceTree = "<group>"; };
-		FC086DB820E7841E00D85EF7 /* types.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = types.h; sourceTree = "<group>"; };
-		FC086DB920E7841E00D85EF7 /* protobuf-c.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "protobuf-c.h"; sourceTree = "<group>"; };
-		FC086DBA20E7841E00D85EF7 /* openmp-fix.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = "openmp-fix.cpp"; sourceTree = "<group>"; };
-		FC086DBB20E7841E00D85EF7 /* dep_core.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dep_core.h; sourceTree = "<group>"; };
-		FC086DBC20E7841E00D85EF7 /* common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = common.h; sourceTree = "<group>"; };
-		FC086DBD20E7841E00D85EF7 /* log.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = log.h; sourceTree = "<group>"; };
-		FC086DBE20E7841E00D85EF7 /* macros.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = macros.h; sourceTree = "<group>"; };
-		FC086DBF20E7841E00D85EF7 /* type_define.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = type_define.h; sourceTree = "<group>"; };
-		FC086DC020E7841E00D85EF7 /* enforce.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = enforce.h; sourceTree = "<group>"; };
-		FC086DC120E7841E00D85EF7 /* protobuf-c.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "protobuf-c.c"; sourceTree = "<group>"; };
-		FC086DC220E7841E00D85EF7 /* variant.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = variant.h; sourceTree = "<group>"; };
-		FC086DC420E7841E00D85EF7 /* paddle_mobile_jni.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = paddle_mobile_jni.cpp; sourceTree = "<group>"; };
-		FC086DC520E7841E00D85EF7 /* paddle_mobile_jni.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = paddle_mobile_jni.h; sourceTree = "<group>"; };
-		FC2428A520E78DF20095932F /* MacroDefine.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = MacroDefine.h; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		FC086BAC20E7839B00D85EF7 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		FC086BA620E7839B00D85EF7 = {
-			isa = PBXGroup;
-			children = (
-				FC086BB120E7839B00D85EF7 /* PaddleMobile */,
-				FC086BB020E7839B00D85EF7 /* Products */,
-			);
-			sourceTree = "<group>";
-		};
-		FC086BB020E7839B00D85EF7 /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				FC086BAF20E7839B00D85EF7 /* libPaddleMobile.a */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		FC086BB120E7839B00D85EF7 /* PaddleMobile */ = {
-			isa = PBXGroup;
-			children = (
-				FC086CFC20E7841E00D85EF7 /* src */,
-				FC086BB220E7839B00D85EF7 /* PaddleMobile.h */,
-				FC086BB320E7839B00D85EF7 /* PaddleMobile.m */,
-				FC2428A520E78DF20095932F /* MacroDefine.h */,
-			);
-			path = PaddleMobile;
-			sourceTree = "<group>";
-		};
-		FC086CFC20E7841E00D85EF7 /* src */ = {
-			isa = PBXGroup;
-			children = (
-				FC086CFD20E7841E00D85EF7 /* memory */,
-				FC086D0020E7841E00D85EF7 /* operators */,
-				FC086D8A20E7841E00D85EF7 /* framework */,
-				FC086DB220E7841E00D85EF7 /* io */,
-				FC086DB520E7841E00D85EF7 /* common */,
-				FC086DC320E7841E00D85EF7 /* jni */,
-			);
-			name = src;
-			path = ../../../src;
-			sourceTree = "<group>";
-		};
-		FC086CFD20E7841E00D85EF7 /* memory */ = {
-			isa = PBXGroup;
-			children = (
-				FC086CFE20E7841E00D85EF7 /* t_malloc.cpp */,
-				FC086CFF20E7841E00D85EF7 /* t_malloc.h */,
-			);
-			path = memory;
-			sourceTree = "<group>";
-		};
-		FC086D0020E7841E00D85EF7 /* operators */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D0120E7841E00D85EF7 /* feed_op.h */,
-				FC086D0220E7841E00D85EF7 /* fusion_conv_add_bn_relu_op.h */,
-				FC086D0320E7841E00D85EF7 /* lrn_op.cpp */,
-				FC086D0420E7841E00D85EF7 /* op_param.h */,
-				FC086D0520E7841E00D85EF7 /* sigmoid_op.cpp */,
-				FC086D0620E7841E00D85EF7 /* box_coder_op.cpp */,
-				FC086D0720E7841E00D85EF7 /* feed_op.cpp */,
-				FC086D0820E7841E00D85EF7 /* mul_op.h */,
-				FC086D0920E7841E00D85EF7 /* prior_box_op.h */,
-				FC086D0A20E7841E00D85EF7 /* fusion_conv_add_bn_relu_op.cpp */,
-				FC086D0B20E7841E00D85EF7 /* reshape_op.cpp */,
-				FC086D0C20E7841E00D85EF7 /* concat_op.cpp */,
-				FC086D0D20E7841E00D85EF7 /* transpose_op.cpp */,
-				FC086D0E20E7841E00D85EF7 /* prior_box_op.cpp */,
-				FC086D0F20E7841E00D85EF7 /* fusion_conv_add_relu_op.cpp */,
-				FC086D1020E7841E00D85EF7 /* lrn_op.h */,
-				FC086D1120E7841E00D85EF7 /* multiclass_nms_op.h */,
-				FC086D1220E7841E00D85EF7 /* relu_op.h */,
-				FC086D1320E7841E00D85EF7 /* fusion_conv_add.h */,
-				FC086D1420E7841E00D85EF7 /* conv_op.h */,
-				FC086D1520E7841E00D85EF7 /* softmax_op.cpp */,
-				FC086D1620E7841E00D85EF7 /* pool_op.h */,
-				FC086D1720E7841E00D85EF7 /* depthwise_conv_op.cpp */,
-				FC086D1820E7841E00D85EF7 /* softmax_op.h */,
-				FC086D1920E7841E00D85EF7 /* elementwise_add_op.h */,
-				FC086D1A20E7841E00D85EF7 /* elementwise_add_op.cpp */,
-				FC086D1B20E7841E00D85EF7 /* fetch_op.h */,
-				FC086D1C20E7841E00D85EF7 /* math */,
-				FC086D3320E7841E00D85EF7 /* fusion_conv_add_relu_op.h */,
-				FC086D3420E7841E00D85EF7 /* fetch_op.cpp */,
-				FC086D3520E7841E00D85EF7 /* fusion_conv_add.cpp */,
-				FC086D3620E7841E00D85EF7 /* op_param.cpp */,
-				FC086D3720E7841E00D85EF7 /* transpose_op.h */,
-				FC086D3820E7841E00D85EF7 /* fusion_fc_op.h */,
-				FC086D3920E7841E00D85EF7 /* batchnorm_op.h */,
-				FC086D3A20E7841E00D85EF7 /* mul_op.cpp */,
-				FC086D3B20E7841E00D85EF7 /* relu_op.cpp */,
-				FC086D3C20E7841E00D85EF7 /* conv_op.cpp */,
-				FC086D3D20E7841E00D85EF7 /* fusion_fc_op.cpp */,
-				FC086D3E20E7841E00D85EF7 /* box_coder_op.h */,
-				FC086D3F20E7841E00D85EF7 /* concat_op.h */,
-				FC086D4020E7841E00D85EF7 /* batchnorm_op.cpp */,
-				FC086D4120E7841E00D85EF7 /* reshape_op.h */,
-				FC086D4220E7841E00D85EF7 /* pool_op.cpp */,
-				FC086D4320E7841E00D85EF7 /* sigmoid_op.h */,
-				FC086D4420E7841E00D85EF7 /* multiclass_nms_op.cpp */,
-				FC086D4520E7841E00D85EF7 /* kernel */,
-				FC086D8920E7841E00D85EF7 /* depthwise_conv_op.h */,
-			);
-			path = operators;
-			sourceTree = "<group>";
-		};
-		FC086D1C20E7841E00D85EF7 /* math */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D1D20E7841E00D85EF7 /* elementwise_op_function.h */,
-				FC086D1E20E7841E00D85EF7 /* softmax.h */,
-				FC086D1F20E7841E00D85EF7 /* gemm.cpp */,
-				FC086D2020E7841E00D85EF7 /* math_function.h */,
-				FC086D2120E7841E00D85EF7 /* conv_func.h */,
-				FC086D2220E7841E00D85EF7 /* pool_2x2.cpp */,
-				FC086D2320E7841E00D85EF7 /* im2col.cpp */,
-				FC086D2420E7841E00D85EF7 /* gemm.h */,
-				FC086D2520E7841E00D85EF7 /* im2col.h */,
-				FC086D2620E7841E00D85EF7 /* vol2col.cpp */,
-				FC086D2720E7841E00D85EF7 /* math_function.cpp */,
-				FC086D2820E7841E00D85EF7 /* pool_3x3.cpp */,
-				FC086D2920E7841E00D85EF7 /* pool_2x2.h */,
-				FC086D2A20E7841E00D85EF7 /* depthwise_conv_3x3.h */,
-				FC086D2B20E7841E00D85EF7 /* pooling.cpp */,
-				FC086D2C20E7841E00D85EF7 /* pool_3x3.h */,
-				FC086D2D20E7841E00D85EF7 /* depthwise_conv_3x3.cpp */,
-				FC086D2E20E7841E00D85EF7 /* vol2col.h */,
-				FC086D2F20E7841E00D85EF7 /* softmax.cpp */,
-				FC086D3020E7841E00D85EF7 /* transform.h */,
-				FC086D3120E7841E00D85EF7 /* pooling.h */,
-				FC086D3220E7841E00D85EF7 /* math_func_neon.h */,
-			);
-			path = math;
-			sourceTree = "<group>";
-		};
-		FC086D4520E7841E00D85EF7 /* kernel */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D4620E7841E00D85EF7 /* relu_kernel.h */,
-				FC086D4720E7841E00D85EF7 /* multiclass_nms_kernel.h */,
-				FC086D4820E7841E00D85EF7 /* depthwise_conv_kernel.h */,
-				FC086D4920E7841E00D85EF7 /* lrn_kernel.h */,
-				FC086D4A20E7841E00D85EF7 /* pool_kernel.h */,
-				FC086D4B20E7841E00D85EF7 /* fusion_fc_kernel.h */,
-				FC086D4C20E7841E00D85EF7 /* box_coder_kernel.h */,
-				FC086D4D20E7841E00D85EF7 /* concat_kernel.h */,
-				FC086D4E20E7841E00D85EF7 /* mul_kernel.h */,
-				FC086D4F20E7841E00D85EF7 /* softmax_kernel.h */,
-				FC086D5020E7841E00D85EF7 /* batchnorm_kernel.h */,
-				FC086D5120E7841E00D85EF7 /* mali */,
-				FC086D6320E7841E00D85EF7 /* elementwise_add_kernel.h */,
-				FC086D6420E7841E00D85EF7 /* central-arm-func */,
-				FC086D6A20E7841E00D85EF7 /* conv_add_relu_kernel.h */,
-				FC086D6B20E7841E00D85EF7 /* reshape_kernel.h */,
-				FC086D6C20E7841E00D85EF7 /* transpose_kernel.h */,
-				FC086D6D20E7841E00D85EF7 /* conv_add_kernel.h */,
-				FC086D6E20E7841E00D85EF7 /* fpga */,
-				FC086D7020E7841E00D85EF7 /* conv_add_bn_relu_kernel.h */,
-				FC086D7120E7841E00D85EF7 /* prior_box_kernel.h */,
-				FC086D7220E7841E00D85EF7 /* conv_kernel.h */,
-				FC086D7320E7841E00D85EF7 /* arm */,
-				FC086D8820E7841E00D85EF7 /* sigmoid_kernel.h */,
-			);
-			path = kernel;
-			sourceTree = "<group>";
-		};
-		FC086D5120E7841E00D85EF7 /* mali */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D5220E7841E00D85EF7 /* acl_tensor.cc */,
-				FC086D5320E7841E00D85EF7 /* acl_operator.cc */,
-				FC086D5420E7841E00D85EF7 /* conv_kernel.cpp */,
-				FC086D5520E7841E00D85EF7 /* acl_operator.h */,
-				FC086D5620E7841E00D85EF7 /* conv_add_kernel.cpp */,
-				FC086D5720E7841E00D85EF7 /* acl_tensor.h */,
-				FC086D5820E7841E00D85EF7 /* relu_kernel.cpp */,
-				FC086D5920E7841E00D85EF7 /* mul_kernel.cpp */,
-				FC086D5A20E7841E00D85EF7 /* elementwise_add_kernel.cpp */,
-				FC086D5B20E7841E00D85EF7 /* ACL_Android */,
-				FC086D5C20E7841E00D85EF7 /* softmax_kernel.cpp */,
-				FC086D5D20E7841E00D85EF7 /* concat_kernel.cpp */,
-				FC086D5E20E7841E00D85EF7 /* pool_kernel.cpp */,
-				FC086D5F20E7841E00D85EF7 /* reshape_kernel.cpp */,
-				FC086D6020E7841E00D85EF7 /* lrn_kernel.cpp */,
-				FC086D6120E7841E00D85EF7 /* fushion_fc_kernel.cpp */,
-				FC086D6220E7841E00D85EF7 /* batchnorm_kernel.cpp */,
-			);
-			path = mali;
-			sourceTree = "<group>";
-		};
-		FC086D5B20E7841E00D85EF7 /* ACL_Android */ = {
-			isa = PBXGroup;
-			children = (
-			);
-			path = ACL_Android;
-			sourceTree = "<group>";
-		};
-		FC086D6420E7841E00D85EF7 /* central-arm-func */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D6520E7841E00D85EF7 /* conv_arm_func.h */,
-				FC086D6620E7841E00D85EF7 /* conv_add_bn_relu_func.h */,
-				FC086D6720E7841E00D85EF7 /* conv_add_relu_arm_func.h */,
-				FC086D6820E7841E00D85EF7 /* depthwise_conv_arm_func.h */,
-				FC086D6920E7841E00D85EF7 /* batchnorm_arm_func.h */,
-			);
-			path = "central-arm-func";
-			sourceTree = "<group>";
-		};
-		FC086D6E20E7841E00D85EF7 /* fpga */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D6F20E7841E00D85EF7 /* conv_kernel.cpp */,
-			);
-			path = fpga;
-			sourceTree = "<group>";
-		};
-		FC086D7320E7841E00D85EF7 /* arm */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D7420E7841E00D85EF7 /* prior_box_kernel.cpp */,
-				FC086D7520E7841E00D85EF7 /* conv_kernel.cpp */,
-				FC086D7620E7841E00D85EF7 /* conv_add_bn_relu_kernel.cpp */,
-				FC086D7720E7841E00D85EF7 /* box_coder_kernel.cpp */,
-				FC086D7820E7841E00D85EF7 /* conv_add_kernel.cpp */,
-				FC086D7920E7841E00D85EF7 /* sigmoid_kernel.cpp */,
-				FC086D7A20E7841E00D85EF7 /* relu_kernel.cpp */,
-				FC086D7B20E7841E00D85EF7 /* mul_kernel.cpp */,
-				FC086D7C20E7841E00D85EF7 /* elementwise_add_kernel.cpp */,
-				FC086D7D20E7841E00D85EF7 /* conv_add_relu_kernel.cpp */,
-				FC086D7E20E7841E00D85EF7 /* transpose_kernel.cpp */,
-				FC086D7F20E7841E00D85EF7 /* depthwise_conv_kernel.cpp */,
-				FC086D8020E7841E00D85EF7 /* softmax_kernel.cpp */,
-				FC086D8120E7841E00D85EF7 /* concat_kernel.cpp */,
-				FC086D8220E7841E00D85EF7 /* fusion_fc_kernel.cpp */,
-				FC086D8320E7841E00D85EF7 /* pool_kernel.cpp */,
-				FC086D8420E7841E00D85EF7 /* reshape_kernel.cpp */,
-				FC086D8520E7841E00D85EF7 /* lrn_kernel.cpp */,
-				FC086D8620E7841E00D85EF7 /* batchnorm_kernel.cpp */,
-				FC086D8720E7841E00D85EF7 /* multiclass_nms_kernel.cpp */,
-			);
-			path = arm;
-			sourceTree = "<group>";
-		};
-		FC086D8A20E7841E00D85EF7 /* framework */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D8B20E7841E00D85EF7 /* framework.pb-c.c */,
-				FC086D8C20E7841E00D85EF7 /* tensor_util.cpp */,
-				FC086D8D20E7841E00D85EF7 /* operator.h */,
-				FC086D8E20E7841E00D85EF7 /* op_info.h */,
-				FC086D8F20E7841E00D85EF7 /* operator.cpp */,
-				FC086D9020E7841E00D85EF7 /* ddim.cpp */,
-				FC086D9120E7841E00D85EF7 /* tensor_util.h */,
-				FC086D9220E7841E00D85EF7 /* variable.h */,
-				FC086D9320E7841E00D85EF7 /* scope.cpp */,
-				FC086D9420E7841E00D85EF7 /* data_layout.h */,
-				FC086D9520E7841E00D85EF7 /* lod_tensor.h */,
-				FC086D9620E7841E00D85EF7 /* dim.h */,
-				FC086D9720E7841E00D85EF7 /* framework.pb-c.h */,
-				FC086D9820E7841E00D85EF7 /* op_kernel_type.h */,
-				FC086D9920E7841E00D85EF7 /* attribute.cpp */,
-				FC086D9A20E7841E00D85EF7 /* op_proto_maker.h */,
-				FC086D9B20E7841E00D85EF7 /* program */,
-				FC086DAB20E7841E00D85EF7 /* lod_tensor.cpp */,
-				FC086DAC20E7841E00D85EF7 /* framework.proto */,
-				FC086DAD20E7841E00D85EF7 /* ddim.h */,
-				FC086DAE20E7841E00D85EF7 /* attribute.h */,
-				FC086DAF20E7841E00D85EF7 /* scope.h */,
-				FC086DB020E7841E00D85EF7 /* tensor.h */,
-				FC086DB120E7841E00D85EF7 /* op_registry.h */,
-			);
-			path = framework;
-			sourceTree = "<group>";
-		};
-		FC086D9B20E7841E00D85EF7 /* program */ = {
-			isa = PBXGroup;
-			children = (
-				FC086D9C20E7841E00D85EF7 /* op_desc.cpp */,
-				FC086D9D20E7841E00D85EF7 /* program_desc.cpp */,
-				FC086D9E20E7841E00D85EF7 /* var_desc.h */,
-				FC086D9F20E7841E00D85EF7 /* program_desc.h */,
-				FC086DA020E7841E00D85EF7 /* op_desc.h */,
-				FC086DA120E7841E00D85EF7 /* program-optimize */,
-				FC086DA720E7841E00D85EF7 /* block_desc.cpp */,
-				FC086DA820E7841E00D85EF7 /* program.h */,
-				FC086DA920E7841E00D85EF7 /* tensor_desc.h */,
-				FC086DAA20E7841E00D85EF7 /* block_desc.h */,
-			);
-			path = program;
-			sourceTree = "<group>";
-		};
-		FC086DA120E7841E00D85EF7 /* program-optimize */ = {
-			isa = PBXGroup;
-			children = (
-				FC086DA220E7841E00D85EF7 /* fusion_op_register.h */,
-				FC086DA320E7841E00D85EF7 /* node.cpp */,
-				FC086DA420E7841E00D85EF7 /* node.h */,
-				FC086DA520E7841E00D85EF7 /* program_optimize.h */,
-				FC086DA620E7841E00D85EF7 /* program_optimize.cpp */,
-			);
-			path = "program-optimize";
-			sourceTree = "<group>";
-		};
-		FC086DB220E7841E00D85EF7 /* io */ = {
-			isa = PBXGroup;
-			children = (
-				FC086DB320E7841E00D85EF7 /* io.cpp */,
-				FC086DB420E7841E00D85EF7 /* io.h */,
-			);
-			path = io;
-			sourceTree = "<group>";
-		};
-		FC086DB520E7841E00D85EF7 /* common */ = {
-			isa = PBXGroup;
-			children = (
-				FC086DB620E7841E00D85EF7 /* types.cpp */,
-				FC086DB720E7841E00D85EF7 /* threadpool.h */,
-				FC086DB820E7841E00D85EF7 /* types.h */,
-				FC086DB920E7841E00D85EF7 /* protobuf-c.h */,
-				FC086DBA20E7841E00D85EF7 /* openmp-fix.cpp */,
-				FC086DBB20E7841E00D85EF7 /* dep_core.h */,
-				FC086DBC20E7841E00D85EF7 /* common.h */,
-				FC086DBD20E7841E00D85EF7 /* log.h */,
-				FC086DBE20E7841E00D85EF7 /* macros.h */,
-				FC086DBF20E7841E00D85EF7 /* type_define.h */,
-				FC086DC020E7841E00D85EF7 /* enforce.h */,
-				FC086DC120E7841E00D85EF7 /* protobuf-c.c */,
-				FC086DC220E7841E00D85EF7 /* variant.h */,
-			);
-			path = common;
-			sourceTree = "<group>";
-		};
-		FC086DC320E7841E00D85EF7 /* jni */ = {
-			isa = PBXGroup;
-			children = (
-				FC086DC420E7841E00D85EF7 /* paddle_mobile_jni.cpp */,
-				FC086DC520E7841E00D85EF7 /* paddle_mobile_jni.h */,
-			);
-			path = jni;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		FC086BAE20E7839B00D85EF7 /* PaddleMobile */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = FC086BB820E7839B00D85EF7 /* Build configuration list for PBXNativeTarget "PaddleMobile" */;
-			buildPhases = (
-				FC086BAB20E7839B00D85EF7 /* Sources */,
-				FC086BAC20E7839B00D85EF7 /* Frameworks */,
-				FC086BAD20E7839B00D85EF7 /* CopyFiles */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = PaddleMobile;
-			productName = PaddleMobile;
-			productReference = FC086BAF20E7839B00D85EF7 /* libPaddleMobile.a */;
-			productType = "com.apple.product-type.library.static";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		FC086BA720E7839B00D85EF7 /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastUpgradeCheck = 0930;
-				ORGANIZATIONNAME = orange;
-				TargetAttributes = {
-					FC086BAE20E7839B00D85EF7 = {
-						CreatedOnToolsVersion = 9.3.1;
-					};
-				};
-			};
-			buildConfigurationList = FC086BAA20E7839B00D85EF7 /* Build configuration list for PBXProject "PaddleMobile" */;
-			compatibilityVersion = "Xcode 9.3";
-			developmentRegion = en;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-			);
-			mainGroup = FC086BA620E7839B00D85EF7;
-			productRefGroup = FC086BB020E7839B00D85EF7 /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				FC086BAE20E7839B00D85EF7 /* PaddleMobile */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXSourcesBuildPhase section */
-		FC086BAB20E7839B00D85EF7 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				FC086DCE20E7841E00D85EF7 /* transpose_op.cpp in Sources */,
-				FC086DD820E7841E00D85EF7 /* math_function.cpp in Sources */,
-				FC086DE120E7841E00D85EF7 /* relu_op.cpp in Sources */,
-				FC086E0920E7841E00D85EF7 /* multiclass_nms_kernel.cpp in Sources */,
-				FC086E0220E7841E00D85EF7 /* softmax_kernel.cpp in Sources */,
-				FC086DCD20E7841E00D85EF7 /* concat_op.cpp in Sources */,
-				FC086DCA20E7841E00D85EF7 /* feed_op.cpp in Sources */,
-				FC086DD920E7841E00D85EF7 /* pool_3x3.cpp in Sources */,
-				FC086DF020E7841E00D85EF7 /* pool_kernel.cpp in Sources */,
-				FC086E1A20E7841E00D85EF7 /* paddle_mobile_jni.cpp in Sources */,
-				FC086DF620E7841E00D85EF7 /* prior_box_kernel.cpp in Sources */,
-				FC086DC620E7841E00D85EF7 /* t_malloc.cpp in Sources */,
-				FC086DD320E7841E00D85EF7 /* elementwise_add_op.cpp in Sources */,
-				FC086E0E20E7841E00D85EF7 /* scope.cpp in Sources */,
-				FC086DDE20E7841E00D85EF7 /* fusion_conv_add.cpp in Sources */,
-				FC086DFF20E7841E00D85EF7 /* conv_add_relu_kernel.cpp in Sources */,
-				FC086DD720E7841E00D85EF7 /* vol2col.cpp in Sources */,
-				FC086E0B20E7841E00D85EF7 /* tensor_util.cpp in Sources */,
-				FC086E1320E7841E00D85EF7 /* program_optimize.cpp in Sources */,
-				FC086DF820E7841E00D85EF7 /* conv_add_bn_relu_kernel.cpp in Sources */,
-				FC086DC820E7841E00D85EF7 /* sigmoid_op.cpp in Sources */,
-				FC086E0D20E7841E00D85EF7 /* ddim.cpp in Sources */,
-				FC086E0120E7841E00D85EF7 /* depthwise_conv_kernel.cpp in Sources */,
-				FC086DDB20E7841E00D85EF7 /* depthwise_conv_3x3.cpp in Sources */,
-				FC086BB420E7839B00D85EF7 /* PaddleMobile.m in Sources */,
-				FC086E1420E7841E00D85EF7 /* block_desc.cpp in Sources */,
-				FC086DC920E7841E00D85EF7 /* box_coder_op.cpp in Sources */,
-				FC086DDF20E7841E00D85EF7 /* op_param.cpp in Sources */,
-				FC086DD520E7841E00D85EF7 /* pool_2x2.cpp in Sources */,
-				FC086DFD20E7841E00D85EF7 /* mul_kernel.cpp in Sources */,
-				FC086E0C20E7841E00D85EF7 /* operator.cpp in Sources */,
-				FC086DE020E7841E00D85EF7 /* mul_op.cpp in Sources */,
-				FC086E1520E7841E00D85EF7 /* lod_tensor.cpp in Sources */,
-				FC086DE720E7841E00D85EF7 /* acl_tensor.cc in Sources */,
-				FC086DDD20E7841E00D85EF7 /* fetch_op.cpp in Sources */,
-				FC086DE220E7841E00D85EF7 /* conv_op.cpp in Sources */,
-				FC086DDA20E7841E00D85EF7 /* pooling.cpp in Sources */,
-				FC086DEF20E7841E00D85EF7 /* concat_kernel.cpp in Sources */,
-				FC086DE520E7841E00D85EF7 /* pool_op.cpp in Sources */,
-				FC086DE820E7841E00D85EF7 /* acl_operator.cc in Sources */,
-				FC086DF220E7841E00D85EF7 /* lrn_kernel.cpp in Sources */,
-				FC086E0F20E7841E00D85EF7 /* attribute.cpp in Sources */,
-				FC086E0520E7841E00D85EF7 /* pool_kernel.cpp in Sources */,
-				FC086DDC20E7841E00D85EF7 /* softmax.cpp in Sources */,
-				FC086E0420E7841E00D85EF7 /* fusion_fc_kernel.cpp in Sources */,
-				FC086E1220E7841E00D85EF7 /* node.cpp in Sources */,
-				FC086E0820E7841E00D85EF7 /* batchnorm_kernel.cpp in Sources */,
-				FC086DCC20E7841E00D85EF7 /* reshape_op.cpp in Sources */,
-				FC086DE920E7841E00D85EF7 /* conv_kernel.cpp in Sources */,
-				FC086E1920E7841E00D85EF7 /* protobuf-c.c in Sources */,
-				FC086DF920E7841E00D85EF7 /* box_coder_kernel.cpp in Sources */,
-				FC086DF120E7841E00D85EF7 /* reshape_kernel.cpp in Sources */,
-				FC086DF720E7841E00D85EF7 /* conv_kernel.cpp in Sources */,
-				FC086DCF20E7841E00D85EF7 /* prior_box_op.cpp in Sources */,
-				FC086E1720E7841E00D85EF7 /* types.cpp in Sources */,
-				FC086DF320E7841E00D85EF7 /* fushion_fc_kernel.cpp in Sources */,
-				FC086DEB20E7841E00D85EF7 /* relu_kernel.cpp in Sources */,
-				FC086E0620E7841E00D85EF7 /* reshape_kernel.cpp in Sources */,
-				FC086E0720E7841E00D85EF7 /* lrn_kernel.cpp in Sources */,
-				FC086DE620E7841E00D85EF7 /* multiclass_nms_op.cpp in Sources */,
-				FC086E1120E7841E00D85EF7 /* program_desc.cpp in Sources */,
-				FC086E0320E7841E00D85EF7 /* concat_kernel.cpp in Sources */,
-				FC086DEC20E7841E00D85EF7 /* mul_kernel.cpp in Sources */,
-				FC086DFB20E7841E00D85EF7 /* sigmoid_kernel.cpp in Sources */,
-				FC086E1820E7841E00D85EF7 /* openmp-fix.cpp in Sources */,
-				FC086DF420E7841E00D85EF7 /* batchnorm_kernel.cpp in Sources */,
-				FC086DEA20E7841E00D85EF7 /* conv_add_kernel.cpp in Sources */,
-				FC086E1620E7841E00D85EF7 /* io.cpp in Sources */,
-				FC086DD620E7841E00D85EF7 /* im2col.cpp in Sources */,
-				FC086DC720E7841E00D85EF7 /* lrn_op.cpp in Sources */,
-				FC086DD220E7841E00D85EF7 /* depthwise_conv_op.cpp in Sources */,
-				FC086DFA20E7841E00D85EF7 /* conv_add_kernel.cpp in Sources */,
-				FC086E0A20E7841E00D85EF7 /* framework.pb-c.c in Sources */,
-				FC086DD020E7841E00D85EF7 /* fusion_conv_add_relu_op.cpp in Sources */,
-				FC086DCB20E7841E00D85EF7 /* fusion_conv_add_bn_relu_op.cpp in Sources */,
-				FC086DFC20E7841E00D85EF7 /* relu_kernel.cpp in Sources */,
-				FC086DE320E7841E00D85EF7 /* fusion_fc_op.cpp in Sources */,
-				FC086E0020E7841E00D85EF7 /* transpose_kernel.cpp in Sources */,
-				FC086DEE20E7841E00D85EF7 /* softmax_kernel.cpp in Sources */,
-				FC086DE420E7841E00D85EF7 /* batchnorm_op.cpp in Sources */,
-				FC086DED20E7841E00D85EF7 /* elementwise_add_kernel.cpp in Sources */,
-				FC086DF520E7841E00D85EF7 /* conv_kernel.cpp in Sources */,
-				FC086DD120E7841E00D85EF7 /* softmax_op.cpp in Sources */,
-				FC086E1020E7841E00D85EF7 /* op_desc.cpp in Sources */,
-				FC086DD420E7841E00D85EF7 /* gemm.cpp in Sources */,
-				FC086DFE20E7841E00D85EF7 /* elementwise_add_kernel.cpp in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin XCBuildConfiguration section */
-		FC086BB620E7839B00D85EF7 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
-				MTL_ENABLE_DEBUG_INFO = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				SDKROOT = iphoneos;
-			};
-			name = Debug;
-		};
-		FC086BB720E7839B00D85EF7 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				SDKROOT = iphoneos;
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-		FC086BB920E7839B00D85EF7 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = Z5M2UUN5YV;
-				HEADER_SEARCH_PATHS = ../../src;
-				OTHER_LDFLAGS = "-ObjC";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SKIP_INSTALL = YES;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Debug;
-		};
-		FC086BBA20E7839B00D85EF7 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = Z5M2UUN5YV;
-				HEADER_SEARCH_PATHS = ../../src;
-				OTHER_LDFLAGS = "-ObjC";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SKIP_INSTALL = YES;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		FC086BAA20E7839B00D85EF7 /* Build configuration list for PBXProject "PaddleMobile" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FC086BB620E7839B00D85EF7 /* Debug */,
-				FC086BB720E7839B00D85EF7 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		FC086BB820E7839B00D85EF7 /* Build configuration list for PBXNativeTarget "PaddleMobile" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FC086BB920E7839B00D85EF7 /* Debug */,
-				FC086BBA20E7839B00D85EF7 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = FC086BA720E7839B00D85EF7 /* Project object */;
-}
diff --git a/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate b/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
deleted file mode 100644
index d1170b2289cb3302e40f5101b720bc835d08e7e1..0000000000000000000000000000000000000000
Binary files a/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate and /dev/null differ
diff --git a/ios/PaddleMobile/PaddleMobile.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist b/ios/PaddleMobile/PaddleMobile.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
deleted file mode 100644
index a877b7cd221cba607fdb31df85bfa008b95d988c..0000000000000000000000000000000000000000
--- a/ios/PaddleMobile/PaddleMobile.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
+++ /dev/null
@@ -1,14 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>SchemeUserState</key>
-	<dict>
-		<key>PaddleMobile.xcscheme</key>
-		<dict>
-			<key>orderHint</key>
-			<integer>1</integer>
-		</dict>
-	</dict>
-</dict>
-</plist>
diff --git a/ios/PaddleMobile/PaddleMobile/MacroDefine.h b/ios/PaddleMobile/PaddleMobile/MacroDefine.h
deleted file mode 100644
index a09c420e87ec698345b6d1ffa4fd64c2c6ef9b47..0000000000000000000000000000000000000000
--- a/ios/PaddleMobile/PaddleMobile/MacroDefine.h
+++ /dev/null
@@ -1,13 +0,0 @@
-//
-//  MacroDefine.h
-//  PaddleMobile
-//
-//  Created by liuRuiLong on 2018/6/30.
-//  Copyright © 2018年 orange. All rights reserved.
-//
-
-#ifndef MacroDefine_h
-#define MacroDefine_h
-
-
-#endif /* MacroDefine_h */
diff --git a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.pbxproj b/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.pbxproj
deleted file mode 100644
index e7e77ada15e84a6957a082c203c0121e118c5a3b..0000000000000000000000000000000000000000
--- a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,340 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 50;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		FC086BC920E783AF00D85EF7 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = FC086BC820E783AF00D85EF7 /* AppDelegate.m */; };
-		FC086BCC20E783AF00D85EF7 /* ViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = FC086BCB20E783AF00D85EF7 /* ViewController.m */; };
-		FC086BCF20E783AF00D85EF7 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC086BCD20E783AF00D85EF7 /* Main.storyboard */; };
-		FC086BD120E783B100D85EF7 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = FC086BD020E783B100D85EF7 /* Assets.xcassets */; };
-		FC086BD420E783B100D85EF7 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC086BD220E783B100D85EF7 /* LaunchScreen.storyboard */; };
-		FC086BD720E783B100D85EF7 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = FC086BD620E783B100D85EF7 /* main.m */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXFileReference section */
-		FC086BC420E783AF00D85EF7 /* PaddleMobileDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = PaddleMobileDemo.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		FC086BC720E783AF00D85EF7 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
-		FC086BC820E783AF00D85EF7 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
-		FC086BCA20E783AF00D85EF7 /* ViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ViewController.h; sourceTree = "<group>"; };
-		FC086BCB20E783AF00D85EF7 /* ViewController.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ViewController.m; sourceTree = "<group>"; };
-		FC086BCE20E783AF00D85EF7 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
-		FC086BD020E783B100D85EF7 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
-		FC086BD320E783B100D85EF7 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
-		FC086BD520E783B100D85EF7 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
-		FC086BD620E783B100D85EF7 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		FC086BC120E783AF00D85EF7 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		FC086BBB20E783AF00D85EF7 = {
-			isa = PBXGroup;
-			children = (
-				FC086BC620E783AF00D85EF7 /* PaddleMobileDemo */,
-				FC086BC520E783AF00D85EF7 /* Products */,
-			);
-			sourceTree = "<group>";
-		};
-		FC086BC520E783AF00D85EF7 /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				FC086BC420E783AF00D85EF7 /* PaddleMobileDemo.app */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		FC086BC620E783AF00D85EF7 /* PaddleMobileDemo */ = {
-			isa = PBXGroup;
-			children = (
-				FC086BC720E783AF00D85EF7 /* AppDelegate.h */,
-				FC086BC820E783AF00D85EF7 /* AppDelegate.m */,
-				FC086BCA20E783AF00D85EF7 /* ViewController.h */,
-				FC086BCB20E783AF00D85EF7 /* ViewController.m */,
-				FC086BCD20E783AF00D85EF7 /* Main.storyboard */,
-				FC086BD020E783B100D85EF7 /* Assets.xcassets */,
-				FC086BD220E783B100D85EF7 /* LaunchScreen.storyboard */,
-				FC086BD520E783B100D85EF7 /* Info.plist */,
-				FC086BD620E783B100D85EF7 /* main.m */,
-			);
-			path = PaddleMobileDemo;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		FC086BC320E783AF00D85EF7 /* PaddleMobileDemo */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = FC086BDA20E783B100D85EF7 /* Build configuration list for PBXNativeTarget "PaddleMobileDemo" */;
-			buildPhases = (
-				FC086BC020E783AF00D85EF7 /* Sources */,
-				FC086BC120E783AF00D85EF7 /* Frameworks */,
-				FC086BC220E783AF00D85EF7 /* Resources */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = PaddleMobileDemo;
-			productName = PaddleMobileDemo;
-			productReference = FC086BC420E783AF00D85EF7 /* PaddleMobileDemo.app */;
-			productType = "com.apple.product-type.application";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		FC086BBC20E783AF00D85EF7 /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastUpgradeCheck = 0930;
-				ORGANIZATIONNAME = orange;
-				TargetAttributes = {
-					FC086BC320E783AF00D85EF7 = {
-						CreatedOnToolsVersion = 9.3.1;
-					};
-				};
-			};
-			buildConfigurationList = FC086BBF20E783AF00D85EF7 /* Build configuration list for PBXProject "PaddleMobileDemo" */;
-			compatibilityVersion = "Xcode 9.3";
-			developmentRegion = en;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-				Base,
-			);
-			mainGroup = FC086BBB20E783AF00D85EF7;
-			productRefGroup = FC086BC520E783AF00D85EF7 /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				FC086BC320E783AF00D85EF7 /* PaddleMobileDemo */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		FC086BC220E783AF00D85EF7 /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				FC086BD420E783B100D85EF7 /* LaunchScreen.storyboard in Resources */,
-				FC086BD120E783B100D85EF7 /* Assets.xcassets in Resources */,
-				FC086BCF20E783AF00D85EF7 /* Main.storyboard in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		FC086BC020E783AF00D85EF7 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				FC086BCC20E783AF00D85EF7 /* ViewController.m in Sources */,
-				FC086BD720E783B100D85EF7 /* main.m in Sources */,
-				FC086BC920E783AF00D85EF7 /* AppDelegate.m in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin PBXVariantGroup section */
-		FC086BCD20E783AF00D85EF7 /* Main.storyboard */ = {
-			isa = PBXVariantGroup;
-			children = (
-				FC086BCE20E783AF00D85EF7 /* Base */,
-			);
-			name = Main.storyboard;
-			sourceTree = "<group>";
-		};
-		FC086BD220E783B100D85EF7 /* LaunchScreen.storyboard */ = {
-			isa = PBXVariantGroup;
-			children = (
-				FC086BD320E783B100D85EF7 /* Base */,
-			);
-			name = LaunchScreen.storyboard;
-			sourceTree = "<group>";
-		};
-/* End PBXVariantGroup section */
-
-/* Begin XCBuildConfiguration section */
-		FC086BD820E783B100D85EF7 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
-				MTL_ENABLE_DEBUG_INFO = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				SDKROOT = iphoneos;
-			};
-			name = Debug;
-		};
-		FC086BD920E783B100D85EF7 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				SDKROOT = iphoneos;
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-		FC086BDB20E783B100D85EF7 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = Z5M2UUN5YV;
-				INFOPLIST_FILE = PaddleMobileDemo/Info.plist;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-				);
-				PRODUCT_BUNDLE_IDENTIFIER = orange.PaddleMobileDemo;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Debug;
-		};
-		FC086BDC20E783B100D85EF7 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CODE_SIGN_STYLE = Automatic;
-				DEVELOPMENT_TEAM = Z5M2UUN5YV;
-				INFOPLIST_FILE = PaddleMobileDemo/Info.plist;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-				);
-				PRODUCT_BUNDLE_IDENTIFIER = orange.PaddleMobileDemo;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		FC086BBF20E783AF00D85EF7 /* Build configuration list for PBXProject "PaddleMobileDemo" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FC086BD820E783B100D85EF7 /* Debug */,
-				FC086BD920E783B100D85EF7 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		FC086BDA20E783B100D85EF7 /* Build configuration list for PBXNativeTarget "PaddleMobileDemo" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				FC086BDB20E783B100D85EF7 /* Debug */,
-				FC086BDC20E783B100D85EF7 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = FC086BBC20E783AF00D85EF7 /* Project object */;
-}
diff --git a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate b/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
deleted file mode 100644
index 69b699dbf553971a96471ef864e4e848c9b17e12..0000000000000000000000000000000000000000
Binary files a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate and /dev/null differ
diff --git a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist b/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
deleted file mode 100644
index 7caa9222e77f1e53c0ee45c298aacb330e870688..0000000000000000000000000000000000000000
--- a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist
+++ /dev/null
@@ -1,14 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>SchemeUserState</key>
-	<dict>
-		<key>PaddleMobileDemo.xcscheme</key>
-		<dict>
-			<key>orderHint</key>
-			<integer>0</integer>
-		</dict>
-	</dict>
-</dict>
-</plist>
diff --git a/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.m b/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.m
deleted file mode 100644
index 12cc19636b43f7fc3634736c4b551b4aba29ce73..0000000000000000000000000000000000000000
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.m
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- 
- http://www.apache.org/licenses/LICENSE-2.0
- 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#import "AppDelegate.h"
-
-@interface AppDelegate ()
-
-@end
-
-@implementation AppDelegate
-
-
-- (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
-    // Override point for customization after application launch.
-    return YES;
-}
-
-
-- (void)applicationWillResignActive:(UIApplication *)application {
-    // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
-    // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
-}
-
-
-- (void)applicationDidEnterBackground:(UIApplication *)application {
-    // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
-    // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
-}
-
-
-- (void)applicationWillEnterForeground:(UIApplication *)application {
-    // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
-}
-
-
-- (void)applicationDidBecomeActive:(UIApplication *)application {
-    // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
-}
-
-
-- (void)applicationWillTerminate:(UIApplication *)application {
-    // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
-}
-
-
-@end
diff --git a/metal/Podfile b/metal/Podfile
new file mode 100644
index 0000000000000000000000000000000000000000..6e9a6c6e3713ceaafc8d1769d7ec731ecc78b615
--- /dev/null
+++ b/metal/Podfile
@@ -0,0 +1,19 @@
+platform :ios, ‘9.0’
+use_frameworks!
+
+workspace 'paddle-mobile.xcworkspace'
+
+target 'paddle-mobile-demo' do
+	project 'paddle-mobile-demo/paddle-mobile-demo.xcodeproj'
+    pod 'SwiftProtobuf', '~> 1.0'
+end
+
+target 'paddle-mobile' do
+	project 'paddle-mobile/paddle-mobile.xcodeproj'
+	pod 'SwiftProtobuf', '~> 1.0'
+end
+
+target 'paddle-mobile-unit-test' do
+    project 'paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj'
+    pod 'SwiftProtobuf', '~> 1.0'
+end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
new file mode 100644
index 0000000000000000000000000000000000000000..d6114880efcaf528bd26fcda11e08ec68d943575
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
@@ -0,0 +1,562 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 50;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		30D0ED21F392CFA3885B1002 /* Pods_paddle_mobile_demo.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 18896810981724F8A0FED62A /* Pods_paddle_mobile_demo.framework */; };
+		C2E67E5E21524E460013F575 /* LoadPointerViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = C2E67E5D21524E460013F575 /* LoadPointerViewController.m */; };
+		FC013928210204A3008100E3 /* PreProcessKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC013927210204A3008100E3 /* PreProcessKernel.metal */; };
+		FC039B8220E11C550081E9F8 /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B8120E11C550081E9F8 /* AppDelegate.swift */; };
+		FC039B8420E11C550081E9F8 /* ViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B8320E11C550081E9F8 /* ViewController.swift */; };
+		FC039B8720E11C550081E9F8 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8520E11C550081E9F8 /* Main.storyboard */; };
+		FC039B8920E11C560081E9F8 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8820E11C560081E9F8 /* Assets.xcassets */; };
+		FC039B8C20E11C560081E9F8 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FC039B8A20E11C560081E9F8 /* LaunchScreen.storyboard */; };
+		FC803BCD214D27930094B8E5 /* FPSCounter.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BCB214D27920094B8E5 /* FPSCounter.swift */; };
+		FC803BCE214D27930094B8E5 /* VideoCapture.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BCC214D27920094B8E5 /* VideoCapture.swift */; };
+		FCBCCC552122EF5500D94F7E /* MetalHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC542122EF5400D94F7E /* MetalHelper.swift */; };
+		FCEBEC2C20E1391F00C0B14D /* paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */; };
+		FCEBEC2D20E1391F00C0B14D /* paddle_mobile.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
+		FCF437E8214B6DDB00943429 /* MultiPredictViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */; };
+		FCFE9B692152858600DECA15 /* hand.jpg.zip in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B632152858600DECA15 /* hand.jpg.zip */; };
+		FCFE9B6A2152858600DECA15 /* synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B642152858600DECA15 /* synset.txt */; };
+		FCFE9B6B2152858600DECA15 /* banana.jpeg in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B652152858600DECA15 /* banana.jpeg */; };
+		FCFE9B6C2152858600DECA15 /* hand.jpg in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B662152858600DECA15 /* hand.jpg */; };
+		FCFE9B6D2152858600DECA15 /* iphone.JPG in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B672152858600DECA15 /* iphone.JPG */; };
+		FCFE9B6E2152858600DECA15 /* paddle-mobile.png in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B682152858600DECA15 /* paddle-mobile.png */; };
+		FCFE9C512152859600DECA15 /* genet_params in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B752152859500DECA15 /* genet_params */; };
+		FCFE9C522152859600DECA15 /* genet_model in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9B762152859500DECA15 /* genet_model */; };
+		FCFE9D232152859600DECA15 /* ar_model in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9C4C2152859500DECA15 /* ar_model */; };
+		FCFE9D242152859600DECA15 /* ar_params in Resources */ = {isa = PBXBuildFile; fileRef = FCFE9C4D2152859500DECA15 /* ar_params */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXCopyFilesBuildPhase section */
+		FCEBEC2E20E1392000C0B14D /* Embed Frameworks */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = "";
+			dstSubfolderSpec = 10;
+			files = (
+				FCEBEC2D20E1391F00C0B14D /* paddle_mobile.framework in Embed Frameworks */,
+			);
+			name = "Embed Frameworks";
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXCopyFilesBuildPhase section */
+
+/* Begin PBXFileReference section */
+		081C9CF10DB06C58B8B6B039 /* Pods-paddle-mobile-demo.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-demo.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-demo/Pods-paddle-mobile-demo.release.xcconfig"; sourceTree = "<group>"; };
+		18896810981724F8A0FED62A /* Pods_paddle_mobile_demo.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile_demo.framework; sourceTree = BUILT_PRODUCTS_DIR; };
+		878829884E1A14D7044721D5 /* Pods-paddle-mobile-demo.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-demo.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-demo/Pods-paddle-mobile-demo.debug.xcconfig"; sourceTree = "<group>"; };
+		C2E67E5C21524E460013F575 /* LoadPointerViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LoadPointerViewController.h; sourceTree = "<group>"; };
+		C2E67E5D21524E460013F575 /* LoadPointerViewController.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = LoadPointerViewController.m; sourceTree = "<group>"; };
+		FC013927210204A3008100E3 /* PreProcessKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PreProcessKernel.metal; sourceTree = "<group>"; };
+		FC039B7E20E11C550081E9F8 /* paddle-mobile-demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "paddle-mobile-demo.app"; sourceTree = BUILT_PRODUCTS_DIR; };
+		FC039B8120E11C550081E9F8 /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
+		FC039B8320E11C550081E9F8 /* ViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ViewController.swift; sourceTree = "<group>"; };
+		FC039B8620E11C550081E9F8 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
+		FC039B8820E11C560081E9F8 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		FC039B8B20E11C560081E9F8 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
+		FC039B8D20E11C560081E9F8 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		FC27991121343A39000B6BAD /* paddle-mobile-demo-Bridging-Header.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = "paddle-mobile-demo-Bridging-Header.h"; sourceTree = "<group>"; };
+		FC4FD97B2140EE250073E130 /* libc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libc++.tbd"; path = "usr/lib/libc++.tbd"; sourceTree = SDKROOT; };
+		FC803BCB214D27920094B8E5 /* FPSCounter.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FPSCounter.swift; sourceTree = "<group>"; };
+		FC803BCC214D27920094B8E5 /* VideoCapture.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = VideoCapture.swift; sourceTree = "<group>"; };
+		FCBCCC542122EF5400D94F7E /* MetalHelper.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MetalHelper.swift; sourceTree = "<group>"; };
+		FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
+		FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MultiPredictViewController.swift; sourceTree = "<group>"; };
+		FCFE9B632152858600DECA15 /* hand.jpg.zip */ = {isa = PBXFileReference; lastKnownFileType = archive.zip; path = hand.jpg.zip; sourceTree = "<group>"; };
+		FCFE9B642152858600DECA15 /* synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = synset.txt; sourceTree = "<group>"; };
+		FCFE9B652152858600DECA15 /* banana.jpeg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = banana.jpeg; sourceTree = "<group>"; };
+		FCFE9B662152858600DECA15 /* hand.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = hand.jpg; sourceTree = "<group>"; };
+		FCFE9B672152858600DECA15 /* iphone.JPG */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = iphone.JPG; sourceTree = "<group>"; };
+		FCFE9B682152858600DECA15 /* paddle-mobile.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; path = "paddle-mobile.png"; sourceTree = "<group>"; };
+		FCFE9B752152859500DECA15 /* genet_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = genet_params; sourceTree = "<group>"; };
+		FCFE9B762152859500DECA15 /* genet_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = genet_model; sourceTree = "<group>"; };
+		FCFE9C4C2152859500DECA15 /* ar_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = ar_model; sourceTree = "<group>"; };
+		FCFE9C4D2152859500DECA15 /* ar_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = ar_params; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		FC039B7B20E11C550081E9F8 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FCEBEC2C20E1391F00C0B14D /* paddle_mobile.framework in Frameworks */,
+				30D0ED21F392CFA3885B1002 /* Pods_paddle_mobile_demo.framework in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		5722B50FEC38F55CA9B6A57B /* Pods */ = {
+			isa = PBXGroup;
+			children = (
+				878829884E1A14D7044721D5 /* Pods-paddle-mobile-demo.debug.xcconfig */,
+				081C9CF10DB06C58B8B6B039 /* Pods-paddle-mobile-demo.release.xcconfig */,
+			);
+			name = Pods;
+			sourceTree = "<group>";
+		};
+		7B7DED984E9EE7BFB45E24E8 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				FC4FD97B2140EE250073E130 /* libc++.tbd */,
+				18896810981724F8A0FED62A /* Pods_paddle_mobile_demo.framework */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		FC039B7520E11C550081E9F8 = {
+			isa = PBXGroup;
+			children = (
+				FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */,
+				FC039B8020E11C550081E9F8 /* paddle-mobile-demo */,
+				FC039B7F20E11C550081E9F8 /* Products */,
+				5722B50FEC38F55CA9B6A57B /* Pods */,
+				7B7DED984E9EE7BFB45E24E8 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		FC039B7F20E11C550081E9F8 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				FC039B7E20E11C550081E9F8 /* paddle-mobile-demo.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		FC039B8020E11C550081E9F8 /* paddle-mobile-demo */ = {
+			isa = PBXGroup;
+			children = (
+				FCFE9B6F2152859500DECA15 /* models */,
+				FCFE9B622152858600DECA15 /* images */,
+				FC803BCA214D27920094B8E5 /* VideoCapture */,
+				FC8CFED2213519540094D569 /* Net */,
+				FC039B8120E11C550081E9F8 /* AppDelegate.swift */,
+				FC039B8320E11C550081E9F8 /* ViewController.swift */,
+				FC039B8520E11C550081E9F8 /* Main.storyboard */,
+				FC039B8820E11C560081E9F8 /* Assets.xcassets */,
+				FC039B8A20E11C560081E9F8 /* LaunchScreen.storyboard */,
+				FC039B8D20E11C560081E9F8 /* Info.plist */,
+				FC27991121343A39000B6BAD /* paddle-mobile-demo-Bridging-Header.h */,
+				FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */,
+				C2E67E5C21524E460013F575 /* LoadPointerViewController.h */,
+				C2E67E5D21524E460013F575 /* LoadPointerViewController.m */,
+			);
+			path = "paddle-mobile-demo";
+			sourceTree = "<group>";
+		};
+		FC803BCA214D27920094B8E5 /* VideoCapture */ = {
+			isa = PBXGroup;
+			children = (
+				FC803BCB214D27920094B8E5 /* FPSCounter.swift */,
+				FC803BCC214D27920094B8E5 /* VideoCapture.swift */,
+			);
+			path = VideoCapture;
+			sourceTree = "<group>";
+		};
+		FC8CFED2213519540094D569 /* Net */ = {
+			isa = PBXGroup;
+			children = (
+				FC013927210204A3008100E3 /* PreProcessKernel.metal */,
+				FCBCCC542122EF5400D94F7E /* MetalHelper.swift */,
+			);
+			path = Net;
+			sourceTree = "<group>";
+		};
+		FCFE9B622152858600DECA15 /* images */ = {
+			isa = PBXGroup;
+			children = (
+				FCFE9B632152858600DECA15 /* hand.jpg.zip */,
+				FCFE9B642152858600DECA15 /* synset.txt */,
+				FCFE9B652152858600DECA15 /* banana.jpeg */,
+				FCFE9B662152858600DECA15 /* hand.jpg */,
+				FCFE9B672152858600DECA15 /* iphone.JPG */,
+				FCFE9B682152858600DECA15 /* paddle-mobile.png */,
+			);
+			name = images;
+			path = ../../images;
+			sourceTree = "<group>";
+		};
+		FCFE9B6F2152859500DECA15 /* models */ = {
+			isa = PBXGroup;
+			children = (
+				FCFE9B742152859500DECA15 /* genet */,
+				FCFE9C4B2152859500DECA15 /* fluid_fssd_new_ar */,
+			);
+			name = models;
+			path = ../../models;
+			sourceTree = "<group>";
+		};
+		FCFE9B742152859500DECA15 /* genet */ = {
+			isa = PBXGroup;
+			children = (
+				FCFE9B752152859500DECA15 /* genet_params */,
+				FCFE9B762152859500DECA15 /* genet_model */,
+			);
+			path = genet;
+			sourceTree = "<group>";
+		};
+		FCFE9C4B2152859500DECA15 /* fluid_fssd_new_ar */ = {
+			isa = PBXGroup;
+			children = (
+				FCFE9C4C2152859500DECA15 /* ar_model */,
+				FCFE9C4D2152859500DECA15 /* ar_params */,
+			);
+			path = fluid_fssd_new_ar;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		FC039B7D20E11C550081E9F8 /* paddle-mobile-demo */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = FC039B9020E11C560081E9F8 /* Build configuration list for PBXNativeTarget "paddle-mobile-demo" */;
+			buildPhases = (
+				9E041A9C487A2E44C709327E /* [CP] Check Pods Manifest.lock */,
+				FC039B7A20E11C550081E9F8 /* Sources */,
+				FC039B7B20E11C550081E9F8 /* Frameworks */,
+				FC039B7C20E11C550081E9F8 /* Resources */,
+				84ED590C0E51ABA9C34F51B5 /* [CP] Embed Pods Frameworks */,
+				FCEBEC2E20E1392000C0B14D /* Embed Frameworks */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = "paddle-mobile-demo";
+			productName = "paddle-mobile-demo";
+			productReference = FC039B7E20E11C550081E9F8 /* paddle-mobile-demo.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		FC039B7620E11C550081E9F8 /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastSwiftUpdateCheck = 0930;
+				LastUpgradeCheck = 0930;
+				ORGANIZATIONNAME = orange;
+				TargetAttributes = {
+					FC039B7D20E11C550081E9F8 = {
+						CreatedOnToolsVersion = 9.3.1;
+						LastSwiftMigration = 0940;
+					};
+				};
+			};
+			buildConfigurationList = FC039B7920E11C550081E9F8 /* Build configuration list for PBXProject "paddle-mobile-demo" */;
+			compatibilityVersion = "Xcode 9.3";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = FC039B7520E11C550081E9F8;
+			productRefGroup = FC039B7F20E11C550081E9F8 /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				FC039B7D20E11C550081E9F8 /* paddle-mobile-demo */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		FC039B7C20E11C550081E9F8 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FCFE9D232152859600DECA15 /* ar_model in Resources */,
+				FC039B8C20E11C560081E9F8 /* LaunchScreen.storyboard in Resources */,
+				FCFE9C522152859600DECA15 /* genet_model in Resources */,
+				FCFE9D242152859600DECA15 /* ar_params in Resources */,
+				FCFE9B6E2152858600DECA15 /* paddle-mobile.png in Resources */,
+				FCFE9C512152859600DECA15 /* genet_params in Resources */,
+				FCFE9B692152858600DECA15 /* hand.jpg.zip in Resources */,
+				FC039B8920E11C560081E9F8 /* Assets.xcassets in Resources */,
+				FCFE9B6A2152858600DECA15 /* synset.txt in Resources */,
+				FCFE9B6B2152858600DECA15 /* banana.jpeg in Resources */,
+				FCFE9B6D2152858600DECA15 /* iphone.JPG in Resources */,
+				FCFE9B6C2152858600DECA15 /* hand.jpg in Resources */,
+				FC039B8720E11C550081E9F8 /* Main.storyboard in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXShellScriptBuildPhase section */
+		84ED590C0E51ABA9C34F51B5 /* [CP] Embed Pods Frameworks */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+				"${SRCROOT}/../Pods/Target Support Files/Pods-paddle-mobile-demo/Pods-paddle-mobile-demo-frameworks.sh",
+				"${BUILT_PRODUCTS_DIR}/SwiftProtobuf/SwiftProtobuf.framework",
+			);
+			name = "[CP] Embed Pods Frameworks";
+			outputPaths = (
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/SwiftProtobuf.framework",
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "\"${SRCROOT}/../Pods/Target Support Files/Pods-paddle-mobile-demo/Pods-paddle-mobile-demo-frameworks.sh\"\n";
+			showEnvVarsInLog = 0;
+		};
+		9E041A9C487A2E44C709327E /* [CP] Check Pods Manifest.lock */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
+				"${PODS_ROOT}/Manifest.lock",
+			);
+			name = "[CP] Check Pods Manifest.lock";
+			outputPaths = (
+				"$(DERIVED_FILE_DIR)/Pods-paddle-mobile-demo-checkManifestLockResult.txt",
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
+			showEnvVarsInLog = 0;
+		};
+/* End PBXShellScriptBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		FC039B7A20E11C550081E9F8 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FC039B8420E11C550081E9F8 /* ViewController.swift in Sources */,
+				FC803BCE214D27930094B8E5 /* VideoCapture.swift in Sources */,
+				FC013928210204A3008100E3 /* PreProcessKernel.metal in Sources */,
+				FCF437E8214B6DDB00943429 /* MultiPredictViewController.swift in Sources */,
+				FCBCCC552122EF5500D94F7E /* MetalHelper.swift in Sources */,
+				FC803BCD214D27930094B8E5 /* FPSCounter.swift in Sources */,
+				C2E67E5E21524E460013F575 /* LoadPointerViewController.m in Sources */,
+				FC039B8220E11C550081E9F8 /* AppDelegate.swift in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin PBXVariantGroup section */
+		FC039B8520E11C550081E9F8 /* Main.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				FC039B8620E11C550081E9F8 /* Base */,
+			);
+			name = Main.storyboard;
+			sourceTree = "<group>";
+		};
+		FC039B8A20E11C560081E9F8 /* LaunchScreen.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				FC039B8B20E11C560081E9F8 /* Base */,
+			);
+			name = LaunchScreen.storyboard;
+			sourceTree = "<group>";
+		};
+/* End PBXVariantGroup section */
+
+/* Begin XCBuildConfiguration section */
+		FC039B8E20E11C560081E9F8 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+			};
+			name = Debug;
+		};
+		FC039B8F20E11C560081E9F8 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = iphoneos;
+				SWIFT_COMPILATION_MODE = wholemodule;
+				SWIFT_OPTIMIZATION_LEVEL = "-O";
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		FC039B9120E11C560081E9F8 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 878829884E1A14D7044721D5 /* Pods-paddle-mobile-demo.debug.xcconfig */;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CLANG_ENABLE_MODULES = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				CODE_SIGN_STYLE = Automatic;
+				DEVELOPMENT_TEAM = A798K58VVL;
+				ENABLE_BITCODE = NO;
+				INFOPLIST_FILE = "paddle-mobile-demo/Info.plist";
+				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = "com.baidu.paddle-mobile";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				PROVISIONING_PROFILE = "";
+				PROVISIONING_PROFILE_SPECIFIER = "";
+				SWIFT_OBJC_BRIDGING_HEADER = "paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h";
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				SWIFT_VERSION = 4.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		FC039B9220E11C560081E9F8 /* Release */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 081C9CF10DB06C58B8B6B039 /* Pods-paddle-mobile-demo.release.xcconfig */;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CLANG_ENABLE_MODULES = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				CODE_SIGN_STYLE = Automatic;
+				DEVELOPMENT_TEAM = A798K58VVL;
+				ENABLE_BITCODE = NO;
+				INFOPLIST_FILE = "paddle-mobile-demo/Info.plist";
+				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = "com.baidu.paddle-mobile";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				PROVISIONING_PROFILE = "";
+				PROVISIONING_PROFILE_SPECIFIER = "";
+				SWIFT_OBJC_BRIDGING_HEADER = "paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h";
+				SWIFT_VERSION = 4.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		FC039B7920E11C550081E9F8 /* Build configuration list for PBXProject "paddle-mobile-demo" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				FC039B8E20E11C560081E9F8 /* Debug */,
+				FC039B8F20E11C560081E9F8 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		FC039B9020E11C560081E9F8 /* Build configuration list for PBXNativeTarget "paddle-mobile-demo" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				FC039B9120E11C560081E9F8 /* Debug */,
+				FC039B9220E11C560081E9F8 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = FC039B7620E11C550081E9F8 /* Project object */;
+}
diff --git a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
similarity index 66%
rename from ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
rename to metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
index e4db9529ba656814e6a2bd889426662d914277eb..d363ac3d832069ff15c89241985b5be4f48a4e1a 100644
--- a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/contents.xcworkspacedata
@@ -2,6 +2,6 @@
 <Workspace
    version = "1.0">
    <FileRef
-      location = "self:PaddleMobileDemo.xcodeproj">
+      location = "self:paddle-mobile-demo.xcodeproj">
    </FileRef>
 </Workspace>
diff --git a/ios/PaddleMobile.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
similarity index 100%
rename from ios/PaddleMobile.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
rename to metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
new file mode 100644
index 0000000000000000000000000000000000000000..c13bba168aef55d0004299258e02496fc2486236
Binary files /dev/null and b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate differ
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift
new file mode 100644
index 0000000000000000000000000000000000000000..537fb06ed9e5b9100bea43b7acae9c014e0f4a78
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift
@@ -0,0 +1,51 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import UIKit
+
+@UIApplicationMain
+class AppDelegate: UIResponder, UIApplicationDelegate {
+
+    var window: UIWindow?
+
+    func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplicationLaunchOptionsKey: Any]?) -> Bool {
+        // Override point for customization after application launch.
+        return true
+    }
+
+    func applicationWillResignActive(_ application: UIApplication) {
+        // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
+        // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
+    }
+
+    func applicationDidEnterBackground(_ application: UIApplication) {
+        // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
+        // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
+    }
+
+    func applicationWillEnterForeground(_ application: UIApplication) {
+        // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
+    }
+
+    func applicationDidBecomeActive(_ application: UIApplication) {
+        // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
+    }
+
+    func applicationWillTerminate(_ application: UIApplication) {
+        // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
+    }
+
+
+}
+
diff --git a/ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/AppIcon.appiconset/Contents.json b/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/AppIcon.appiconset/Contents.json
similarity index 100%
rename from ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/AppIcon.appiconset/Contents.json
rename to metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/AppIcon.appiconset/Contents.json
diff --git a/ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/Contents.json b/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/Contents.json
similarity index 100%
rename from ios/PaddleMobileDemo/PaddleMobileDemo/Assets.xcassets/Contents.json
rename to metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/Contents.json
diff --git a/ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/LaunchScreen.storyboard b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/LaunchScreen.storyboard
similarity index 100%
rename from ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/LaunchScreen.storyboard
rename to metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/LaunchScreen.storyboard
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
new file mode 100644
index 0000000000000000000000000000000000000000..d72694fdacf5b46821ba6422fa77e095f92382b9
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
@@ -0,0 +1,312 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14113" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+    <device id="retina4_7" orientation="portrait">
+        <adaptation id="fullscreen"/>
+    </device>
+    <dependencies>
+        <deployment identifier="iOS"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14088"/>
+        <capability name="Aspect ratio constraints" minToolsVersion="5.1"/>
+        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--Multi Predict View Controller-->
+        <scene sceneID="ec4-AW-9Vs">
+            <objects>
+                <viewController id="Vwd-lt-764" customClass="MultiPredictViewController" customModule="paddle_mobile_demo" customModuleProvider="target" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="55D-rz-Ex6">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="TQt-X9-PdF">
+                                <rect key="frame" x="164" y="318" width="46" height="30"/>
+                                <state key="normal" title="Button"/>
+                                <connections>
+                                    <action selector="predictAct:" destination="Vwd-lt-764" eventType="touchUpInside" id="d4z-Cv-6jY"/>
+                                </connections>
+                            </button>
+                        </subviews>
+                        <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                        <constraints>
+                            <constraint firstItem="TQt-X9-PdF" firstAttribute="centerY" secondItem="55D-rz-Ex6" secondAttribute="centerY" id="bL3-wr-TcH"/>
+                            <constraint firstItem="TQt-X9-PdF" firstAttribute="centerX" secondItem="55D-rz-Ex6" secondAttribute="centerX" id="sBi-RQ-sJn"/>
+                        </constraints>
+                        <viewLayoutGuide key="safeArea" id="bsd-h4-RYZ"/>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="68E-SG-96s" userLabel="First Responder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="-559" y="686"/>
+        </scene>
+        <!--View Controller-->
+        <scene sceneID="tne-QT-ifu">
+            <objects>
+                <viewController id="BYZ-38-t0r" customClass="ViewController" customModule="paddle_mobile_demo" customModuleProvider="target" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <imageView userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" translatesAutoresizingMaskIntoConstraints="NO" id="ZZh-fw-LwK">
+                                <rect key="frame" x="0.0" y="20" width="225" height="247"/>
+                            </imageView>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" horizontalCompressionResistancePriority="749" text="Platform:" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="2EB-m2-a3L">
+                                <rect key="frame" x="10" y="538" width="35" height="24"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="24" id="SYv-As-Si8"/>
+                                </constraints>
+                                <fontDescription key="fontDescription" type="system" pointSize="20"/>
+                                <nil key="textColor"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <pickerView contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="DlO-dk-RMr">
+                                <rect key="frame" x="55" y="510.5" width="320" height="80"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="80" id="Sbi-05-Mwd"/>
+                                </constraints>
+                            </pickerView>
+                            <pickerView contentMode="scaleToFill" horizontalCompressionResistancePriority="749" translatesAutoresizingMaskIntoConstraints="NO" id="6MG-gv-hD5">
+                                <rect key="frame" x="85" y="401" width="290" height="80"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="80" id="yAL-JY-G6b"/>
+                                </constraints>
+                            </pickerView>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="Models" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="avL-VK-Kha">
+                                <rect key="frame" x="10" y="429" width="65" height="24"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="24" id="EwE-B3-z2R"/>
+                                </constraints>
+                                <fontDescription key="fontDescription" type="system" pointSize="20"/>
+                                <nil key="textColor"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="wUL-9N-u1V">
+                                <rect key="frame" x="16" y="597" width="63.5" height="30"/>
+                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <state key="normal" title="Image">
+                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                </state>
+                                <connections>
+                                    <action selector="selectImageAct:" destination="BYZ-38-t0r" eventType="touchUpInside" id="5uR-SM-fKO"/>
+                                </connections>
+                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="XpL-9M-UOp">
+                                <rect key="frame" x="109.5" y="597" width="63" height="30"/>
+                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <state key="normal" title="Load">
+                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                </state>
+                                <connections>
+                                    <action selector="loadAct:" destination="BYZ-38-t0r" eventType="touchUpInside" id="fZ5-CQ-jCY"/>
+                                </connections>
+                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="R90-Yf-S6g">
+                                <rect key="frame" x="202.5" y="597" width="63.5" height="30"/>
+                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <state key="normal" title="Predict">
+                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                </state>
+                                <connections>
+                                    <action selector="predictAct:" destination="BYZ-38-t0r" eventType="touchUpInside" id="Iyy-sY-gt4"/>
+                                </connections>
+                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="a3K-ri-NVs">
+                                <rect key="frame" x="296" y="597" width="63" height="30"/>
+                                <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <state key="normal" title="Clear">
+                                    <color key="titleColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                </state>
+                                <connections>
+                                    <action selector="clearAct:" destination="BYZ-38-t0r" eventType="touchUpInside" id="JYf-UX-rCR"/>
+                                </connections>
+                            </button>
+                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="w7H-Sk-Rai">
+                                <rect key="frame" x="79.5" y="597" width="30" height="30"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="30" id="tje-ct-ded"/>
+                                    <constraint firstAttribute="width" constant="30" id="vYd-Fc-KAj"/>
+                                </constraints>
+                            </view>
+                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="T4O-nx-ciH">
+                                <rect key="frame" x="266" y="597" width="30" height="30"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="30" id="CZQ-vS-4di"/>
+                                    <constraint firstAttribute="width" constant="30" id="fXE-S7-ZXL"/>
+                                </constraints>
+                            </view>
+                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="976-fk-Kx2">
+                                <rect key="frame" x="172.5" y="597" width="30" height="30"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="30" id="JBc-yg-8YH"/>
+                                    <constraint firstAttribute="width" constant="30" id="L4p-hP-s5C"/>
+                                </constraints>
+                            </view>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="耗时:" lineBreakMode="tailTruncation" numberOfLines="0" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="m5L-O7-P31">
+                                <rect key="frame" x="15" y="277" width="350" height="38"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="38" id="6SS-sb-7I2"/>
+                                </constraints>
+                                <fontDescription key="fontDescription" type="system" pointSize="15"/>
+                                <nil key="textColor"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                            <imageView userInteractionEnabled="NO" contentMode="scaleToFill" horizontalHuggingPriority="251" verticalHuggingPriority="251" image="paddle-mobile.png" translatesAutoresizingMaskIntoConstraints="NO" id="4ey-Xr-U4e">
+                                <rect key="frame" x="90" y="637" width="195" height="30"/>
+                                <constraints>
+                                    <constraint firstAttribute="width" secondItem="4ey-Xr-U4e" secondAttribute="height" multiplier="6.5:1" id="8c5-FF-lB9"/>
+                                </constraints>
+                            </imageView>
+                            <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" editable="NO" text="结果:" textAlignment="natural" translatesAutoresizingMaskIntoConstraints="NO" id="VQn-bS-fWp">
+                                <rect key="frame" x="10" y="323" width="355" height="70"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="70" id="07M-Gx-Elk"/>
+                                </constraints>
+                                <fontDescription key="fontDescription" type="system" pointSize="15"/>
+                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
+                            </textView>
+                            <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="Cil-py-NiA">
+                                <rect key="frame" x="225" y="20" width="150" height="247"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                            </view>
+                        </subviews>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                        <constraints>
+                            <constraint firstItem="m5L-O7-P31" firstAttribute="top" secondItem="Cil-py-NiA" secondAttribute="bottom" constant="10" id="16p-IK-b5X"/>
+                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="trailing" secondItem="VQn-bS-fWp" secondAttribute="trailing" constant="10" id="1Xg-0h-9SE"/>
+                            <constraint firstItem="avL-VK-Kha" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="2t9-hS-VXa"/>
+                            <constraint firstItem="R90-Yf-S6g" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="76b-Ny-1Og"/>
+                            <constraint firstItem="DlO-dk-RMr" firstAttribute="centerY" secondItem="2EB-m2-a3L" secondAttribute="centerY" id="7R7-7x-IRs"/>
+                            <constraint firstItem="a3K-ri-NVs" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="8Gv-HO-dKf"/>
+                            <constraint firstItem="w7H-Sk-Rai" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="94H-ZN-G7S"/>
+                            <constraint firstItem="2EB-m2-a3L" firstAttribute="top" secondItem="avL-VK-Kha" secondAttribute="bottom" constant="85" id="A5J-Qv-Ux5"/>
+                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="trailing" secondItem="a3K-ri-NVs" secondAttribute="trailing" constant="16" id="Avk-9e-Pvg"/>
+                            <constraint firstItem="DlO-dk-RMr" firstAttribute="leading" secondItem="2EB-m2-a3L" secondAttribute="trailing" constant="10" id="CYY-XV-JFd"/>
+                            <constraint firstItem="T4O-nx-ciH" firstAttribute="leading" secondItem="R90-Yf-S6g" secondAttribute="trailing" id="ImW-FE-Mua"/>
+                            <constraint firstItem="T4O-nx-ciH" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="KIi-87-AGM"/>
+                            <constraint firstItem="XpL-9M-UOp" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="KWW-qT-Rzf"/>
+                            <constraint firstItem="6MG-gv-hD5" firstAttribute="centerY" secondItem="avL-VK-Kha" secondAttribute="centerY" id="KZa-YZ-DEs"/>
+                            <constraint firstItem="2EB-m2-a3L" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="Le3-TN-zOL"/>
+                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" constant="-150" id="MeS-HQ-voE"/>
+                            <constraint firstItem="m5L-O7-P31" firstAttribute="top" secondItem="ZZh-fw-LwK" secondAttribute="bottom" constant="10" id="NUL-Ta-VI8"/>
+                            <constraint firstItem="m5L-O7-P31" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="15" id="RFA-z1-9aB"/>
+                            <constraint firstItem="wUL-9N-u1V" firstAttribute="width" secondItem="a3K-ri-NVs" secondAttribute="width" id="Rp6-Bh-BN3"/>
+                            <constraint firstItem="6MG-gv-hD5" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="S0W-0G-75m"/>
+                            <constraint firstItem="Cil-py-NiA" firstAttribute="top" secondItem="6Tk-OE-BBY" secondAttribute="top" id="UNc-Et-9Yv"/>
+                            <constraint firstItem="w7H-Sk-Rai" firstAttribute="leading" secondItem="wUL-9N-u1V" secondAttribute="trailing" id="VBM-8b-jP0"/>
+                            <constraint firstItem="VQn-bS-fWp" firstAttribute="top" secondItem="m5L-O7-P31" secondAttribute="bottom" constant="8" id="VpS-4N-mOo"/>
+                            <constraint firstItem="wUL-9N-u1V" firstAttribute="top" secondItem="2EB-m2-a3L" secondAttribute="bottom" constant="35" id="VpU-j2-gaE"/>
+                            <constraint firstItem="wUL-9N-u1V" firstAttribute="width" secondItem="XpL-9M-UOp" secondAttribute="width" id="Xrz-oE-aIz"/>
+                            <constraint firstItem="wUL-9N-u1V" firstAttribute="width" secondItem="R90-Yf-S6g" secondAttribute="width" id="a4b-Rh-yKG"/>
+                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="trailing" secondItem="m5L-O7-P31" secondAttribute="trailing" constant="10" id="aOn-WU-xP7"/>
+                            <constraint firstItem="R90-Yf-S6g" firstAttribute="leading" secondItem="976-fk-Kx2" secondAttribute="trailing" id="amy-QU-hbW"/>
+                            <constraint firstItem="a3K-ri-NVs" firstAttribute="leading" secondItem="T4O-nx-ciH" secondAttribute="trailing" id="dkX-Iq-hYk"/>
+                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="top" secondItem="6Tk-OE-BBY" secondAttribute="top" id="eIC-fZ-OEE"/>
+                            <constraint firstItem="976-fk-Kx2" firstAttribute="centerY" secondItem="wUL-9N-u1V" secondAttribute="centerY" id="fFg-pB-eyU"/>
+                            <constraint firstItem="6Tk-OE-BBY" firstAttribute="bottom" secondItem="wUL-9N-u1V" secondAttribute="bottom" constant="40" id="fG6-0p-I0P"/>
+                            <constraint firstItem="Cil-py-NiA" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="gGK-DB-ibv"/>
+                            <constraint firstItem="XpL-9M-UOp" firstAttribute="leading" secondItem="w7H-Sk-Rai" secondAttribute="trailing" id="guC-Db-cA9"/>
+                            <constraint firstItem="6MG-gv-hD5" firstAttribute="leading" secondItem="avL-VK-Kha" secondAttribute="trailing" constant="10" id="jNW-iC-u7V"/>
+                            <constraint firstItem="4ey-Xr-U4e" firstAttribute="bottom" secondItem="6Tk-OE-BBY" secondAttribute="bottom" id="o1X-q5-P7j"/>
+                            <constraint firstItem="6MG-gv-hD5" firstAttribute="top" secondItem="VQn-bS-fWp" secondAttribute="bottom" constant="8" id="tAE-ss-jlA"/>
+                            <constraint firstItem="Cil-py-NiA" firstAttribute="leading" secondItem="ZZh-fw-LwK" secondAttribute="trailing" id="teJ-PP-h2R"/>
+                            <constraint firstItem="4ey-Xr-U4e" firstAttribute="top" secondItem="wUL-9N-u1V" secondAttribute="bottom" constant="10" id="udc-wT-jqd"/>
+                            <constraint firstItem="ZZh-fw-LwK" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" id="vXI-l2-CjL"/>
+                            <constraint firstItem="VQn-bS-fWp" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="10" id="wtI-Dl-YPq"/>
+                            <constraint firstItem="976-fk-Kx2" firstAttribute="leading" secondItem="XpL-9M-UOp" secondAttribute="trailing" id="wxP-4D-gDn"/>
+                            <constraint firstItem="wUL-9N-u1V" firstAttribute="leading" secondItem="6Tk-OE-BBY" secondAttribute="leading" constant="16" id="xzZ-jO-4fI"/>
+                            <constraint firstItem="DlO-dk-RMr" firstAttribute="trailing" secondItem="6Tk-OE-BBY" secondAttribute="trailing" id="z6f-Nb-ASh"/>
+                            <constraint firstItem="4ey-Xr-U4e" firstAttribute="centerX" secondItem="8bC-Xf-vdC" secondAttribute="centerX" id="zzi-Qz-G9G"/>
+                        </constraints>
+                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
+                    </view>
+                    <connections>
+                        <outlet property="elapsedTimeLabel" destination="m5L-O7-P31" id="vJ7-EQ-Z5f"/>
+                        <outlet property="modelPickerView" destination="6MG-gv-hD5" id="l0g-ue-raK"/>
+                        <outlet property="resultTextView" destination="VQn-bS-fWp" id="306-c7-3vM"/>
+                        <outlet property="selectImageView" destination="ZZh-fw-LwK" id="afR-Bv-6AW"/>
+                        <outlet property="threadPickerView" destination="DlO-dk-RMr" id="Kk4-QV-b5o"/>
+                        <outlet property="videoView" destination="Cil-py-NiA" id="QY2-BP-SNS"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="-1449" y="-3"/>
+        </scene>
+        <!--Load Pointer View Controller-->
+        <scene sceneID="56v-9i-I4d">
+            <objects>
+                <viewController id="4MS-jc-i6A" customClass="LoadPointerViewController" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="VbZ-nk-rJR">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <imageView userInteractionEnabled="NO" contentMode="scaleToFill" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" translatesAutoresizingMaskIntoConstraints="NO" id="2p5-S3-M4T">
+                                <rect key="frame" x="16" y="63" width="240" height="128"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                            </imageView>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="37q-nm-0H7">
+                                <rect key="frame" x="38" y="610" width="42" height="30"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="30" id="ofW-G3-KST"/>
+                                    <constraint firstAttribute="width" constant="42" id="pwd-tO-zcJ"/>
+                                </constraints>
+                                <state key="normal" title="Image"/>
+                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="fAg-ai-yaA">
+                                <rect key="frame" x="119" y="610" width="34" height="30"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="30" id="IES-jf-Z1n"/>
+                                    <constraint firstAttribute="width" constant="34" id="jxK-Xn-WCE"/>
+                                </constraints>
+                                <state key="normal" title="Load"/>
+                                <connections>
+                                    <action selector="loaderButtonPressed:" destination="4MS-jc-i6A" eventType="touchUpInside" id="3cy-PD-aiE"/>
+                                </connections>
+                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="pdS-6e-Pd1">
+                                <rect key="frame" x="185" y="610" width="49" height="30"/>
+                                <constraints>
+                                    <constraint firstAttribute="width" constant="49" id="ddY-uM-fzA"/>
+                                    <constraint firstAttribute="height" constant="30" id="yKd-YL-UML"/>
+                                </constraints>
+                                <state key="normal" title="Predict"/>
+                                <connections>
+                                    <action selector="predictButtonPressed:" destination="4MS-jc-i6A" eventType="touchUpInside" id="sOH-iT-s1w"/>
+                                </connections>
+                            </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="DZa-sd-lY7">
+                                <rect key="frame" x="279" y="610" width="34" height="30"/>
+                                <constraints>
+                                    <constraint firstAttribute="width" constant="34" id="aSO-4q-PgA"/>
+                                    <constraint firstAttribute="height" constant="30" id="eAt-Uc-BxX"/>
+                                </constraints>
+                                <state key="normal" title="clear"/>
+                            </button>
+                        </subviews>
+                        <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
+                        <constraints>
+                            <constraint firstItem="vsb-FH-h7h" firstAttribute="bottom" secondItem="37q-nm-0H7" secondAttribute="bottom" constant="27" id="4Wf-Uh-gvr"/>
+                            <constraint firstItem="DZa-sd-lY7" firstAttribute="leading" secondItem="pdS-6e-Pd1" secondAttribute="trailing" constant="45" id="8dB-uI-cs9"/>
+                            <constraint firstItem="fAg-ai-yaA" firstAttribute="leading" secondItem="37q-nm-0H7" secondAttribute="trailing" constant="39" id="EAV-Oq-jeD"/>
+                            <constraint firstItem="vsb-FH-h7h" firstAttribute="bottom" secondItem="fAg-ai-yaA" secondAttribute="bottom" constant="27" id="Px0-A9-Eql"/>
+                            <constraint firstItem="pdS-6e-Pd1" firstAttribute="leading" secondItem="fAg-ai-yaA" secondAttribute="trailing" constant="32" id="ZUR-Nv-aNb"/>
+                            <constraint firstItem="vsb-FH-h7h" firstAttribute="bottom" secondItem="pdS-6e-Pd1" secondAttribute="bottom" constant="27" id="kPx-mt-ab9"/>
+                            <constraint firstItem="37q-nm-0H7" firstAttribute="leading" secondItem="vsb-FH-h7h" secondAttribute="leading" constant="38" id="trH-Fq-sSv"/>
+                            <constraint firstItem="vsb-FH-h7h" firstAttribute="bottom" secondItem="DZa-sd-lY7" secondAttribute="bottom" constant="27" id="yNJ-hq-2Qg"/>
+                        </constraints>
+                        <viewLayoutGuide key="safeArea" id="vsb-FH-h7h"/>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="hGb-Pb-icS" userLabel="First Responder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="-721" y="-427"/>
+        </scene>
+    </scenes>
+    <resources>
+        <image name="paddle-mobile.png" width="402" height="62"/>
+    </resources>
+</document>
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Info.plist b/metal/paddle-mobile-demo/paddle-mobile-demo/Info.plist
new file mode 100644
index 0000000000000000000000000000000000000000..665ff9e0cdcc7a102a23bc7b28754ba794c59967
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Info.plist
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>$(DEVELOPMENT_LANGUAGE)</string>
+	<key>CFBundleExecutable</key>
+	<string>$(EXECUTABLE_NAME)</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>$(PRODUCT_NAME)</string>
+	<key>CFBundlePackageType</key>
+	<string>APPL</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleVersion</key>
+	<string>1</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>NSCameraUsageDescription</key>
+	<string>use camera</string>
+	<key>NSPhotoLibraryUsageDescription</key>
+	<string>use album</string>
+	<key>UILaunchStoryboardName</key>
+	<string>LaunchScreen</string>
+	<key>UIMainStoryboardFile</key>
+	<string>Main</string>
+	<key>UIRequiredDeviceCapabilities</key>
+	<array>
+		<string>armv7</string>
+	</array>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+	</array>
+	<key>UISupportedInterfaceOrientations~ipad</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+</dict>
+</plist>
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.h b/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.h
new file mode 100644
index 0000000000000000000000000000000000000000..a876c236219817bf146cfa4a77eb9421f8472971
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.h
@@ -0,0 +1,13 @@
+//
+//  LoadPointerViewController.h
+//  paddle-mobile-demo
+//
+//  Created by Xiao,Haichun on 2018/9/19.
+//  Copyright © 2018年 orange. All rights reserved.
+//
+
+#import <UIKit/UIKit.h>
+
+@interface LoadPointerViewController : UIViewController
+
+@end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.m b/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.m
new file mode 100644
index 0000000000000000000000000000000000000000..857745686fbe750de08e8be357ccf5a4159eaae8
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/LoadPointerViewController.m
@@ -0,0 +1,171 @@
+//
+//  LoadPointerViewController.m
+//  paddle-mobile-demo
+//
+//  Created by Xiao,Haichun on 2018/9/19.
+//  Copyright © 2018年 orange. All rights reserved.
+//
+
+#import "LoadPointerViewController.h"
+#import <Metal/Metal.h>
+#import "paddle-mobile-demo-Bridging-Header.h"
+
+@interface LoadPointerViewController ()
+
+@property (strong, nonatomic) id<MTLDevice> device;
+@property (strong, nonatomic) id<MTLTexture> texture;
+@property (strong, nonatomic) id<MTLCommandQueue> queue;
+@property (strong, nonatomic) PaddleMobileGPU *runner;
+@property (strong, nonatomic) ModelConfig *modelConfig;
+
+@end
+
+@implementation LoadPointerViewController
+
+- (void)viewDidLoad {
+    [super viewDidLoad];
+  
+  
+  self.device = MTLCreateSystemDefaultDevice();
+  
+  self.queue = [self.device newCommandQueue];
+  
+    // Do any additional setup after loading the view.
+//  NSString *modelPath = [[NSBundle mainBundle] URLForResource:@"genet_model" withExtension:nil].path;
+//  NSString *paramPath = [[NSBundle mainBundle] URLForResource:@"genet_params" withExtension:nil].path;
+  
+  NSString *modelPath = [[NSBundle mainBundle] URLForResource:@"ar_model" withExtension:nil].path;
+  NSString *paramPath = [[NSBundle mainBundle] URLForResource:@"ar_params" withExtension:nil].path;
+
+  long fileSize;
+  FILE *fp;
+  fp = fopen([modelPath UTF8String], "rb");
+  fseek(fp, 0, SEEK_END);
+  fileSize = ftell(fp);
+  rewind(fp);
+  void *buffer = malloc(fileSize);
+  fread(buffer, 1, fileSize, fp);
+  fclose(fp);
+  
+  long paramfileSize;
+  FILE *parmaFilePointer;
+  parmaFilePointer = fopen([paramPath UTF8String], "rb");
+  fseek(parmaFilePointer, 0, SEEK_END);
+  paramfileSize = ftell(parmaFilePointer);
+  rewind(parmaFilePointer);
+  void *parmaBuffer = malloc(paramfileSize);
+  fread(parmaBuffer, 1, paramfileSize, parmaFilePointer);
+  fclose(parmaFilePointer);
+  
+  _modelConfig = [[ModelConfig alloc] init];
+//  _modelConfig.means = @[[NSNumber numberWithFloat:128.0], [NSNumber numberWithFloat:128.0], [NSNumber numberWithFloat:128.0]];
+//  _modelConfig.scale = 0.017;
+//  _modelConfig.dims = @[[NSNumber numberWithFloat:1], [NSNumber numberWithFloat:128.], [NSNumber numberWithFloat:128.0],[NSNumber numberWithFloat:3.0]];
+  _modelConfig.means = @[[NSNumber numberWithFloat:103.94], [NSNumber numberWithFloat:116.78], [NSNumber numberWithFloat:123.68]];
+  _modelConfig.scale = 1;
+  _modelConfig.dims = @[[NSNumber numberWithFloat:1], [NSNumber numberWithFloat:160.], [NSNumber numberWithFloat:160.0],[NSNumber numberWithFloat:3.0]];
+  _modelConfig.modelPointer = buffer;
+  _modelConfig.modelSize = (int)fileSize;
+  _modelConfig.paramPointer = parmaBuffer;
+  _modelConfig.paramSize = (int)paramfileSize;
+}
+- (IBAction)loaderButtonPressed:(id)sender {
+//  _runner = [[PaddleMobileGPU alloc] initWithCommandQueue:self.queue net:GenetType modelConfig:_modelConfig];
+  _runner = [[PaddleMobileGPU alloc] initWithCommandQueue:self.queue net:MobileNetSSDType modelConfig:_modelConfig];
+  
+  [_runner load];
+}
+- (IBAction)predictButtonPressed:(id)sender {
+  [self predict];
+}
+
+- (id<MTLTexture>) createTextureFromImage:(UIImage*) image device:(id<MTLDevice>) device
+{
+  image  =[UIImage imageWithCGImage:[image CGImage]
+                              scale:[image scale]
+                        orientation: UIImageOrientationLeft];
+  
+  NSLog(@"orientation and size and stuff %ld %f %f", (long)image.imageOrientation, image.size.width, image.size.height);
+  
+  CGImageRef imageRef = image.CGImage;
+  
+  size_t width = self.view.frame.size.width;
+  size_t height = self.view.frame.size.height;
+  
+  size_t bitsPerComponent = CGImageGetBitsPerComponent(imageRef);
+  size_t bitsPerPixel = CGImageGetBitsPerPixel(imageRef);
+  
+  CGColorSpaceRef colorSpace = CGImageGetColorSpace(imageRef);
+  
+  CGImageAlphaInfo alphaInfo = CGImageGetAlphaInfo(imageRef);
+  
+  //  NSLog(@"%@ %u", colorSpace, alphaInfo);
+  
+  CGBitmapInfo bitmapInfo = kCGBitmapByteOrderDefault | alphaInfo;
+  //    NSLog(@"bitmap info %u", bitmapInfo);
+  
+  
+  CGContextRef context = CGBitmapContextCreate( NULL, width, height, bitsPerComponent, (bitsPerPixel / 8) * width, colorSpace, bitmapInfo);
+  
+  if( !context )
+  {
+    NSLog(@"Failed to load image, probably an unsupported texture type");
+    return nil;
+  }
+  
+  CGContextDrawImage( context, CGRectMake( 0, 0, width, height ), image.CGImage);
+  
+  
+  MTLPixelFormat format = MTLPixelFormatRGBA8Unorm;
+  
+  MTLTextureDescriptor *texDesc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format
+                                                                                     width:width
+                                                                                    height:height
+                                                                                 mipmapped:NO];
+  id<MTLTexture> texture = [device newTextureWithDescriptor:texDesc];
+  
+  [texture replaceRegion:MTLRegionMake2D(0, 0, width, height)
+             mipmapLevel:0
+               withBytes:CGBitmapContextGetData(context)
+             bytesPerRow:4 * width];
+  
+  return texture;
+}
+
+- (void)predict {
+  _texture = [self createTextureFromImage:[UIImage imageNamed:@"hand.jpg"] device:self.device];
+  NSTimeInterval startTime = [[NSDate date] timeIntervalSince1970];
+  NSInteger max = 428;
+  for (int i = 0;i < max; i ++) {
+    [_runner predict:_texture withCompletion:^(BOOL success , NSArray<NSNumber *> *result) {
+      if (success) {
+        if (i == max -1) {
+          double time = [[NSDate date] timeIntervalSince1970] - startTime;
+          time = (time/max)*1000;
+          NSLog(@"gap ==== %fms",time);
+        }
+//        for (int i = 0; i < result.count; i ++) {
+//          NSNumber *number = result[i];
+//          NSLog(@"result %d = %f:",i, [number floatValue]);
+//        }
+      }
+    }];
+  }
+}
+
+- (void)didReceiveMemoryWarning {
+    [super didReceiveMemoryWarning];
+    // Dispose of any resources that can be recreated.
+}
+
+/*
+#pragma mark - Navigation
+
+// In a storyboard-based application, you will often want to do a little preparation before navigation
+- (void)prepareForSegue:(UIStoryboardSegue *)segue sender:(id)sender {
+    // Get the new view controller using [segue destinationViewController].
+    // Pass the selected object to the new view controller.
+}
+*/
+
+@end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
new file mode 100644
index 0000000000000000000000000000000000000000..bd07da61d0215b243372c27addf60efc3b2ad7d6
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import UIKit
+import paddle_mobile
+
+class MultiPredictViewController: UIViewController {
+  var runner1: Runner!
+  var runner2: Runner!
+  override func viewDidLoad() {
+    super.viewDidLoad()
+    let mobileNet = MobileNet_ssd_hand.init(device: MetalHelper.shared.device)
+    let genet = Genet.init(device: MetalHelper.shared.device)
+    runner1 = Runner.init(inNet: mobileNet, commandQueue: MetalHelper.shared.queue, inPlatform: .GPU)
+    let queue2 = MetalHelper.shared.device.makeCommandQueue()
+    
+    runner2 = Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue, inPlatform: .GPU)
+  }
+
+  @IBAction func predictAct(_ sender: Any) {
+    let success = self.runner2.load()
+//    DispatchQueue.global().async {
+      let image1 = UIImage.init(named: "hand.jpg")
+//      let success = self.runner2.load()
+//      if success {
+//        for i in 0..<10000 {
+//          print(i)
+//          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
+//            print("result1: ")
+////            print(res)
+//          })
+//        }
+//      } else {
+//        print("load failed")
+//      }
+//      self.runner1.clear()
+//    }
+//    return
+//    DispatchQueue.global().async {
+////      sleep(1)
+//      let image1 = UIImage.init(named: "banana.jpeg")
+////      if success {
+//        for _ in 0..<10 {
+//          self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in
+//            print("result2: ")
+//            print(res)
+//          })
+//        }
+////      } else {
+////        print("load failed")
+////      }
+////      self.runner2.clear()
+//    }
+  }
+}
diff --git a/ios/PaddleMobile/PaddleMobile/PaddleMobile.h b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MetalHelper.swift
similarity index 60%
rename from ios/PaddleMobile/PaddleMobile/PaddleMobile.h
rename to metal/paddle-mobile-demo/paddle-mobile-demo/Net/MetalHelper.swift
index 3878c54c8a19a99e535bd8ad90eb1e19e28757c3..d314e8b3f8845ef95b36b4b25e61809d353f0f24 100644
--- a/ios/PaddleMobile/PaddleMobile/PaddleMobile.h
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MetalHelper.swift
@@ -12,15 +12,22 @@
  See the License for the specific language governing permissions and
  limitations under the License. */
 
-#import <CoreImage/CoreImage.h>
-#import <Foundation/Foundation.h>
+import Metal
+import MetalKit
+import Foundation
+import paddle_mobile
 
-@interface PaddleMobile : NSObject
+class MetalHelper {
+  let device: MTLDevice
+  let queue: MTLCommandQueue
+  let textureLoader: MTKTextureLoader
+  static let shared: MetalHelper = MetalHelper.init()
+  private init(){
+    device = MTLCreateSystemDefaultDevice()!
+    queue = device.makeCommandQueue()!
+    textureLoader = MTKTextureLoader.init(device: device)
+  }
+  
 
-+ (instancetype)sharedInstance;
-- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
-- (NSArray *)predict:(CGImageRef)image means:(NSArray<NSNumber *> *)means scale:(float)scale;
-- (NSArray *)predict:(CGImageRef)image;
-- (void)clear;
+}
 
-@end
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PaddleMobile.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PaddleMobile.swift
new file mode 100644
index 0000000000000000000000000000000000000000..a954328acae3a80643ad849d58cd6ac86bf7865e
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PaddleMobile.swift
@@ -0,0 +1,9 @@
+//
+//  PaddleMobile.swift
+//  paddle-mobile-demo
+//
+//  Created by liuRuiLong on 2018/9/5.
+//  Copyright © 2018年 orange. All rights reserved.
+//
+
+import Foundation
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..ac07e449bc5919a37a57143aa6881f79507a45b4
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
@@ -0,0 +1,137 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+
+kernel void mobilenet_preprocess(
+                       texture2d<float, access::read> inTexture [[texture(0)]],
+                       texture2d<float, access::write> outTexture [[texture(1)]],
+                       uint2 gid [[thread_position_in_grid]])
+{
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
+    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void mobilenet_preprocess_half(
+                       texture2d<half, access::read> inTexture [[texture(0)]],
+                       texture2d<half, access::write> outTexture [[texture(1)]],
+                       uint2 gid [[thread_position_in_grid]])
+{
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
+    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void mobilenet_ssd_preprocess(
+                       texture2d<float, access::read> inTexture [[texture(0)]],
+                       texture2d<float, access::write> outTexture [[texture(1)]],
+                       uint2 gid [[thread_position_in_grid]])
+{
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
+    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void mobilenet_ssd_preprocess_half(
+                            texture2d<half, access::read> inTexture [[texture(0)]],
+                            texture2d<half, access::write> outTexture [[texture(1)]],
+                            uint2 gid [[thread_position_in_grid]])
+{
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
+    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void genet_preprocess(texture2d<float, access::read> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
+{
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) {
+    return;
+  }
+  const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f);
+  const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+  outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void genet_preprocess_half(texture2d<half, access::read> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
+{
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) {
+    return;
+  }
+  const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f);
+  const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+  outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void mobilent_ar_preprocess(texture2d<float, access::read> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
+{
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) {
+    return;
+  }
+  const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f);
+  const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+  outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void mobilent_ar_preprocess_half(texture2d<half, access::read> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]])
+{
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) {
+    return;
+  }
+  const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f);
+  const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+  outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void scale(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<float, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) return;
+  float w_stride = inTexture.get_width() / outTexture.get_width();
+  float h_stride = inTexture.get_height() / outTexture.get_height();
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
+  outTexture.write(input, gid);
+}
+
+kernel void scale_half(texture2d<float, access::sample> inTexture [[texture(0)]], texture2d<half, access::write> outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height()) return;
+  float w_stride = inTexture.get_width() / outTexture.get_width();
+  float h_stride = inTexture.get_height() / outTexture.get_height();
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x * w_stride,    gid.y * h_stride), 0);
+  outTexture.write(half4(input), gid);
+}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
new file mode 100644
index 0000000000000000000000000000000000000000..f9e841f9c2a3060e775726023b6d5cfc3eeb679d
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift
@@ -0,0 +1,31 @@
+
+
+import Foundation
+import QuartzCore
+
+public class FPSCounter {
+  private(set) public var fps: Double = 0
+
+  var frames = 0
+  var startTime: CFTimeInterval = 0
+
+  public func start() {
+    frames = 0
+    startTime = CACurrentMediaTime()
+  }
+
+  public func frameCompleted() {
+    frames += 1
+    let now = CACurrentMediaTime()
+    let elapsed = now - startTime
+    if elapsed > 0.1 {
+      let current = Double(frames) / elapsed
+      let smoothing = 0.75
+      fps = smoothing*fps + (1 - smoothing)*current
+      if elapsed > 1 {
+        frames = 0
+        startTime = CACurrentMediaTime()
+      }
+    }
+  }
+}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
new file mode 100644
index 0000000000000000000000000000000000000000..c235ed2f0391bdc97e9e182c0e9897814a0518fa
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
@@ -0,0 +1,218 @@
+
+import UIKit
+import Metal
+import CoreVideo
+import AVFoundation
+
+@available(iOS 10.0, *)
+@objc public protocol VideoCaptureDelegate: NSObjectProtocol {
+  @objc optional func videoCapture(_ capture: VideoCapture, didCaptureSampleBuffer sampleBuffer: CMSampleBuffer, timestamp: CMTime)
+    @objc optional func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime)
+    @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhoto previewImage: UIImage?)
+    @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhotoTexture texture: MTLTexture?)
+}
+
+/**
+  Simple interface to the iPhone's camera.
+*/
+@available(iOS 10.0, *)
+public class VideoCapture: NSObject {
+    public var previewLayer: AVCaptureVideoPreviewLayer?
+    public weak var delegate: VideoCaptureDelegate?
+    public var fps = -1
+    private let device: MTLDevice?
+    private let videoOrientation: AVCaptureVideoOrientation
+    private var textureCache: CVMetalTextureCache?
+    private let captureSession = AVCaptureSession()
+    private let videoOutput = AVCaptureVideoDataOutput()
+    private let photoOutput = AVCapturePhotoOutput()
+    private let queue = DispatchQueue(label: "net.machinethink.camera-queue")
+    private var lastTimestamp = CMTime()
+    private let cameraPosition: AVCaptureDevice.Position
+    public init(device: MTLDevice? = nil, orientation: AVCaptureVideoOrientation = .portrait, position: AVCaptureDevice.Position = .back) {
+        self.device = device
+        self.videoOrientation = orientation
+        self.cameraPosition = position
+        super.init()
+    }
+
+    public func setUp(sessionPreset: AVCaptureSession.Preset = .medium,
+                    completion: @escaping (Bool) -> Void) {
+        queue.async {
+            let success = self.setUpCamera(sessionPreset: sessionPreset)
+            DispatchQueue.main.async {
+                completion(success)
+            }
+        }
+    }
+
+    func fontCamera() -> AVCaptureDevice? {
+        let deveices = AVCaptureDevice.DiscoverySession.init(deviceTypes: [.builtInWideAngleCamera], mediaType: AVMediaType.video, position: .front).devices
+        return deveices.first
+        
+    }
+    
+    func setUpCamera(sessionPreset: AVCaptureSession.Preset) -> Bool {
+        if let inDevice = device{
+            guard CVMetalTextureCacheCreate(kCFAllocatorDefault, nil, inDevice, nil, &textureCache) == kCVReturnSuccess else {
+                print("Error: could not create a texture cache")
+                return false
+            }
+        }
+        
+        captureSession.beginConfiguration()
+        captureSession.sessionPreset = sessionPreset
+
+        var oCaptureDevice: AVCaptureDevice?
+        switch cameraPosition {
+        case .back:
+            oCaptureDevice = AVCaptureDevice.default(for: AVMediaType.video)
+            break
+        case .front:
+            oCaptureDevice = fontCamera()
+            break
+        default:
+            break
+        }
+        
+        guard let captureDevice = oCaptureDevice else {
+            print("Error: no video devices available")
+            return false
+        }
+
+        guard let videoInput = try? AVCaptureDeviceInput(device: captureDevice) else {
+            print("Error: could not create AVCaptureDeviceInput")
+            return false
+        }
+
+        if captureSession.canAddInput(videoInput) {
+            captureSession.addInput(videoInput)
+        }
+
+        let previewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
+        previewLayer.videoGravity = AVLayerVideoGravity.resizeAspect
+        previewLayer.connection?.videoOrientation = self.videoOrientation
+        self.previewLayer = previewLayer
+
+        let settings: [String : Any] = [
+        kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)
+        ]
+
+        videoOutput.videoSettings = settings
+        videoOutput.alwaysDiscardsLateVideoFrames = true
+        videoOutput.setSampleBufferDelegate(self, queue: queue)
+        if captureSession.canAddOutput(videoOutput) {
+            captureSession.addOutput(videoOutput)
+        }
+
+        // We want the buffers to be in portrait orientation otherwise they are
+        // rotated by 90 degrees. Need to set this _after_ addOutput()!
+        videoOutput.connection(with: AVMediaType.video)?.videoOrientation = self.videoOrientation
+
+        if captureSession.canAddOutput(photoOutput) {
+            captureSession.addOutput(photoOutput)
+        }
+
+        captureSession.commitConfiguration()
+        return true
+    }
+
+    public func start() {
+        if !captureSession.isRunning {
+            captureSession.startRunning()
+        }
+    }
+
+    public func stop() {
+        if captureSession.isRunning {
+            captureSession.stopRunning()
+        }
+    }
+
+    /* Captures a single frame of the camera input. */
+    public func capturePhoto() {
+        let settings = AVCapturePhotoSettings(format: [kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)])
+        settings.previewPhotoFormat = [
+            kCVPixelBufferPixelFormatTypeKey as String: settings.__availablePreviewPhotoPixelFormatTypes[0],
+            kCVPixelBufferWidthKey as String: 480,
+            kCVPixelBufferHeightKey as String: 360,
+        ]
+        photoOutput.capturePhoto(with: settings, delegate: self)
+    }
+
+    func convertToMTLTexture(sampleBuffer: CMSampleBuffer?) -> MTLTexture? {
+        if let textureCache = textureCache, let sampleBuffer = sampleBuffer, let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
+            let width = CVPixelBufferGetWidth(imageBuffer)
+            let height = CVPixelBufferGetHeight(imageBuffer)
+            var texture: CVMetalTexture?
+            CVMetalTextureCacheCreateTextureFromImage(kCFAllocatorDefault, textureCache, imageBuffer, nil, .bgra8Unorm, width, height, 0, &texture)
+            if let texture = texture {
+                return CVMetalTextureGetTexture(texture)
+            }
+        }
+        return nil
+    }
+
+    func convertToUIImage(sampleBuffer: CMSampleBuffer?) -> UIImage? {
+        if let sampleBuffer = sampleBuffer,
+            let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) {
+            let width = CVPixelBufferGetWidth(imageBuffer)
+            let height = CVPixelBufferGetHeight(imageBuffer)
+            let rect = CGRect(x: 0, y: 0, width: CGFloat(width), height: CGFloat(height))
+            let ciImage = CIImage(cvPixelBuffer: imageBuffer)
+            let ciContext = CIContext(options: nil)
+            if let cgImage = ciContext.createCGImage(ciImage, from: rect) {
+                return UIImage(cgImage: cgImage)
+            }
+        }
+        return nil
+    }
+}
+
+
+@available(iOS 10.0, *)
+extension VideoCapture: AVCaptureVideoDataOutputSampleBufferDelegate {
+  public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
+    // Because lowering the capture device's FPS looks ugly in the preview,
+    // we capture at full speed but only call the delegate at its desired
+    // framerate. If `fps` is -1, we run at the full framerate.
+    let timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer)
+    let deltaTime = timestamp - lastTimestamp
+    if fps == -1 || deltaTime >= CMTimeMake(1, Int32(fps)) {
+        lastTimestamp = timestamp
+        self.delegate?.videoCapture?(self, didCaptureSampleBuffer: sampleBuffer, timestamp: timestamp)
+        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCaptureVideoTexture:timestamp:))) ?? false{
+            let texture = convertToMTLTexture(sampleBuffer: sampleBuffer)
+            delegate?.videoCapture?(self, didCaptureVideoTexture: texture, timestamp: timestamp)
+        }
+    }
+  }
+
+  public func captureOutput(_ output: AVCaptureOutput, didDrop sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
+    print("dropped frame")
+  }
+}
+
+@available(iOS 10.0, *)
+extension VideoCapture: AVCapturePhotoCaptureDelegate {
+  public func photoOutput(_ captureOutput: AVCapturePhotoOutput,
+                          didFinishProcessingPhoto photoSampleBuffer: CMSampleBuffer?,
+                          previewPhoto previewPhotoSampleBuffer: CMSampleBuffer?,
+                          resolvedSettings: AVCaptureResolvedPhotoSettings,
+                          bracketSettings: AVCaptureBracketedStillImageSettings?,
+                          error: Error?) {
+    var imageTexture: MTLTexture?
+    var previewImage: UIImage?
+    if error == nil {
+        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhotoTexture:))) ?? false{
+            imageTexture = convertToMTLTexture(sampleBuffer: photoSampleBuffer)
+            self.delegate?.videoCapture?(self, didCapturePhotoTexture: imageTexture)
+        }
+        
+        if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhoto:))) ?? false{
+            previewImage = convertToUIImage(sampleBuffer: previewPhotoSampleBuffer)
+            self.delegate?.videoCapture?(self, didCapturePhoto: previewImage)
+        }
+    }
+  }
+}
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
new file mode 100644
index 0000000000000000000000000000000000000000..1c6d0a91c9bf1d202091282e43859270a238edaa
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -0,0 +1,305 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import UIKit
+import MetalKit
+import CoreMedia
+import paddle_mobile
+import MetalPerformanceShaders
+
+var platform: Platform = .GPU
+let threadSupport: [(Platform, String)] = [(.GPU, "GPU"), (.CPU, "CPU")]
+
+//.mobilenet_ssd : Runner.init(inNet: MobileNet_ssd_hand.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform),
+let modelHelperMap: [SupportModel : Runner] = [
+                                               .genet : Runner.init(inNet: Genet.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform),
+                                               .mobilenet_ssd_ar : Runner.init(inNet: MobileNet_ssd_AR.init(device: MetalHelper.shared.device), commandQueue: MetalHelper.shared.queue, inPlatform: platform)]
+//, .genet : Genet.init()
+//let modelHelperMap: [SupportModel : Net] = [.mobilenet : MobileNet.init(), .mobilenet_ssd : MobileNet_ssd_hand.init()]
+
+let netSupport: [SupportModel : Net] = [.genet : Genet.init(device: MetalHelper.shared.device), .mobilenet_ssd_ar : MobileNet_ssd_AR.init(device: MetalHelper.shared.device)]
+
+enum SupportModel: String{
+  //  case mobilenet = "mobilenet"
+//  case mobilenet_ssd    = "mobilenetssd"
+  case genet            = "genet"
+  case mobilenet_ssd_ar = "mobilenetssd_ar"
+  
+  static func supportedModels() -> [SupportModel] {
+    // .mobilenet,
+    // .mobilenet_ssd,
+    return [.genet, .mobilenet_ssd_ar]
+  }
+}
+
+class ViewController: UIViewController {
+  @IBOutlet weak var resultTextView: UITextView!
+  @IBOutlet weak var selectImageView: UIImageView!
+  @IBOutlet weak var elapsedTimeLabel: UILabel!
+  @IBOutlet weak var modelPickerView: UIPickerView!
+  @IBOutlet weak var threadPickerView: UIPickerView!
+  @IBOutlet weak var videoView: UIView!
+//  var videoCapture: VideoCapture!
+
+  var selectImage: UIImage?
+  var inputPointer: UnsafeMutablePointer<Float32>?
+  var modelType: SupportModel = SupportModel.supportedModels()[0]
+  var toPredictTexture: MTLTexture?
+  
+  var runner: Runner!
+  
+  var threadNum = 1
+  
+  @IBAction func loadAct(_ sender: Any) {
+     runner = Runner.init(inNet: netSupport[modelType]!, commandQueue: MetalHelper.shared.queue, inPlatform: platform)
+    
+    if platform == .CPU {
+      if inputPointer == nil {
+        inputPointer = runner.preproccess(image: selectImage!.cgImage!)
+       
+      }
+    } else if platform == .GPU {
+      if self.toPredictTexture == nil {
+        runner.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
+          self?.toPredictTexture = texture
+        }
+      }
+    } else {
+      fatalError( " unsupport " )
+    }
+    
+    if runner.load() {
+      print(" load success ! ")
+    } else {
+      print(" load error ! ")
+    }
+  }
+  
+  @IBAction func selectImageAct(_ sender: Any) {
+    let imagePicker = UIImagePickerController()
+    imagePicker.sourceType = .camera
+    imagePicker.delegate = self
+    self.present(imagePicker, animated: true, completion: nil)
+  }
+  
+  @IBAction func clearAct(_ sender: Any) {
+    runner.clear()
+  }
+  
+  @IBAction func predictAct(_ sender: Any) {
+    let max = 50
+    switch platform {
+    case .GPU:
+      guard let inTexture = toPredictTexture else {
+        resultTextView.text = "请选择图片 ! "
+        return
+      }
+      
+      for _ in 0..<10{
+        runner.predict(texture: inTexture) { (success, resultHolder)  in
+          resultHolder?.releasePointer()
+        }
+      }
+      
+      let startDate = Date.init()
+      for i in 0..<max {
+        runner.predict(texture: inTexture) { [weak self] (success, resultHolder)  in
+          guard let sSelf = self else {
+            fatalError()
+          }
+          if success {
+            if i == max - 1 {
+              let time = Date.init().timeIntervalSince(startDate)
+              DispatchQueue.main.async {
+//                print(resultHolder!.result![0])
+                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: resultHolder!)
+                
+                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
+               
+              }
+            }
+          }
+          
+          DispatchQueue.main.async {
+            resultHolder?.releasePointer()
+          }
+//            print("释放")
+        }
+//        print("sleep before ")
+//        usleep(33000)
+//        print("sleep after ")
+      }
+    case .CPU:
+      guard let inInputPointer = inputPointer else {
+        fatalError( " need input pointer " )
+      }
+      
+      for _ in 0..<10 {
+        runner.predict(inputPointer: inInputPointer) { (success, res) in
+          res?.releaseOutput()
+        }
+      }
+      
+      let startDate = Date.init()
+      for i in 0..<max {
+        runner.predict(inputPointer: inInputPointer) { [weak self](success, res) in
+          guard let sSelf = self else {
+            fatalError()
+          }
+          if success {
+            if i == max - 1 {
+              let time = Date.init().timeIntervalSince(startDate)
+              DispatchQueue.main.async {
+//                sSelf.resultTextView.text = sSelf.runner.net.resultStr(res: res)
+                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max) * 1000.0) ms"
+              }
+            }
+          }
+          res?.releaseOutput()
+        }
+      }
+    }
+  }
+  
+  override func viewDidLoad() {
+    super.viewDidLoad()
+    
+//    if runner.load() {
+//      print(" load success ! ")
+//    } else {
+//      print(" load error ! ")
+//    }
+//    
+    modelPickerView.delegate = self
+    modelPickerView.dataSource = self
+    threadPickerView.delegate = self
+    threadPickerView.dataSource = self
+    
+    selectImage = UIImage.init(named: "hand.jpg")
+    selectImageView.image = selectImage
+    
+//    if platform == .CPU {
+//      inputPointer = runner.preproccess(image: selectImage!.cgImage!)
+//    } else if platform == .GPU {
+//      runner.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
+//        self?.toPredictTexture = texture
+//      }
+//    } else {
+//      fatalError( " unsupport " )
+//    }
+    
+//    videoCapture = VideoCapture.init(device: MetalHelper.shared.device, orientation: .portrait, position: .back)
+//    videoCapture.fps = 30
+//    videoCapture.delegate = self
+//    videoCapture.setUp { (success) in
+//      DispatchQueue.main.async {
+//        if let preViewLayer = self.videoCapture.previewLayer {
+//          self.videoView.layer.addSublayer(preViewLayer)
+//          self.videoCapture.previewLayer?.frame = self.videoView.bounds
+//        }
+//        self.videoCapture.start()
+//      }
+//    }
+
+  }
+}
+
+extension ViewController: UIPickerViewDataSource, UIPickerViewDelegate{
+  func numberOfComponents(in pickerView: UIPickerView) -> Int {
+    if pickerView == modelPickerView {
+      return 1
+    } else if pickerView == threadPickerView {
+      return 1
+    } else {
+      fatalError()
+    }
+  }
+  
+  func pickerView(_ pickerView: UIPickerView, numberOfRowsInComponent component: Int) -> Int {
+    if pickerView == modelPickerView {
+      return SupportModel.supportedModels().count
+    } else if pickerView == threadPickerView {
+      return threadSupport.count
+    } else {
+      fatalError()
+    }
+  }
+  
+  public func pickerView(_ pickerView: UIPickerView, titleForRow row: Int, forComponent component: Int) -> String? {
+    if pickerView == modelPickerView {
+      return SupportModel.supportedModels()[row].rawValue
+    } else if pickerView == threadPickerView {
+      return threadSupport[row].1
+    } else {
+      fatalError()
+    }
+  }
+  
+  public func pickerView(_ pickerView: UIPickerView, didSelectRow row: Int, inComponent component: Int) {
+    if pickerView == modelPickerView {
+      self.modelType = SupportModel.supportedModels()[row]
+    } else if pickerView == threadPickerView {
+      
+      platform = threadSupport[row].0
+    } else {
+      fatalError()
+    }
+  }
+}
+
+extension ViewController:  UIImagePickerControllerDelegate, UINavigationControllerDelegate {
+  func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) {
+    picker.dismiss(animated: true){[weak self] in
+      guard let sSelf = self, let image =  info["UIImagePickerControllerOriginalImage"] as? UIImage else{
+        fatalError("no image")
+      }
+      sSelf.selectImage = image
+      sSelf.selectImageView.image = image
+      sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in
+        sSelf.toPredictTexture = texture
+      })
+    }
+  }
+}
+
+var bool1 = false
+extension ViewController: VideoCaptureDelegate{
+  func predictTexture(texture: MTLTexture){
+    runner.scaleTexture(input: texture) { (scaledTexture) in
+      self.runner.predict(texture: scaledTexture, completion: { (success, resultHolder) in
+//        print(resultHolder!.result![0])
+        resultHolder?.releasePointer()
+      })
+    }
+  }
+  
+  
+//  @available(iOS 10.0, *)
+//  func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime) {
+////    if !bool1 {
+////      DispatchQueue.main.asyncAfter(deadline: DispatchTime.init(uptimeNanoseconds: 500000000)) {
+//    self.predictTexture(texture: texture!)
+////      }
+//
+//
+////      bool1 = true
+////    }
+//
+//  }
+
+}
+
+
+
+
diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h b/metal/paddle-mobile-demo/paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h
new file mode 100644
index 0000000000000000000000000000000000000000..92de82860ccd372ba0eae962edd1b271986f1862
--- /dev/null
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/paddle-mobile-demo-Bridging-Header.h
@@ -0,0 +1,5 @@
+//
+//  Use this file to import your target's public headers that you would like to expose to Swift.
+//
+
+#import <paddle_mobile/paddle_mobile.h>
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.pbxproj b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.pbxproj
new file mode 100644
index 0000000000000000000000000000000000000000..50d58bb45bb5c0e8e5ffbbe8f10ce3e41b770f7c
--- /dev/null
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.pbxproj
@@ -0,0 +1,478 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 50;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		168DA950D7D6CF91EBF70A17 /* Pods_paddle_mobile_unit_test.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8BCD4792E483BFEE9F5523DE /* Pods_paddle_mobile_unit_test.framework */; };
+		FC607427211DF3B100B17547 /* synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = FC60734E211DF3B000B17547 /* synset.txt */; };
+		FC607428211DF3B100B17547 /* banana.jpeg in Resources */ = {isa = PBXBuildFile; fileRef = FC60734F211DF3B000B17547 /* banana.jpeg */; };
+		FC607429211DF3B100B17547 /* iphone.JPG in Resources */ = {isa = PBXBuildFile; fileRef = FC607350211DF3B000B17547 /* iphone.JPG */; };
+		FC60742A211DF3B100B17547 /* paddle-mobile.png in Resources */ = {isa = PBXBuildFile; fileRef = FC607351211DF3B000B17547 /* paddle-mobile.png */; };
+		FC60742B211DF3B100B17547 /* params in Resources */ = {isa = PBXBuildFile; fileRef = FC607354211DF3B000B17547 /* params */; };
+		FC60742C211DF3B100B17547 /* model in Resources */ = {isa = PBXBuildFile; fileRef = FC607355211DF3B000B17547 /* model */; };
+		FC91818D211DAE9A00B6F354 /* paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = FC91818C211DAE9A00B6F354 /* paddle_mobile.framework */; };
+		FC91818E211DAE9A00B6F354 /* paddle_mobile.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = FC91818C211DAE9A00B6F354 /* paddle_mobile.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
+		FCDFD409211D9185005AB38B /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDFD408211D9185005AB38B /* AppDelegate.swift */; };
+		FCDFD40B211D9185005AB38B /* ViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDFD40A211D9185005AB38B /* ViewController.swift */; };
+		FCDFD40E211D9185005AB38B /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FCDFD40C211D9185005AB38B /* Main.storyboard */; };
+		FCDFD410211D9187005AB38B /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = FCDFD40F211D9187005AB38B /* Assets.xcassets */; };
+		FCDFD413211D9187005AB38B /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = FCDFD411211D9187005AB38B /* LaunchScreen.storyboard */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXCopyFilesBuildPhase section */
+		FC91818F211DAE9B00B6F354 /* Embed Frameworks */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = "";
+			dstSubfolderSpec = 10;
+			files = (
+				FC91818E211DAE9A00B6F354 /* paddle_mobile.framework in Embed Frameworks */,
+			);
+			name = "Embed Frameworks";
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXCopyFilesBuildPhase section */
+
+/* Begin PBXFileReference section */
+		5CC132C848027BE970FB2637 /* Pods-paddle-mobile-unit-test.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-unit-test.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-unit-test/Pods-paddle-mobile-unit-test.debug.xcconfig"; sourceTree = "<group>"; };
+		72F34AE9677943FC580DE7F4 /* Pods-paddle-mobile-unit-test.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-unit-test.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-unit-test/Pods-paddle-mobile-unit-test.release.xcconfig"; sourceTree = "<group>"; };
+		8BCD4792E483BFEE9F5523DE /* Pods_paddle_mobile_unit_test.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile_unit_test.framework; sourceTree = BUILT_PRODUCTS_DIR; };
+		FC60734E211DF3B000B17547 /* synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = synset.txt; sourceTree = "<group>"; };
+		FC60734F211DF3B000B17547 /* banana.jpeg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = banana.jpeg; sourceTree = "<group>"; };
+		FC607350211DF3B000B17547 /* iphone.JPG */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = iphone.JPG; sourceTree = "<group>"; };
+		FC607351211DF3B000B17547 /* paddle-mobile.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; path = "paddle-mobile.png"; sourceTree = "<group>"; };
+		FC607354211DF3B000B17547 /* params */ = {isa = PBXFileReference; lastKnownFileType = file; path = params; sourceTree = "<group>"; };
+		FC607355211DF3B000B17547 /* model */ = {isa = PBXFileReference; lastKnownFileType = file; path = model; sourceTree = "<group>"; };
+		FC91818C211DAE9A00B6F354 /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
+		FCDFD405211D9185005AB38B /* paddle-mobile-unit-test.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "paddle-mobile-unit-test.app"; sourceTree = BUILT_PRODUCTS_DIR; };
+		FCDFD408211D9185005AB38B /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
+		FCDFD40A211D9185005AB38B /* ViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ViewController.swift; sourceTree = "<group>"; };
+		FCDFD40D211D9185005AB38B /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
+		FCDFD40F211D9187005AB38B /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		FCDFD412211D9187005AB38B /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
+		FCDFD414211D9187005AB38B /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		FCDFD402211D9185005AB38B /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FC91818D211DAE9A00B6F354 /* paddle_mobile.framework in Frameworks */,
+				168DA950D7D6CF91EBF70A17 /* Pods_paddle_mobile_unit_test.framework in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		CF78F766C11CC8AD67269581 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				8BCD4792E483BFEE9F5523DE /* Pods_paddle_mobile_unit_test.framework */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		F752428B187BC4E0928ACD3D /* Pods */ = {
+			isa = PBXGroup;
+			children = (
+				5CC132C848027BE970FB2637 /* Pods-paddle-mobile-unit-test.debug.xcconfig */,
+				72F34AE9677943FC580DE7F4 /* Pods-paddle-mobile-unit-test.release.xcconfig */,
+			);
+			name = Pods;
+			sourceTree = "<group>";
+		};
+		FC60734D211DF3B000B17547 /* images */ = {
+			isa = PBXGroup;
+			children = (
+				FC60734E211DF3B000B17547 /* synset.txt */,
+				FC60734F211DF3B000B17547 /* banana.jpeg */,
+				FC607350211DF3B000B17547 /* iphone.JPG */,
+				FC607351211DF3B000B17547 /* paddle-mobile.png */,
+			);
+			name = images;
+			path = ../../images;
+			sourceTree = "<group>";
+		};
+		FC607352211DF3B000B17547 /* models */ = {
+			isa = PBXGroup;
+			children = (
+				FC607353211DF3B000B17547 /* mobilenet */,
+			);
+			name = models;
+			path = ../../models;
+			sourceTree = "<group>";
+		};
+		FC607353211DF3B000B17547 /* mobilenet */ = {
+			isa = PBXGroup;
+			children = (
+				FC607354211DF3B000B17547 /* params */,
+				FC607355211DF3B000B17547 /* model */,
+			);
+			path = mobilenet;
+			sourceTree = "<group>";
+		};
+		FCDFD3FC211D9185005AB38B = {
+			isa = PBXGroup;
+			children = (
+				FC91818C211DAE9A00B6F354 /* paddle_mobile.framework */,
+				FCDFD407211D9185005AB38B /* paddle-mobile-unit-test */,
+				FCDFD406211D9185005AB38B /* Products */,
+				F752428B187BC4E0928ACD3D /* Pods */,
+				CF78F766C11CC8AD67269581 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		FCDFD406211D9185005AB38B /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				FCDFD405211D9185005AB38B /* paddle-mobile-unit-test.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		FCDFD407211D9185005AB38B /* paddle-mobile-unit-test */ = {
+			isa = PBXGroup;
+			children = (
+				FC60734D211DF3B000B17547 /* images */,
+				FC607352211DF3B000B17547 /* models */,
+				FCDFD408211D9185005AB38B /* AppDelegate.swift */,
+				FCDFD40A211D9185005AB38B /* ViewController.swift */,
+				FCDFD40C211D9185005AB38B /* Main.storyboard */,
+				FCDFD40F211D9187005AB38B /* Assets.xcassets */,
+				FCDFD411211D9187005AB38B /* LaunchScreen.storyboard */,
+				FCDFD414211D9187005AB38B /* Info.plist */,
+			);
+			path = "paddle-mobile-unit-test";
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		FCDFD404211D9185005AB38B /* paddle-mobile-unit-test */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = FCDFD417211D9187005AB38B /* Build configuration list for PBXNativeTarget "paddle-mobile-unit-test" */;
+			buildPhases = (
+				5F5A9A9DC0C6307DEA4294C1 /* [CP] Check Pods Manifest.lock */,
+				FCDFD401211D9185005AB38B /* Sources */,
+				FCDFD402211D9185005AB38B /* Frameworks */,
+				FCDFD403211D9185005AB38B /* Resources */,
+				53A2089068F9D64BB96D4322 /* [CP] Embed Pods Frameworks */,
+				FC91818F211DAE9B00B6F354 /* Embed Frameworks */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = "paddle-mobile-unit-test";
+			productName = "paddle-mobile-unit-test";
+			productReference = FCDFD405211D9185005AB38B /* paddle-mobile-unit-test.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		FCDFD3FD211D9185005AB38B /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastSwiftUpdateCheck = 0940;
+				LastUpgradeCheck = 0940;
+				ORGANIZATIONNAME = orange;
+				TargetAttributes = {
+					FCDFD404211D9185005AB38B = {
+						CreatedOnToolsVersion = 9.4.1;
+					};
+				};
+			};
+			buildConfigurationList = FCDFD400211D9185005AB38B /* Build configuration list for PBXProject "paddle-mobile-unit-test" */;
+			compatibilityVersion = "Xcode 9.3";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = FCDFD3FC211D9185005AB38B;
+			productRefGroup = FCDFD406211D9185005AB38B /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				FCDFD404211D9185005AB38B /* paddle-mobile-unit-test */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		FCDFD403211D9185005AB38B /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FC607427211DF3B100B17547 /* synset.txt in Resources */,
+				FC60742B211DF3B100B17547 /* params in Resources */,
+				FC607428211DF3B100B17547 /* banana.jpeg in Resources */,
+				FC60742A211DF3B100B17547 /* paddle-mobile.png in Resources */,
+				FC607429211DF3B100B17547 /* iphone.JPG in Resources */,
+				FC60742C211DF3B100B17547 /* model in Resources */,
+				FCDFD413211D9187005AB38B /* LaunchScreen.storyboard in Resources */,
+				FCDFD410211D9187005AB38B /* Assets.xcassets in Resources */,
+				FCDFD40E211D9185005AB38B /* Main.storyboard in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXShellScriptBuildPhase section */
+		53A2089068F9D64BB96D4322 /* [CP] Embed Pods Frameworks */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+				"${SRCROOT}/../Pods/Target Support Files/Pods-paddle-mobile-unit-test/Pods-paddle-mobile-unit-test-frameworks.sh",
+				"${BUILT_PRODUCTS_DIR}/SwiftProtobuf/SwiftProtobuf.framework",
+			);
+			name = "[CP] Embed Pods Frameworks";
+			outputPaths = (
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/SwiftProtobuf.framework",
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "\"${SRCROOT}/../Pods/Target Support Files/Pods-paddle-mobile-unit-test/Pods-paddle-mobile-unit-test-frameworks.sh\"\n";
+			showEnvVarsInLog = 0;
+		};
+		5F5A9A9DC0C6307DEA4294C1 /* [CP] Check Pods Manifest.lock */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
+				"${PODS_ROOT}/Manifest.lock",
+			);
+			name = "[CP] Check Pods Manifest.lock";
+			outputPaths = (
+				"$(DERIVED_FILE_DIR)/Pods-paddle-mobile-unit-test-checkManifestLockResult.txt",
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
+			showEnvVarsInLog = 0;
+		};
+/* End PBXShellScriptBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		FCDFD401211D9185005AB38B /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FCDFD40B211D9185005AB38B /* ViewController.swift in Sources */,
+				FCDFD409211D9185005AB38B /* AppDelegate.swift in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin PBXVariantGroup section */
+		FCDFD40C211D9185005AB38B /* Main.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				FCDFD40D211D9185005AB38B /* Base */,
+			);
+			name = Main.storyboard;
+			sourceTree = "<group>";
+		};
+		FCDFD411211D9187005AB38B /* LaunchScreen.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				FCDFD412211D9187005AB38B /* Base */,
+			);
+			name = LaunchScreen.storyboard;
+			sourceTree = "<group>";
+		};
+/* End PBXVariantGroup section */
+
+/* Begin XCBuildConfiguration section */
+		FCDFD415211D9187005AB38B /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+			};
+			name = Debug;
+		};
+		FCDFD416211D9187005AB38B /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = iphoneos;
+				SWIFT_COMPILATION_MODE = wholemodule;
+				SWIFT_OPTIMIZATION_LEVEL = "-O";
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		FCDFD418211D9187005AB38B /* Debug */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 5CC132C848027BE970FB2637 /* Pods-paddle-mobile-unit-test.debug.xcconfig */;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				DEVELOPMENT_TEAM = A798K58VVL;
+				INFOPLIST_FILE = "paddle-mobile-unit-test/Info.plist";
+				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile-unit-test";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_VERSION = 4.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		FCDFD419211D9187005AB38B /* Release */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 72F34AE9677943FC580DE7F4 /* Pods-paddle-mobile-unit-test.release.xcconfig */;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				DEVELOPMENT_TEAM = A798K58VVL;
+				INFOPLIST_FILE = "paddle-mobile-unit-test/Info.plist";
+				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile-unit-test";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_VERSION = 4.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		FCDFD400211D9185005AB38B /* Build configuration list for PBXProject "paddle-mobile-unit-test" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				FCDFD415211D9187005AB38B /* Debug */,
+				FCDFD416211D9187005AB38B /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		FCDFD417211D9187005AB38B /* Build configuration list for PBXNativeTarget "paddle-mobile-unit-test" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				FCDFD418211D9187005AB38B /* Debug */,
+				FCDFD419211D9187005AB38B /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = FCDFD3FD211D9185005AB38B /* Project object */;
+}
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/contents.xcworkspacedata
new file mode 100644
index 0000000000000000000000000000000000000000..cb4dfcfed95671fcf6dca7b01068d171ad562443
--- /dev/null
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/contents.xcworkspacedata
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Workspace
+   version = "1.0">
+   <FileRef
+      location = "self:paddle-mobile-unit-test.xcodeproj">
+   </FileRef>
+</Workspace>
diff --git a/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
similarity index 100%
rename from ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
rename to metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
new file mode 100644
index 0000000000000000000000000000000000000000..775d17b268941d24e8e9ebd7ac5ae26c2c0dbda9
Binary files /dev/null and b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate differ
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
new file mode 100644
index 0000000000000000000000000000000000000000..7817befaedf1aff04b75abd39cc6f7f06bc935d3
--- /dev/null
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
@@ -0,0 +1,50 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import UIKit
+
+@UIApplicationMain
+class AppDelegate: UIResponder, UIApplicationDelegate {
+
+    var window: UIWindow?
+
+    func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplicationLaunchOptionsKey: Any]?) -> Bool {
+        // Override point for customization after application launch.
+        return true
+    }
+
+    func applicationWillResignActive(_ application: UIApplication) {
+        // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
+        // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game.
+    }
+
+    func applicationDidEnterBackground(_ application: UIApplication) {
+        // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
+        // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
+    }
+
+    func applicationWillEnterForeground(_ application: UIApplication) {
+        // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background.
+    }
+
+    func applicationDidBecomeActive(_ application: UIApplication) {
+        // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
+    }
+
+    func applicationWillTerminate(_ application: UIApplication) {
+        // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
+    }
+
+
+}
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Assets.xcassets/AppIcon.appiconset/Contents.json b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Assets.xcassets/AppIcon.appiconset/Contents.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8db8d65fd79fd541b2b7eba75c7378af3448f9c
--- /dev/null
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Assets.xcassets/AppIcon.appiconset/Contents.json
@@ -0,0 +1,98 @@
+{
+  "images" : [
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "83.5x83.5",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ios-marketing",
+      "size" : "1024x1024",
+      "scale" : "1x"
+    }
+  ],
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Assets.xcassets/Contents.json b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Assets.xcassets/Contents.json
new file mode 100644
index 0000000000000000000000000000000000000000..da4a164c918651cdd1e11dca5cc62c333f097601
--- /dev/null
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Assets.xcassets/Contents.json
@@ -0,0 +1,6 @@
+{
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Base.lproj/LaunchScreen.storyboard b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Base.lproj/LaunchScreen.storyboard
new file mode 100644
index 0000000000000000000000000000000000000000..f83f6fd5810b9c852cf98563d82d5ed1e84ff893
--- /dev/null
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Base.lproj/LaunchScreen.storyboard
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="13122.16" systemVersion="17A277" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
+    <dependencies>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="13104.12"/>
+        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--View Controller-->
+        <scene sceneID="EHf-IW-A2E">
+            <objects>
+                <viewController id="01J-lp-oVM" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="53" y="375"/>
+        </scene>
+    </scenes>
+</document>
diff --git a/ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/Main.storyboard b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Base.lproj/Main.storyboard
similarity index 95%
rename from ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/Main.storyboard
rename to metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Base.lproj/Main.storyboard
index d7c78a1255c016bde922c849eef8555881c207b6..03c13c2286150ad7416086bec99d2c46ccca6efc 100644
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/Base.lproj/Main.storyboard
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Base.lproj/Main.storyboard
@@ -9,7 +9,7 @@
         <!--View Controller-->
         <scene sceneID="tne-QT-ifu">
             <objects>
-                <viewController id="BYZ-38-t0r" customClass="ViewController" customModuleProvider="" sceneMemberID="viewController">
+                <viewController id="BYZ-38-t0r" customClass="ViewController" customModuleProvider="target" sceneMemberID="viewController">
                     <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
                         <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
                         <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
diff --git a/ios/PaddleMobileDemo/PaddleMobileDemo/Info.plist b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Info.plist
similarity index 100%
rename from ios/PaddleMobileDemo/PaddleMobileDemo/Info.plist
rename to metal/paddle-mobile-unit-test/paddle-mobile-unit-test/Info.plist
diff --git a/ios/PaddleMobile/PaddleMobile/PaddleMobile.m b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
similarity index 54%
rename from ios/PaddleMobile/PaddleMobile/PaddleMobile.m
rename to metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
index a350330b1d0fbf7072d35c41cab75da0b477c31d..98f03affa2a230b2698edf6bafe5e06def8986b6 100644
--- a/ios/PaddleMobile/PaddleMobile/PaddleMobile.m
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
@@ -1,43 +1,35 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- 
+
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
- 
+
  http://www.apache.org/licenses/LICENSE-2.0
- 
+
  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. */
 
-#import "PaddleMobile.h"
-
-@implementation PaddleMobile
-
-+ (instancetype)sharedInstance{
-  //TODO: imp
-  exit(0);
-}
+import UIKit
+import Metal
+//import MetalKit
+import paddle_mobile
 
-- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath{
-  //TODO: imp
-  exit(0);
-}
-
-- (NSArray *)predict:(CGImageRef)image means:(NSArray<NSNumber *> *)means scale:(float)scale{
-  //TODO: imp
-  exit(0);
-}
-
-- (NSArray *)predict:(CGImageRef)image{
-  //TODO: imp
-  exit(0);
-}
+class ViewController: UIViewController {
+    override func viewDidLoad() {
+        super.viewDidLoad()
+        let device = Metal.MTLCreateSystemDefaultDevice()!
+        let queue = device.makeCommandQueue()!
+        let test = PaddleMobileUnitTest.init(
+            inDevice: device,
+            inQueue: queue
+        )
+        test.testConcat()
+//        test.testReshape()
+//        test.testTranspose()
+        print(" done ")
+    }
 
-- (void)clear{
-  //TODO: imp
-  exit(0);
 }
-@end
diff --git a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
new file mode 100644
index 0000000000000000000000000000000000000000..34d45528542d0d6a9d5ac153a7d6f818d962cbfd
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
@@ -0,0 +1,977 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 50;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		4AA1EA862146625E00D0F791 /* BilinearInterpOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */; };
+		4AA1EA88214662BD00D0F791 /* BilinearInterpKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA87214662BD00D0F791 /* BilinearInterpKernel.swift */; };
+		4AA1EA8A2146631C00D0F791 /* BilinearInterp.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA892146631C00D0F791 /* BilinearInterp.metal */; };
+		4AA1EA8C2146640900D0F791 /* SplitOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA8B2146640900D0F791 /* SplitOp.swift */; };
+		4AA1EA8E2146647F00D0F791 /* SplitKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA8D2146647F00D0F791 /* SplitKernel.swift */; };
+		4AA1EA90214664CD00D0F791 /* Split.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA8F214664CD00D0F791 /* Split.metal */; };
+		4AA1EA92214665D700D0F791 /* ShapeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA91214665D700D0F791 /* ShapeOp.swift */; };
+		4AA1EA942146661500D0F791 /* ShapeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA932146661500D0F791 /* ShapeKernel.swift */; };
+		4AA1EA982146666500D0F791 /* FlattenOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA972146666500D0F791 /* FlattenOp.swift */; };
+		4AA1EA9E2148D6F900D0F791 /* ConcatKernel.inc.metal in Headers */ = {isa = PBXBuildFile; fileRef = 4AA1EA9D2148D6F900D0F791 /* ConcatKernel.inc.metal */; };
+		4AA1EAA02148DEEE00D0F791 /* ReshapeKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA9F2148DEEE00D0F791 /* ReshapeKernel.inc.metal */; };
+		4AA1EAA2214912CD00D0F791 /* FlattenKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */; };
+		4AA1EAA4214A295C00D0F791 /* Split.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA3214A295C00D0F791 /* Split.inc.metal */; };
+		4AA1EAA6214B5F6800D0F791 /* Shape.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA5214B5F6800D0F791 /* Shape.metal */; };
+		4AA1EAA8214B7AFB00D0F791 /* BilinearInterp.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA7214B7AFB00D0F791 /* BilinearInterp.inc.metal */; };
+		4AA1EAAA214F53D800D0F791 /* BoxCoder.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA9214F53D800D0F791 /* BoxCoder.inc.metal */; };
+		4AA1EAAC214F55C800D0F791 /* Softmax.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAAB214F55C800D0F791 /* Softmax.inc.metal */; };
+		4AA1EAAE214F5FD900D0F791 /* TransposeKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAAD214F5FD900D0F791 /* TransposeKernel.inc.metal */; };
+		4AF928772133F1DB005B6C3A /* BoxCoder.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF928762133F1DB005B6C3A /* BoxCoder.metal */; };
+		4AF9287921341661005B6C3A /* Softmax.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF9287821341661005B6C3A /* Softmax.metal */; };
+		4AF928822135673D005B6C3A /* ConcatKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF928812135673D005B6C3A /* ConcatKernel.metal */; };
+		4AF9288421357BE3005B6C3A /* Elementwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF9288321357BE3005B6C3A /* Elementwise.metal */; };
+		D3831F70E7E0B565B9AC22DA /* Pods_paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */; };
+		FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC0226552138F33800F395E2 /* TransposeKernel.metal */; };
+		FC0226582138F38D00F395E2 /* PoolKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC0226572138F38D00F395E2 /* PoolKernel.metal */; };
+		FC039B6F20E11C3C0081E9F8 /* paddle_mobile.h in Headers */ = {isa = PBXBuildFile; fileRef = FC039B6D20E11C3C0081E9F8 /* paddle_mobile.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		FC039B9720E11C9A0081E9F8 /* Extensions.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9420E11C9A0081E9F8 /* Extensions.swift */; };
+		FC039B9820E11C9A0081E9F8 /* Errors.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9520E11C9A0081E9F8 /* Errors.swift */; };
+		FC039B9920E11C9A0081E9F8 /* Types.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9620E11C9A0081E9F8 /* Types.swift */; };
+		FC039B9B20E11CA00081E9F8 /* Executor.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9A20E11CA00081E9F8 /* Executor.swift */; };
+		FC039B9F20E11CB20081E9F8 /* Tensor.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9D20E11CB20081E9F8 /* Tensor.swift */; };
+		FC039BA020E11CB20081E9F8 /* Dim.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9E20E11CB20081E9F8 /* Dim.swift */; };
+		FC039BA220E11CB70081E9F8 /* Loader.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BA120E11CB70081E9F8 /* Loader.swift */; };
+		FC039BA920E11CBC0081E9F8 /* ConvOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BA420E11CBC0081E9F8 /* ConvOp.swift */; };
+		FC039BAA20E11CBC0081E9F8 /* ElementwiseAddOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BA520E11CBC0081E9F8 /* ElementwiseAddOp.swift */; };
+		FC039BAB20E11CBC0081E9F8 /* Operator.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BA620E11CBC0081E9F8 /* Operator.swift */; };
+		FC039BAC20E11CBC0081E9F8 /* BatchNormOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BA720E11CBC0081E9F8 /* BatchNormOp.swift */; };
+		FC039BAD20E11CBC0081E9F8 /* ReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BA820E11CBC0081E9F8 /* ReluOp.swift */; };
+		FC039BB820E11CC20081E9F8 /* framework.pb.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BAF20E11CC20081E9F8 /* framework.pb.swift */; };
+		FC039BB920E11CC20081E9F8 /* Scope.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BB020E11CC20081E9F8 /* Scope.swift */; };
+		FC039BBA20E11CC20081E9F8 /* TensorDesc.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BB120E11CC20081E9F8 /* TensorDesc.swift */; };
+		FC039BBB20E11CC20081E9F8 /* ProgramDesc.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BB220E11CC20081E9F8 /* ProgramDesc.swift */; };
+		FC039BBC20E11CC20081E9F8 /* VarDesc.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BB320E11CC20081E9F8 /* VarDesc.swift */; };
+		FC039BBD20E11CC20081E9F8 /* Program.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BB420E11CC20081E9F8 /* Program.swift */; };
+		FC039BBE20E11CC20081E9F8 /* OpDesc.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BB520E11CC20081E9F8 /* OpDesc.swift */; };
+		FC039BBF20E11CC20081E9F8 /* Attribute.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BB620E11CC20081E9F8 /* Attribute.swift */; };
+		FC039BC020E11CC20081E9F8 /* BlockDesc.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039BB720E11CC20081E9F8 /* BlockDesc.swift */; };
+		FC0E2DBA20EE3B8D009C1FAC /* ReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DB920EE3B8D009C1FAC /* ReluKernel.swift */; };
+		FC0E2DBC20EE45FE009C1FAC /* ConvKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */; };
+		FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */; };
+		FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */; };
+		FC1B16B320EC9A4F00678B91 /* Kernels.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC1B16B220EC9A4F00678B91 /* Kernels.metal */; };
+		FC292C5421421B2F00CF622F /* PaddleMobileGPU.h in Headers */ = {isa = PBXBuildFile; fileRef = FC292C5321421B2E00CF622F /* PaddleMobileGPU.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		FC292C5621421B4600CF622F /* PaddleMobileGPU.m in Sources */ = {isa = PBXBuildFile; fileRef = FC292C5521421B4600CF622F /* PaddleMobileGPU.m */; };
+		FC292C81214255BD00CF622F /* CPUCompute.mm in Sources */ = {isa = PBXBuildFile; fileRef = FC292C7C214255BC00CF622F /* CPUCompute.mm */; };
+		FC292C82214255BD00CF622F /* MobileNetSSD.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC292C7E214255BC00CF622F /* MobileNetSSD.swift */; };
+		FC292C85214257CB00CF622F /* CPUCompute.h in Headers */ = {isa = PBXBuildFile; fileRef = FC292C7D214255BC00CF622F /* CPUCompute.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		FC292C872142624800CF622F /* Genet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC292C862142624800CF622F /* Genet.swift */; };
+		FC33B0F02147659000714A93 /* MobileNet.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC33B0EF2147659000714A93 /* MobileNet.swift */; };
+		FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */; };
+		FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74820F0B954007C0C6D /* ConvKernel.metal */; };
+		FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */; };
+		FC4FD9752140E1DE0073E130 /* PaddleMobile.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4FD9742140E1DE0073E130 /* PaddleMobile.swift */; };
+		FC4FD9792140E4980073E130 /* PaddleMobileCPU.h in Headers */ = {isa = PBXBuildFile; fileRef = FC4FD9772140E4980073E130 /* PaddleMobileCPU.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		FC4FD97A2140E4980073E130 /* libpaddle-mobile.a in Frameworks */ = {isa = PBXBuildFile; fileRef = FC4FD9782140E4980073E130 /* libpaddle-mobile.a */; };
+		FC4FD97E2140F2C30073E130 /* libstdc++.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = FC4FD97D2140F2C30073E130 /* libstdc++.tbd */; };
+		FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */; };
+		FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC60DB8820E9AAA500FF203F /* MetalExtension.swift */; };
+		FC803BBF214CB65A0094B8E5 /* ConvAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */; };
+		FC803BC1214CB77A0094B8E5 /* ConvAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */; };
+		FC803BC3214CB79C0094B8E5 /* ConvAddPreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC2214CB79C0094B8E5 /* ConvAddPreluKernel.metal */; };
+		FC803BC5214CB8F00094B8E5 /* ConvAddPrelu.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC4214CB8F00094B8E5 /* ConvAddPrelu.inc.metal */; };
+		FC803BC7214CBA820094B8E5 /* Macro.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC6214CBA820094B8E5 /* Macro.metal */; };
+		FC803BC9214CFC8D0094B8E5 /* FetchKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */; };
+		FC82735920E3C04200BE430A /* OpCreator.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC82735820E3C04200BE430A /* OpCreator.swift */; };
+		FC9A19E32148C31300CD9CBF /* MobilenetSSD_AR.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9A19E22148C31300CD9CBF /* MobilenetSSD_AR.swift */; };
+		FC9D037920E229E4000F735A /* OpParam.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D037820E229E4000F735A /* OpParam.swift */; };
+		FC9D038020E22FBB000F735A /* FeedOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D037F20E22FBB000F735A /* FeedOp.swift */; };
+		FC9D038220E2312E000F735A /* FetchOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D038120E2312E000F735A /* FetchOp.swift */; };
+		FC9D038420E23B01000F735A /* Texture.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D038320E23B01000F735A /* Texture.swift */; };
+		FCA3A1632132A4AC00084FE5 /* ReshapeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */; };
+		FCA3A1652132A5EB00084FE5 /* Common.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA3A1642132A5EB00084FE5 /* Common.metal */; };
+		FCA67B1721364EF000BD58AA /* ConvTransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */; };
+		FCA67CD52138272900BD58AA /* ConvAddMetal.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD42138272900BD58AA /* ConvAddMetal.metal */; };
+		FCA67CD7213827AC00BD58AA /* ConvAddBNReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD6213827AC00BD58AA /* ConvAddBNReluKernel.metal */; };
+		FCA67CD92138287B00BD58AA /* ConvBNReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD82138287B00BD58AA /* ConvBNReluKernel.metal */; };
+		FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */; };
+		FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */; };
+		FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */; };
+		FCBCCC5D2122F8A100D94F7E /* DepthwiseConvOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5C2122F8A100D94F7E /* DepthwiseConvOp.swift */; };
+		FCBCCC5F2122FB3B00D94F7E /* PriorBoxOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5E2122FB3B00D94F7E /* PriorBoxOp.swift */; };
+		FCBCCC612122FBDF00D94F7E /* PriorBoxKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC602122FBDF00D94F7E /* PriorBoxKernel.swift */; };
+		FCBCCC632122FCC000D94F7E /* TransposeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC622122FCC000D94F7E /* TransposeKernel.swift */; };
+		FCBCCC652122FCD700D94F7E /* TransposeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC642122FCD700D94F7E /* TransposeOp.swift */; };
+		FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC66212306B000D94F7E /* ConcatOp.swift */; };
+		FCBCCC69212306D300D94F7E /* ConcatKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC68212306D300D94F7E /* ConcatKernel.swift */; };
+		FCBCCC6B2123071700D94F7E /* BoxcoderOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6A2123071700D94F7E /* BoxcoderOp.swift */; };
+		FCBCCC6D2123073A00D94F7E /* BoxcoderKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */; };
+		FCBCCC6F2123097100D94F7E /* MulticlassNMSOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */; };
+		FCBCCC71212309A700D94F7E /* MulticlassNMSKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */; };
+		FCD04E6620F314C50007374F /* PoolOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6520F314C50007374F /* PoolOp.swift */; };
+		FCD04E6820F315020007374F /* PoolKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6720F315020007374F /* PoolKernel.swift */; };
+		FCD04E6A20F319EC0007374F /* SoftmaxOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6920F319EC0007374F /* SoftmaxOp.swift */; };
+		FCD04E6C20F31A280007374F /* SoftmaxKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6B20F31A280007374F /* SoftmaxKernel.swift */; };
+		FCD04E6E20F31B4B0007374F /* ReshapeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6D20F31B4B0007374F /* ReshapeOp.swift */; };
+		FCD04E7020F31B720007374F /* ReshapeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6F20F31B720007374F /* ReshapeKernel.swift */; };
+		FCD04E7220F343420007374F /* ConvAddOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E7120F343420007374F /* ConvAddOp.swift */; };
+		FCD04E7420F3437E0007374F /* ConvAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E7320F3437E0007374F /* ConvAddKernel.swift */; };
+		FCDC0FEB21099A1D00DC9EFB /* Tools.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDC0FEA21099A1D00DC9EFB /* Tools.swift */; };
+		FCDDC6C6212F9FB800E5EF74 /* PreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */; };
+		FCDDC6C8212FA3CA00E5EF74 /* ConvTransposeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */; };
+		FCDDC6CA212FDF6800E5EF74 /* BatchNormKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6C9212FDF6800E5EF74 /* BatchNormKernel.metal */; };
+		FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */; };
+		FCDDC6CF212FE14700E5EF74 /* PriorBoxKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */; };
+		FCDE8A33212A917900F4A8F6 /* ConvTransposeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */; };
+		FCE3A1A92153DE5100C37CDE /* ConvAddAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */; };
+		FCE3A1AB2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */; };
+		FCE3A1AD2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */; };
+		FCE3A1AF2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */; };
+		FCE3A1B12153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1B02153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal */; };
+		FCE3A1B32153E91900C37CDE /* ElementwiseAddPreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1B22153E91900C37CDE /* ElementwiseAddPreluKernel.metal */; };
+		FCE9D7B7214F869000B520C3 /* Net.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE9D7B6214F869000B520C3 /* Net.swift */; };
+		FCE9D7B9214FAA4800B520C3 /* NMSFetchResultKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */; };
+		FCEB684A212F00DB00D2448E /* PreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCEB6849212F00DB00D2448E /* PreluKernel.metal */; };
+		FCEB684C212F093800D2448E /* PreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEB684B212F093800D2448E /* PreluOp.swift */; };
+		FCEBC0F420F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEBC0F320F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift */; };
+		FCEBC0F620F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */; };
+		FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCF2D73720E64E70007AC5F5 /* Kernel.swift */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BilinearInterpOp.swift; sourceTree = "<group>"; };
+		4AA1EA87214662BD00D0F791 /* BilinearInterpKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BilinearInterpKernel.swift; sourceTree = "<group>"; };
+		4AA1EA892146631C00D0F791 /* BilinearInterp.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BilinearInterp.metal; sourceTree = "<group>"; };
+		4AA1EA8B2146640900D0F791 /* SplitOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SplitOp.swift; sourceTree = "<group>"; };
+		4AA1EA8D2146647F00D0F791 /* SplitKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SplitKernel.swift; sourceTree = "<group>"; };
+		4AA1EA8F214664CD00D0F791 /* Split.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Split.metal; sourceTree = "<group>"; };
+		4AA1EA91214665D700D0F791 /* ShapeOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ShapeOp.swift; sourceTree = "<group>"; };
+		4AA1EA932146661500D0F791 /* ShapeKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ShapeKernel.swift; sourceTree = "<group>"; };
+		4AA1EA972146666500D0F791 /* FlattenOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FlattenOp.swift; sourceTree = "<group>"; };
+		4AA1EA9D2148D6F900D0F791 /* ConcatKernel.inc.metal */ = {isa = PBXFileReference; explicitFileType = sourcecode.metal; fileEncoding = 4; path = ConcatKernel.inc.metal; sourceTree = "<group>"; };
+		4AA1EA9F2148DEEE00D0F791 /* ReshapeKernel.inc.metal */ = {isa = PBXFileReference; explicitFileType = sourcecode.metal; fileEncoding = 4; path = ReshapeKernel.inc.metal; sourceTree = "<group>"; };
+		4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FlattenKernel.swift; sourceTree = "<group>"; };
+		4AA1EAA3214A295C00D0F791 /* Split.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Split.inc.metal; sourceTree = "<group>"; };
+		4AA1EAA5214B5F6800D0F791 /* Shape.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Shape.metal; sourceTree = "<group>"; };
+		4AA1EAA7214B7AFB00D0F791 /* BilinearInterp.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BilinearInterp.inc.metal; sourceTree = "<group>"; };
+		4AA1EAA9214F53D800D0F791 /* BoxCoder.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.inc.metal; sourceTree = "<group>"; };
+		4AA1EAAB214F55C800D0F791 /* Softmax.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Softmax.inc.metal; sourceTree = "<group>"; };
+		4AA1EAAD214F5FD900D0F791 /* TransposeKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = TransposeKernel.inc.metal; sourceTree = "<group>"; };
+		4AF928762133F1DB005B6C3A /* BoxCoder.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.metal; sourceTree = "<group>"; };
+		4AF9287821341661005B6C3A /* Softmax.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Softmax.metal; sourceTree = "<group>"; };
+		4AF928812135673D005B6C3A /* ConcatKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConcatKernel.metal; sourceTree = "<group>"; };
+		4AF9288321357BE3005B6C3A /* Elementwise.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Elementwise.metal; sourceTree = "<group>"; };
+		CDF58151D902A1CBAE56A0C2 /* Pods-paddle-mobile.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile/Pods-paddle-mobile.debug.xcconfig"; sourceTree = "<group>"; };
+		DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
+		E2A7957C92EDA5C3BEC0FFC2 /* Pods-paddle-mobile.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile/Pods-paddle-mobile.release.xcconfig"; sourceTree = "<group>"; };
+		FC0226552138F33800F395E2 /* TransposeKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = TransposeKernel.metal; sourceTree = "<group>"; };
+		FC0226572138F38D00F395E2 /* PoolKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PoolKernel.metal; sourceTree = "<group>"; };
+		FC039B6A20E11C3C0081E9F8 /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
+		FC039B6D20E11C3C0081E9F8 /* paddle_mobile.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = paddle_mobile.h; sourceTree = "<group>"; };
+		FC039B6E20E11C3C0081E9F8 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		FC039B9420E11C9A0081E9F8 /* Extensions.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Extensions.swift; sourceTree = "<group>"; };
+		FC039B9520E11C9A0081E9F8 /* Errors.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Errors.swift; sourceTree = "<group>"; };
+		FC039B9620E11C9A0081E9F8 /* Types.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Types.swift; sourceTree = "<group>"; };
+		FC039B9A20E11CA00081E9F8 /* Executor.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Executor.swift; sourceTree = "<group>"; };
+		FC039B9D20E11CB20081E9F8 /* Tensor.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Tensor.swift; sourceTree = "<group>"; };
+		FC039B9E20E11CB20081E9F8 /* Dim.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Dim.swift; sourceTree = "<group>"; };
+		FC039BA120E11CB70081E9F8 /* Loader.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Loader.swift; sourceTree = "<group>"; };
+		FC039BA420E11CBC0081E9F8 /* ConvOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ConvOp.swift; sourceTree = "<group>"; };
+		FC039BA520E11CBC0081E9F8 /* ElementwiseAddOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ElementwiseAddOp.swift; sourceTree = "<group>"; };
+		FC039BA620E11CBC0081E9F8 /* Operator.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Operator.swift; sourceTree = "<group>"; };
+		FC039BA720E11CBC0081E9F8 /* BatchNormOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BatchNormOp.swift; sourceTree = "<group>"; };
+		FC039BA820E11CBC0081E9F8 /* ReluOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ReluOp.swift; sourceTree = "<group>"; };
+		FC039BAF20E11CC20081E9F8 /* framework.pb.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = framework.pb.swift; sourceTree = "<group>"; };
+		FC039BB020E11CC20081E9F8 /* Scope.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Scope.swift; sourceTree = "<group>"; };
+		FC039BB120E11CC20081E9F8 /* TensorDesc.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = TensorDesc.swift; sourceTree = "<group>"; };
+		FC039BB220E11CC20081E9F8 /* ProgramDesc.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ProgramDesc.swift; sourceTree = "<group>"; };
+		FC039BB320E11CC20081E9F8 /* VarDesc.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = VarDesc.swift; sourceTree = "<group>"; };
+		FC039BB420E11CC20081E9F8 /* Program.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Program.swift; sourceTree = "<group>"; };
+		FC039BB520E11CC20081E9F8 /* OpDesc.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = OpDesc.swift; sourceTree = "<group>"; };
+		FC039BB620E11CC20081E9F8 /* Attribute.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Attribute.swift; sourceTree = "<group>"; };
+		FC039BB720E11CC20081E9F8 /* BlockDesc.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BlockDesc.swift; sourceTree = "<group>"; };
+		FC0E2DB920EE3B8D009C1FAC /* ReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ReluKernel.swift; sourceTree = "<group>"; };
+		FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvKernel.swift; sourceTree = "<group>"; };
+		FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BatchNormKernel.swift; sourceTree = "<group>"; };
+		FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddKernel.swift; sourceTree = "<group>"; };
+		FC1B16B220EC9A4F00678B91 /* Kernels.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Kernels.metal; sourceTree = "<group>"; };
+		FC292C5321421B2E00CF622F /* PaddleMobileGPU.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PaddleMobileGPU.h; sourceTree = "<group>"; };
+		FC292C5521421B4600CF622F /* PaddleMobileGPU.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = PaddleMobileGPU.m; sourceTree = "<group>"; };
+		FC292C7C214255BC00CF622F /* CPUCompute.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = CPUCompute.mm; sourceTree = "<group>"; };
+		FC292C7D214255BC00CF622F /* CPUCompute.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = CPUCompute.h; sourceTree = "<group>"; };
+		FC292C7E214255BC00CF622F /* MobileNetSSD.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MobileNetSSD.swift; sourceTree = "<group>"; };
+		FC292C862142624800CF622F /* Genet.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Genet.swift; sourceTree = "<group>"; };
+		FC33B0EF2147659000714A93 /* MobileNet.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MobileNet.swift; sourceTree = "<group>"; };
+		FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PaddleMobileUnitTest.swift; sourceTree = "<group>"; };
+		FC4CB74820F0B954007C0C6D /* ConvKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvKernel.metal; sourceTree = "<group>"; };
+		FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ProgramOptimize.swift; sourceTree = "<group>"; };
+		FC4FD9742140E1DE0073E130 /* PaddleMobile.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = PaddleMobile.swift; sourceTree = "<group>"; };
+		FC4FD9772140E4980073E130 /* PaddleMobileCPU.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PaddleMobileCPU.h; sourceTree = "<group>"; };
+		FC4FD9782140E4980073E130 /* libpaddle-mobile.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; path = "libpaddle-mobile.a"; sourceTree = "<group>"; };
+		FC4FD97D2140F2C30073E130 /* libstdc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libstdc++.tbd"; path = "usr/lib/libstdc++.tbd"; sourceTree = SDKROOT; };
+		FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Texture2DTo2DArrayKernel.swift; sourceTree = "<group>"; };
+		FC60DB8820E9AAA500FF203F /* MetalExtension.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MetalExtension.swift; sourceTree = "<group>"; };
+		FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddPreluOp.swift; sourceTree = "<group>"; };
+		FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddPreluKernel.swift; sourceTree = "<group>"; };
+		FC803BC2214CB79C0094B8E5 /* ConvAddPreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddPreluKernel.metal; sourceTree = "<group>"; };
+		FC803BC4214CB8F00094B8E5 /* ConvAddPrelu.inc.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddPrelu.inc.metal; sourceTree = "<group>"; };
+		FC803BC6214CBA820094B8E5 /* Macro.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Macro.metal; sourceTree = "<group>"; };
+		FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = FetchKernel.metal; sourceTree = "<group>"; };
+		FC82735820E3C04200BE430A /* OpCreator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpCreator.swift; sourceTree = "<group>"; };
+		FC9A19E22148C31300CD9CBF /* MobilenetSSD_AR.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MobilenetSSD_AR.swift; sourceTree = "<group>"; };
+		FC9D037820E229E4000F735A /* OpParam.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpParam.swift; sourceTree = "<group>"; };
+		FC9D037F20E22FBB000F735A /* FeedOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FeedOp.swift; sourceTree = "<group>"; };
+		FC9D038120E2312E000F735A /* FetchOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FetchOp.swift; sourceTree = "<group>"; };
+		FC9D038320E23B01000F735A /* Texture.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Texture.swift; sourceTree = "<group>"; };
+		FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ReshapeKernel.metal; sourceTree = "<group>"; };
+		FCA3A1642132A5EB00084FE5 /* Common.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Common.metal; sourceTree = "<group>"; };
+		FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvTransposeKernel.metal; sourceTree = "<group>"; };
+		FCA67CD42138272900BD58AA /* ConvAddMetal.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddMetal.metal; sourceTree = "<group>"; };
+		FCA67CD6213827AC00BD58AA /* ConvAddBNReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddBNReluKernel.metal; sourceTree = "<group>"; };
+		FCA67CD82138287B00BD58AA /* ConvBNReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvBNReluKernel.metal; sourceTree = "<group>"; };
+		FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DwConvBNReluOp.swift; sourceTree = "<group>"; };
+		FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluOp.swift; sourceTree = "<group>"; };
+		FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluKernel.swift; sourceTree = "<group>"; };
+		FCBCCC5C2122F8A100D94F7E /* DepthwiseConvOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DepthwiseConvOp.swift; sourceTree = "<group>"; };
+		FCBCCC5E2122FB3B00D94F7E /* PriorBoxOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PriorBoxOp.swift; sourceTree = "<group>"; };
+		FCBCCC602122FBDF00D94F7E /* PriorBoxKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PriorBoxKernel.swift; sourceTree = "<group>"; };
+		FCBCCC622122FCC000D94F7E /* TransposeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TransposeKernel.swift; sourceTree = "<group>"; };
+		FCBCCC642122FCD700D94F7E /* TransposeOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TransposeOp.swift; sourceTree = "<group>"; };
+		FCBCCC66212306B000D94F7E /* ConcatOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConcatOp.swift; sourceTree = "<group>"; };
+		FCBCCC68212306D300D94F7E /* ConcatKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConcatKernel.swift; sourceTree = "<group>"; };
+		FCBCCC6A2123071700D94F7E /* BoxcoderOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BoxcoderOp.swift; sourceTree = "<group>"; };
+		FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BoxcoderKernel.swift; sourceTree = "<group>"; };
+		FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MulticlassNMSOp.swift; sourceTree = "<group>"; };
+		FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MulticlassNMSKernel.swift; sourceTree = "<group>"; };
+		FCD04E6520F314C50007374F /* PoolOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PoolOp.swift; sourceTree = "<group>"; };
+		FCD04E6720F315020007374F /* PoolKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PoolKernel.swift; sourceTree = "<group>"; };
+		FCD04E6920F319EC0007374F /* SoftmaxOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SoftmaxOp.swift; sourceTree = "<group>"; };
+		FCD04E6B20F31A280007374F /* SoftmaxKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SoftmaxKernel.swift; sourceTree = "<group>"; };
+		FCD04E6D20F31B4B0007374F /* ReshapeOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ReshapeOp.swift; sourceTree = "<group>"; };
+		FCD04E6F20F31B720007374F /* ReshapeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ReshapeKernel.swift; sourceTree = "<group>"; };
+		FCD04E7120F343420007374F /* ConvAddOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddOp.swift; sourceTree = "<group>"; };
+		FCD04E7320F3437E0007374F /* ConvAddKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddKernel.swift; sourceTree = "<group>"; };
+		FCDC0FEA21099A1D00DC9EFB /* Tools.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Tools.swift; sourceTree = "<group>"; };
+		FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PreluKernel.swift; sourceTree = "<group>"; };
+		FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvTransposeKernel.swift; sourceTree = "<group>"; };
+		FCDDC6C9212FDF6800E5EF74 /* BatchNormKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = BatchNormKernel.metal; sourceTree = "<group>"; };
+		FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ReluKernel.metal; sourceTree = "<group>"; };
+		FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PriorBoxKernel.metal; sourceTree = "<group>"; };
+		FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvTransposeOp.swift; sourceTree = "<group>"; };
+		FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddAddPreluOp.swift; sourceTree = "<group>"; };
+		FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddAddPreluKernel.swift; sourceTree = "<group>"; };
+		FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddPreluOp.swift; sourceTree = "<group>"; };
+		FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddPreluKernel.swift; sourceTree = "<group>"; };
+		FCE3A1B02153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.inc.metal; sourceTree = "<group>"; };
+		FCE3A1B22153E91900C37CDE /* ElementwiseAddPreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.metal; sourceTree = "<group>"; };
+		FCE9D7B6214F869000B520C3 /* Net.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Net.swift; sourceTree = "<group>"; };
+		FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = NMSFetchResultKernel.metal; sourceTree = "<group>"; };
+		FCEB6849212F00DB00D2448E /* PreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PreluKernel.metal; sourceTree = "<group>"; };
+		FCEB684B212F093800D2448E /* PreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PreluOp.swift; sourceTree = "<group>"; };
+		FCEBC0F320F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; name = ConvAddBatchNormReluOp.swift; path = "paddle-mobile/Operators/ConvAddBatchNormReluOp.swift"; sourceTree = SOURCE_ROOT; };
+		FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddBatchNormReluKernel.swift; sourceTree = "<group>"; };
+		FCF2D73720E64E70007AC5F5 /* Kernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; name = Kernel.swift; path = "paddle-mobile/Operators/Kernels/Base/Kernel.swift"; sourceTree = SOURCE_ROOT; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		FC039B6620E11C3C0081E9F8 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FC4FD97E2140F2C30073E130 /* libstdc++.tbd in Frameworks */,
+				D3831F70E7E0B565B9AC22DA /* Pods_paddle_mobile.framework in Frameworks */,
+				FC4FD97A2140E4980073E130 /* libpaddle-mobile.a in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		336CBE234BF5DE48658DE65F /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				FC4FD97D2140F2C30073E130 /* libstdc++.tbd */,
+				DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		8EB858F9B68D372C9F1CA263 /* Pods */ = {
+			isa = PBXGroup;
+			children = (
+				CDF58151D902A1CBAE56A0C2 /* Pods-paddle-mobile.debug.xcconfig */,
+				E2A7957C92EDA5C3BEC0FFC2 /* Pods-paddle-mobile.release.xcconfig */,
+			);
+			name = Pods;
+			sourceTree = "<group>";
+		};
+		FC039B6020E11C3C0081E9F8 = {
+			isa = PBXGroup;
+			children = (
+				FC039B6C20E11C3C0081E9F8 /* paddle-mobile */,
+				FC039B6B20E11C3C0081E9F8 /* Products */,
+				8EB858F9B68D372C9F1CA263 /* Pods */,
+				336CBE234BF5DE48658DE65F /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		FC039B6B20E11C3C0081E9F8 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				FC039B6A20E11C3C0081E9F8 /* paddle_mobile.framework */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		FC039B6C20E11C3C0081E9F8 /* paddle-mobile */ = {
+			isa = PBXGroup;
+			children = (
+				FCE9D7B6214F869000B520C3 /* Net.swift */,
+				FC9A19E22148C31300CD9CBF /* MobilenetSSD_AR.swift */,
+				FC33B0EF2147659000714A93 /* MobileNet.swift */,
+				FC292C862142624800CF622F /* Genet.swift */,
+				FC292C7E214255BC00CF622F /* MobileNetSSD.swift */,
+				FC292C7C214255BC00CF622F /* CPUCompute.mm */,
+				FC292C7D214255BC00CF622F /* CPUCompute.h */,
+				FC292C5521421B4600CF622F /* PaddleMobileGPU.m */,
+				FC292C5321421B2E00CF622F /* PaddleMobileGPU.h */,
+				FC4FD9762140E4920073E130 /* CPU */,
+				FC4FD9742140E1DE0073E130 /* PaddleMobile.swift */,
+				FC039BAE20E11CC20081E9F8 /* Program */,
+				FC039BA320E11CBC0081E9F8 /* Operators */,
+				FC039B9C20E11CB20081E9F8 /* framework */,
+				FC039B9320E11C9A0081E9F8 /* Common */,
+				FC039B6D20E11C3C0081E9F8 /* paddle_mobile.h */,
+				FC039B6E20E11C3C0081E9F8 /* Info.plist */,
+			);
+			path = "paddle-mobile";
+			sourceTree = "<group>";
+		};
+		FC039B9320E11C9A0081E9F8 /* Common */ = {
+			isa = PBXGroup;
+			children = (
+				FC039B9420E11C9A0081E9F8 /* Extensions.swift */,
+				FC039B9520E11C9A0081E9F8 /* Errors.swift */,
+				FC039B9620E11C9A0081E9F8 /* Types.swift */,
+				FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */,
+				FC60DB8820E9AAA500FF203F /* MetalExtension.swift */,
+				FCDC0FEA21099A1D00DC9EFB /* Tools.swift */,
+			);
+			path = Common;
+			sourceTree = "<group>";
+		};
+		FC039B9C20E11CB20081E9F8 /* framework */ = {
+			isa = PBXGroup;
+			children = (
+				FC039BA120E11CB70081E9F8 /* Loader.swift */,
+				FC039B9A20E11CA00081E9F8 /* Executor.swift */,
+				FC039B9D20E11CB20081E9F8 /* Tensor.swift */,
+				FC039B9E20E11CB20081E9F8 /* Dim.swift */,
+				FC9D038320E23B01000F735A /* Texture.swift */,
+			);
+			path = framework;
+			sourceTree = "<group>";
+		};
+		FC039BA320E11CBC0081E9F8 /* Operators */ = {
+			isa = PBXGroup;
+			children = (
+				FC086BA520E67E8500D85EF7 /* Kernels */,
+				FCD592FA20E248EC00252966 /* Base */,
+				FCEBC0F320F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift */,
+				FC039BA420E11CBC0081E9F8 /* ConvOp.swift */,
+				FC039BA520E11CBC0081E9F8 /* ElementwiseAddOp.swift */,
+				FC039BA720E11CBC0081E9F8 /* BatchNormOp.swift */,
+				FC039BA820E11CBC0081E9F8 /* ReluOp.swift */,
+				FC9D037F20E22FBB000F735A /* FeedOp.swift */,
+				FC9D038120E2312E000F735A /* FetchOp.swift */,
+				FCD04E6520F314C50007374F /* PoolOp.swift */,
+				FCD04E6920F319EC0007374F /* SoftmaxOp.swift */,
+				FCD04E6D20F31B4B0007374F /* ReshapeOp.swift */,
+				FCD04E7120F343420007374F /* ConvAddOp.swift */,
+				FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */,
+				FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */,
+				FCBCCC5C2122F8A100D94F7E /* DepthwiseConvOp.swift */,
+				FCBCCC5E2122FB3B00D94F7E /* PriorBoxOp.swift */,
+				FCBCCC642122FCD700D94F7E /* TransposeOp.swift */,
+				FCBCCC66212306B000D94F7E /* ConcatOp.swift */,
+				FCBCCC6A2123071700D94F7E /* BoxcoderOp.swift */,
+				4AA1EA8B2146640900D0F791 /* SplitOp.swift */,
+				4AA1EA91214665D700D0F791 /* ShapeOp.swift */,
+				4AA1EA972146666500D0F791 /* FlattenOp.swift */,
+				4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */,
+				FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */,
+				FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */,
+				FCEB684B212F093800D2448E /* PreluOp.swift */,
+				FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */,
+				FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */,
+				FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */,
+			);
+			path = Operators;
+			sourceTree = "<group>";
+		};
+		FC039BAE20E11CC20081E9F8 /* Program */ = {
+			isa = PBXGroup;
+			children = (
+				FC039BAF20E11CC20081E9F8 /* framework.pb.swift */,
+				FC039BB020E11CC20081E9F8 /* Scope.swift */,
+				FC039BB120E11CC20081E9F8 /* TensorDesc.swift */,
+				FC039BB220E11CC20081E9F8 /* ProgramDesc.swift */,
+				FC039BB320E11CC20081E9F8 /* VarDesc.swift */,
+				FC039BB420E11CC20081E9F8 /* Program.swift */,
+				FC039BB520E11CC20081E9F8 /* OpDesc.swift */,
+				FC039BB620E11CC20081E9F8 /* Attribute.swift */,
+				FC039BB720E11CC20081E9F8 /* BlockDesc.swift */,
+				FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */,
+			);
+			path = Program;
+			sourceTree = "<group>";
+		};
+		FC086BA520E67E8500D85EF7 /* Kernels */ = {
+			isa = PBXGroup;
+			children = (
+				FCDDC6CD212FE02100E5EF74 /* Base */,
+				FCEB6837212F00B100D2448E /* metal */,
+				FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */,
+				FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */,
+				FC0E2DB920EE3B8D009C1FAC /* ReluKernel.swift */,
+				FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */,
+				FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */,
+				FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */,
+				FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */,
+				FCD04E6720F315020007374F /* PoolKernel.swift */,
+				FCD04E6B20F31A280007374F /* SoftmaxKernel.swift */,
+				FCD04E6F20F31B720007374F /* ReshapeKernel.swift */,
+				4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */,
+				FCD04E7320F3437E0007374F /* ConvAddKernel.swift */,
+				FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */,
+				FCBCCC602122FBDF00D94F7E /* PriorBoxKernel.swift */,
+				FCBCCC622122FCC000D94F7E /* TransposeKernel.swift */,
+				FCBCCC68212306D300D94F7E /* ConcatKernel.swift */,
+				FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */,
+				4AA1EA8D2146647F00D0F791 /* SplitKernel.swift */,
+				4AA1EA932146661500D0F791 /* ShapeKernel.swift */,
+				4AA1EA87214662BD00D0F791 /* BilinearInterpKernel.swift */,
+				FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */,
+				FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */,
+				FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */,
+				FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */,
+				FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */,
+			);
+			path = Kernels;
+			sourceTree = "<group>";
+		};
+		FC4FD9762140E4920073E130 /* CPU */ = {
+			isa = PBXGroup;
+			children = (
+				FC4FD9782140E4980073E130 /* libpaddle-mobile.a */,
+				FC4FD9772140E4980073E130 /* PaddleMobileCPU.h */,
+			);
+			path = CPU;
+			sourceTree = "<group>";
+		};
+		FCD592FA20E248EC00252966 /* Base */ = {
+			isa = PBXGroup;
+			children = (
+				FC9D037820E229E4000F735A /* OpParam.swift */,
+				FC039BA620E11CBC0081E9F8 /* Operator.swift */,
+				FC82735820E3C04200BE430A /* OpCreator.swift */,
+			);
+			path = Base;
+			sourceTree = "<group>";
+		};
+		FCDDC6CD212FE02100E5EF74 /* Base */ = {
+			isa = PBXGroup;
+			children = (
+				FCF2D73720E64E70007AC5F5 /* Kernel.swift */,
+			);
+			path = Base;
+			sourceTree = "<group>";
+		};
+		FCEB6837212F00B100D2448E /* metal */ = {
+			isa = PBXGroup;
+			children = (
+				4AF928812135673D005B6C3A /* ConcatKernel.metal */,
+				4AA1EA9D2148D6F900D0F791 /* ConcatKernel.inc.metal */,
+				4AF9288321357BE3005B6C3A /* Elementwise.metal */,
+				FC1B16B220EC9A4F00678B91 /* Kernels.metal */,
+				FC4CB74820F0B954007C0C6D /* ConvKernel.metal */,
+				4AF928762133F1DB005B6C3A /* BoxCoder.metal */,
+				4AA1EAA9214F53D800D0F791 /* BoxCoder.inc.metal */,
+				4AA1EAA5214B5F6800D0F791 /* Shape.metal */,
+				4AA1EA8F214664CD00D0F791 /* Split.metal */,
+				4AA1EAA3214A295C00D0F791 /* Split.inc.metal */,
+				4AA1EA892146631C00D0F791 /* BilinearInterp.metal */,
+				4AA1EAA7214B7AFB00D0F791 /* BilinearInterp.inc.metal */,
+				4AF9287821341661005B6C3A /* Softmax.metal */,
+				4AA1EAAB214F55C800D0F791 /* Softmax.inc.metal */,
+				FCEB6849212F00DB00D2448E /* PreluKernel.metal */,
+				FCDDC6C9212FDF6800E5EF74 /* BatchNormKernel.metal */,
+				FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */,
+				FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */,
+				FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */,
+				4AA1EA9F2148DEEE00D0F791 /* ReshapeKernel.inc.metal */,
+				FCA3A1642132A5EB00084FE5 /* Common.metal */,
+				FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */,
+				FCA67CD42138272900BD58AA /* ConvAddMetal.metal */,
+				FCA67CD6213827AC00BD58AA /* ConvAddBNReluKernel.metal */,
+				FCA67CD82138287B00BD58AA /* ConvBNReluKernel.metal */,
+				FC0226552138F33800F395E2 /* TransposeKernel.metal */,
+				4AA1EAAD214F5FD900D0F791 /* TransposeKernel.inc.metal */,
+				FC0226572138F38D00F395E2 /* PoolKernel.metal */,
+				FC803BC2214CB79C0094B8E5 /* ConvAddPreluKernel.metal */,
+				FC803BC4214CB8F00094B8E5 /* ConvAddPrelu.inc.metal */,
+				FC803BC6214CBA820094B8E5 /* Macro.metal */,
+				FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */,
+				FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */,
+				FCE3A1B02153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal */,
+				FCE3A1B22153E91900C37CDE /* ElementwiseAddPreluKernel.metal */,
+			);
+			path = metal;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXHeadersBuildPhase section */
+		FC039B6720E11C3C0081E9F8 /* Headers */ = {
+			isa = PBXHeadersBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FC4FD9792140E4980073E130 /* PaddleMobileCPU.h in Headers */,
+				FC292C85214257CB00CF622F /* CPUCompute.h in Headers */,
+				FC292C5421421B2F00CF622F /* PaddleMobileGPU.h in Headers */,
+				4AA1EA9E2148D6F900D0F791 /* ConcatKernel.inc.metal in Headers */,
+				FC039B6F20E11C3C0081E9F8 /* paddle_mobile.h in Headers */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXHeadersBuildPhase section */
+
+/* Begin PBXNativeTarget section */
+		FC039B6920E11C3C0081E9F8 /* paddle-mobile */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = FC039B7220E11C3C0081E9F8 /* Build configuration list for PBXNativeTarget "paddle-mobile" */;
+			buildPhases = (
+				AF33BB8D0770A77AC22B5EF4 /* [CP] Check Pods Manifest.lock */,
+				FC039B6520E11C3C0081E9F8 /* Sources */,
+				FC039B6620E11C3C0081E9F8 /* Frameworks */,
+				FC039B6720E11C3C0081E9F8 /* Headers */,
+				FC039B6820E11C3C0081E9F8 /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = "paddle-mobile";
+			productName = "paddle-mobile";
+			productReference = FC039B6A20E11C3C0081E9F8 /* paddle_mobile.framework */;
+			productType = "com.apple.product-type.framework";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		FC039B6120E11C3C0081E9F8 /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastUpgradeCheck = 0930;
+				ORGANIZATIONNAME = orange;
+				TargetAttributes = {
+					FC039B6920E11C3C0081E9F8 = {
+						CreatedOnToolsVersion = 9.3.1;
+						LastSwiftMigration = 0940;
+					};
+				};
+			};
+			buildConfigurationList = FC039B6420E11C3C0081E9F8 /* Build configuration list for PBXProject "paddle-mobile" */;
+			compatibilityVersion = "Xcode 9.3";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+			);
+			mainGroup = FC039B6020E11C3C0081E9F8;
+			productRefGroup = FC039B6B20E11C3C0081E9F8 /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				FC039B6920E11C3C0081E9F8 /* paddle-mobile */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		FC039B6820E11C3C0081E9F8 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXShellScriptBuildPhase section */
+		AF33BB8D0770A77AC22B5EF4 /* [CP] Check Pods Manifest.lock */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
+				"${PODS_ROOT}/Manifest.lock",
+			);
+			name = "[CP] Check Pods Manifest.lock";
+			outputPaths = (
+				"$(DERIVED_FILE_DIR)/Pods-paddle-mobile-checkManifestLockResult.txt",
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
+			showEnvVarsInLog = 0;
+		};
+/* End PBXShellScriptBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		FC039B6520E11C3C0081E9F8 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				FC9D038020E22FBB000F735A /* FeedOp.swift in Sources */,
+				4AA1EAAA214F53D800D0F791 /* BoxCoder.inc.metal in Sources */,
+				FC039B9F20E11CB20081E9F8 /* Tensor.swift in Sources */,
+				FC803BC9214CFC8D0094B8E5 /* FetchKernel.metal in Sources */,
+				FCA67CD7213827AC00BD58AA /* ConvAddBNReluKernel.metal in Sources */,
+				4AF9287921341661005B6C3A /* Softmax.metal in Sources */,
+				4AA1EA942146661500D0F791 /* ShapeKernel.swift in Sources */,
+				FC0E2DBC20EE45FE009C1FAC /* ConvKernel.swift in Sources */,
+				FC039BAA20E11CBC0081E9F8 /* ElementwiseAddOp.swift in Sources */,
+				FCDE8A33212A917900F4A8F6 /* ConvTransposeOp.swift in Sources */,
+				FCBCCC6B2123071700D94F7E /* BoxcoderOp.swift in Sources */,
+				4AA1EAAE214F5FD900D0F791 /* TransposeKernel.inc.metal in Sources */,
+				4AA1EAA4214A295C00D0F791 /* Split.inc.metal in Sources */,
+				FC803BC7214CBA820094B8E5 /* Macro.metal in Sources */,
+				FC039B9B20E11CA00081E9F8 /* Executor.swift in Sources */,
+				4AF9288421357BE3005B6C3A /* Elementwise.metal in Sources */,
+				FCD04E7020F31B720007374F /* ReshapeKernel.swift in Sources */,
+				FCE3A1B12153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal in Sources */,
+				FCD04E7220F343420007374F /* ConvAddOp.swift in Sources */,
+				FC039BBB20E11CC20081E9F8 /* ProgramDesc.swift in Sources */,
+				FCE3A1AB2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift in Sources */,
+				FC9D037920E229E4000F735A /* OpParam.swift in Sources */,
+				FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */,
+				FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */,
+				FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */,
+				FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */,
+				FCDDC6C6212F9FB800E5EF74 /* PreluKernel.swift in Sources */,
+				FCA67CD52138272900BD58AA /* ConvAddMetal.metal in Sources */,
+				FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */,
+				4AA1EA8C2146640900D0F791 /* SplitOp.swift in Sources */,
+				FC292C81214255BD00CF622F /* CPUCompute.mm in Sources */,
+				FCEBC0F420F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift in Sources */,
+				4AA1EAAC214F55C800D0F791 /* Softmax.inc.metal in Sources */,
+				FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */,
+				4AF928772133F1DB005B6C3A /* BoxCoder.metal in Sources */,
+				FC803BBF214CB65A0094B8E5 /* ConvAddPreluOp.swift in Sources */,
+				FC33B0F02147659000714A93 /* MobileNet.swift in Sources */,
+				FCEB684C212F093800D2448E /* PreluOp.swift in Sources */,
+				4AA1EAA8214B7AFB00D0F791 /* BilinearInterp.inc.metal in Sources */,
+				FCA67CD92138287B00BD58AA /* ConvBNReluKernel.metal in Sources */,
+				FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */,
+				FCEBC0F620F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift in Sources */,
+				4AA1EA8A2146631C00D0F791 /* BilinearInterp.metal in Sources */,
+				FCDDC6CA212FDF6800E5EF74 /* BatchNormKernel.metal in Sources */,
+				FC1B16B320EC9A4F00678B91 /* Kernels.metal in Sources */,
+				FC039BBA20E11CC20081E9F8 /* TensorDesc.swift in Sources */,
+				FC039BA020E11CB20081E9F8 /* Dim.swift in Sources */,
+				FC039BB820E11CC20081E9F8 /* framework.pb.swift in Sources */,
+				FC039B9920E11C9A0081E9F8 /* Types.swift in Sources */,
+				FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */,
+				FCA3A1632132A4AC00084FE5 /* ReshapeKernel.metal in Sources */,
+				FC4FD9752140E1DE0073E130 /* PaddleMobile.swift in Sources */,
+				FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */,
+				FC039BA920E11CBC0081E9F8 /* ConvOp.swift in Sources */,
+				FC9D038420E23B01000F735A /* Texture.swift in Sources */,
+				FCE3A1B32153E91900C37CDE /* ElementwiseAddPreluKernel.metal in Sources */,
+				4AA1EAA2214912CD00D0F791 /* FlattenKernel.swift in Sources */,
+				4AA1EA982146666500D0F791 /* FlattenOp.swift in Sources */,
+				FCBCCC652122FCD700D94F7E /* TransposeOp.swift in Sources */,
+				4AA1EAA6214B5F6800D0F791 /* Shape.metal in Sources */,
+				FCD04E6E20F31B4B0007374F /* ReshapeOp.swift in Sources */,
+				FC039B9820E11C9A0081E9F8 /* Errors.swift in Sources */,
+				FC039BBF20E11CC20081E9F8 /* Attribute.swift in Sources */,
+				4AA1EA8E2146647F00D0F791 /* SplitKernel.swift in Sources */,
+				FCD04E7420F3437E0007374F /* ConvAddKernel.swift in Sources */,
+				FC039BB920E11CC20081E9F8 /* Scope.swift in Sources */,
+				FC292C5621421B4600CF622F /* PaddleMobileGPU.m in Sources */,
+				FCD04E6620F314C50007374F /* PoolOp.swift in Sources */,
+				FCE9D7B9214FAA4800B520C3 /* NMSFetchResultKernel.metal in Sources */,
+				FC039BAC20E11CBC0081E9F8 /* BatchNormOp.swift in Sources */,
+				FCBCCC6F2123097100D94F7E /* MulticlassNMSOp.swift in Sources */,
+				FC039BBC20E11CC20081E9F8 /* VarDesc.swift in Sources */,
+				FC292C872142624800CF622F /* Genet.swift in Sources */,
+				FC803BC5214CB8F00094B8E5 /* ConvAddPrelu.inc.metal in Sources */,
+				4AF928822135673D005B6C3A /* ConcatKernel.metal in Sources */,
+				FCBCCC632122FCC000D94F7E /* TransposeKernel.swift in Sources */,
+				FCBCCC71212309A700D94F7E /* MulticlassNMSKernel.swift in Sources */,
+				FCDC0FEB21099A1D00DC9EFB /* Tools.swift in Sources */,
+				FC0E2DBA20EE3B8D009C1FAC /* ReluKernel.swift in Sources */,
+				4AA1EA862146625E00D0F791 /* BilinearInterpOp.swift in Sources */,
+				FCBCCC6D2123073A00D94F7E /* BoxcoderKernel.swift in Sources */,
+				FCBCCC69212306D300D94F7E /* ConcatKernel.swift in Sources */,
+				FCDDC6C8212FA3CA00E5EF74 /* ConvTransposeKernel.swift in Sources */,
+				FC82735920E3C04200BE430A /* OpCreator.swift in Sources */,
+				FCA3A1652132A5EB00084FE5 /* Common.metal in Sources */,
+				4AA1EA92214665D700D0F791 /* ShapeOp.swift in Sources */,
+				FC803BC1214CB77A0094B8E5 /* ConvAddPreluKernel.swift in Sources */,
+				FCBCCC5D2122F8A100D94F7E /* DepthwiseConvOp.swift in Sources */,
+				FCE3A1AF2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift in Sources */,
+				FCE9D7B7214F869000B520C3 /* Net.swift in Sources */,
+				FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */,
+				FC039BAB20E11CBC0081E9F8 /* Operator.swift in Sources */,
+				FCD04E6A20F319EC0007374F /* SoftmaxOp.swift in Sources */,
+				FC292C82214255BD00CF622F /* MobileNetSSD.swift in Sources */,
+				FCBCCC612122FBDF00D94F7E /* PriorBoxKernel.swift in Sources */,
+				FCBCCC5F2122FB3B00D94F7E /* PriorBoxOp.swift in Sources */,
+				FC9D038220E2312E000F735A /* FetchOp.swift in Sources */,
+				FCA67B1721364EF000BD58AA /* ConvTransposeKernel.metal in Sources */,
+				FC039BBD20E11CC20081E9F8 /* Program.swift in Sources */,
+				FC039BA220E11CB70081E9F8 /* Loader.swift in Sources */,
+				FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */,
+				FCD04E6C20F31A280007374F /* SoftmaxKernel.swift in Sources */,
+				FCEB684A212F00DB00D2448E /* PreluKernel.metal in Sources */,
+				4AA1EAA02148DEEE00D0F791 /* ReshapeKernel.inc.metal in Sources */,
+				FC9A19E32148C31300CD9CBF /* MobilenetSSD_AR.swift in Sources */,
+				FCDDC6CF212FE14700E5EF74 /* PriorBoxKernel.metal in Sources */,
+				FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */,
+				FCE3A1A92153DE5100C37CDE /* ConvAddAddPreluOp.swift in Sources */,
+				FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */,
+				FCE3A1AD2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift in Sources */,
+				FC039BC020E11CC20081E9F8 /* BlockDesc.swift in Sources */,
+				FC803BC3214CB79C0094B8E5 /* ConvAddPreluKernel.metal in Sources */,
+				4AA1EA90214664CD00D0F791 /* Split.metal in Sources */,
+				FCD04E6820F315020007374F /* PoolKernel.swift in Sources */,
+				FC0226582138F38D00F395E2 /* PoolKernel.metal in Sources */,
+				FC039BAD20E11CBC0081E9F8 /* ReluOp.swift in Sources */,
+				FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */,
+				FC039BBE20E11CC20081E9F8 /* OpDesc.swift in Sources */,
+				4AA1EA88214662BD00D0F791 /* BilinearInterpKernel.swift in Sources */,
+				FC039B9720E11C9A0081E9F8 /* Extensions.swift in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		FC039B7020E11C3C0081E9F8 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				CURRENT_PROJECT_VERSION = 1;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				VERSIONING_SYSTEM = "apple-generic";
+				VERSION_INFO_PREFIX = "";
+			};
+			name = Debug;
+		};
+		FC039B7120E11C3C0081E9F8 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				CURRENT_PROJECT_VERSION = 1;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.3;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = iphoneos;
+				SWIFT_COMPILATION_MODE = wholemodule;
+				SWIFT_OPTIMIZATION_LEVEL = "-O";
+				VALIDATE_PRODUCT = YES;
+				VERSIONING_SYSTEM = "apple-generic";
+				VERSION_INFO_PREFIX = "";
+			};
+			name = Release;
+		};
+		FC039B7320E11C3C0081E9F8 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = CDF58151D902A1CBAE56A0C2 /* Pods-paddle-mobile.debug.xcconfig */;
+			buildSettings = {
+				CLANG_ENABLE_MODULES = YES;
+				CODE_SIGN_IDENTITY = "";
+				CODE_SIGN_STYLE = Automatic;
+				DEFINES_MODULE = YES;
+				DEVELOPMENT_TEAM = A798K58VVL;
+				DYLIB_COMPATIBILITY_VERSION = 1;
+				DYLIB_CURRENT_VERSION = 1;
+				DYLIB_INSTALL_NAME_BASE = "@rpath";
+				ENABLE_BITCODE = NO;
+				INFOPLIST_FILE = "paddle-mobile/Info.plist";
+				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
+				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+					"@loader_path/Frameworks",
+				);
+				LIBRARY_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(PROJECT_DIR)/paddle-mobile/CPU",
+				);
+				MACH_O_TYPE = mh_dylib;
+				MTL_LANGUAGE_REVISION = UseDeploymentTarget;
+				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
+				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
+				SKIP_INSTALL = YES;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				SWIFT_VERSION = 4.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		FC039B7420E11C3C0081E9F8 /* Release */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = E2A7957C92EDA5C3BEC0FFC2 /* Pods-paddle-mobile.release.xcconfig */;
+			buildSettings = {
+				CLANG_ENABLE_MODULES = YES;
+				CODE_SIGN_IDENTITY = "";
+				CODE_SIGN_STYLE = Automatic;
+				DEFINES_MODULE = YES;
+				DEVELOPMENT_TEAM = A798K58VVL;
+				DYLIB_COMPATIBILITY_VERSION = 1;
+				DYLIB_CURRENT_VERSION = 1;
+				DYLIB_INSTALL_NAME_BASE = "@rpath";
+				ENABLE_BITCODE = NO;
+				INFOPLIST_FILE = "paddle-mobile/Info.plist";
+				INSTALL_PATH = "$(LOCAL_LIBRARY_DIR)/Frameworks";
+				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+					"@loader_path/Frameworks",
+				);
+				LIBRARY_SEARCH_PATHS = (
+					"$(inherited)",
+					"$(PROJECT_DIR)/paddle-mobile/CPU",
+				);
+				MACH_O_TYPE = mh_dylib;
+				MTL_LANGUAGE_REVISION = UseDeploymentTarget;
+				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
+				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
+				SKIP_INSTALL = YES;
+				SWIFT_VERSION = 4.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		FC039B6420E11C3C0081E9F8 /* Build configuration list for PBXProject "paddle-mobile" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				FC039B7020E11C3C0081E9F8 /* Debug */,
+				FC039B7120E11C3C0081E9F8 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		FC039B7220E11C3C0081E9F8 /* Build configuration list for PBXNativeTarget "paddle-mobile" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				FC039B7320E11C3C0081E9F8 /* Debug */,
+				FC039B7420E11C3C0081E9F8 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = FC039B6120E11C3C0081E9F8 /* Project object */;
+}
diff --git a/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/contents.xcworkspacedata
similarity index 68%
rename from ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/contents.xcworkspacedata
rename to metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/contents.xcworkspacedata
index 18f14d0d53b03b7326c6b613e445438ab35e4bed..bb84e46b46d8c2c496c068dc15f2304785ed8e31 100644
--- a/ios/PaddleMobile/PaddleMobile.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/contents.xcworkspacedata
@@ -2,6 +2,6 @@
 <Workspace
    version = "1.0">
    <FileRef
-      location = "self:PaddleMobile.xcodeproj">
+      location = "self:paddle-mobile.xcodeproj">
    </FileRef>
 </Workspace>
diff --git a/ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
similarity index 100%
rename from ios/PaddleMobileDemo/PaddleMobileDemo.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
rename to metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
diff --git a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate
new file mode 100644
index 0000000000000000000000000000000000000000..6b9abef67c5d123b4fb22117ed3f4f575de52aa0
Binary files /dev/null and b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.xcworkspace/xcuserdata/liuruilong.xcuserdatad/UserInterfaceState.xcuserstate differ
diff --git a/metal/paddle-mobile/paddle-mobile/CPU/PaddleMobileCPU.h b/metal/paddle-mobile/paddle-mobile/CPU/PaddleMobileCPU.h
new file mode 100644
index 0000000000000000000000000000000000000000..c68d81f328f4ce9a9bf16624f677b2996644c35c
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/CPU/PaddleMobileCPU.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#import <CoreImage/CoreImage.h>
+#import <Foundation/Foundation.h>
+
+@interface PaddleMobileCPUResult: NSObject
+
+@property (assign, nonatomic, readonly) float *output;
+
+@property (assign, nonatomic, readonly) int outputSize;
+
+-(void)releaseOutput;
+
+@end
+
+@interface PaddleMobileCPU : NSObject
+
+/*
+    创建对象
+*/
+- (instancetype)init;
+
+/*
+    load 模型, 开辟内存
+*/
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+
+/*
+  加载散开形式的模型, 需传入模型的目录
+*/
+- (BOOL)load:(NSString *)modelAndWeightPath;
+
+/*
+ * 从内存中加载模型
+ * */
+- (BOOL)LoadCombinedMemory:(size_t)modelLen
+               andModelBuf:(const uint8_t *)modelBuf
+         andModelParamsLen:(size_t)combinedParamsLen
+      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf;
+
+/*
+ *  对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存
+ * */
+-(void)preprocess:(CGImageRef)image
+           output:(float *)output
+            means:(NSArray<NSNumber *> *)means
+        scale:(float)scale
+        dim:(NSArray<NSNumber *> *)dim;
+
+/*
+ * 预测预处理后的数据, 返回结果使用结束需要调用其 realseOutput 函数进行释放
+ * */
+- (PaddleMobileCPUResult *)predictInput:(float *)input
+                                    dim:(NSArray<NSNumber *> *)dim;
+
+/*
+    进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
+
+/*
+    进行预测, 默认 means 为 0, scale 为 1.0
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
+
+/*
+    清理内存
+*/
+- (void)clear;
+
+@end
diff --git a/metal/paddle-mobile/paddle-mobile/CPUCompute.h b/metal/paddle-mobile/paddle-mobile/CPUCompute.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed12dd60df4ea06944fdf4ff9b635fc12a99120e
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/CPUCompute.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#import <Foundation/Foundation.h>
+
+
+@interface CPUResult: NSObject
+@property (assign, nonatomic) float *output;
+@property (assign, nonatomic) int outputSize;
+@end
+
+@interface NMSCompute: NSObject
+
+@property (assign, nonatomic) float scoreThredshold;
+
+@property (assign, nonatomic) int nmsTopK;
+
+@property (assign, nonatomic) int keepTopK;
+
+@property (assign, nonatomic) float nmsEta;
+
+@property (assign, nonatomic) float nmsThreshold;
+
+@property (assign, nonatomic) int background_label;
+
+@property (strong, nonatomic) NSArray<NSNumber *> *scoreDim;
+
+@property (strong, nonatomic) NSArray<NSNumber *> *bboxDim;
+
+-(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox;
+
+@end
diff --git a/metal/paddle-mobile/paddle-mobile/CPUCompute.mm b/metal/paddle-mobile/paddle-mobile/CPUCompute.mm
new file mode 100644
index 0000000000000000000000000000000000000000..b97153765b46bb63d604d8845eee08d91283481d
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/CPUCompute.mm
@@ -0,0 +1,322 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+
+#import "CPUCompute.h"
+
+#import <map>
+#import <vector>
+#import <utility>
+#import <algorithm>
+
+
+
+
+struct NMSParam {
+  
+  float *score_data;
+  
+  float *box_data;
+  
+  float *output;
+  
+  int output_size;
+  
+  std::vector<int> score_dim;
+  
+  std::vector<int> box_dim;
+  
+  float scoreThredshold;
+  
+  int nmsTopK;
+  
+  int keepTopK;
+  
+  float nmsEta;
+  
+  float nmsThreshold;
+  
+  int background_label;
+};
+
+
+constexpr int kOutputDim = 6;
+constexpr int kBBoxSize = 4;
+
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+static inline void GetMaxScoreIndex(
+                                    const std::vector<T>& scores, const T threshold, int top_k,
+                                    std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <class T>
+static inline T BBoxArea(const T* box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static inline T JaccardOverlap(const T* box1, const T* box2,
+                               const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_w = inter_xmax - inter_xmin;
+    const T inter_h = inter_ymax - inter_ymin;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <typename T>
+static inline void NMSFast(
+                           const T *bbox_data,
+                           std::vector<int> bbox_dim,
+                           const T *score_data,
+                           const T score_threshold, const T nms_threshold,
+                           const T eta, const int top_k,
+                           std::vector<int>* selected_indices) {
+  // The total boxes for each instance.
+  int num_boxes = bbox_dim[0];
+  // 4: [xmin ymin xmax ymax]
+  int box_size = bbox_dim[1];
+
+  std::vector<T> scores_data(num_boxes);
+  std::copy_n(score_data, num_boxes, scores_data.begin());
+  std::vector<std::pair<T, int>> sorted_indices;
+  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+
+  selected_indices->clear();
+  T adaptive_threshold = nms_threshold;
+
+  while (sorted_indices.size() != 0) {
+    const int idx = sorted_indices.front().second;
+    bool keep = true;
+    for (size_t k = 0; k < selected_indices->size(); ++k) {
+      if (keep) {
+        const int kept_idx = (*selected_indices)[k];
+        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                      bbox_data + kept_idx * box_size, true);
+        keep = overlap <= adaptive_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) {
+      selected_indices->push_back(idx);
+    }
+    sorted_indices.erase(sorted_indices.begin());
+    if (keep && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+}
+
+template <typename T>
+void MultiClassNMS(const T *boxes_data,
+                   const std::vector<int> &box_dim,
+                   const T *scores_data,
+                   const std::vector<int> &score_dim,
+                   std::map<int, std::vector<int>>* indices, int* num_nmsed_out,
+                   const int& background_label, const int& nms_top_k,
+                   const int& keep_top_k, const T& nms_threshold,
+                   const T& nms_eta, const T& score_threshold) {
+  
+  int64_t class_num = score_dim[0];
+  int64_t predict_dim = score_dim[1];
+  int num_det = 0;
+  for (int c = 0; c < class_num; ++c) {
+    if (c == background_label) continue;
+    const T *score_data = scores_data + c * predict_dim;
+    
+    /// [c] is key
+    NMSFast<T>(boxes_data, box_dim, score_data, score_threshold, nms_threshold, nms_eta,
+                   nms_top_k, &((*indices)[c]));
+    num_det += (*indices)[c].size();
+  }
+
+  *num_nmsed_out = num_det;
+  if (keep_top_k > -1 && num_det > keep_top_k) {
+    std::vector<std::pair<T, std::pair<int, int>>> score_index_pairs;
+    for (const auto& it : *indices) {
+      int label = it.first;
+      const T* sdata = scores_data + label * predict_dim;
+      const std::vector<int>& label_indices = it.second;
+      for (size_t j = 0; j < label_indices.size(); ++j) {
+        int idx = label_indices[j];
+        // PADDLE_ENFORCE_LT(idx, predict_dim);
+        score_index_pairs.push_back(std::make_pair(sdata[idx], std::make_pair(label, idx)));
+      }
+    }
+    // Keep top k results per image.
+    std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
+                     SortScorePairDescend<std::pair<int, int>>);
+    score_index_pairs.resize(keep_top_k);
+
+    // Store the new indices.
+    std::map<int, std::vector<int>> new_indices;
+    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+      int label = score_index_pairs[j].second.first;
+      int idx = score_index_pairs[j].second.second;
+      new_indices[label].push_back(idx);
+    }
+    new_indices.swap(*indices);
+    *num_nmsed_out = keep_top_k;
+  }
+}
+
+template <typename T>
+void MultiClassOutput(const T *scores_data,
+                      const std::vector<int> &score_dim,
+                      const T *bboxes_data,
+                      T *outputs_data,
+                      const std::map<int, std::vector<int>>& selected_indices) {
+  int predict_dim = score_dim[1];
+  int count = 0;
+  for (const auto& it : selected_indices) {
+    /// one batch
+    int label = it.first;
+    const T* sdata = scores_data + label * predict_dim;
+    const std::vector<int>& indices = it.second;
+    for (size_t j = 0; j < indices.size(); ++j) {
+      int idx = indices[j];
+      const T* bdata = bboxes_data + idx * kBBoxSize;
+      outputs_data[count * kOutputDim] = label;           // label
+      outputs_data[count * kOutputDim + 1] = sdata[idx];  // score
+      // xmin, ymin, xmax, ymax
+      std::memcpy(outputs_data + count * kOutputDim + 2, bdata, 4 * sizeof(T));
+      count++;
+    }
+  }
+}
+
+void MultiClassNMSCompute(NMSParam *param) {
+  assert(param->score_dim[0] == 1);
+  assert(param->box_dim[0] == 1);
+  assert (param->score_dim.size() == 3);
+  assert(param->box_dim.size() == 3);
+
+  float* outputs;
+  auto background_label = param->background_label;
+  auto nms_top_k = param->nmsTopK;
+  auto keep_top_k = param->keepTopK;
+  auto nms_threshold = param->nmsThreshold;
+  auto nms_eta = param->nmsEta;
+  auto score_threshold = param->scoreThredshold;
+
+  std::vector<int> score_dim_one_batch = {param->score_dim[1], param->score_dim[2]};
+  std::vector<int> box_dim_one_batch = {param->box_dim[1], param->box_dim[2]};
+  
+  std::vector<int> batch_starts = {0};
+  
+  std::map<int, std::vector<int>> indices;
+  int num_nmsed_out = 0;
+  
+  MultiClassNMS<float>(param->box_data, box_dim_one_batch, param->score_data, score_dim_one_batch, &indices, &num_nmsed_out,
+                       background_label, nms_top_k, keep_top_k, nms_threshold,
+                       nms_eta, score_threshold);
+  batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+
+  int output_size = 0;
+  int num_kept = batch_starts.back();
+  if (num_kept == 0) {
+    outputs = new float[1];
+    outputs[0] = -1;
+    output_size = 1;
+  } else {
+    outputs = new float[num_kept * kOutputDim];
+    int64_t s = batch_starts[0];
+    int64_t e = batch_starts[1];
+    if (e > s) {
+      MultiClassOutput<float>(param->score_data, score_dim_one_batch, param->box_data, outputs, indices);
+    }
+    output_size = num_kept * kOutputDim;
+  }
+  param->output = outputs;
+  param->output_size = output_size;
+}
+
+@implementation CPUResult
+@end
+
+@implementation NMSCompute
+
+-(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox {
+  NMSParam param;
+  param.box_data = bbox;
+  param.score_data = score;
+  param.background_label = self.background_label;
+  param.scoreThredshold = self.scoreThredshold;
+  param.nmsTopK = self.nmsTopK;
+  param.keepTopK = self.keepTopK;
+  param.nmsEta = self.nmsEta;
+  param.nmsThreshold = self.nmsThreshold;
+  std::vector<int> score_dim;
+  for (int i = 0; i < self.scoreDim.count; ++i) {
+    score_dim.push_back(self.scoreDim[i].intValue);
+  }
+  param.score_dim = score_dim;
+  
+  std::vector<int> box_dim;
+  for (int i = 0; i < self.bboxDim.count; ++i) {
+    box_dim.push_back(self.bboxDim[i].intValue);
+  }
+  param.box_dim = box_dim;
+  MultiClassNMSCompute(&param);
+  CPUResult *cr = [[CPUResult alloc] init];
+  cr.output = param.output;
+  cr.outputSize = param.output_size;
+  return cr;
+}
+
+@end
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Common/Errors.swift b/metal/paddle-mobile/paddle-mobile/Common/Errors.swift
new file mode 100644
index 0000000000000000000000000000000000000000..decb9509a613710232de9a006e5289662fe2cae5
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Common/Errors.swift
@@ -0,0 +1,24 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+public enum PaddleMobileError: Error{
+    case loaderError(message: String)
+    case netError(message: String)
+    case memoryError(message: String)
+    case paramError(message: String)
+    case opError(message: String)
+    case predictError(message: String)
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Common/Extensions.swift b/metal/paddle-mobile/paddle-mobile/Common/Extensions.swift
new file mode 100644
index 0000000000000000000000000000000000000000..4c38a1b7b42e21f88b3b1c8825c181bb83293a54
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Common/Extensions.swift
@@ -0,0 +1,127 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+// 自定义 ?!  如果 ?! 前的返回值为一个可选值, 则进行隐式解包, 如果有值则返回这个值, 如果为nil 则fatalError 传入的信息
+precedencegroup ExecutedOrFatalError{
+  associativity: left
+  higherThan: AssignmentPrecedence
+}
+infix operator ?!: ExecutedOrFatalError
+public func ?!<T>(option: T?, excuteOrError: @autoclosure () -> String) -> T{
+  if let inOpt = option {
+    return inOpt
+  }else{
+    print(excuteOrError())
+    fatalError(excuteOrError())
+  }
+}
+
+//Lense
+struct Lense<A, B> {
+  let from: (A) -> B
+  let to: (B, A) -> A
+}
+
+precedencegroup CombineLense{
+  associativity: left
+  higherThan: AssignmentPrecedence
+}
+
+infix operator >>>: CombineLense
+func >>><A, B, C>(left: Lense<B, C>, right: Lense<A, B>) -> Lense<A, C> {
+  return Lense<A, C>.init(from: { (a) -> C in
+    left.from(right.from(a))
+  }, to: { (c, a) -> A in
+    right.to( left.to(c, right.from(a)),a)
+  })
+}
+
+protocol CIntIndex {
+  associatedtype T;
+  subscript(index: CInt) -> T { get set};
+}
+
+extension Array: CIntIndex{
+  typealias T = Element
+  subscript(index: CInt) -> T {
+    get{
+      guard Int64(Int.max) >= Int64(index) else{
+        fatalError("cint index out of Int range")
+      }
+      return self[Int(index)]
+    }
+    set{
+      guard Int64(Int.max) >= Int64(index) else{
+        fatalError("cint index out of Int range")
+      }
+      self[Int(index)] = newValue
+    }
+    
+  }
+}
+
+extension Array where Element: AnyObject{
+  mutating func remove(element: Element) {
+    if let index = index(where: { (node) -> Bool in
+      return unsafeBitCast(element, to: Int.self) == unsafeBitCast(node, to: Int.self)
+    }) {
+      remove(at: index)
+    }
+  }
+  
+}
+
+//MARK: Array extension
+extension Array where Element: Comparable{
+  
+  /// 返回数组前 r 个元素, 并将元素处于原数组的位置作为元组的第一个元素返回
+  ///
+  /// - Parameter r: 前 r 个元素
+  /// - Returns: [(原有位置, 排好位置的元素)]
+  public func top(r: Int) -> [(Int, Element)] {
+    precondition(r <= self.count)
+    return Array<(Int, Element)>(zip(0..<self.count, self).sorted{ $0.1 > $1.1 }.prefix(through: r - 1))
+  }
+}
+
+extension Array {
+  public func strideArray(inCount: Int = 20) -> [(Int, Element)] {
+    if count < inCount {
+      return (0..<count).map{ ($0, self[$0]) }
+    } else {
+      let stride = count / inCount
+      var newArray: [(Int, Element)] = []
+      for i in 0..<inCount {
+        newArray.append((i * stride, self[i * stride]))
+      }
+      return newArray
+    }
+  }
+}
+
+extension String{
+  func cStr() -> UnsafePointer<Int8>? {
+    return (self as NSString).utf8String
+  }
+}
+
+func address<T: AnyObject>(o: T) -> String {
+  return String.init(format: "%018p", unsafeBitCast(o, to: Int.self))
+}
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
new file mode 100644
index 0000000000000000000000000000000000000000..3be8c118613b3e9d6a9247fd731cc74392392d5b
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
@@ -0,0 +1,605 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+fileprivate var defaultMetalLibrary: MTLLibrary?
+fileprivate var paddleMobileMetalLibrary: MTLLibrary?
+
+extension MTLDevice {
+  func defaultLibrary() -> MTLLibrary {
+    if defaultMetalLibrary == nil {
+      defaultMetalLibrary = makeDefaultLibrary()
+    }
+    if let inDefaultLib = defaultMetalLibrary {
+      return inDefaultLib
+    } else {
+      fatalError(" default metal libary is nil")
+    }
+  }
+  
+  func paddleMobileLibrary() -> MTLLibrary {
+    if paddleMobileMetalLibrary == nil {
+      guard let path = Bundle.init(for: Kernel.self).path(forResource: "default", ofType: "metallib") else {
+        fatalError("Counld't find paddle mobile library")
+      }
+      do {
+        paddleMobileMetalLibrary = try makeLibrary(filepath: path)
+      } catch _ {
+        fatalError("Counld't load paddle mobile library")
+      }
+    }
+    
+    if let inPaddleMobileLib = paddleMobileMetalLibrary {
+      return inPaddleMobileLib
+    } else {
+      fatalError("PaddleMobile metal libary is nil")
+    }
+  }
+  
+  func pipeLine(funcName: String, inPaddleMobileLib: Bool = true) -> MTLComputePipelineState {
+    let useLib = inPaddleMobileLib ? paddleMobileLibrary() : defaultLibrary()
+    guard let function = useLib.makeFunction(name: funcName) else {
+      fatalError(" function " + funcName + " not found")
+    }
+    do {
+      let pipLine = try makeComputePipelineState(function: function)
+      return pipLine
+    } catch let error {
+      print(error)
+      fatalError("make pip line error occured : \(error)")
+    }
+    
+  }
+  
+  func makeBuffer<P>(value: [P]) -> MTLBuffer {
+    let buffer = makeBuffer(length: value.count * MemoryLayout<P>.size, options: MTLResourceOptions.storageModeShared)
+    let contents = buffer?.contents().bindMemory(to: P.self, capacity: value.count * MemoryLayout<P>.size)
+    for i in 0..<value.count {
+      contents?[i] = value[i]
+    }
+    return buffer!
+  }
+  
+  func texture2tensor_loop<P>(texture: MTLTexture, cb: ([Int], P)->Void) -> Void {
+    let bpR = texture.width * 4 * MemoryLayout<P>.size
+    let bpI = texture.height * bpR
+    let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: 1))
+    for i in 0..<texture.arrayLength {
+      let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: bpI)
+      texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i)
+      for tx in 0..<texture.width * texture.height * 4 {
+        var k = tx
+        var xyzn: [Int] = [0, 0, 0, 0]
+        xyzn[1] = k / (texture.width * 4)
+        k %= (texture.width * 4)
+        xyzn[3] = k % 4
+        xyzn[0] = k / 4
+        xyzn[2] = i
+        cb(xyzn, pointer[tx])
+      }
+    }
+  }
+  
+  func texture2tensor_3<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    assert(dim.count == 3)
+    assert(texture.width == ndim[3])
+    assert(texture.height == ndim[2])
+    assert(ndim[0] == 1)
+    assert(texture.arrayLength == (ndim[1] + 3) / 4)
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[1] = xyzn[2] * 4 + xyzn[3]
+      tg[2] = xyzn[1]
+      tg[3] = xyzn[0]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
+  
+  func texture2tensor_2<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    assert(dim.count == 2)
+    let w = (ndim[3] + 3) / 4
+    assert(texture.width == w)
+    assert(texture.height == ndim[2])
+    assert(ndim[0] == 1)
+    assert(ndim[1] == 1)
+    assert(texture.arrayLength == 1)
+    
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[2] = xyzn[1]
+      tg[3] = xyzn[0] * 4 + xyzn[3]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
+  
+  func texture2tensor_1<P>(texture: MTLTexture, dim: [Int],  transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    assert(dim.count == 1)
+    let w = (ndim[3] + 3) / 4
+    assert(texture.width == w)
+    assert(texture.height == 1)
+    assert(ndim[0] == 1)
+    assert(ndim[1] == 1)
+    assert(ndim[2] == 1)
+    assert(texture.arrayLength == 1)
+    
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[3] = xyzn[0] * 4 + xyzn[3]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
+  
+  func texture2tensor<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+    if dim.count == 3 {
+      return texture2tensor_3(texture: texture, dim: dim, transpose: transpose)
+    } else if dim.count == 2 {
+      return texture2tensor_2(texture: texture, dim: dim, transpose: transpose)
+    } else if dim.count == 1 {
+      return texture2tensor_1(texture: texture, dim: dim, transpose: transpose)
+    }
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let count = dim.reduce(1) { $0 * $1 }
+    var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    
+    assert(texture.width == ndim[2])
+    assert(texture.height == ndim[1])
+    assert(texture.arrayLength == (ndim[0] * ndim[3] + 3) / 4)
+    
+    texture2tensor_loop(texture: texture) { (xyzn: [Int], v: P) in
+      var tg: [Int] = [0, 0, 0, 0]
+      tg[1] = xyzn[1]
+      tg[2] = xyzn[0]
+      tg[0] = (xyzn[2] * 4 + xyzn[3]) / ndim[3]
+      tg[3] = (xyzn[2] * 4 + xyzn[3]) % ndim[3]
+      var ig: [Int] = [0, 0, 0, 0]
+      for k in 0..<4 {
+        ig[transpose[k]] = tg[k]
+      }
+      let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+      if ix < count {
+        tensor[ix] = v
+      }
+    }
+    return tensor
+  }
+  
+  func tensor2texture<P>(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3], inComputePrecision: ComputePrecision = .Float32) -> MTLTexture {
+    if value.count > 0 {
+      assert(value.count == dim.reduce(1) { $0 * $1 })
+    }
+    
+    var tdim: [Int] = [1, 1, 1, 1]
+    for i in 0..<dim.count {
+      tdim[4 - dim.count + i] = dim[i]
+    }
+    let ndim: [Int] = transpose.map { tdim[$0] }
+    
+    let textureDesc = MTLTextureDescriptor.init()
+    textureDesc.width = ndim[2]
+    textureDesc.height = ndim[1]
+    textureDesc.depth = 1
+    textureDesc.usage = [.shaderRead, .shaderWrite]
+    
+    if inComputePrecision == .Float16 {
+      textureDesc.pixelFormat = .rgba16Float
+    } else if inComputePrecision == .Float32 {
+      textureDesc.pixelFormat = .rgba32Float
+    }
+    
+    textureDesc.textureType = .type2DArray
+    textureDesc.storageMode = .shared
+    textureDesc.cpuCacheMode = .defaultCache
+    textureDesc.arrayLength = (ndim[0] * ndim[3] + 3) / 4
+    let texture = makeTexture(descriptor: textureDesc)!
+    
+    if value.count > 0 {
+      var rcount: Int = (ndim[0] * ndim[3] + 3) / 4
+      rcount = rcount * 4 * ndim[1] * ndim[2]
+      var nvalue: [Float32] = .init(repeating: 0.0, count: rcount)
+      
+      for i0 in 0..<tdim[0] {
+        for i1 in 0..<tdim[1] {
+          for i2 in 0..<tdim[2] {
+            for i3 in 0..<tdim[3] {
+              let ig = [i0, i1, i2, i3]
+              let ix = (i0 * tdim[1] * tdim[2] * tdim[3]) + (i1 * tdim[2] * tdim[3]) + (i2 * tdim[3]) + i3
+              
+              let jg = transpose.map { ig[$0] }
+              let k = jg[0] * ndim[3] + jg[3]
+              let jx = ((k / 4) * ndim[1] * ndim[2] * 4) + (jg[1] * ndim[2] * 4) + (jg[2] * 4) + (k % 4)
+              
+              nvalue[jx] = value[ix] as! Float32
+            }
+          }
+        }
+      }
+      
+      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: ndim[2], height: ndim[1], depth: 1))
+      if inComputePrecision == .Float16 {
+        let xvalue: [UInt16] = .init(repeating: 0, count: rcount)
+        let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
+        let outputP: UnsafeMutablePointer<UInt16> = UnsafeMutablePointer(mutating: xvalue)
+        float32ToFloat16(input: pointer, output: outputP, count: rcount)
+        let bpR = ndim[2] * 4 * 2
+        let bpI = ndim[1] * bpR
+        for i in 0..<textureDesc.arrayLength {
+          let p = outputP + texture.width * texture.height * 4 * i
+          texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
+        }
+      } else {
+        let pointer: UnsafeMutablePointer<Float32> = UnsafeMutablePointer(mutating: nvalue)
+        let bpR = ndim[2] * 4 * MemoryLayout<P>.size
+        let bpI = ndim[1] * bpR
+        for i in 0..<textureDesc.arrayLength {
+          let p = pointer + texture.width * texture.height * 4 * i
+          texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
+        }
+      }
+    }
+    return texture
+  }
+  
+  func makeFloatTexture<P>(value: [P], textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture{
+    
+    let textureDesc = MTLTextureDescriptor.init()
+    textureDesc.width = textureWidth
+    textureDesc.height = textureHeight
+    textureDesc.depth = 1
+    textureDesc.usage = [.shaderRead, .shaderWrite]
+    textureDesc.pixelFormat = .rgba32Float
+    textureDesc.textureType = .type2DArray
+    textureDesc.storageMode = .shared
+    textureDesc.cpuCacheMode = .defaultCache
+    textureDesc.arrayLength = arrayLength
+    let texture = makeTexture(descriptor: textureDesc)!
+    
+    if value.count >= 4{
+      let counts = arrayLength * 4 * textureWidth * textureHeight
+      let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: counts * MemoryLayout<P>.size)
+      for i in 0..<value.count {
+        pointer[i] = value[i]
+      }
+      for i in value.count..<counts {
+        pointer[i] = 0 as! P
+      }
+      
+      let bytesPerRow = texture.width * texture.depth * 4 * MemoryLayout<P>.size
+      let bytesPerImage = texture.height * bytesPerRow
+      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: texture.depth))
+      for i in 0..<arrayLength {
+        let p = pointer + texture.width * texture.height * 4 * i
+        texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage)
+      }
+    } else {
+      
+    }
+    
+    return texture
+  }
+}
+
+extension MTLComputeCommandEncoder {
+  public func dispatch(computePipline: MTLComputePipelineState, outTexture: MTLTexture) {
+    let slices = (outTexture.arrayLength * 4 + 3)/4
+    
+    let width = computePipline.threadExecutionWidth
+    let height = computePipline.maxTotalThreadsPerThreadgroup/width
+    let threadsPerGroup = MTLSize.init(width: width, height: height, depth: 1)
+    
+//    print(" thread: threads per group: \(threadsPerGroup) ")
+//    print(" thread: out texture width: \(outTexture.width) , out texture height: \(outTexture.height)")
+    
+    let groupWidth = (outTexture.width + width - 1)/width
+    let groupHeight = (outTexture.height + height - 1)/height
+    let groupDepth = slices
+    let groups = MTLSize.init(width: groupWidth, height: groupHeight, depth: groupDepth)
+    
+    setComputePipelineState(computePipline)
+    
+    dispatchThreadgroups(groups, threadsPerThreadgroup: threadsPerGroup)
+  }
+}
+
+public extension MTLTexture {
+  
+  func stridableFloatArray<P>(stridable: Bool = true) -> [(index: Int, value: P)] {
+    var arr: [P] = floatArray { (p: P) -> P in
+      return p;
+    }
+    var result:  [(index: Int, value: P)] = []
+    if arr.count > 100 && stridable {
+      for j in stride(from: 0, to: arr.count , by: arr.count / 100){
+        result.append((j, arr[j]))
+      }
+    } else {
+      for j in 0..<arr.count {
+        result.append((j, arr[j]))
+      }
+    }
+    return result
+  }
+  
+  func floatArray<P, T>(res: (P) -> T) -> [T] {
+    var fArr: [T] = []
+    if textureType == .type2DArray {
+      for i in 0..<arrayLength{
+        let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
+        let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
+        let bytesPerImage = width * height * depth * 4 * MemoryLayout<P>.size
+        let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+        getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
+        let p = bytes.assumingMemoryBound(to: P.self)
+        
+        for j in 0..<width * height * depth * 4 {
+          fArr.append(res(p[j]))
+        }
+        bytes.deallocate()
+      }
+    } else if textureType == .type2D {
+      let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<P>.size, alignment: MemoryLayout<P>.alignment)
+      let bytesPerRow = width * depth * 4 * MemoryLayout<P>.size
+      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+      getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
+      let p = bytes.assumingMemoryBound(to: P.self)
+      
+      for j in 0..<width * height * 4 {
+        fArr.append(res(p[j]))
+      }
+      bytes.deallocate()
+    }
+    return fArr
+  }
+  
+  func float32Array() -> [Float32] {
+    if pixelFormat == .rgba32Float {
+      let float32Array = floatArray { (f: Float32) -> Float32 in
+        return f
+      }
+      return float32Array
+    } else if pixelFormat == .rgba16Float {
+      
+      var float16Array = floatArray { (f: Float16) -> Float16 in
+        return f
+      }
+      return float16To32(input: &float16Array, count: float16Array.count)
+    } else {
+      fatalError()
+    }
+  }
+  
+  func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
+    print(header)
+    print("texture: \(self)")
+    //        let res: [(index: Int, value: T)] = stridableFloatArray(stridable: stridable)
+    //        print(res)
+    
+    if textureType == .type2DArray {
+      for i in 0..<arrayLength{
+        var str: String = "slice: \(i): \n"
+        let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
+        let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
+        let bytesPerImage = width * height * depth * 4 * MemoryLayout<T>.size
+        let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+        getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
+        let p = bytes.assumingMemoryBound(to: T.self)
+        str += "2d array count : \(width * height * depth * 4) \n"
+        if stridable && width * height * depth * 4 > 20 {
+          for j in stride(from: 0, to: width * height * depth * 4 , by: width * height * depth * 4 / 20){
+            str += " index \(j): \(p[j])"
+          }
+        } else {
+          for j in 0..<width * height * depth * 4 {
+            str += " index \(j): \(p[j])"
+          }
+        }
+        
+        bytes.deallocate()
+        print(str)
+      }
+    } else if textureType == .type2D {
+      var str: String = "texture 2D: "
+      let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
+      let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
+      let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+      getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
+      let p = bytes.assumingMemoryBound(to: T.self)
+      str += "2d count : \(width * width * 4) \n"
+      
+      if stridable {
+        for j in stride(from: 0, to: width * height * 4, by: width * height * 4 / 20){
+          str += "index \(j): \(p[j]) "
+        }
+      } else {
+        for j in 0..<width * height * 4 {
+          str += "index \(j): \(p[j]) "
+        }
+      }
+      
+      print(str)
+      bytes.deallocate()
+    }
+    return nil
+    
+  }
+  
+  // n c h w - dim
+  func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] {
+    var textureArray: [Float32]
+    if pixelFormat == .rgba32Float {
+      textureArray = floatArray { (i : Float32) -> Float32 in
+        return i
+      }
+    } else if pixelFormat == .rgba16Float {
+      
+      var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
+        return i
+      }
+      textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
+    } else {
+      fatalError(" 目前还不支持其他类型 ")
+    }
+    
+    var output: [Float32] = []
+    for s in 0..<arrayLength {
+      for c in 0..<4{
+        for h in 0..<dim.h {
+          for w in 0..<dim.w {
+            if (s * 4 + c) < dim.c {
+              let textureValue = textureArray[dim.w * dim.h * 4 * s + h * dim.w * 4 + w * 4 + c]
+              output.append(textureValue)
+            }
+          }
+        }
+      }
+    }
+    return output
+  }
+  
+  func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int)) -> [Float32] {
+//    print("origin dim: \(dim)")
+//    print("texture: ")
+//    print(self)
+    
+    var textureArray: [Float32]
+    if pixelFormat == .rgba32Float {
+      textureArray = floatArray { (i : Float32) -> Float32 in
+        return i
+      }
+    } else if pixelFormat == .rgba16Float {
+      var textureFloat16Array = floatArray { (i : Float16) -> Float16 in
+        return i
+      }
+      textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count)
+    } else {
+      fatalError(" 目前还不支持其他类型 ")
+    }
+    
+    var output: [Float32] = []
+    let numOfASlice = dim.h * dim.w * 4
+    for h in 0..<dim.h {
+      for w in 0..<dim.w {
+        for sliceIndex in 0..<arrayLength {
+          if sliceIndex * 4 + 4 > dim.c {
+            for i in 0..<(4 - ((sliceIndex * 4 + 4) - dim.c)) {
+              let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i]
+              output.append(value)
+            }
+          } else {
+            for i in 0..<4 {
+              let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i]
+              output.append(value)
+            }
+          }
+        }
+      }
+    }
+    return output
+  }
+  
+}
+
+
+public extension MTLBuffer {
+  func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
+    print(header)
+    print("MTLBuffer: \(self) ")
+    var str = ""
+    if stridable && length/MemoryLayout<T>.stride > 1000{
+      for j in stride(from: 0, to: length, by: length/MemoryLayout<T>.stride / 100){
+        str += " \(contents().assumingMemoryBound(to: T.self)[j])"
+      }
+    } else {
+      for i in 0..<length/MemoryLayout<T>.size {
+        str += " \(contents().assumingMemoryBound(to: T.self)[i])"
+      }
+    }
+    print(str)
+    return nil
+  }
+  
+  func makeTexture(textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture {
+    let textureDesc = MTLTextureDescriptor.init()
+    textureDesc.width = textureWidth
+    textureDesc.height = textureHeight
+    textureDesc.depth = 1
+    textureDesc.usage = [.shaderRead, .shaderWrite]
+    textureDesc.pixelFormat = .rgba32Float
+    textureDesc.textureType = .type2DArray
+    textureDesc.storageMode = .shared
+    textureDesc.cpuCacheMode = .defaultCache
+    textureDesc.arrayLength = arrayLength
+    let texture = makeTexture(descriptor: textureDesc, offset: 0, bytesPerRow: textureWidth * 4 * 4)!
+    return texture
+  }
+  
+  func array<T>() -> [T] {
+    var array: [T] = []
+    let pointer = contents().bindMemory(to: T.self, capacity: length)
+    for i in 0..<(length / MemoryLayout<T>.size) {
+      array.append(pointer[i])
+    }
+    return array;
+  }
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift b/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
new file mode 100644
index 0000000000000000000000000000000000000000..91afae6f6415d187a69063381f3a27a6bbe92b81
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
@@ -0,0 +1,343 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Metal
+import Foundation
+
+public class PaddleMobileUnitTest {
+    let device: MTLDevice
+    let queue: MTLCommandQueue
+    public init(inDevice: MTLDevice, inQueue: MTLCommandQueue) {
+        device = inDevice
+        queue = inQueue
+    }
+    
+    private func indentPrintTensor(tensor: [Float32], dim: [Int], ix: [Int], indentLevel: Int) {
+        let indent = Array.init(repeating: " ", count: indentLevel).joined(separator: "")
+        var tx = ix
+        if dim.count == indentLevel + 1 {
+            var log: String = indent + "["
+            for i in 0..<dim[indentLevel] {
+                tx = ix
+                tx[indentLevel] = i
+                for x in 1..<dim.count {
+                    for y in 0..<x {
+                        tx[y] *= dim[x]
+                    }
+                }
+                let c = tx.reduce(0) { $0 + $1 }
+                if i > 0 {
+                    log += ", "
+                }
+                log += tensor[c].description
+            }
+            log += "]"
+            if (indentLevel > 0) && (ix[indentLevel - 1] < dim[indentLevel - 1] - 1) {
+                log += ","
+            }
+            print(log)
+        } else {
+            print(indent + "[")
+            for i in 0..<dim[indentLevel] {
+                tx[indentLevel] = i
+                indentPrintTensor(tensor: tensor, dim: dim, ix: tx, indentLevel: indentLevel + 1)
+            }
+            if (indentLevel > 0) && (ix[indentLevel - 1] < dim[indentLevel - 1] - 1) {
+                print(indent + "],")
+            } else {
+                print(indent + "]")
+            }
+        }
+    }
+    
+    private func tensorPrint(tensor: [Float32], dim: [Int]) {
+        var detectPos = -1
+        var odim = 1
+        var ndim = dim
+        for i in 0..<dim.count {
+            if dim[i] == -1 {
+                if detectPos == -1 {
+                    detectPos = i
+                } else {
+                    detectPos = -2
+                }
+            } else if dim[i] <= 0 {
+                detectPos = -3
+            } else {
+                odim *= dim[i]
+            }
+        }
+        assert(detectPos >= -1)
+        if (detectPos == -1) {
+            assert(tensor.count == odim)
+        } else {
+            assert(tensor.count % odim == 0)
+            ndim[detectPos] = tensor.count / odim
+        }
+        indentPrintTensor(tensor: tensor, dim: ndim, ix: dim.map { $0 * 0 }, indentLevel: 0)
+    }
+    
+    public func testConcat() {
+//        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
+//        var it: [[Float32]] = []
+//        for _ in 0..<7 {
+//            it.append((0..<12).map { Float32($0) })
+//        }
+//        let input = it.map { device.tensor2texture(value: $0, dim: [3, 4]) }
+//        let output = device.tensor2texture(value: [Float32](), dim: [3, 28])
+//
+//        let param = ConcatTestParam.init(
+//            input: input,
+//            output: output,
+//            dims: [[3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]],
+//            axis: 1,
+//            odim: [3, 28]
+//        )
+//        let concatKernel = ConcatKernel<Float32>.init(device: device, testParam: param)
+//        concatKernel.test(cmdBuffer: buffer, param: param)
+//        buffer.addCompletedHandler { (buffer) in
+//            for i in 0..<it.count {
+//                let _: Float32? = input[i].logDesc()
+//                self.tensorPrint(tensor: it[i], dim: [3, 4])
+//            }
+//            let _: Float32? = output.logDesc()
+//            let tx: [Float32] = self.device.texture2tensor(texture: output, dim: [3, 28])
+//            self.tensorPrint(tensor: tx, dim: [3, 28])
+//        }
+//
+//        buffer.commit()
+    }
+    
+    public func testReshape() {
+//        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
+//        let input: [Float32] = (0..<24).map { Float32($0) }
+//        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
+//        let outTexture = device.tensor2texture(value: [Float32](), dim: [4, 6])
+//        let mp = ReshapeMetalParam.init(
+//            idim: (1, 2, 3, 4),
+//            itrans: (0, 1, 2, 3),
+//            odim: (1, 1, 4, 6),
+//            otrans: (0, 1, 2, 3)
+//        )
+//        let param = ReshapeTestParam.init(
+//            inputTexture: inTexture,
+//            outputTexture: outTexture,
+//            param: mp
+//        )
+//        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
+//        reshapeKernel.test(commandBuffer: buffer, testParam: param)
+//        buffer.addCompletedHandler { (buffer) in
+//            let _: Float32? = inTexture.logDesc()
+//            let _: Float32? = outTexture.logDesc()
+//            self.tensorPrint(tensor: input, dim: [2, 3, 4])
+//            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [4, 6])
+//            self.tensorPrint(tensor: tx, dim: [4, 6])
+//        }
+        
+//        let input: [Float32] = (0..<24).map { Float32($0) }
+//        let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
+//        let outTexture = device.tensor2texture(value: [Float32](), dim: [24])
+//        let mp = ReshapeMetalParam.init(
+//            idim: (1, 2, 3, 4),
+//            itrans: (0, 1, 2, 3),
+//            odim: (1, 1, 1, 24),
+//            otrans: (0, 1, 2, 3)
+//        )
+//        let param = ReshapeTestParam.init(
+//            inputTexture: inTexture,
+//            outputTexture: outTexture,
+//            param: mp
+//        )
+//        let reshapeKernel = ReshapeKernel<Float32>.init(device: device, testParam: param)
+//        reshapeKernel.test(commandBuffer: buffer, testParam: param)
+//        buffer.addCompletedHandler { (buffer) in
+//            let _: Float32? = inTexture.logDesc()
+//            let _: Float32? = outTexture.logDesc()
+//            self.tensorPrint(tensor: input, dim: [2, 3, 4])
+//            let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24])
+//            self.tensorPrint(tensor: tx, dim: [24])
+//        }
+//
+//        
+//        buffer.commit()
+    }
+    
+    public func testTranspose() {
+
+        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
+//        var input: [Float32] = []
+//        for i in 0..<72 {
+//            input.append(Float32(i))
+//        }
+////        let inputTexture = device.makeFloatTexture(value: input, textureWidth: 3, textureHeight: 2, arrayLength: 3)
+//        let inputTexture = device.tensor2texture(value: input, dim: [4, 3, 2, 3]);
+//        // group 1
+//        let outputTexture = device.tensor2texture(value: [Float32](), dim: [3, 3, 2, 4])
+//        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 3, oC: 4, axis: [3, 1, 2, 0])
+////        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [3, 0, 2, 1])
+////        // group 2
+////        let outputTexture = device.makeFloatTexture(value: [Float32](), textureWidth: 3, textureHeight: 3, arrayLength: 6)
+////        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 4, axis: [3, 0, 2, 1])
+////
+//        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
+//
+//        transposeKernel.test(commandBuffer: buffer, param: param)
+//
+//        buffer.addCompletedHandler { (buffer) in
+//            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
+//            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
+//            self.tensorPrint(tensor: input, dim: [4, 3, 2, 3])
+//            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 3, 2, 4])
+//            self.tensorPrint(tensor: tx, dim: [3, 3, 2, 4])
+//        }
+//
+//        let input: [Float32] = (0..<24).map { Float32($0) }
+//        let inputTexture = device.tensor2texture(value: input, dim: [2, 3, 4])
+//        let outputTexture = device.tensor2texture(value: [Float](), dim: [3, 4, 2])
+//        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [0, 2, 3, 1])
+//        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
+//
+//        transposeKernel.test(commandBuffer: buffer, param: param)
+//
+//        buffer.addCompletedHandler { (buffer) in
+//            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
+//            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
+//            self.tensorPrint(tensor: input, dim: [2, 3, 4])
+//            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 4, 2])
+//            self.tensorPrint(tensor: tx, dim: [3, 4, 2])
+//        }
+//        
+        buffer.commit()
+    }
+    
+    public func testConvAddBnRelu() {
+        let buffer = queue.makeCommandBuffer() ?! " buffer is nil "
+        
+        let input: [Float32] = [
+         1.0, 2.0, 3.0, 4.0,
+         1.0, 2.0, 3.0, 4.0,
+         1.0, 2.0, 3.0, 4.0,
+         
+         1.0, 2.0, 3.0, 4.0,
+         1.0, 2.0, 3.0, 4.0,
+         1.0, 2.0, 3.0, 4.0,
+         
+         1.0, 2.0, 3.0, 4.0,
+         1.0, 2.0, 3.0, 4.0,
+         1.0, 2.0, 3.0, 4.0,
+        ]
+        
+        let filter: [Float32] = [
+        //1.0
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        
+        //2.0
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        
+        //3.0
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        
+        //4.0
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        1.0, 1.0, 1.0, 1.0,
+        ]
+        
+        let biase: [Float32] = [1.0, 1.0, 1.0, 100.0]
+        let newScalue: [Float32] = [1.0, 1.0, 1.0, 1.0]
+        let newBiase: [Float32] = [1.0, 1.0, 1.0, 1.0]
+        
+        let inputeTexture = device.makeFloatTexture(value: input, textureWidth: 3, textureHeight: 3, arrayLength: 1)
+        
+        //filter
+        let filterBuffer = device.makeBuffer(value: filter)
+        
+        // biase
+        let biaseBuffer = device.makeBuffer(value: biase)
+        
+        // new scale
+        let newScalueBuffer = device.makeBuffer(value: newScalue)
+        
+        // new biase
+        let newBiaseBuffer = device.makeBuffer(value: newBiase)
+        
+        //output
+        let outputTexture = device.makeFloatTexture(value: [Float32](), textureWidth: 2, textureHeight: 2, arrayLength: 1)
+        
+        let filterSize: (width: Int, height: Int, channel: Int) = (3, 3, 4)
+        let paddings: (Int, Int) = (1, 1)
+        let stride: (Int, Int) = (2, 2)
+        
+        let offsetX = filterSize.width/2 - paddings.0
+        let offsetY = filterSize.height/2 - paddings.1
+        
+        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1))
+        
+        let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
+        
+        
+        
+        let convAddBnReluKernel = ConvAddBatchNormReluKernel<Float32>.init(device: device, testParam: param)
+        
+        convAddBnReluKernel.test(commandBuffer: buffer, param: param)
+        
+        buffer.addCompletedHandler { (buffer) in
+            let _: Float32? = inputeTexture.logDesc(header: "input texture", stridable: false)
+            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
+        }
+        
+        buffer.commit()
+    }
+}
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Common/Tools.swift b/metal/paddle-mobile/paddle-mobile/Common/Tools.swift
new file mode 100644
index 0000000000000000000000000000000000000000..23ad7113971de3d0843abe17accfe3d67f0caaa9
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Common/Tools.swift
@@ -0,0 +1,27 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+func writeToLibrary<P: PrecisionType>(fileName: String, array: [P]) {
+    let libraryPath = NSSearchPathForDirectoriesInDomains(.libraryDirectory, .userDomainMask, true).last ?! " library path get error "
+    let filePath = libraryPath + "/" + fileName
+    let fileManager = FileManager.init()
+    fileManager.createFile(atPath: filePath, contents: nil, attributes: nil)
+    let fileHandler = FileHandle.init(forWritingAtPath: filePath) ?! " file handler nil "
+    let data = Data.init(buffer: UnsafeBufferPointer.init(start: array, count: array.count))
+    fileHandler.write(data)
+    fileHandler.closeFile()
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Common/Types.swift b/metal/paddle-mobile/paddle-mobile/Common/Types.swift
new file mode 100644
index 0000000000000000000000000000000000000000..a1197ed2188a263af3c0819fec09b584af501dd3
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Common/Types.swift
@@ -0,0 +1,294 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+import Accelerate
+
+public protocol SummableMultipliable: Equatable {
+  static func +(lhs: Self, rhs: Self) -> Self
+  static func *(lhs: Self, rhs: Self) -> Self
+  static func -(lhs: Self, rhs: Self) -> Self
+}
+public protocol PrecisionType: SummableMultipliable{
+  init(inFloat: Float32)
+  init(inFloat16: Float16)
+  init<P: PrecisionType>(_ inP: P)
+  static var bitSize: UInt { get }
+}
+
+public typealias Float16 = Int16
+extension Float16: PrecisionType {
+  public static func * (prefix: Float16, postfix: Float16) {
+    return prefix * postfix
+  }
+  
+  public init<P>(_ inP: P) where P : PrecisionType {
+    if P.bitSize == Float32.bitSize {
+      self = Float16(inFloat: inP as! Float32)
+    } else if P.bitSize == Float16.bitSize {
+      self = inP as! Float16
+    } else {
+      fatalError()
+    }
+  }
+  
+  public static var bitSize: UInt {
+    return 16
+  }
+  
+  public init(inFloat16: Float16) {
+    self = inFloat16
+  }
+  public init(inFloat: Float32) {
+    self = Int16(inFloat)
+  }
+}
+
+extension Float32: PrecisionType {
+  public init<P>(_ inP: P) where P : PrecisionType {
+    if P.bitSize == Float32.bitSize {
+      self = inP as! Float32
+    } else if P.bitSize == Float16.bitSize {
+      self = Float32.init(inP as! Float16)
+    } else {
+      fatalError()
+    }
+  }
+  
+  public init(inFloat: Float32) {
+    self = inFloat
+  }
+  
+  public init(inFloat16: Float16) {
+    self = Float32.init(inFloat16)
+  }
+  
+  public static var bitSize: UInt {
+    return 32
+  }
+}
+
+public func float32ToFloat16(input: UnsafeMutablePointer<Float32>, output: UnsafeMutableRawPointer, count: Int) {
+  var float32Buffer = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 4)
+  var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
+  guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
+    fatalError(" float 32 to float 16 error ! ")
+  }
+}
+
+public func float16To32(input: UnsafeMutablePointer<Float16>, count: Int) -> [Float32] {
+  var output = Array<Float>.init(repeating: 0.0, count: count)
+  float16to32(input: input, output: &output, count: count)
+  return output
+}
+
+public func float16to32(input: UnsafeMutablePointer<Float16>, output: UnsafeMutablePointer<Float32>, count: Int) {
+  var bufferFloat16 = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 2)
+  var bufferFloat32 = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 4)
+  if vImageConvert_Planar16FtoPlanarF(&bufferFloat16, &bufferFloat32, 0) != kvImageNoError {
+    fatalError(" convert float16 to float32 error")
+  }
+}
+
+// N - 0   C - 1   H - 2   W - 3
+struct DataLayout {
+  
+  static func NCHW(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout {
+    return DataLayout.init([(.N, dim[0]), (.C, dim[1]), (.H, dim[2]), (.W, dim[3])])
+  }
+  
+  static func NHWC(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout {
+    return DataLayout.init([(.N, dim[0]), (.H, dim[1]), (.W, dim[2]), (.C, dim[3])])
+  }
+  
+  func count() -> Int {
+    return layoutWithDim.count
+  }
+  
+  var N: Int? {
+    get {
+      for layoutDim in layoutWithDim {
+        if layoutDim.0 == .N {
+          return layoutDim.1
+        }
+      }
+      return nil
+    }
+    set {
+      var newN = (Layout.N, newValue)
+      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+        return layout == .N
+      }) {
+        fatalError()
+      }
+    }
+  }
+  var C: Int? {
+    get {
+      for layoutDim in layoutWithDim {
+        if layoutDim.0 == .C {
+          return layoutDim.1
+        }
+      }
+      return nil
+    }
+    set {
+      var newN = (Layout.C, newValue)
+      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+        return layout == .N
+      }) {
+        fatalError()
+      }
+    }
+  }
+  var H: Int? {
+    get {
+      for layoutDim in layoutWithDim {
+        if layoutDim.0 == .H {
+          return layoutDim.1
+        }
+      }
+      return nil
+    }
+    set {
+      var newN = (Layout.H, newValue)
+      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+        return layout == .H
+      }) {
+        fatalError()
+      }
+    }
+  }
+  var W: Int? {
+    get {
+      for layoutDim in layoutWithDim {
+        if layoutDim.0 == .W {
+          return layoutDim.1
+        }
+      }
+      return nil
+    }
+    set {
+      var newN = (Layout.W, newValue)
+      if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in
+        return layout == .W
+      }) {
+        fatalError()
+      }
+    }
+  }
+  
+  
+  init(_ inLayout: [(Layout, Int)]) {
+    layoutWithDim = inLayout
+  }
+  
+  func layout() -> [Layout] {
+    return layoutWithDim.map({ (layout: Layout, dim: Int) -> Layout in
+      return layout
+    })
+  }
+  
+  var layoutWithDim: [(Layout, Int)] = [(.N, 0), (.C, 0), (.H, 0), (.W, 0)]
+  
+  func convertTo(inLayout: [Layout]) {
+    
+  }
+  
+  enum Layout: Int{
+    case N = 0
+    case C = 1
+    case H = 2
+    case W = 3
+    static func defaultLayout() -> [Layout] {
+      return [N, C, H, W]
+    }
+  }
+}
+
+extension DataLayout: Equatable {
+  public static func == (lhs: DataLayout, rhs: DataLayout) -> Bool {
+    if lhs.layoutWithDim.count == rhs.layoutWithDim.count {
+      var result = true
+      for i in 0..<lhs.layoutWithDim.count {
+        result = (lhs.layoutWithDim[i].0 == rhs.layoutWithDim[i].0)
+        if !result {
+          break
+        }
+      }
+      return result
+    } else {
+      return false
+    }
+  }
+}
+
+public protocol Variant: CustomStringConvertible, CustomDebugStringConvertible {
+}
+
+extension Tensor: Variant {
+}
+
+extension Texture: Variant {
+}
+
+extension GPUResultHolder: Variant {
+}
+
+extension InputTexture: Variant {
+}
+
+extension MTLTexture where Self: Variant {
+  
+}
+
+class FetchHolder: Variant {
+  var resultBuffer: MTLBuffer?
+  var dim: [Int]
+  var capacity: Int
+  
+  init(inCapacity: Int, inDim: [Int]) {
+    capacity = inCapacity
+    dim = inDim
+  }
+  
+  func initBuffer(device: MTLDevice) {
+    resultBuffer = device.makeBuffer(length: capacity * 4, options: [])
+  }
+  
+  var result: UnsafeMutablePointer<Float32> {
+    guard let inResultBuffer = resultBuffer else {
+      fatalError()
+    }
+    return inResultBuffer.contents().bindMemory(to: Float32.self, capacity: capacity)
+  }
+  
+}
+
+extension FetchHolder: CustomStringConvertible, CustomDebugStringConvertible {
+  var description: String {
+    fatalError()
+//    return "\(result)"
+  }
+  
+  var debugDescription: String {
+    fatalError()
+//    return "\(result)"
+  }
+  
+  
+}
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Genet.swift b/metal/paddle-mobile/paddle-mobile/Genet.swift
new file mode 100644
index 0000000000000000000000000000000000000000..d803d1e99537e3a24d1fae5a5653d680bd811ac2
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Genet.swift
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+public class Genet: Net {
+  @objc public override init(device: MTLDevice) {
+    super.init(device: device)
+    means = [128.0, 128.0, 128.0]
+    scale = 0.017
+    except = 0
+    modelPath = Bundle.main.path(forResource: "genet_model", ofType: nil) ?! "model null"
+    paramPath = Bundle.main.path(forResource: "genet_params", ofType: nil) ?! "para null"
+    modelDir = ""
+    preprocessKernel = GenetPreProccess.init(device: device)
+    dim = (n: 1, h: 128, w: 128, c: 3)
+  }
+  
+  @objc override public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
+    super.init(device:device,paramPointer:paramPointer,paramSize:paramSize,modePointer:modePointer,modelSize:modelSize)
+    means = [128.0, 128.0, 128.0]
+    scale = 0.017
+    except = 0
+    modelPath = ""
+    paramPath = ""
+    modelDir = ""
+    preprocessKernel = GenetPreProccess.init(device: device)
+    dim = (n: 1, h: 128, w: 128, c: 3)
+  }
+
+  class GenetPreProccess: CusomKernel {
+    init(device: MTLDevice) {
+      let s = CusomKernel.Shape.init(inWidth: 128, inHeight: 128, inChannel: 3)
+      super.init(device: device, inFunctionName: "genet_preprocess", outputDim: s, usePaddleMobileLib: false)
+    }
+  }
+  
+  override  public func resultStr(res: ResultHolder) -> String {
+//    fatalError()
+    return " \(res.result![0]) ... "
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Info.plist b/metal/paddle-mobile/paddle-mobile/Info.plist
new file mode 100644
index 0000000000000000000000000000000000000000..1007fd9dd7d0af3071eced72a45c88fea7665976
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Info.plist
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>$(DEVELOPMENT_LANGUAGE)</string>
+	<key>CFBundleExecutable</key>
+	<string>$(EXECUTABLE_NAME)</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>$(PRODUCT_NAME)</string>
+	<key>CFBundlePackageType</key>
+	<string>FMWK</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleVersion</key>
+	<string>$(CURRENT_PROJECT_VERSION)</string>
+	<key>NSPrincipalClass</key>
+	<string></string>
+</dict>
+</plist>
diff --git a/metal/paddle-mobile/paddle-mobile/MobileNet.swift b/metal/paddle-mobile/paddle-mobile/MobileNet.swift
new file mode 100644
index 0000000000000000000000000000000000000000..7d10a920d15e751f29fce7f9f6be71cd6a2d6b69
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/MobileNet.swift
@@ -0,0 +1,70 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class MobileNet: Net{
+  
+  class MobilenetPreProccess: CusomKernel {
+    init(device: MTLDevice) {
+      let s = CusomKernel.Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
+      super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, usePaddleMobileLib: false)
+    }
+  }
+  
+  class PreWords {
+    var contents: [String] = []
+    init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
+      if let filePath = inBundle.path(forResource: fileName, ofType: type) {
+        let string = try! String.init(contentsOfFile: filePath)
+        contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
+          String($0[$0.index($0.startIndex, offsetBy: 10)...])
+        }
+      }else{
+        fatalError("no file call \(fileName)")
+      }
+    }
+    subscript(index: Int) -> String {
+      return contents[index]
+    }
+  }
+  
+  let labels = PreWords.init(fileName: "synset")
+  
+  override public func resultStr(res: ResultHolder) -> String {
+    guard let resPointer = res.result else {
+      fatalError()
+    }
+    var s: [String] = []
+    (0..<res.capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
+      s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
+    }
+    return s.joined(separator: "\n")
+  }
+  
+
+  
+  override init(device: MTLDevice) {
+    super.init(device: device)
+    means = [123.68, 116.78, 103.94]
+    scale = 0.017
+    except = 0
+    modelPath = Bundle.main.path(forResource: "model", ofType: nil) ?! "model null"
+    paramPath = Bundle.main.path(forResource: "params", ofType: nil) ?! "para null"
+    modelDir = ""
+    preprocessKernel = MobilenetPreProccess.init(device: device)
+    dim = (n: 1, h: 224, w: 224, c: 3)
+  }
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/MobileNetSSD.swift b/metal/paddle-mobile/paddle-mobile/MobileNetSSD.swift
new file mode 100644
index 0000000000000000000000000000000000000000..667cfa72c7f9409b641ef9061d9a82f212e97aac
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/MobileNetSSD.swift
@@ -0,0 +1,100 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+public class MobileNet_ssd_hand: Net{
+  @objc public override init(device: MTLDevice) {
+    super.init(device: device)
+    means = [123.68, 116.78, 103.94]
+    scale = 0.017
+    except = 2
+    modelPath = Bundle.main.path(forResource: "ssd_hand_model", ofType: nil) ?! "model null"
+    paramPath = Bundle.main.path(forResource: "ssd_hand_params", ofType: nil) ?! "para null"
+    modelDir = ""
+    preprocessKernel = MobilenetssdPreProccess.init(device: device)
+    dim = (n: 1, h: 300, w: 300, c: 3)
+  }
+  
+  @objc override public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
+    super.init(device:device,paramPointer:paramPointer,paramSize:paramSize,modePointer:modePointer,modelSize:modelSize)
+    means = [123.68, 116.78, 103.94]
+    scale = 0.017
+    except = 2
+    modelPath = ""
+    paramPath = ""
+    modelDir = ""
+    preprocessKernel = MobilenetssdPreProccess.init(device: device)
+    dim = (n: 1, h: 300, w: 300, c: 3)
+  }
+  
+  class MobilenetssdPreProccess: CusomKernel {
+    init(device: MTLDevice) {
+      let s = CusomKernel.Shape.init(inWidth: 300, inHeight: 300, inChannel: 3)
+      super.init(device: device, inFunctionName: "mobilenet_ssd_preprocess", outputDim: s, usePaddleMobileLib: false)
+    }
+  }
+  
+  override public func resultStr(res: ResultHolder) -> String {
+    return " \(res)"
+  }
+  
+  override func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
+
+//    guard let interRes = paddleMobileRes.intermediateResults else {
+//      fatalError(" need have inter result ")
+//    }
+//
+//    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  Texture<Float32> else {
+//      fatalError(" need score ")
+//    }
+//
+//    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture<Float32> else {
+//      fatalError()
+//    }
+//
+//    var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3]))
+////    print("score: ")
+////    print(scoreFormatArr.strideArray())
+////
+//    var bboxArr = bbox.metalTexture.float32Array()
+////    print("bbox: ")
+////    print(bboxArr.strideArray())
+//
+//    let nmsCompute = NMSCompute.init()
+//    nmsCompute.scoreThredshold = 0.01
+//    nmsCompute.nmsTopK = 400
+//    nmsCompute.keepTopK = 200
+//    nmsCompute.nmsEta = 1.0
+//    nmsCompute.nmsThreshold = 0.45
+//    nmsCompute.background_label = 0;
+//
+//    nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])]
+//
+//    nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])]
+//    guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else {
+//      fatalError( " result error " )
+//    }
+//
+//    let output: [Float32] = result.map { $0.floatValue }
+//
+//
+//    return output
+    fatalError()
+  }
+  
+
+  
+ 
+}
diff --git a/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift b/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift
new file mode 100644
index 0000000000000000000000000000000000000000..6c7bd9b9c6ae4f55327a370ceb1e682a8e5e7658
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift
@@ -0,0 +1,153 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+public class MobileNet_ssd_AR: Net{
+  @objc public override init(device: MTLDevice) {
+    super.init(device: device)
+    means = [103.94, 116.78, 123.68]
+    scale = 1
+    except = 2
+    modelPath = Bundle.main.path(forResource: "ar_model", ofType: nil) ?! "model null"
+    paramPath = Bundle.main.path(forResource: "ar_params", ofType: nil) ?! "para null"
+    modelDir = ""
+    preprocessKernel = MobilenetssdPreProccess.init(device: device)
+    dim = (n: 1, h: 160, w: 160, c: 3)
+  }
+  
+  @objc override public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
+    super.init(device:device,paramPointer:paramPointer,paramSize:paramSize,modePointer:modePointer,modelSize:modelSize)
+    means = [103.94, 116.78, 123.68]
+    scale = 1
+    except = 2
+    modelPath = ""
+    paramPath = ""
+    modelDir = ""
+    preprocessKernel = MobilenetssdPreProccess.init(device: device)
+    dim = (n: 1, h: 160, w: 160, c: 3)
+  }
+  
+  class MobilenetssdPreProccess: CusomKernel {
+    init(device: MTLDevice) {
+      let s = CusomKernel.Shape.init(inWidth: 160, inHeight: 160, inChannel: 3)
+      super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, usePaddleMobileLib: false)
+    }
+  }
+  
+  override public func resultStr(res: ResultHolder) -> String {
+    return " \(res.result![0])"
+  }
+  
+  override func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
+    guard let interRes = paddleMobileRes.intermediateResults else {
+      fatalError(" need have inter result ")
+    }
+    
+    guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as?  FetchHolder else {
+      fatalError(" need score ")
+    }
+    
+    guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? FetchHolder else {
+      fatalError()
+    }
+    
+//    let startDate = Date.init()
+    
+//    print("scoreFormatArr: ")
+//print((0..<score.capacity).map{ score.result[$0] }.strideArray())
+//
+//    print("bbox arr: ")
+//
+//    print((0..<bbox.capacity).map{ bbox.result[$0] }.strideArray())
+    
+    let nmsCompute = NMSCompute.init()
+    nmsCompute.scoreThredshold = 0.25
+    nmsCompute.nmsTopK = 100
+    nmsCompute.keepTopK = 100
+    nmsCompute.nmsEta = 1.0
+    nmsCompute.nmsThreshold = 0.449999988
+    nmsCompute.background_label = 0;
+    nmsCompute.scoreDim = [NSNumber.init(value: score.dim[0]), NSNumber.init(value: score.dim[1]), NSNumber.init(value: score.dim[2])]
+    nmsCompute.bboxDim = [NSNumber.init(value: bbox.dim[0]), NSNumber.init(value: bbox.dim[1]), NSNumber.init(value: bbox.dim[2])]
+    guard let result = nmsCompute.compute(withScore: score.result, andBBoxs: bbox.result) else {
+      fatalError( " result error " )
+    }
+    let resultHolder = ResultHolder.init(inResult: result.output, inCapacity: Int(result.outputSize))
+//    for i in 0..<Int(result.outputSize) {
+//
+//      print("i \(i) : \(result.output[i])")
+//    }
+//    print(Date.init().timeIntervalSince(startDate))
+
+//    print(resultHolder.result![0])
+    return resultHolder
+  }
+  
+  override func updateProgram(program: Program) {
+    for i in [56, 66, 76, 86, 93, 99] {
+      let opDesc = program.programDesc.blocks[0].ops[i]
+      let output = opDesc.outputs["Out"]!.first!
+      let v = program.scope[output]!
+      let originTexture = v as! Texture<Float32>
+      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1] / 7, originTexture.tensorDim[0] * 7])
+      
+      originTexture.dim = Dim.init(inDim: [1, 1, originTexture.dim[3] / 7, originTexture.dim[2] * 7])
+      
+      originTexture.padToFourDim = Dim.init(inDim: [1, 1, originTexture.padToFourDim[3] / 7, originTexture.padToFourDim[2] * 7])
+      
+      program.scope[output] = originTexture
+      
+      if i == 99 {
+        opDesc.attrs["axis"] = 0
+      } else {
+        opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
+      }
+    }
+    
+    for i in [58, 59, 88, 89, 95, 96, 68, 69, 78, 79] {
+      let opDesc = program.programDesc.blocks[0].ops[i]
+      let output = opDesc.outputs["Out"]!.first!
+      let v = program.scope[output]!
+      
+      
+      
+      let originTexture = v as! Texture<Float32>
+      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+      opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
+    }
+    
+    for i in [60, 101, 90, 97, 70, 80] {
+      let opDesc = program.programDesc.blocks[0].ops[i]
+      let output = opDesc.outputs["Out"]!.first!
+      let v = program.scope[output]!
+      let originTexture = v as! Texture<Float32>
+      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
+    }
+    
+    for i in [102] {
+      let opDesc = program.programDesc.blocks[0].ops[i]
+      for output in opDesc.outputs["Out"]! {
+        let v = program.scope[output]!
+        let originTexture = v as! Texture<Float32>
+        originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
+      }
+      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
+      print(" split axis \(opDesc.attrs["axis"])")
+    }
+    // 99
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Net.swift b/metal/paddle-mobile/paddle-mobile/Net.swift
new file mode 100644
index 0000000000000000000000000000000000000000..ce9ec98a66e685eec3a688a5a29402a76567b0e2
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Net.swift
@@ -0,0 +1,70 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+
+import Foundation
+
+public class ResultHolder: NSObject {
+  @objc public let result: UnsafeMutablePointer<Float32>?
+  @objc public let capacity: Int
+
+  init(inResult: UnsafeMutablePointer<Float32>?, inCapacity: Int) {
+    result = inResult
+    capacity = inCapacity
+  }
+  
+  @objc public func releasePointer() {
+    result?.deinitialize(count: capacity)
+    result?.deallocate()
+  }
+}
+
+public class Net: NSObject {
+  var except: Int = 0
+  var means: [Float] = []
+  var scale: Float = 0.0
+  var dim: (n: Int, h: Int, w: Int, c: Int) = (n: 0, h: 0, w: 0, c: 0)
+  var preprocessKernel: CusomKernel? = nil
+  var paramPointer: UnsafeMutableRawPointer? = nil
+  var paramSize: Int = 0
+  var modelPointer: UnsafeMutableRawPointer? = nil
+  var modelSize: Int = 0
+  var modelPath: String = ""
+  var paramPath: String = ""
+  var modelDir: String = ""
+  @objc public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
+      self.paramPointer = paramPointer
+      self.paramSize = paramSize
+      self.modelPointer = modePointer
+      self.modelSize = modelSize
+      super.init()
+  }
+
+  
+  public func resultStr(res: ResultHolder) -> String {
+    fatalError()
+  }
+  
+  func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder {
+    return ResultHolder.init(inResult: paddleMobileRes.resultPointer, inCapacity: paddleMobileRes.capacity)
+  }
+  
+  @objc public init(device: MTLDevice) {
+    super.init()
+  }
+  
+  func updateProgram(program: Program) {
+
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
new file mode 100644
index 0000000000000000000000000000000000000000..9806042e9eb339d6d15f2cbfebe924b548d29922
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
@@ -0,0 +1,73 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+fileprivate var singletons : [String : Any] = [:]
+class OpCreator<P: PrecisionType> {
+    static var shared : OpCreator<P> {
+        let key = String(describing: P.self)
+        if let singleton = singletons[key] {
+            return singleton as! OpCreator<P>
+        } else {
+            let newSingleton = OpCreator<P>()
+            singletons[key] = newSingleton
+            return newSingleton
+        }
+    }
+    
+    func creat(device: MTLDevice, opDesc: OpDesc, scope: Scope) throws -> Runable & InferShaperable {
+        guard let opCreator = opCreators[opDesc.type] else {
+            throw PaddleMobileError.opError(message: "there is no " + opDesc.type + " yet")
+        }
+        
+        do {
+            return try opCreator(device, opDesc, scope)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    let opCreators: [String : (MTLDevice, OpDesc, Scope) throws -> Runable & InferShaperable] =
+        [gConvType                  :     ConvOp<P>.creat,
+         gBatchNormType             :     BatchNormOp<P>.creat,
+         gReluType                  :     ReluOp<P>.creat,
+         gElementwiseAddType        :     ElementwiseAddOp<P>.creat,
+         gFeedType                  :     FeedOp<P>.creat,
+         gFetchType                 :     FetchOp<P>.creat,
+         gConvAddBatchNormReluType  :     ConvAddBatchNormReluOp<P>.creat,
+         gPooType                   :     PoolOp<P>.creat,
+         gSoftmaxType               :     SoftmaxOp<P>.creat,
+         gReshapeType               :     ReshapeOp<P>.creat,
+         gConvAddType               :     ConvAddOp<P>.creat,
+         gDepthConvType             :     DepthConvOp<P>.creat,
+         gConcatType                :     ConcatOp<P>.creat,
+         gBoxcoderType              :     BoxcoderOp<P>.creat,
+         gConvBnReluType            :     ConvBNReluOp<P>.creat,
+         gDwConvBnReluType          :     DwConvBNReluOp<P>.creat,
+         gMulticlassNMSType         :     MulticlassNMSOp<P>.creat,
+         gTransposeType             :     TransposeOp<P>.creat,
+         gPriorBoxType              :     PriorBoxOp<P>.creat,
+         gPreluType                 :     PreluOp<P>.creat,
+         gConv2dTransposeType       :     ConvTransposeOp<P>.creat,
+         gBilinearInterpType        :     BilinearInterpOp<P>.creat,
+         gSplit                     :     SplitOp<P>.creat,
+         gShape                     :     ShapeOp<P>.creat,
+         gFlatten                   :     FlattenOp<P>.creat,
+         gConvAddPreluType          :     ConvAddPreluOp<P>.creat,
+         gConvAddAddPreluType       :     ConvAddAddPreluOp<P>.creat,
+         gElementwiseAddPreluType:   ElementwiseAddPreluOp<P>.creat]
+  
+    private init(){}
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Base/OpParam.swift b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpParam.swift
new file mode 100644
index 0000000000000000000000000000000000000000..9f868e35864d59be5711c4ac0a02787638eeae8f
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpParam.swift
@@ -0,0 +1,220 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+/*
+ let opInputsOutputsKey  = [gConvType         : (inputs: ["Input"], outputs: ["Output"]),
+ gBatchNormType    : (inputs: ["X"], outputs: ["Y"]),
+ gReluType         : (inputs: ["X"], outputs: ["Out"]),
+ gElementwiseAdd   : (inputs: ["X", "Y"], outputs: ["Out"])]
+ */
+
+protocol OpParam {
+  associatedtype OutputType: Variant
+  var output: OutputType { get set }
+  func outputDesc() -> String
+  
+  associatedtype ParamPrecisionType: PrecisionType
+  init(opDesc: OpDesc, inScope: Scope) throws
+  static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType
+  static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType
+  static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType
+  
+  static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T
+  
+  static func paramInputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType
+  
+}
+
+extension OpParam {
+  func outputDesc() -> String {
+    return output.debugDescription
+  }
+  
+  static func getFirstTensor<VarType: Variant>(key: String, map: [String : [String]], from: Scope) throws -> VarType {
+    guard let mapKeys = map[key], mapKeys.count > 0 else {
+      throw PaddleMobileError.paramError(message: key + " not found in \(map) or maped values is empty")
+    }
+    guard let variant = from[mapKeys[0]] else {
+      throw PaddleMobileError.paramError(message: mapKeys[0] + " not found in scope")
+    }
+    
+    guard let v = variant as? VarType else {
+      throw PaddleMobileError.paramError(message: " type error")
+
+    }
+    return v
+  }
+  
+  static func outputVariances<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorVariances: VarType = try getFirstTensor(key: "Variances", map: outputs, from: from)
+      return tensorVariances
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func paramInputAlpha<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let alphaTensor: VarType = try getFirstTensor(key: "Alpha", map: inputs, from: from)
+      return alphaTensor
+    } catch let error {
+      throw error
+    }
+  }
+  
+  
+  static func inputImage<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorImage: VarType = try getFirstTensor(key: "Image", map: inputs, from: from)
+      return tensorImage
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func inputX<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorX: VarType = try getFirstTensor(key: "X", map: inputs, from: from)
+      return tensorX
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func outputBoxes<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorBox: VarType = try getFirstTensor(key: "Boxes", map: outputs, from: from)
+      return tensorBox
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func input<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorInput: VarType = try getFirstTensor(key: "Input", map: inputs, from: from)
+      return tensorInput
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func output<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorOutput: VarType = try getFirstTensor(key: "Output", map: outputs, from: from)
+      return tensorOutput
+    } catch let error {
+      throw error
+    }
+  }
+  static func outputY<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorOutputY: VarType = try getFirstTensor(key: "Y", map: outputs, from: from)
+      return tensorOutputY
+    } catch let error {
+      throw error
+    }
+  }
+  static func inputY<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorY: VarType = try getFirstTensor(key: "Y", map: inputs, from: from)
+      return tensorY
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func outputOut<VarType: Variant>(outputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let out: VarType = try getFirstTensor(key: "Out", map: outputs, from: from)
+      return out
+    } catch let error {
+      throw error
+    }
+  }
+  static func inputFilter<VarType: Variant>(paraInputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorFilter: VarType = try getFirstTensor(key: "Filter", map: paraInputs, from: from)
+      return tensorFilter
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func inputBiase<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorBias: VarType = try getFirstTensor(key: "Bias", map: inputs, from: from)
+      return tensorBias
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func inputMean<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorMean: VarType = try getFirstTensor(key: "Mean", map: inputs, from: from)
+      return tensorMean
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func inputScale<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorScale: VarType = try getFirstTensor(key: "Scale", map: inputs, from: from)
+      return tensorScale
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func inputVariance<VarType: Variant>(inputs: [String : [String]], from: Scope) throws -> VarType {
+    do {
+      let tensorVariance: VarType = try getFirstTensor(key: "Variance", map: inputs, from: from)
+      return tensorVariance
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func getAttr<T>(key: String, attrs: [String : Attr]) throws -> T{
+    guard let attr = attrs[key] else {
+      throw PaddleMobileError.paramError(message: "attr \(key) can't found in: \(attrs)" )
+    }
+    
+    guard let tAttr = attr as? T else {
+      throw PaddleMobileError.paramError(message: "key: \(key) attr: \(attr) type error" )
+    }
+    return tAttr
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift b/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift
new file mode 100644
index 0000000000000000000000000000000000000000..40698da5ecb047dbf557cea18556616020ee9750
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift
@@ -0,0 +1,192 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Metal
+import Foundation
+
+protocol Fusion {
+  static func fusionNode() -> Node
+  static func change() -> [String : [(from: String, to: String)]]
+  static func fusionType() -> String
+  static func needCheck() -> [(Int, String)]
+}
+extension Fusion {
+  static func needCheck() -> [(Int, String)] {
+    return []
+  }
+}
+
+protocol Runable {
+  func run(device: MTLDevice, buffer: MTLCommandBuffer) throws
+  func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws
+  func delogOutput()
+  func inputVariant() -> [String : [Variant]]
+  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer)
+}
+
+extension Runable where Self: OperatorProtocol{
+  func run(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try runImpl(device: device, buffer: buffer)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func inputVariant() -> [String : [Variant]] {
+//    return [:]
+    fatalError(" op \(type) need implement inputVariant")
+  }
+  
+  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
+    fatalError(" need implement ")
+  }
+  
+  func delogOutput() {
+    
+    print(type + ": has no implementation" )
+  }
+}
+
+protocol Creator where Self: OperatorProtocol{
+  associatedtype OpType: OperatorProtocol & Runable & InferShaperable
+  static func creat(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> OpType
+}
+
+extension Creator where Self: OperatorProtocol {
+  static func creat(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> OpType {
+    do {
+      return try OpType.provide(device:device, opDesc: opDesc, inScope: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+}
+
+protocol InferShaperable {
+  func inferShape()
+}
+
+protocol OperatorProtocol {
+  associatedtype ParamType
+  associatedtype KerType:  Computable where Self.KerType.ParamType == ParamType
+  var type: String { get }
+  var scope: Scope { get }
+  var inputs: [String : [String]] { get }
+  var paraInputs: [String : [String]] { get set }
+  var outpus: [String : [String]] { get }
+  var attrs: [String : Attr] { get }
+  var para: ParamType { get }
+  var kernel: KerType { get }
+  init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws
+}
+
+extension OperatorProtocol {
+  static func provide(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws -> Self {
+    do {
+      return try Self.init(device: device, opDesc: opDesc, inScope: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+}
+
+class Operator <KernelType:  Computable , ParameterType>: OperatorProtocol where KernelType.ParamType == ParameterType {
+  typealias ParamType = ParameterType
+  typealias KerType = KernelType
+  let type: String
+  let inputs: [String : [String]]
+  var paraInputs: [String : [String]]
+  let outpus: [String : [String]]
+  let attrs: [String : Attr]
+  let para: ParamType
+  let scope: Scope
+  var kernel: KerType
+  required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws {
+//    print("create op: \(opDesc.type)")
+    type = opDesc.type
+    scope = inScope
+    inputs = opDesc.inputs
+    outpus = opDesc.outputs
+    attrs =  opDesc.attrs
+    paraInputs = opDesc.paraInputs
+    do {
+      para = try ParamType.init(opDesc:opDesc, inScope: inScope)
+    } catch let error {
+      throw error
+    }
+    kernel = KernelType.init(device: device, param: para)
+  }
+}
+
+// op infos
+let gFetchType                  = "fetch"
+let gFeedType                   = "feed"
+let gConvType                   = "conv2d"
+let gBatchNormType              = "batch_norm"
+let gReluType                   = "relu"
+let gElementwiseAddType         = "elementwise_add"
+let gConvAddBatchNormReluType   = "conv_add_batchnorm_relu"
+let gPooType                    = "pool2d"
+let gSoftmaxType                = "softmax"
+let gReshapeType                = "reshape"
+let gConvAddType                = "conv_add"
+let gDepthConvType              = "depthwise_conv2d"
+let gPriorBoxType               = "prior_box"
+let gTransposeType              = "transpose"
+let gConcatType                 = "concat"
+let gBoxcoderType               = "box_coder"
+let gMulticlassNMSType          = "multiclass_nms"
+let gConvBnReluType             = "conv_bn_relu"
+let gDwConvBnReluType           = "depth_conv_bn_relu"
+let gPreluType                  = "prelu"
+let gConv2dTransposeType        = "conv2d_transpose"
+let gBilinearInterpType         = "bilinear_interp"
+let gSplit                      = "split"
+let gShape                      = "shape"
+let gFlatten                    = "flatten"
+let gConvAddPreluType           = "conv_add_prelu"
+let gConvAddAddPreluType        = "conv_add_add_prelu"
+let gElementwiseAddPreluType = "elementwise_add_prelu"
+
+
+let opInfos = [gConvType                    : (inputs: ["Input"], outputs: ["Output"]),
+               gBatchNormType               : (inputs: ["X"], outputs: ["Y"]),
+               gReluType                    : (inputs: ["X"], outputs: ["Out"]),
+               gElementwiseAddType          : (inputs: ["X"], outputs: ["Out"]),
+               gFeedType                    : (inputs: ["X"], outputs: ["Out"]),
+               gFetchType                   : (inputs: ["X"], outputs: ["Out"]),
+               gConvAddBatchNormReluType    : (inputs: ["Input"], outputs: ["Out"]),
+               gPooType                     : (inputs: ["X"], outputs: ["Out"]),
+               gSoftmaxType                 : (inputs: ["X"], outputs: ["Out"]),
+               gReshapeType                 : (inputs: ["X"], outputs: ["Out"]),
+               gConvAddType                 : (inputs: ["Input"], outputs: ["Out"]),
+               gDepthConvType               : (inputs: ["Input"], outputs: ["Output"]),
+               gConcatType                  : (inputs: ["X"], outputs: ["Out"]),
+               gBoxcoderType                : (inputs: ["PriorBox", "PriorBoxVar", "TargetBox"], outputs: ["OutputBox"]),
+               gTransposeType               : (inputs: ["X"], outputs: ["Out"]),
+               gConvBnReluType              : (inputs: ["Input"], outputs: ["Out"]),
+               gDwConvBnReluType            : (inputs: ["Input"], outputs: ["Out"]),
+               gMulticlassNMSType           : (inputs: ["BBoxes", "Scores"], outputs: ["Out"]),
+               gPriorBoxType                : (inputs: ["Input", "Image"], outputs: ["Boxes", "Variances"]),
+               gPreluType                   : (inputs: ["X"], outputs: ["Out"]),
+               gConv2dTransposeType         : (inputs: ["Input"], outputs: ["Output"]),
+               gBilinearInterpType          : (inputs: ["X"], outputs: ["Out"]),
+               gSplit                       : (inputs: ["X"], outputs: ["Out"]),
+               gShape                       : (inputs: ["Input"], outputs: ["Out"]),
+               gFlatten                     : (inputs: ["X"], outputs: ["Out"]),
+               gConvAddPreluType            : (inputs: ["Input"], outputs: ["Out"]),
+               gConvAddAddPreluType         : (inputs: ["Input"], outputs: ["Out"]),
+               gElementwiseAddPreluType  :  (inputs: ["X"], outputs: ["Out"])
+              ]
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/BatchNormOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/BatchNormOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..9fc20f8a597d39d3b628c5e1033f9c5cceac45ed
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BatchNormOp.swift
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class BatchNormParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try BatchNormParam.inputX(inputs: opDesc.inputs, from: inScope)
+      if input.transpose != [0, 2, 3, 1] {
+        fatalError("batch norm only accepts NHWC")
+      }
+      output = try BatchNormParam.outputY(outputs: opDesc.outputs, from: inScope)
+      bias = try BatchNormParam.getFirstTensor(key: "Bias", map: opDesc.paraInputs, from: inScope)
+      mean = try BatchNormParam.getFirstTensor(key: "Mean", map: opDesc.paraInputs, from: inScope)
+      scale = try BatchNormParam.getFirstTensor(key: "Scale", map: opDesc.paraInputs, from: inScope)
+      variance = try BatchNormParam.getFirstTensor(key: "Variance", map: opDesc.paraInputs, from: inScope)
+      epsilon = try BatchNormParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
+      momentum = try BatchNormParam.getAttr(key: "momentum", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  let bias: Tensor<P>
+  let mean: Tensor<P>
+  let scale: Tensor<P>
+  let variance: Tensor<P>
+  let epsilon: Float
+  let momentum: Float
+}
+
+class BatchNormOp<P: PrecisionType>: Operator<BatchNormKernel<P>, BatchNormParam<P>>, Runable, Creator, InferShaperable{
+  typealias OpType = BatchNormOp<P>
+
+  func inferShape() {
+    para.output.dim = para.input.dim
+  }
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/BilinearInterpOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/BilinearInterpOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..8db64ac3a473fe59e7821f11abeb3437c337459d
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BilinearInterpOp.swift
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class BilinearInterpParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try BilinearInterpParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try BilinearInterpParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      out_h = try BilinearInterpParam.getAttr(key: "out_h", attrs: opDesc.attrs)
+      out_w = try BilinearInterpParam.getAttr(key: "out_w", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+    if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
+      fatalError()
+    }
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  let out_h: Int
+  let out_w: Int
+}
+
+class BilinearInterpOp<P: PrecisionType>: Operator<BilinearInterpKernel<P>, BilinearInterpParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = BilinearInterpOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+//    print(outputArray)
+    print(outputArray.strideArray())
+  }
+  
+}
+
+
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..0e1d5f0c53128bbc2f0b5e94d2075eecdef0fcc6
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class BoxcoderParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      priorBox = try BoxcoderParam.getFirstTensor(key: "PriorBox", map: opDesc.inputs, from: inScope)
+      priorBoxVar = try BoxcoderParam.getFirstTensor(key: "PriorBoxVar", map: opDesc.inputs, from: inScope)
+      targetBox = try BoxcoderParam.getFirstTensor(key: "TargetBox", map: opDesc.inputs, from: inScope)
+      output = try BoxcoderParam.getFirstTensor(key: "OutputBox", map: opDesc.outputs, from: inScope)
+      codeType = try BoxcoderParam.getAttr(key: "code_type", attrs: opDesc.attrs)
+      boxNormalized = try BoxcoderParam.getAttr(key: "box_normalized", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+    assert(priorBox.tensorDim.cout() == 2)
+    assert(priorBoxVar.tensorDim.cout() == 2)
+    assert(targetBox.tensorDim.cout() == 3)
+    assert(output.tensorDim.cout() == 3)
+    assert(priorBox.transpose == [0, 1, 2, 3])
+    assert(priorBoxVar.transpose == [0, 1, 2, 3])
+    assert(targetBox.transpose == [0, 1, 2, 3])
+    assert(codeType == "decode_center_size") // encode_center_size is not implemented
+    assert((targetBox.tensorDim.cout() == 3) && (targetBox.tensorDim[0] == 1)) // N must be 1 (only handle batch size = 1)
+  }
+  let priorBox: Texture<P>
+  let priorBoxVar: Texture<P>
+  let targetBox: Texture<P>
+  var output: Texture<P>
+  let codeType: String
+  let boxNormalized: Bool
+}
+
+class BoxcoderOp<P: PrecisionType>: Operator<BoxcoderKernel<P>, BoxcoderParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = BoxcoderOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let pbv : [Float32] = device.texture2tensor(texture: para.priorBoxVar.metalTexture!, dim: para.priorBoxVar.tensorDim.dims, transpose: para.priorBoxVar.transpose)
+    let pb : [Float32] = device.texture2tensor(texture: para.priorBox.metalTexture!, dim: para.priorBox.tensorDim.dims, transpose: para.priorBox.transpose)
+    let tb : [Float32] = device.texture2tensor(texture: para.targetBox.metalTexture!, dim: para.targetBox.tensorDim.dims, transpose: para.targetBox.transpose)
+    let out : [Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(" prior box var ")
+    print(pbv.strideArray())
+    print(" target box ")
+    print(tb.strideArray())
+    print(" prior box ")
+    print(pb.strideArray())
+    print(" output ")
+    print(out.strideArray())
+  }
+  
+}
+
+
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/CNNMPSConvOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/CNNMPSConvOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..8ba74a1c31456d7cb6e9ad67974bc02055313958
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/CNNMPSConvOp.swift
@@ -0,0 +1,75 @@
+//
+//  CNNConvAddBatchNormReluOp.swift
+//  paddle-mobile
+
+import Foundation
+
+class CNNMPSConvTestParam: TestParam {
+    var outputTexture: MTLTexture?
+    var metalParam: MetalConvParam
+    let filterPointer: UnsafeMutableRawPointer
+    let biasePointer: UnsafeMutablePointer<Float>
+    let filterSize: (width: Int, height: Int, channel: Int)
+    init(inMetalParam: MetalConvParam, inFilter: [Float], inBiase: [Float], inFilterSize: (width: Int, height: Int, channel: Int)) {
+        metalParam = inMetalParam
+        filterPointer = UnsafeMutableRawPointer.init(mutating: inFilter)
+        biasePointer = UnsafeMutablePointer.init(mutating: inBiase)
+        filterSize = inFilterSize
+    }
+}
+
+@available(iOS 10.0, *)
+class CNNMPSConvOp<P: PrecisionType>: Operator<CNNConvKernel<P>, CNNConvParam<P>>, Runable, Creator, InferShaperable, Fusion {
+    
+    typealias OpType = CNNMPSConvOp<P>
+
+    required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws {
+        fatalError()
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+    }
+    
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gConvType)
+        _ = beginNode-->Node.init(inType: gElementwiseAdd);
+        return beginNode
+    }
+    
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
+    
+    static func fusionType() -> String {
+        return gMPSCNNConvType
+    }
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
+    }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..d5320136190bb1b7af124b762b719921c1d25200
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
@@ -0,0 +1,77 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConcatParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      guard let xlist = opDesc.inputs["X"] else {
+        fatalError()
+      }
+      for x in xlist {
+        guard let variant = inScope[x], let v = variant as? Texture<P> else {
+          fatalError()
+        }
+        if transpose.count == 0 {
+          transpose = v.transpose
+        }
+        if v.transpose != transpose {
+          fatalError()
+        }
+       
+        input.append(v)
+      }
+      axis = try ConcatParam.getAttr(key: "axis", attrs: opDesc.attrs)
+      output = try ConcatParam.outputOut(outputs: opDesc.outputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  var input: [Texture<P>] = []
+  var output: Texture<P>
+  var transpose: [Int] = []
+  let axis: Int
+}
+
+class ConcatOp<P: PrecisionType>: Operator<ConcatKernel<P>, ConcatParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = ConcatOp<P>
+
+  func inferShape() {
+    //        let dim = para.input.reduce([0, 0]) {[$0[0] + $1.dim[0], $1.dim[1]]}
+    //        para.output.dim = Dim.init(inDim: dim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+  }
+  
+}
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddAddPreluOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..e5bded65a1a8944d337fea65995af79cab580105
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddAddPreluOp.swift
@@ -0,0 +1,108 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddAddPreluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvAddAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvAddAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvAddAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvAddAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvAddAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvAddAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      groups = try ConvAddAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      alpha = try ConvAddAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+      mode = try ConvAddAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+      y = try ConvAddAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  let input: Texture<P>
+  let y: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  let mode: String
+  let alpha: Tensor<P>
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
+}
+
+class ConvAddAddPreluOp<P: PrecisionType>: Operator<ConvAddAddPreluKernel<P>, ConvAddAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  typealias OpType = ConvAddAddPreluOp<P>
+  
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gConvAddAddPreluType
+  }
+  
+  static func needCheck() -> [(Int, String)] {
+    return [(2, "Y"), (2, "X")]
+  }
+  
+  
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..43935b65d1442d7c2e1ca3db49168140569c433f
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
@@ -0,0 +1,130 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+
+class ConvAddBatchNormReluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      
+      filter = try ConvAddBatchNormReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvAddBatchNormReluParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvAddBatchNormReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvAddBatchNormReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvAddBatchNormReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvAddBatchNormReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      epsilon = try ConvAddBatchNormReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
+      
+      groups = try ConvAddBatchNormReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      variance = try ConvAddBatchNormReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
+      bias = try ConvAddBatchNormReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
+      
+      scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
+      mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
+      y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  let input: Texture<P>
+  
+  let variance: Tensor<ParamPrecisionType>
+  let bias: Tensor<ParamPrecisionType>
+  let mean: Tensor<ParamPrecisionType>
+  let scale: Tensor<ParamPrecisionType>
+  let y: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  let epsilon: Float32
+  var newScale: MTLBuffer?
+  var newBiase: MTLBuffer?
+  
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
+}
+
+class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKernel<P>, ConvAddBatchNormReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  
+  typealias OpType = ConvAddBatchNormReluOp<P>
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gElementwiseAddType)
+      --> Node.init(inType: gBatchNormType)
+      --> Node.init(inType: gReluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gConvAddBatchNormReluType
+  }
+  
+  func delogOutput() {
+    print(" conv add batchnorm relu output ")
+    print(para.output.toTensor().strideArray())
+    //        let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
+    //        para.filter.logDataPointer(header: "filter data pointer: ")
+    //        print("filter: \(para.filter)")
+    
+    //        print("biase: \(para.y)")
+    //        print("padding: \(para.paddings)")
+    //        print("stride: \(para.stride)")
+    
+    //        let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false)
+    //        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
+    //        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
+    
+    //        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..5e184844d886beb19ac5ff297f8a270af8a076fa
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
@@ -0,0 +1,116 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvAddParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvAddParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvAddParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvAddParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvAddParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      groups = try ConvAddParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      
+      y = try ConvAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  let input: Texture<P>
+  let y: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
+}
+
+class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  typealias OpType = ConvAddOp<P>
+
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gElementwiseAddType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gConvAddType
+  }
+  
+  func inferShape() {
+    
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+//    print("op \(type): ")
+//    print(" padding: ")
+//    print(para.paddings)
+//    print("stride: ")
+//    print(para.stride)
+//    print("dilations: ")
+//    print(para.dilations)
+//    print(" para input dim: ")
+//    print(para.input.dim)
+//    print(" para filter dim: ")
+//    print(para.filter.dim)
+//    print(" para output dim: ")
+//    print(para.output.dim)
+//    print(" biase: ")
+//    let biase: [Float32] = para.y.buffer.array()
+//    print(biase)
+    
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddPreluOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..0a0fcc7d7934e1c3c7a48f6925105b02ec6d8fc9
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddPreluOp.swift
@@ -0,0 +1,101 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddPreluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      groups = try ConvAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      alpha = try ConvAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+      mode = try ConvAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+      y = try ConvAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  let input: Texture<P>
+  let y: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  let mode: String
+  let alpha: Tensor<P>
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
+}
+
+class ConvAddPreluOp<P: PrecisionType>: Operator<ConvAddPreluKernel<P>, ConvAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  typealias OpType = ConvAddPreluOp<P>
+  
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gConvAddPreluType
+  }
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..959fe44b98dabec2b39fdfdb438d482d720caa61
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
@@ -0,0 +1,116 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvBNReluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvBNReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvBNReluParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvBNReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvBNReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvBNReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvBNReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      epsilon = try ConvBNReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
+      
+      groups = try ConvBNReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      variance = try ConvBNReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
+      bias = try ConvBNReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
+      scale = try ConvBNReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
+      mean = try ConvBNReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  let input: Texture<P>
+  
+  let variance: Tensor<ParamPrecisionType>
+  let bias: Tensor<ParamPrecisionType>
+  let mean: Tensor<ParamPrecisionType>
+  let scale: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  let epsilon: Float32
+  var newScale: MTLBuffer?
+  var newBiase: MTLBuffer?
+  
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
+}
+
+class ConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  typealias OpType = ConvBNReluOp<P>
+  
+  func inputs() -> [Variant] {
+    return [para.input, para.variance, para.bias, para.mean, para.scale, para.filter]
+  }
+  
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gBatchNormType)
+      --> Node.init(inType: gReluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gConvBnReluType
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..e82eb1f4753f0ebfdb5a949c85181a0ae52ea2da
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvOp.swift
@@ -0,0 +1,81 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvParam.output(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      groups = try ConvParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      
+    } catch let error {
+      throw error
+    }
+  }
+  
+  let input: Texture<P>
+  let filter: Tensor<ParamPrecisionType>
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
+}
+
+class ConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runable, Creator, InferShaperable {
+  typealias OpType = ConvOp<P>
+
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print("conv output : ")
+    print(para.output.toTensor().strideArray())
+    //        let _: Float16? = para.output.metalTexture.logDesc()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..9ec6f7bd60260b5808e469129e9c292ff9837f7c
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvTransposeParam<P: PrecisionType>: ConvParam<P> {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      try super.init(opDesc: opDesc, inScope: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+}
+
+class ConvTransposeOp<P: PrecisionType>: Operator<ConvTransposeKernel<P>, ConvTransposeParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = ConvTransposeOp<P>
+  
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+  
+    print(" \(type) output: ")
+    let padToFourDim = para.output.padToFourDim
+    if para.output.transpose == [0, 1, 2, 3] {
+      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+      print(outputArray.strideArray())
+    } else if para.output.transpose == [0, 2, 3, 1] {
+      let output = para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]))
+      print(output.strideArray())
+    } else {
+      print(" not implement")
+    }
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..ec76eecf1fc9736d9dff6a4cf0d69a314a9b1e0d
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class DepthConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runable, Creator, InferShaperable {
+
+  typealias OpType = DepthConvOp<P>
+
+  required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      try super.init(device: device, opDesc: opDesc, inScope: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..8575cfd88c7ddea2f007cad21507b4620c87d3e2
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
@@ -0,0 +1,70 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class DwConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  typealias OpType = ConvBNReluOp<P>
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gDepthConvType)
+    _ = beginNode
+      --> Node.init(inType: gBatchNormType)
+      --> Node.init(inType: gReluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gDwConvBnReluType
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..ae040dd65f74fc222275bc579338107f2ea188fd
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
@@ -0,0 +1,98 @@
+///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. */
+
+import Foundation
+
+class ElementwiseAddParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      inputX = try ElementwiseAddParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try ElementwiseAddParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      axis = try ElementwiseAddParam.getAttr(key: "axis", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+    do {
+      inputY = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch _ {
+      let tensorY: Tensor<P> = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+      let device = inputX.metalTexture!.device
+      inputY = Texture.init(device: device, inDim: tensorY.dim)
+      let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
+      inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: computePrecision)
+    }
+    
+//    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
+//      param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+//      if computePrecision == .Float32 {
+//        super.init(device: device, inFunctionName: "elementwise_add")
+//      } else if computePrecision == .Float16 {
+//        super.init(device: device, inFunctionName: "elementwise_add_half")
+//      } else {
+//        fatalError()
+//      }
+//    }
+    
+    var offset = axis
+    if axis == -1 {
+      offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
+    }
+    for i in 0..<(inputY.tensorDim.cout()) {
+      assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i])
+    }
+  }
+  
+  var inputX: Texture<P>
+  var inputY: Texture<P>
+  var output: Texture<P>
+  var axis: Int
+}
+
+class ElementwiseAddOp<P: PrecisionType>: Operator<ElementwiseAddKernel<P>, ElementwiseAddParam<P>>, Runable, Creator, InferShaperable{
+  typealias OpType = ElementwiseAddOp<P>
+  
+  func inferShape() {
+//    para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output)
+    
+    let padToFourDim = para.output.padToFourDim
+    if para.output.transpose == [0, 1, 2, 3] {
+      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+      print(outputArray.strideArray())
+    } else if para.output.transpose == [0, 2, 3, 1] {
+      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    } else {
+      print(" not implement")
+    }
+  }
+}
+
+
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddPreluOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..333303e9bb7c1224ff50d69b5523edabe0fc81a6
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddPreluOp.swift
@@ -0,0 +1,119 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ElementwiseAddPreluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      alpha = try ElementwiseAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+      mode = try ElementwiseAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+      inputX = try ElementwiseAddPreluParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try ElementwiseAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      axis = try ElementwiseAddPreluParam.getAttr(key: "axis", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+    do {
+      inputY = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch _ {
+      let tensorY: Tensor<P> = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+      let device = inputX.metalTexture!.device
+      inputY = Texture.init(device: device, inDim: tensorY.dim)
+      let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
+      inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: computePrecision)
+    }
+    
+    //    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
+    //      param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+    //      if computePrecision == .Float32 {
+    //        super.init(device: device, inFunctionName: "elementwise_add")
+    //      } else if computePrecision == .Float16 {
+    //        super.init(device: device, inFunctionName: "elementwise_add_half")
+    //      } else {
+    //        fatalError()
+    //      }
+    //    }
+    
+    var offset = axis
+    if axis == -1 {
+      offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
+    }
+    for i in 0..<(inputY.tensorDim.cout()) {
+      assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i])
+    }
+  }
+  
+  let mode: String
+  let alpha: Tensor<P>
+  var inputX: Texture<P>
+  var inputY: Texture<P>
+  var output: Texture<P>
+  var axis: Int
+}
+
+class ElementwiseAddPreluOp<P: PrecisionType>: Operator<ElementwiseAddPreluKernel<P>, ElementwiseAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gElementwiseAddType)
+    _ = beginNode
+      --> Node.init(inType: gPreluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gElementwiseAddPreluType
+  }
+  
+  typealias OpType = ElementwiseAddPreluOp<P>
+  
+  func inferShape() {
+    //    para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output)
+    
+    let padToFourDim = para.output.padToFourDim
+    if para.output.transpose == [0, 1, 2, 3] {
+      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+      print(outputArray.strideArray())
+    } else if para.output.transpose == [0, 2, 3, 1] {
+      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    } else {
+      print(" not implement")
+    }
+  }
+}
+
+
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..382ea58b844b25bb855ed7cdc155a860bca45da5
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift
@@ -0,0 +1,67 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class FeedParam<P: PrecisionType>: OpParam{
+  var output: Texture<P>
+  var input: InputTexture {
+    return scope.input() as! InputTexture
+  }
+  let scope: Scope
+  
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    scope = inScope
+    do {
+      output = try FeedParam.outputOut(outputs: opDesc.outputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  typealias ParamPrecisionType = P
+}
+
+class FeedOp<P: PrecisionType>: Operator<Texture2DTo2DArrayKernel<P>, FeedParam<P>>, Runable, Creator, InferShaperable {
+  typealias OpType = FeedOp<P>
+
+  func inferShape() {
+    //        print("feed  input: \(para.input.expectDim)")
+    print("feed output: \(para.output.dim)")
+    //        para.output.dim =
+    //        para.output.dim = para.input.expectDim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+    
+    //        let resizeKernel = ResizeKernel<P>.init(device: device)
+    //        let resizeParam = ResizeParam.init(input: para.input.mtlTexture, output: para.output.metalTexture, expectDim: para.input.expectDim)
+    //        do {
+    //            try resizeKernel.compute(commandBuffer: buffer, param: resizeParam)
+    //        } catch let error {
+    //            throw error
+    //        }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+  }
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/FetchOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/FetchOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..ade5b09099b69f4784b33a3b108cfcfe1aa1ea7f
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FetchOp.swift
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class FetchParam<P: PrecisionType>: OpParam{
+  var output: FetchHolder
+  let input: Texture<P>
+  let scope: Scope
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    scope = inScope
+    do {
+      input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = FetchHolder.init(inCapacity: input.numel(), inDim: input.tensorDim.dims)
+      scope.setOutput(output: output)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  typealias ParamPrecisionType = P
+}
+
+class FetchKernel<P: PrecisionType>: Kernel, Computable {
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: FetchParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setBuffer(param.output.resultBuffer!, offset: 0, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: FetchParam<P>) {
+    param.output.initBuffer(device: device)
+    if computePrecision == .Float16 {
+      if param.input.transpose == [0, 2, 3, 1] {
+        super.init(device: device, inFunctionName: "fetch_half")
+      } else {
+//        fatalError(" not support ")
+        super.init(device: device, inFunctionName: "fetch_placeholder_half")
+        print(" not support ")
+      }
+    } else if computePrecision == .Float32 {
+      if param.input.transpose == [0, 2, 3, 1] {
+        super.init(device: device, inFunctionName: "fetch")
+      } else {
+        print(" not support ")
+        super.init(device: device, inFunctionName: "fetch_placeholder")
+//        fatalError(" not support ")        
+      }
+    } else {
+      fatalError(" not support ")
+    }
+  }
+}
+
+class FetchOp<P: PrecisionType>: Operator< FetchKernel<P>, FetchParam<P>>, Runable, Creator, InferShaperable {
+  
+  typealias OpType = FetchOp<P>
+
+  func inferShape() {
+    print(para.input.dim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/FlattenOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/FlattenOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..4fc5f222932ce98c4bf3e29bdf6cd8c666f5f9f1
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/FlattenOp.swift
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class FlattenParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try FlattenParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try FlattenParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      axis = try FlattenParam.getAttr(key: "axis", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  let axis: Int
+}
+
+
+class FlattenOp<P: PrecisionType>: Operator<FlattenKernel<P>, FlattenParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = FlattenOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+  }
+  
+}
+
+
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..f58358761f820809685510fa4e9b5ff237567b3c
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift
@@ -0,0 +1,94 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Metal
+import Foundation
+
+public protocol TestParam {
+}
+
+public protocol Testable {
+  associatedtype TestParamType: TestParam
+  func test(commandBuffer: MTLCommandBuffer, param: TestParamType)
+  init(device: MTLDevice, testParam: TestParamType)
+}
+
+
+protocol Computable {
+  associatedtype ParamType: OpParam
+  func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws
+  init(device: MTLDevice, param: ParamType)
+}
+
+protocol KernelProtocol {
+  var pipline: MTLComputePipelineState { get set }
+  var functionName: String { get set }
+  
+}
+
+open class Kernel {
+  let pipline: MTLComputePipelineState
+  let functionName: String
+  public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = true) {
+    pipline = device.pipeLine(funcName: inFunctionName, inPaddleMobileLib: usePaddleMobileLib)
+    functionName = inFunctionName
+  }
+}
+
+open class CusomKernel: Kernel {
+  public struct Shape {
+    public let width: Int
+    public let height: Int
+    public let channel: Int
+    public init(inWidth: Int, inHeight: Int, inChannel: Int){
+      width = inWidth
+      height = inHeight
+      channel = inChannel
+    }
+  }
+  public let outputTexture: MTLTexture
+  public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, usePaddleMobileLib: Bool = false) {
+    let textureDesc = MTLTextureDescriptor.init()
+    textureDesc.textureType = .type2D
+    textureDesc.width = outputDim.width
+    textureDesc.height = outputDim.height
+    textureDesc.depth = (outputDim.channel + 3) / 4
+    
+    if computePrecision == .Float16 {
+      textureDesc.pixelFormat = .rgba16Float
+    } else if computePrecision == .Float32 {
+      textureDesc.pixelFormat = .rgba32Float
+    } else {
+      fatalError()
+    }
+    
+    textureDesc.usage = [.shaderRead, .shaderWrite]
+    textureDesc.storageMode = .shared
+    outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
+    
+    super.init(device: device, inFunctionName: inFunctionName, usePaddleMobileLib: usePaddleMobileLib)
+  }
+  
+  public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(inputTexuture, index: 0)
+    encoder.setTexture(outputTexture, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
+    encoder.endEncoding()
+  }
+  
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..dad8d0c6ac2e5a93273573473c700179f8b90a37
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class BatchNormKernel<P: PrecisionType>: Kernel, Computable {
+  required init(device: MTLDevice, param: BatchNormParam<P>) {
+    let count = param.variance.dim.numel()
+    let varianceP = param.variance.data.pointer
+    let meanP = param.mean.data.pointer
+    let scaleP = param.scale.data.pointer
+    let biasP = param.bias.data.pointer
+    for i in 0..<count {
+      let invStd = P(1 / (Float32(varianceP[i]) + param.epsilon).squareRoot())
+      biasP[i] = biasP[i] - meanP[i] * invStd * scaleP[i]
+      scaleP[i] = invStd * scaleP[i]
+    }
+
+    param.bias.initBuffer(device: device, precision: computePrecision)
+    param.scale.initBuffer(device: device, precision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "batchnorm")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "batchnorm_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBuffer(param.scale.buffer, offset: 0, index: 0)
+    encoder.setBuffer(param.bias.buffer, offset: 0, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormReluKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..fca5719553038732b1646fb8b15885bd03bd5624
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormReluKernel.swift
@@ -0,0 +1,91 @@
+//
+//  BatchNormRelu.swift
+//  paddle-mobile
+//
+//  Created by zhangxinjun on 2018/8/23.
+//  Copyright © 2018年 orange. All rights reserved.
+//
+
+import Foundation
+
+
+class BatchNormReluParam<P: PrecisionType>: BatchNormParam<P> {
+    
+}
+
+class BatchNormReluKernel<P: PrecisionType>: Kernel, Computable{
+    
+    
+    typealias ParamType = BatchNormReluParam<P>
+    var newScale: MTLBuffer
+    var newBias: MTLBuffer
+    
+    required init(device: MTLDevice, testParam: BatchNormReluTestParam) {
+        
+        newScale = testParam.newScaleBuffer
+        newBias = testParam.newBiaseBuffer
+        
+        super.init(device: device, inFunctionName: "batch_norm_relu_3x3")
+    }
+    
+    required init(device: MTLDevice, param: BatchNormReluParam<P>) {
+        guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else {
+            fatalError()
+        }
+        guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else {
+            fatalError()
+        }
+        self.newScale = newScale
+        self.newBias = newBias
+        
+        super.init(device: device, inFunctionName: "batch_norm_relu_3x3")
+        
+        
+        let varianceBuffer : MTLBuffer = param.inputVariance.buffer
+        
+        var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length)
+        let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self)
+        for i in 0..<(varianceBuffer.length / MemoryLayout<P>.stride) {
+            invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot()
+        }
+        
+        let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self)
+        let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self)
+        let scale : MTLBuffer = param.inputScale.buffer
+        let scaleContents = scale.contents().assumingMemoryBound(to: P.self)
+        let bias : MTLBuffer = param.inputBias.buffer
+        let biasContents = bias.contents().assumingMemoryBound(to: P.self)
+        let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self)
+        
+        for i in 0..<(newScale.length / MemoryLayout<P>.stride) {
+            newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i]))
+            newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i]))
+        }
+    }
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: BatchNormReluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            fatalError()
+        }
+        encoder.setTexture(param.input as? MTLTexture, index: 0)
+        encoder.setTexture(param.output as? MTLTexture, index: 1)
+        encoder.setBuffer(newScale, offset: 0, index: 1)
+        encoder.setBuffer(newBias, offset: 0, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output as! MTLTexture)
+        encoder.endEncoding()
+    }
+    
+    func testCompute(commandBuffer: MTLCommandBuffer, testParam: BatchNormReluTestParam) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            fatalError()
+        }
+        encoder.setTexture(testParam.inputTexture, index: 0)
+        encoder.setTexture(testParam.outputTexture, index: 1)
+        encoder.setBuffer(newScale, offset: 0, index: 0)
+        encoder.setBuffer(newBias, offset: 0, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
+        encoder.endEncoding()
+    }
+    
+    
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BilinearInterpKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BilinearInterpKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..7f3e7433760cc1fa4d093b08027bce7c79172532
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BilinearInterpKernel.swift
@@ -0,0 +1,55 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct BilinearInterpMetalParam {
+  var ratio_h: Float32
+  var ratio_w: Float32
+}
+
+class BilinearInterpKernel<P: PrecisionType>: Kernel, Computable{
+  func compute(commandBuffer: MTLCommandBuffer, param: BilinearInterpParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    var ratio_h: Float32 = 0
+    var ratio_w: Float32 = 0
+    if param.output.tensorDim.dims[2] > 1 {
+      ratio_h = Float32(param.input.tensorDim.dims[2]-1) / Float32(param.output.tensorDim.dims[2]-1)
+    }
+    if param.output.tensorDim.dims[3] > 1 {
+      ratio_w = Float32(param.input.tensorDim.dims[3]-1) / Float32(param.output.tensorDim.dims[3]-1)
+    }
+    var p = BilinearInterpMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
+    encoder.setBytes(&p, length: MemoryLayout<BilinearInterpMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: BilinearInterpParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "bilinear_interp_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "bilinear_interp_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..c084d9b28e1dc7019a14d3ae317ddf8a64547830
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct BoxcoderMetalParam {
+}
+
+class BoxcoderKernel<P: PrecisionType>: Kernel, Computable{
+  func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(param.priorBox.metalTexture, index: 0)
+    encoder.setTexture(param.priorBoxVar.metalTexture, index: 1)
+    encoder.setTexture(param.targetBox.metalTexture, index: 2)
+    encoder.setTexture(param.output.metalTexture, index: 3)
+    var bmp = BoxcoderMetalParam.init()
+    encoder.setBytes(&bmp, length: MemoryLayout<BoxcoderMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: BoxcoderParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 3, 1, 2], computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "boxcoder_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "boxcoder_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/CNNConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/CNNConvKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..14a5bd521455632c8a67e4c1a8ebdedc6c460aa5
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/CNNConvKernel.swift
@@ -0,0 +1,176 @@
+//
+//  CNNConvKernel.swift
+//  paddle-mobile
+//
+
+import Foundation
+import Metal
+import Accelerate
+import MetalPerformanceShaders
+
+@available(iOS 10.0, *)
+class WeightsDataSource: NSObject, MPSCNNConvolutionDataSource  {
+    
+    let desc: MPSCNNConvolutionDescriptor
+    let weight:UnsafeMutableRawPointer
+    let bias:UnsafeMutablePointer<Float>
+    
+    
+    
+    init(inDesc: MPSCNNConvolutionDescriptor, inWeight: UnsafeMutableRawPointer, inBias: UnsafeMutablePointer<Float>) {
+        desc = inDesc
+        weight = inWeight
+        bias = inBias
+    }
+    
+    
+    func dataType() -> MPSDataType {
+        return .float32
+    }
+    
+    func descriptor() -> MPSCNNConvolutionDescriptor {
+        return desc
+    }
+    
+    func weights() -> UnsafeMutableRawPointer {
+        return self.weight
+    }
+    
+    func biasTerms() -> UnsafeMutablePointer<Float>? {
+        return self.bias
+    }
+    
+    func load() -> Bool {
+        return true
+    }
+    
+    func purge() {
+    }
+    
+    func label() -> String? {
+        return "Conv"
+    }
+    
+    
+}
+
+@available(iOS 10.0, *)
+class CNNConvParam<P: PrecisionType>: OpParam{
+    
+    typealias ParamPrecisionType = P
+    required init(opDesc: OpDesc, inScope: Scope) throws {
+        do {
+            filter = try CNNConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+            input = try CNNConvParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try CNNConvParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            stride = try CNNConvParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            paddings = try CNNConvParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            // 暂时不用关心
+            dilations = try CNNConvParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+            // 暂时不用关心
+            groups = try CNNConvParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            
+            variance = try CNNConvParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
+            // bias
+            y = try CNNConvParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    var input: Texture<P>
+    let variance: Tensor<ParamPrecisionType>
+    let y: Tensor<ParamPrecisionType>
+    let filter: Tensor<ParamPrecisionType>
+    var output: Texture<P>
+    let stride: [Int32]
+    let paddings: [Int32]
+    let dilations: [Int32]
+    let groups: Int
+}
+
+@available(iOS 10.0, *)
+class CNNConvKernel<P: PrecisionType>: Kernel, Computable {
+    
+    typealias ParamType = CNNConvParam<P>
+    
+    var mpsImageCreator: MpsImageCreator<P>?
+    var activation:MPSCNNNeuron?
+    var conv:MPSCNNConvolution?
+    var weightDataSource:WeightsDataSource?
+    var param: CNNConvParam<P>?
+    var device: MTLDevice?
+    
+    
+    required init(device:MTLDevice, testParam:CNNMPSConvTestParam) {
+        self.device = device
+        
+        let desc = MPSCNNConvolutionDescriptor(kernelWidth: testParam.filterSize.width, kernelHeight: testParam.filterSize.height, inputFeatureChannels: testParam.filterSize.channel, outputFeatureChannels: testParam.filterSize.channel, neuronFilter: activation)
+        
+        desc.strideInPixelsX = Int(testParam.metalParam.offsetX)
+        desc.strideInPixelsY = Int(testParam.metalParam.offsetY)
+        
+        
+        weightDataSource = WeightsDataSource(inDesc: desc, inWeight:testParam.filterPointer, inBias:testParam.biasePointer)
+        
+        if #available(iOS 11.0, *) {
+            conv = MPSCNNConvolution(device: self.device!, weights: weightDataSource!)
+        } else {
+            // Fallback on earlier versions
+        }
+        
+        super.init(device: device, inFunctionName: "")
+    }
+
+    required init(device:MTLDevice, param:CNNConvParam<P>) {
+        
+        self.device = device
+
+        let inChannels: Int
+        let outChannels: Int
+        
+        if param.y.dim.cout() == 4 {
+            inChannels = (param.y.dim[3])
+            outChannels = inChannels
+        } else {
+            inChannels = 0
+            outChannels = inChannels
+        }
+        
+        let desc = MPSCNNConvolutionDescriptor(kernelWidth: param.filter.width, kernelHeight: param.filter.height, inputFeatureChannels: inChannels, outputFeatureChannels: outChannels, neuronFilter: activation)
+        
+        desc.strideInPixelsX = Int(param.stride[0])
+        desc.strideInPixelsY = Int(param.stride[1])
+        
+        
+        weightDataSource = WeightsDataSource(inDesc: desc, inWeight:param.filter.data.pointer as! UnsafeMutablePointer<Float>, inBias: param.y.data.pointer as! UnsafeMutablePointer<Float>)
+        
+        if #available(iOS 11.0, *) {
+            conv = MPSCNNConvolution(device: self.device!, weights: weightDataSource!)
+        } else {
+            // Fallback on earlier versions
+        }
+        
+        super.init(device: device, inFunctionName: "")
+    }
+
+    func compute(commandBuffer: MTLCommandBuffer, param: CNNConvParam<P>) throws {
+        let inputImage:MPSImage = (mpsImageCreator?.createMPSImage(device: device!))!
+        var outputImage = (mpsImageCreator?.createMPSImage(device: device!))!
+        
+        // 运算conv和add两个步骤，add用了bias偏差做为参数，被Metal API进行调用
+        conv?.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
+        
+        param.input = outputImage.texture as! Texture<P>
+    }
+    
+    func testCompute(commandBuffer: MTLCommandBuffer, testParam: CNNMPSConvTestParam) throws {
+        let inputImage:MPSImage = (mpsImageCreator?.createMPSImage(device: device!))!
+        var outputImage = (mpsImageCreator?.createMPSImage(device: device!))!
+        
+        // 运算conv和add两个步骤，add用了bias偏差做为参数，被Metal API进行调用
+        conv?.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
+        
+        testParam.outputTexture = outputImage.texture
+    }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Concat.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Concat.swift
new file mode 100644
index 0000000000000000000000000000000000000000..25f0a21bfff420566d06a59dca626805dd0ce6e0
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Concat.swift
@@ -0,0 +1,31 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConcatKernel<P: PrecisionType>: Kernel, Computable{
+    func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
+    
+    required init(device: MTLDevice, param: ConcatParam<P>) {
+        super.init(device: device, inFunctionName: "concat")
+    }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..81ef46c0b3e919615d07f667851007e95b02d54f
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
@@ -0,0 +1,147 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct ConcatTestParam: TestParam {
+  var input: [MTLTexture]
+  var output: MTLTexture
+  var dims: [[Int]]
+  var axis: Int
+  var odim: [Int]
+}
+
+struct ConcatMetalParam {
+  var odim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
+  var axis: Int32 = 0
+  var offset: Int32 = 0
+  var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+  var vdim: (Int32, Int32, Int32, Int32, Int32, Int32) = (0, 0, 0, 0, 0, 0)
+}
+
+class ConcatKernel<P: PrecisionType>: Kernel, Computable{
+  var v = "normal"
+  var pm = ConcatMetalParam.init()
+  func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
+    
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    let num = param.input.count
+    for i in 0..<num {
+      encoder.setTexture(param.input[i].metalTexture, index: i)
+    }
+    encoder.setTexture(param.output.metalTexture, index: num)
+    if v == "normal" {
+      encoder.setTexture(param.output.metalTexture, index: num + 1)
+    }
+    encoder.setBytes(&pm, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+
+  required init(device: MTLDevice, param: ConcatParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: computePrecision)
+    let orank = param.output.tensorDim.cout()
+    let num = param.input.count
+    assert(num <= 6)
+    var axis = 4 - param.output.tensorDim.cout() + param.axis
+    for i in 0..<4 {
+      if param.transpose[i] == axis {
+        axis = i
+        break
+      }
+    }
+    pm.axis = Int32(axis)
+    pm.odim = (Int32(param.output.dim[0]), Int32(param.output.dim[1]), Int32(param.output.dim[2]), Int32(param.output.dim[3]))
+    pm.trans = (Int32(param.output.transpose[0]), Int32(param.output.transpose[1]), Int32(param.output.transpose[2]), Int32(param.output.transpose[3]))
+    var vdim: [Int] = [0, 0, 0, 0, 0, 0]
+    for i in 0..<num {
+      vdim[i] = param.input[i].dim[axis]
+    }
+    if orank == 4 {
+      if axis == 1 {
+        v = "y"
+      } else if axis == 2 {
+        v = "x"
+      } else {
+        if (param.output.dim[0] == 1) && axis == 3 {
+          var vz = true
+          for i in 0..<num {
+            if vdim[i] % 4 != 0 {
+              vz = false
+              break
+            }
+          }
+          if vz {
+            v = "z"
+            for i in 0..<num {
+              vdim[i] = vdim[i] / 4
+            }
+          }
+        }
+      }
+    } else if orank == 3 {
+      if axis == 2 {
+        v = "y"
+      } else if axis == 3 {
+        v = "x"
+      } else if axis == 1 {
+        var vz = true
+        for i in 0..<num {
+          if vdim[i] % 4 != 0 {
+            vz = false
+            break
+          }
+        }
+        if vz {
+          v = "z"
+          for i in 0..<num {
+            vdim[i] = vdim[i] / 4
+          }
+        }
+      }
+    } else {
+      if axis == 2 {
+        v = "y"
+      } else if axis == 3 {
+        var vx = true
+        for i in 0..<num {
+          if vdim[i] % 4 != 0 {
+            vx = false
+            break
+          }
+        }
+        if vx {
+          v = "x"
+          for i in 0..<num {
+            vdim[i] = vdim[i] / 4
+          }
+        }
+      }
+    }
+    pm.vdim = (Int32(vdim[0]), Int32(vdim[1]), Int32(vdim[2]), Int32(vdim[3]), Int32(vdim[4]), Int32(vdim[5]))
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "concat_\(orank)_\(num)_\(v)_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  required init(device: MTLDevice, testParam: ConcatTestParam) {
+    super.init(device: device, inFunctionName: "concat")
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddAddPreluKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..dfd9a74291306337c9183595d02db7f8d25e63a9
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddAddPreluKernel.swift
@@ -0,0 +1,150 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddAddPreluKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: MetalConvParam!
+  required init(device: MTLDevice, param: ConvAddAddPreluParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: computePrecision)
+    param.alpha.initBuffer(device: device, precision: computePrecision)
+    
+    if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half")
+        }
+        
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half")
+        }
+        
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float")
+        }
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float")
+        }
+        
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else {
+      fatalError()
+    }
+    
+    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
+    
+    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+    
+    //    print(" function: \(functionName)")
+    //    print("offset x: \(offsetX)")
+    //    print("offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    //    print("metal param: ")
+    //    print(inMetalParam)
+    
+    metalParam = inMetalParam
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddAddPreluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..66324dd47086fd7c1ccffb674c0f8b8623416e0d
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -0,0 +1,179 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct ConvAddBatchNormReluTestParam: TestParam {
+  let inputTexture: MTLTexture
+  let outputTexture: MTLTexture
+  var metalParam: MetalConvParam
+  let filterBuffer: MTLBuffer
+  let biaseBuffer: MTLBuffer
+  let newScaleBuffer: MTLBuffer
+  let newBiaseBuffer: MTLBuffer
+  let filterSize: (width: Int, height: Int, channel: Int)
+  init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
+    inputTexture = inInputTexture
+    outputTexture = inOutputTexture
+    metalParam = inMetalParam
+    filterBuffer = inFilterBuffer
+    biaseBuffer = inBiaseBuffer
+    newScaleBuffer = inNewScaleBuffer
+    newBiaseBuffer = inNewBiaseBuffer
+    filterSize = inFilterSize
+  }
+}
+
+class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
+  required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam) {
+    if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
+      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
+    } else if testParam.filterSize.channel == 1 {
+      super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
+    } else {
+      super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
+    }
+  }
+  
+  var metalParam: MetalConvParam!
+  
+  required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: computePrecision)
+    param.variance.initBuffer(device: device, precision: .Float32)
+    param.mean.initBuffer(device: device, precision: .Float32)
+    param.scale.initBuffer(device: device, precision: .Float32)
+    param.bias.initBuffer(device: device, precision: .Float32)
+    
+    if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
+      } else {
+        fatalError(" unsupport ")
+      }
+    } else if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half")
+      } else {
+        fatalError(" unsupport ")
+      }
+    } else {
+      fatalError()
+    }
+    
+    let offsetX = param.filter.width/2 - Int(param.paddings[0])
+    let offsetY = param.filter.height/2 - Int(param.paddings[1])
+    
+    print("offset x: \(offsetX)")
+    print("offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    
+    var invs: [P] = []
+    let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
+    
+    for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
+      let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
+      invs.append(P(inv))
+    }
+    
+    let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
+    let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
+    
+    let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
+    let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
+    let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
+    for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
+      newScale[i] = invs[i] * scaleContents[i]
+      newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
+    }
+    
+//    var newScaleFP16: UnsafeMutableRawPointer
+//
+//    float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleFP16, count: param.scale.buffer.length / MemoryLayout<P>.size)
+    
+    
+//    let newBiaseFloat16 = device.makeBuffer(length: <#T##Int#>, options: <#T##MTLResourceOptions#>)
+    
+    var newBiaseBuffer: MTLBuffer
+    var newScaleBuffer: MTLBuffer
+    
+    if computePrecision == .Float32 {
+      newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
+      newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
+    } else if computePrecision == .Float16 {
+      
+      newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+      newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+      
+      float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
+      
+      float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
+    } else {
+      fatalError(" unsupport ")
+    }
+    
+    param.newBiase = newBiaseBuffer
+    param.newScale = newScaleBuffer
+    
+    newScale.deinitialize(count: param.scale.buffer.length)
+    newScale.deallocate()
+    
+    newBiase.deinitialize(count: param.bias.buffer.length)
+    newBiase.deallocate()
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+    encoder.setBuffer(param.newScale!, offset: 0, index: 3)
+    encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      fatalError()
+    }
+    
+    encoder.setTexture(param.inputTexture, index: 0)
+    encoder.setTexture(param.outputTexture, index: 1)
+    var inMetalParam = param.metalParam
+    encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
+    encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2)
+    encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3)
+    encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4)
+    encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..d5aa98d2606ceda5cbcf0f3f4c1fc0ed2adeed25
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: MetalConvParam!
+  required init(device: MTLDevice, param: ConvAddParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: computePrecision)
+    
+    if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_1x1_half")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_add_3x3_half")
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        super.init(device: device, inFunctionName: "conv_add_5x1_half")
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_1x5_half")
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_1x1")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_add_3x3")
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        super.init(device: device, inFunctionName: "conv_add_5x1")
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_1x5")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_add_3x3")
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else {
+      fatalError()
+    }
+    
+
+    
+    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
+    
+    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+    
+//    print(" function: \(functionName)")
+//    print("offset x: \(offsetX)")
+//    print("offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+//    print("metal param: ")
+//    print(inMetalParam)
+    
+    metalParam = inMetalParam
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddPreluKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..35d49953c656364799e8ca7400ef4bac445200a0
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddPreluKernel.swift
@@ -0,0 +1,150 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddPreluKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: MetalConvParam!
+  required init(device: MTLDevice, param: ConvAddPreluParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: computePrecision)
+    param.alpha.initBuffer(device: device, precision: computePrecision)
+
+    if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half")
+        }
+        
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half")
+        }
+        
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float")
+        }
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float")
+        }
+        
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else {
+      fatalError()
+    }
+    
+    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
+    
+    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+    
+    //    print(" function: \(functionName)")
+    //    print("offset x: \(offsetX)")
+    //    print("offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    //    print("metal param: ")
+    //    print(inMetalParam)
+    
+    metalParam = inMetalParam
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddPreluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..e79f8f9be37c2575b28aef2e9169ab814c9587fe
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
@@ -0,0 +1,180 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+import MetalPerformanceShaders
+
+struct ConvBNReluTestParam: TestParam {
+  let inputTexture: MTLTexture
+  let outputTexture: MTLTexture
+  var metalParam: MetalConvParam
+  let filterBuffer: MTLBuffer
+  let biaseBuffer: MTLBuffer
+  let newScaleBuffer: MTLBuffer
+  let newBiaseBuffer: MTLBuffer
+  let filterSize: (width: Int, height: Int, channel: Int)
+  init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
+    
+    inputTexture = inInputTexture
+    outputTexture = inOutputTexture
+    metalParam = inMetalParam
+    filterBuffer = inFilterBuffer
+    biaseBuffer = inBiaseBuffer
+    newScaleBuffer = inNewScaleBuffer
+    newBiaseBuffer = inNewBiaseBuffer
+    filterSize = inFilterSize
+  }
+}
+
+class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
+  required init(device: MTLDevice, testParam: ConvBNReluTestParam) {
+    if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
+      super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1")
+    } else if testParam.filterSize.channel == 1 {
+      super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3")
+    } else {
+      super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3")
+    }
+  }
+  
+  var metalParam: MetalConvParam!
+
+  required init(device: MTLDevice, param: ConvBNReluParam<P>) {
+    
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.variance.initBuffer(device: device, precision: .Float32)
+    param.mean.initBuffer(device: device, precision: .Float32)
+    param.scale.initBuffer(device: device, precision: .Float32)
+    param.bias.initBuffer(device: device, precision: .Float32)
+    
+    if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3")
+      } else {
+        fatalError(" unsupport ")
+      }
+    } else if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half")
+      } else if param.filter.channel == 1 {
+        super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half")
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half")
+      } else {
+        fatalError(" unsupport ")
+      }
+    } else {
+      fatalError()
+    }
+    
+   
+    
+    let offsetX = param.filter.width/2 - Int(param.paddings[0])
+    let offsetY = param.filter.height/2 - Int(param.paddings[1])
+    
+//    print(" param filter width: \(param.filter.width)")
+//    print(" param filter height: \(param.filter.height)")
+//
+//    print(" param paddings: \(param.paddings)")
+//
+//    print("ConvBNReluKernel offset x: \(offsetX)")
+//    print("ConvBNReluKernel offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    
+    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    
+    var invs: [P] = []
+    let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
+    
+    for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
+      let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
+      invs.append(P(inv))
+    }
+    
+    let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
+    let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
+    
+    let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
+    let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
+    let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
+    for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
+      newScale[i] = invs[i] * scaleContents[i]
+      newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
+    }
+    
+    var newBiaseBuffer: MTLBuffer
+    var newScaleBuffer: MTLBuffer
+    
+    if computePrecision == .Float32 {
+      newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
+      newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
+    } else if computePrecision == .Float16 {
+      
+      newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+      newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
+      
+      float32ToFloat16(input: newBiase as! UnsafeMutablePointer<Float32>, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout<P>.size)
+      
+      float32ToFloat16(input: newScale as! UnsafeMutablePointer<Float32>, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout<P>.size)
+    } else {
+      fatalError(" unsupport ")
+    }
+    
+    param.newBiase = newBiaseBuffer
+    param.newScale = newScaleBuffer
+    
+    newScale.deinitialize(count: param.scale.buffer.length)
+    newScale.deallocate()
+    
+    newBiase.deinitialize(count: param.bias.buffer.length)
+    newBiase.deallocate()
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvBNReluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.newScale!, offset: 0, index: 2)
+    encoder.setBuffer(param.newBiase!, offset: 0, index: 3)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  public func test(commandBuffer: MTLCommandBuffer, param: ConvBNReluTestParam) {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      fatalError()
+    }
+    
+    encoder.setTexture(param.inputTexture, index: 0)
+    encoder.setTexture(param.outputTexture, index: 1)
+    var inMetalParam = param.metalParam
+    encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
+    encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 2)
+    encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 3)
+    encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..345136a503d8eda6ad23f85ef01eb53fa539d453
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+public struct MetalConvParam {
+  let offsetX: Int16
+  let offsetY: Int16
+  let offsetZ: Int16
+  let strideX: UInt16
+  let strideY: UInt16
+  let dilationX: UInt16
+  let dilationY: UInt16
+}
+
+class ConvKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: MetalConvParam!
+  required init(device: MTLDevice, param: ConvParam<P>) {
+    param.filter.initBuffer(device: device, precision: ComputePrecision.Float32)
+    if param.filter.width == 1 && param.filter.height == 1 {
+      super.init(device: device, inFunctionName: "conv_1x1")
+    } else if param.filter.channel == 1 {
+      super.init(device: device, inFunctionName: "depthwise_conv_3x3")
+    } else if param.filter.width == 3 && param.filter.height == 3 {
+      super.init(device: device, inFunctionName: "conv_3x3")
+    } else {
+      fatalError(" unsupport ")
+    }
+
+    let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
+    let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
+    let offsetZ = 0.0
+    
+    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..435776c850854f2fc4259e8a2089299da825f463
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
@@ -0,0 +1,83 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct MetalConvTransposeParam {
+  let kernelW: UInt16;
+  let kernelH: UInt16;
+  
+  let strideX: UInt16;
+  let strideY: UInt16;
+  
+  let paddingX: UInt16;
+  let paddingY: UInt16;
+  
+  let dilationX: UInt16;
+  let dilationY: UInt16;
+}
+
+class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
+  var metalParam: MetalConvTransposeParam!
+  required init(device: MTLDevice, param: ConvTransposeParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision, convertToNHWC: false, withTranspose: true)
+    if computePrecision == .Float32 {
+      if param.stride == [2, 2] && param.stride == [2, 2] {
+        super.init(device: device, inFunctionName: "conv_transpose2x2_stride2")
+      } else {
+        fatalError(" -- conv transpose unsupported yet -- ")
+      }
+    } else if computePrecision == .Float16 {
+      if param.stride == [2, 2] && param.stride == [2, 2] {
+        super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half")
+      } else {
+        fatalError(" -- conv transpose unsupported yet -- ")
+      }
+    } else {
+      fatalError()
+    }
+    
+//    let filter: [Float32] = param.filter.buffer.array()
+//    print(" conv transpose filter")
+//    print(filter)
+    let kernelWidth = UInt16(param.filter.width)
+    let kernelHeight = UInt16(param.filter.height)
+    
+    let strideX = UInt16(param.stride[0])
+    let strideY = UInt16(param.stride[1])
+    let paddingX = UInt16(param.paddings[0])
+    let paddingY = UInt16(param.paddings[1])
+    let dilationX = UInt16(param.dilations[0])
+    let dilationY = UInt16(param.dilations[1])
+    
+    metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY)
+
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvTransposeParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..16774a85492d2e21ca5575ed661674824319db28
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
@@ -0,0 +1,73 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct ElementwiseAddMetalParam {
+  var fast: Int32 = 0
+  var axis: Int32 = 0
+  var ylen: Int32 = 0
+  var xdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
+  var xtrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+  var ydim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
+  var ytrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+}
+
+class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: ElementwiseAddMetalParam
+  required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+    
+    metalParam = ElementwiseAddMetalParam.init()
+    
+    let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
+    let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
+    let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
+    let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
+    
+    metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
+    metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
+    metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
+    metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
+    if param.axis == -1 {
+      metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
+    } else {
+      metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
+    }
+    metalParam.ylen = Int32(param.inputY.tensorDim.cout())
+    if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
+      //      print("===> elementwise_add fast!!!")
+      metalParam.fast = 1
+    }
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "elementwise_add")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "elementwise_add_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(param.inputX.metalTexture, index: 0)
+    encoder.setTexture(param.inputY.metalTexture, index: 1)
+    encoder.setTexture(param.output.metalTexture, index: 2)
+    encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddPreluKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..91589864b07f10754c860d038e754e09874db54e
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddPreluKernel.swift
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+
+class ElementwiseAddPreluKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: ElementwiseAddMetalParam
+  required init(device: MTLDevice, param: ElementwiseAddPreluParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+    param.alpha.initBuffer(device: device, precision: computePrecision)
+   
+    metalParam = ElementwiseAddMetalParam.init()
+    
+    let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
+    let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
+    let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
+    let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
+    
+    metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
+    metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
+    metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
+    metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
+    if param.axis == -1 {
+      metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
+    } else {
+      metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
+    }
+    metalParam.ylen = Int32(param.inputY.tensorDim.cout())
+    if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
+      //      print("===> elementwise_add fast!!!")
+      metalParam.fast = 1
+    }
+    
+    if computePrecision == .Float32 {
+      if param.mode == "channel" {
+        super.init(device: device, inFunctionName: "elementwise_add_channel_float")
+      } else if param.mode == "element" {
+        super.init(device: device, inFunctionName: "elementwise_add_element_float")
+      } else {
+        super.init(device: device, inFunctionName: "elementwise_add_prelu_float")
+      }
+    } else if computePrecision == .Float16 {
+      if param.mode == "channel" {
+        super.init(device: device, inFunctionName: "elementwise_add_channel_half")
+      } else if param.mode == "element" {
+        super.init(device: device, inFunctionName: "elementwise_add_channel_half")
+      } else {
+        super.init(device: device, inFunctionName: "elementwise_add_channel_half")
+      }
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddPreluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(param.inputX.metalTexture, index: 0)
+    encoder.setTexture(param.inputY.metalTexture, index: 1)
+    encoder.setTexture(param.output.metalTexture, index: 2)
+    encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
+    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/FlattenKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/FlattenKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..090c55b16160dca19bfcdc4f3467cacdbc9a20c2
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/FlattenKernel.swift
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct FlattenMetalParam {
+  var idim: (Int32, Int32, Int32, Int32)
+  var itrans: (Int32, Int32, Int32, Int32)
+  var odim: (Int32, Int32, Int32, Int32)
+  var otrans: (Int32, Int32, Int32, Int32)
+}
+
+
+class FlattenKernel<P: PrecisionType>: Kernel, Computable{
+  
+  var metalParam: FlattenMetalParam
+  
+  required init(device: MTLDevice, param: FlattenParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    var id: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.input.tensorDim.cout() {
+      id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
+    }
+    let it: [Int32] = param.input.transpose.map { Int32($0) }
+    var od: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.output.tensorDim.cout() {
+      od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
+    }
+    let ot: [Int32] = param.output.transpose.map { Int32($0) }
+    metalParam = FlattenMetalParam.init(
+      idim: (id[0], id[1], id[2], id[3]),
+      itrans: (it[0], it[1], it[2], it[3]),
+      odim: (od[0], od[1], od[2], od[3]),
+      otrans: (ot[0], ot[1], ot[2], ot[3])
+    )
+    let irank = param.input.tensorDim.cout()
+    let orank = param.output.tensorDim.cout()
+    assert(orank == 2)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "reshape_\(irank)_2_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "reshape_\(irank)_2_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: FlattenParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+
+    encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..3f78efb89e47197ae0af6a1bb53955bc4a937eda
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
@@ -0,0 +1,55 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class MulticlassNMSKernel<P: PrecisionType>: Kernel, Computable{
+  let pipline1: MTLComputePipelineState
+
+  required init(device: MTLDevice, param: MulticlassNMSParam<P>) {
+    
+    param.middleOutput.initBuffer(device: device)
+    param.bboxOutput.initBuffer(device: device)
+    if computePrecision == .Float32 {
+      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox", inPaddleMobileLib: true)
+      super.init(device: device, inFunctionName: "nms_fetch_result")
+    } else if computePrecision == .Float16 {
+      pipline1 = device.pipeLine(funcName: "nms_fetch_bbox_half", inPaddleMobileLib: true)
+      super.init(device: device, inFunctionName: "nms_fetch_result_half")
+    } else {
+      fatalError( " unsupport precision " )
+    }
+    
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: MulticlassNMSParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.scores.metalTexture, index: 0)
+    encoder.setBuffer(param.middleOutput.resultBuffer!, offset: 0, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.scores.metalTexture)
+    encoder.endEncoding()
+    
+    guard let encoderBox = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoderBox.setTexture(param.bboxes.metalTexture, index: 0)
+    encoderBox.setBuffer(param.bboxOutput.resultBuffer!, offset: 0, index: 0)
+    encoderBox.dispatch(computePipline: pipline1, outTexture: param.bboxes.metalTexture)
+    encoderBox.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..1d66e420e236f2e0a7734838a293215807caa968
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct PoolMetalParam {
+  let ksizeX: Int32
+  let ksizeY: Int32
+  let strideX: Int32
+  let strideY: Int32
+  let paddingX: Int32
+  let paddingY: Int32
+  let poolType: Int32
+}
+
+class PoolKernel<P: PrecisionType>: Kernel, Computable{
+  var metalParam: PoolMetalParam
+  required init(device: MTLDevice, param: PoolParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    
+    var poolType: Int32
+    switch param.poolType {
+    case "max":
+      poolType = 0
+    case "avg":
+      poolType = 1
+    default:
+      fatalError()
+    }
+    metalParam = PoolMetalParam.init(
+      ksizeX: param.ksize[0],
+      ksizeY: param.ksize[1],
+      strideX: param.stride[0],
+      strideY: param.stride[1],
+      paddingX: param.padding[0],
+      paddingY: param.padding[1],
+      poolType: poolType
+    )
+    
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "pool")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "pool_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: PoolParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+
+    encoder.setBytes(&metalParam, length: MemoryLayout<PoolMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..4ee25888f06048bfe696028ea2338a56fd06053e
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class PreluKernel<P: PrecisionType>: Kernel, Computable{
+  required init(device: MTLDevice, param: PreluParam<P>) {
+    param.alpha.initBuffer(device: device, precision: computePrecision)
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      if param.mode == "channel" {
+        super.init(device: device, inFunctionName: "prelu_channel")
+      } else if param.mode == "element" {
+        super.init(device: device, inFunctionName: "prelu_element")
+      } else {
+        super.init(device: device, inFunctionName: "prelu_other")
+      }
+    } else if computePrecision == .Float16 {
+      if param.mode == "channel" {
+        super.init(device: device, inFunctionName: "prelu_channel_half")
+      } else if param.mode == "element" {
+        super.init(device: device, inFunctionName: "prelu_element_half")
+      } else {
+        super.init(device: device, inFunctionName: "prelu_other_half")
+      }
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: PreluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..be18c4411ffbef704dff61bb2aa82bc338daf163
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
@@ -0,0 +1,151 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct PriorBoxMetalParam {
+  let offset: Float32
+  let stepWidth: Float32
+  let stepHeight: Float32
+  let minSize: Float32
+  let maxSize: Float32
+  let imageWidth: Float32
+  let imageHeight: Float32
+  let clip: Bool
+  let numPriors: uint
+  let aspecRatiosSize: uint
+  let minSizeSize: uint
+  let maxSizeSize: uint
+}
+
+class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
+  var metalParam: PriorBoxMetalParam!
+  
+  required init(device: MTLDevice, param: PriorBoxParam<P>) {
+    
+    let originDim = param.output.tensorDim;
+    
+    param.output.tensorDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
+    param.output.padToFourDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]])
+    
+    param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: computePrecision)
+    param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
+    
+    
+    if computePrecision == .Float32 {
+      if param.min_max_aspect_ratios_order {
+        super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder")
+      } else {
+        super.init(device: device, inFunctionName: "prior_box")
+      }
+      
+    } else if computePrecision == .Float16 {
+      if param.min_max_aspect_ratios_order {
+        super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder_half")
+      } else {
+        super.init(device: device, inFunctionName: "prior_box_half")
+      }
+    } else {
+      fatalError()
+    }
+    
+    
+    guard param.minSizes.count == 1 else {
+      fatalError(" need implement ")
+    }
+    
+//    let n = 1
+//    let h = param.output.dim[1]
+//    let w = param.output.dim[2]
+//    let c = param.output.dim[3] * param.output.dim[0]
+//
+//    param.output.dim = Dim.init(inDim: [n, h, w, c])
+//    param.output.transpose = [0, 1, 2, 3]
+    
+    let imageWidth = Float32(param.inputImage.padToFourDim[3])
+    let imageHeight = Float32(param.inputImage.padToFourDim[2])
+    
+    let featureWidth = param.input.padToFourDim[3]
+    let featureHeight = param.input.padToFourDim[2]
+    
+    if param.stepW == 0 || param.stepH == 0 {
+      param.stepW = Float32(imageWidth) / Float32(featureWidth)
+      param.stepH = Float32(imageHeight) / Float32(featureHeight)
+    }
+    
+    var outputAspectRatior: [Float32] = []
+    outputAspectRatior.append(1.0)
+    
+    let epsilon = 1e-6
+    for ar in param.aspectRatios {
+      var alreadyExist = false
+      for outputAr in outputAspectRatior {
+        if fabs(Double(ar) - Double(outputAr)) < Double(epsilon) {
+          alreadyExist = true
+          break
+        }
+      }
+      
+      if !alreadyExist {
+        outputAspectRatior.append(ar)
+      }
+      if param.flip {
+        outputAspectRatior.append(1.0 / ar)
+      }
+    }
+    
+    if computePrecision == .Float16 {
+      let buffer = device.makeBuffer(length: outputAspectRatior.count * MemoryLayout<Float16>.size)
+      float32ToFloat16(input: &outputAspectRatior, output:(buffer?.contents())!, count: outputAspectRatior.count)
+      param.newAspectRatios = buffer
+
+    } else if computePrecision == .Float32 {
+      let buffer = device.makeBuffer(bytes: outputAspectRatior, length: outputAspectRatior.count * MemoryLayout<Float32>.size, options: [])
+      param.newAspectRatios = buffer
+    } else {
+      fatalError()
+    }
+    
+    let aspectRatiosSize = uint(outputAspectRatior.count)
+    
+    let maxSizeSize: uint = uint(param.maxSizes.count)
+    let minSizeSize: uint = uint(param.minSizes.count)
+    
+    let numPriors = aspectRatiosSize * minSizeSize + maxSizeSize
+    
+    let minSize = param.minSizes.last ?? 0.0
+    let maxSize = param.maxSizes.last ?? 0.0
+    
+    metalParam = PriorBoxMetalParam.init(offset: param.offset, stepWidth: param.stepW, stepHeight: param.stepH, minSize: minSize, maxSize: maxSize, imageWidth: imageWidth, imageHeight: imageHeight, clip: param.clip, numPriors: numPriors, aspecRatiosSize: aspectRatiosSize, minSizeSize: minSizeSize, maxSizeSize: maxSizeSize)
+    
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: PriorBoxParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setTexture(param.outputVariances.metalTexture, index: 2)
+    
+    encoder.setBuffer(param.newAspectRatios!, offset: 0, index: 0)
+    
+    encoder.setBytes(&metalParam, length: MemoryLayout<PriorBoxMetalParam>.size, index: 1)
+    
+    encoder.setBytes(param.variances, length: MemoryLayout<Float32>.size * param.variances.count, index: 2)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..18f279e9f3c5226d6eea5b5e6f0a42502173071e
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ReluKernel<P: PrecisionType>: Kernel, Computable{
+  func compute(commandBuffer: MTLCommandBuffer, param: ReluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: ReluParam<P>) {
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "relu")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "relu_half")
+    } else {
+      fatalError()
+    }
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..4114d3c3c62054235cd57fe37fe9cd83c5bb58cb
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift
@@ -0,0 +1,97 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct ReshapeMetalParam {
+  var idim: (Int32, Int32, Int32, Int32)
+  var itrans: (Int32, Int32, Int32, Int32)
+  var odim: (Int32, Int32, Int32, Int32)
+  var otrans: (Int32, Int32, Int32, Int32)
+}
+
+struct ReshapeTestParam: TestParam {
+  let inputTexture: MTLTexture
+  let outputTexture: MTLTexture
+  let param: ReshapeMetalParam
+}
+
+class ReshapeKernel<P: PrecisionType>: Kernel, Computable{
+  
+  var metalParam: ReshapeMetalParam
+  
+  required init(device: MTLDevice, param: ReshapeParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    var id: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.input.tensorDim.cout() {
+      id[4-param.input.tensorDim.cout()+i] = Int32(param.input.tensorDim[i])
+    }
+    let it: [Int32] = param.input.transpose.map { Int32($0) }
+    var od: [Int32] = [1, 1, 1, 1]
+    for i in 0..<param.output.tensorDim.cout() {
+      od[4-param.output.tensorDim.cout()+i] = Int32(param.output.tensorDim[i])
+    }
+    let ot: [Int32] = param.output.transpose.map { Int32($0) }
+    metalParam = ReshapeMetalParam.init(
+      idim: (id[0], id[1], id[2], id[3]),
+      itrans: (it[0], it[1], it[2], it[3]),
+      odim: (od[0], od[1], od[2], od[3]),
+      otrans: (ot[0], ot[1], ot[2], ot[3])
+    )
+    let irank = param.input.tensorDim.cout()
+    let orank = param.output.tensorDim.cout()
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "reshape_\(irank)_\(orank)_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+  required init(device: MTLDevice, testParam: ReshapeTestParam) {
+    metalParam = ReshapeMetalParam.init(
+    idim: (0, 0, 0, 0),
+    itrans: (0, 0, 0, 0),
+    odim: (0, 0, 0, 0),
+    otrans: (0, 0, 0, 0)
+    )
+    super.init(device: device, inFunctionName: "reshape")
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+
+    encoder.setBytes(&metalParam, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+//  func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) {
+//    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+//      fatalError()
+//    }
+//    encoder.setTexture(testParam.inputTexture, index: 0)
+//    encoder.setTexture(testParam.outputTexture, index: 1)
+//    var pm: ReshapeMetalParam = testParam.param
+//    encoder.setBytes(&pm, length: MemoryLayout<ReshapeMetalParam>.size, index: 0)
+//    encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture)
+//    encoder.endEncoding()
+//  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeBilinearKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeBilinearKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..e5cbce1d1e196f88bb7a3b38d3e92c330774f3ba
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeBilinearKernel.swift
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct ResizeBilinearMetalParam {
+  var ratio_h: Float32
+  var ratio_w: Float32
+}
+
+class ResizeBilinearKernel<P: PrecisionType>: Kernel, Computable{
+  func compute(commandBuffer: MTLCommandBuffer, param: ResizeBilinearParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    let ratio_h: Float32 = Float32(param.input.tensorDim.dims[2]) / Float32(param.output.tensorDim.dims[2])
+    let ratio_w: Float32 = Float32(param.input.tensorDim.dims[3]) / Float32(param.output.tensorDim.dims[3])
+    var p = ResizeBilinearMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w)
+    encoder.setBytes(&p, length: MemoryLayout<ConcatMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: ResizeBilinearParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "resize_bilinear")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "resize_bilinear_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ShapeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ShapeKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..feb052a44fdc7c6134cc90f07f3fc94ad0a497df
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ShapeKernel.swift
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct ShapeMetalParam {
+}
+
+class ShapeKernel<P: PrecisionType>: Kernel, Computable{
+  func compute(commandBuffer: MTLCommandBuffer, param: ShapeParam<P>) throws {
+//    print("shape compute")
+//    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+//      throw PaddleMobileError.predictError(message: " encode is nil")
+//    }
+//    encoder.setTexture(param.output.metalTexture, index: 0)
+//    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: ShapeParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "shape")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "shape_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..5d6874da151b64fd58c2016865515778d6267551
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift
@@ -0,0 +1,51 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct SoftmaxMetalParam {
+  let N: Int32
+  let K: Int32
+}
+
+class SoftmaxKernel<P: PrecisionType>: Kernel, Computable{
+  
+  var metalParam: SoftmaxMetalParam
+  required init(device: MTLDevice, param: SoftmaxParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    metalParam = SoftmaxMetalParam.init(
+      N: Int32(param.input.tensorDim[0]),
+      K: Int32(param.input.tensorDim[1])
+    )
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "softmax_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "softmax_half")
+    } else {
+      fatalError()
+    }
+  }
+
+  func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encoder is nil")
+    }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<SoftmaxMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SplitKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SplitKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..67e1cd9ab85c3c60d89846bab89ef10bbe513305
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SplitKernel.swift
@@ -0,0 +1,93 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct SplitMetalParam {
+  var idim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1)
+  var axis: Int32 = 0
+  var offset: Int32 = 0
+  var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+  var vdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0)
+}
+
+class SplitKernel<P: PrecisionType>: Kernel, Computable{
+  var smp: SplitMetalParam
+  func compute(commandBuffer: MTLCommandBuffer, param: SplitParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    for i in 0..<param.outputList.count {
+      encoder.setTexture(param.outputList[i].metalTexture, index: i + 1)
+    }
+    encoder.setBytes(&smp, length: MemoryLayout<SplitMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: SplitParam<P>) {
+    //     param.output.initTexture(device: device, computePrecision: computePrecision)
+    let num = param.outputList.count
+    let rank = param.input.tensorDim.cout()
+    assert(num >= 2 && num <= 4)
+    for output in param.outputList {
+      output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
+    }
+    smp = SplitMetalParam.init()
+    smp.idim = (Int32(param.input.dim[0]), Int32(param.input.dim[1]), Int32(param.input.dim[2]), Int32(param.input.dim[3]))
+    smp.axis = Int32(param.axis + param.input.dim.cout() - param.input.tensorDim.cout())
+    for i in 0..<4 {
+      if param.input.transpose[i] == smp.axis {
+        smp.axis = Int32(i)
+        break
+      }
+    }
+    smp.trans = (Int32(param.input.transpose[0]), Int32(param.input.transpose[1]), Int32(param.input.transpose[2]), Int32(param.input.transpose[3]))
+    var vdim: [Int32] = [0, 0, 0, 0]
+    for i in 0..<num {
+      vdim[i] = Int32(param.outputList[i].tensorDim[param.axis])
+    }
+    smp.vdim = (vdim[0], vdim[1], vdim[2], vdim[3])
+    var v = "normal"
+    if rank == 4 {
+      if smp.axis == 1 {
+        v = "y"
+      } else if smp.axis == 2 {
+        v = "x"
+      }
+    } else if rank == 3 {
+      if smp.axis == 2 {
+        v = "y"
+      } else if smp.axis == 3 {
+        v = "x"
+      }
+    } else if rank == 2 {
+      if smp.axis == 2 {
+        v = "y"
+      }
+    }
+    if v == "normal" {
+      fatalError("split unsupported")
+    }
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_float")
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "split_\(rank)_\(num)_\(v)_half")
+    } else {
+      fatalError()
+    }
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..0943686660e4bdd91b6cd909dff04cdd497cd817
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct Texture2DTo2DArrayParam {
+  let input: MTLTexture
+  let output: MTLTexture
+  let expectDim: Dim
+}
+
+class Texture2DTo2DArrayKernel<P: PrecisionType>: Kernel, Computable{
+  func compute(commandBuffer: MTLCommandBuffer, param: FeedParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(param.input.mtlTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.input.mtlTexture)
+    encoder.endEncoding()
+  }
+  
+  required init(device: MTLDevice, param: FeedParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "texture2d_to_2d_array_half")
+    } else if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "texture2d_to_2d_array")
+    } else {
+      fatalError()
+    }
+    
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
new file mode 100644
index 0000000000000000000000000000000000000000..7b872283d45bca4adb5e90a531c936f2ad5534f8
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct TransposeMetalParam {
+  var iC: Int32 = 0
+  var oC: Int32 = 0
+  var axis: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3)
+}
+
+class TransposeKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: TransposeMetalParam = TransposeMetalParam.init()
+  required init(device: MTLDevice, param: TransposeParam<P>) {
+    param.output.initTexture(device: device, computePrecision: computePrecision)
+    let rank = param.input.tensorDim.cout()
+    var axis: [Int] = [0, 1, 2, 3]
+    for i in 0..<param.axis.count {
+      axis[4-rank+i] = 4 - rank + Int(param.axis[i])
+    }
+
+    var naxis: [Int] = [0, 0, 0, 0]
+    for i in 0..<4 {
+      for j in 0..<4 {
+        if param.input.transpose[j] == axis[i] {
+          naxis[i] = j
+          break
+        }
+      }
+    }
+    metalParam.iC = Int32(param.input.dim[param.input.transpose[3]])
+    metalParam.oC = Int32(param.output.dim[3])
+    metalParam.axis = (Int32(naxis[0]), Int32(naxis[1]), Int32(naxis[2]), Int32(naxis[3]))
+    var kernelFunc = "transpose_undefined"
+    if computePrecision == .Float16 {
+      if param.input.transpose == axis {
+        kernelFunc = "transpose_copy_half"
+      } else {
+        kernelFunc = "transpose_\(rank)_half"
+      }
+    } else if computePrecision == .Float32 {
+      if param.input.transpose == axis {
+        kernelFunc = "transpose_copy_float"
+      } else {
+        kernelFunc = "transpose_\(rank)_float"
+      }
+    } else {
+      fatalError()
+    }
+    print("===========>", kernelFunc)
+    print(metalParam)
+    super.init(device: device, inFunctionName: kernelFunc)
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+  
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<TransposeMetalParam>.size, index: 0)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+  
+
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..96333a07a9669ecb2b5bfe901d71be729e37b533
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormKernel.metal
@@ -0,0 +1,42 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void batchnorm(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outTexture [[texture(1)]],
+                      const device float4 * nscale [[buffer(0)]],
+                      const device float4 * nbias [[buffer(1)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  float4 output = input * nscale[gid.z] + nbias[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void batchnorm_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                      const device half4 * newScale [[buffer(0)]],
+                      const device half4 * newBias [[buffer(1)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  half4 output = input * newScale[gid.z] + newBias[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormRelu.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormRelu.metal
new file mode 100644
index 0000000000000000000000000000000000000000..eb94408c8ac664be5cf62bc28bfb02825856ebd4
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BatchNormRelu.metal
@@ -0,0 +1,36 @@
+//
+//  BatchNormRelu.metal
+//  paddle-mobile
+//
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct MetalConvParam {
+    short offsetX;
+    short offsetY;
+    short offsetZ;
+    ushort strideX;
+    ushort strideY;
+};
+
+kernel void batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         const device float4 *new_scale [[buffer(0)]],
+                                         const device float4 *new_biase [[buffer(1)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+    
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    float4 input;
+    float4 output;
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    input = inTexture.sample(sample, gid.x, gid.y, gid.z);
+    output = fmax(input * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    outTexture.write(output, gid.xy, gid.z);
+
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..a590f8089890f2fab1af4c1f736f3bfc5708aecf
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.inc.metal
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+
+#define FUNC(f, p) CONCAT2_(f, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+
+kernel void FUNC(bilinear_interp, P)(texture2d_array<P, access::read> input [[texture(0)]],
+                     texture2d_array<P, access::write> output [[texture(1)]],
+                     constant bilinear_interp_param & pm [[buffer(0)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) r;
+  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+    r = input.read(gid.xy, gid.z);
+  } else {
+    P w = gid.x * pm.ratio_w;
+    P h = gid.y * pm.ratio_h;
+    uint w0 = w, h0 = h;
+    uint w1 = w0 + 1, h1 = h0 + 1;
+    P w1lambda = w - w0, h1lambda = h - h0;
+    P w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+    if (w1 >= input.get_width()) w1 = w0;
+    if (h1 >= input.get_height()) h1 = h0;
+    VECTOR(P, 4) r0 = input.read(uint2(w0, h0), gid.z);
+    VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z);
+    VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z);
+    VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z);
+    r = h2lambda * (w2lambda * r0 + w1lambda * r1)
+      + h1lambda * (w2lambda * r2 + w1lambda * r3);
+  }
+  output.write(r, gid.xy, gid.z);
+}
+
+#endif
diff --git a/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.m b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.metal
similarity index 71%
rename from ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.m
rename to metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.metal
index 369e90039d37f62545a20ae30b5ce47d1c27dc95..394cf89db09d47b0d3c87ff124c21a93962c0972 100644
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.m
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BilinearInterp.metal
@@ -12,22 +12,18 @@
  See the License for the specific language governing permissions and
  limitations under the License. */
 
-#import "ViewController.h"
+#include <metal_stdlib>
+using namespace metal;
 
-@interface ViewController ()
+struct bilinear_interp_param {
+  float ratio_h;
+  float ratio_w;
+};
 
-@end
+#define P float
+#include "BilinearInterp.inc.metal"
+#undef P
 
-@implementation ViewController
-
-- (void)viewDidLoad {
-    [super viewDidLoad];
-}
-
-
-- (void)didReceiveMemoryWarning {
-    [super didReceiveMemoryWarning];
-}
-
-
-@end
+#define P half
+#include "BilinearInterp.inc.metal"
+#undef P
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..918fbac1a713d7b0442a1eb1f07abea3616bec96
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.inc.metal
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+
+#define FUNC(f, p) CONCAT2_(f, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+kernel void FUNC(boxcoder, P)(texture2d_array<P, access::read> priorBox [[texture(0)]],
+                     texture2d_array<P, access::read> priorBoxVar [[texture(1)]],
+                     texture2d_array<P, access::read> targetBox [[texture(2)]],
+                     texture2d_array<P, access::write> output[[texture(3)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) p = priorBox.read(uint2(0, gid.x), gid.z);
+  VECTOR(P, 4) pv = priorBoxVar.read(uint2(0, gid.x), gid.z);
+  VECTOR(P, 4) t;
+  t[0] = targetBox.read(uint2(0, gid.x), gid.z)[0];
+  t[1] = targetBox.read(uint2(1, gid.x), gid.z)[0];
+  t[2] = targetBox.read(uint2(2, gid.x), gid.z)[0];
+  t[3] = targetBox.read(uint2(3, gid.x), gid.z)[0];
+  
+  P px = (p.x + p.z) / 2;
+  P py = (p.y + p.w) / 2;
+  P pw = p.z - p.x;
+  P ph = p.w - p.y;
+  
+  P tx = pv.x * t.x * pw + px;
+  P ty = pv.y * t.y * ph + py;
+  P tw = exp(pv.z * t.z) * pw;
+  P th = exp(pv.w * t.w) * ph;
+  
+  VECTOR(P, 4) r;
+  r.x = tx - tw / 2;
+  r.y = ty - th / 2;
+  r.z = tx + tw / 2;
+  r.w = ty + th / 2;
+
+  output.write(r, gid.xy, gid.z);
+}
+
+#endif
diff --git a/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.h b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal
similarity index 79%
rename from ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.h
rename to metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal
index eb789ffbb2d394d9c45651b48a931d3759a7687b..4009e213d51d0a9c33c70aea22b015df49e347dc 100644
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/AppDelegate.h
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal
@@ -12,12 +12,12 @@
  See the License for the specific language governing permissions and
  limitations under the License. */
 
-#import <UIKit/UIKit.h>
-
-@interface AppDelegate : UIResponder <UIApplicationDelegate>
-
-@property (strong, nonatomic) UIWindow *window;
-
-
-@end
+#include <metal_stdlib>
+using namespace metal;
 
+#define P float
+#include "BoxCoder.inc.metal"
+#undef P
+#define P half
+#include "BoxCoder.inc.metal"
+#undef P
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
new file mode 100644
index 0000000000000000000000000000000000000000..40bae035c097b5ab386d78520b6b04f074eb2fee
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal
@@ -0,0 +1,120 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+
+inline void xyzn2abcd_1(int xyzn[4], int abcd[4]) {
+  abcd[0] = abcd[1] = abcd[2] = 0;
+  abcd[3] = xyzn[0] * 4 + xyzn[3];
+}
+inline void xyzn2abcd_2(int xyzn[4], int abcd[4]) {
+  abcd[0] = abcd[1] = 0;
+  abcd[2] = xyzn[1];
+  abcd[3] = xyzn[0] * 4 + xyzn[3];
+}
+inline void xyzn2abcd_3(int xyzn[4], int abcd[4]) {
+  abcd[0] = 0;
+  abcd[3] = xyzn[0];
+  abcd[2] = xyzn[1];
+  abcd[1] = xyzn[2] * 4 + xyzn[3];
+}
+inline void xyzn2abcd_4(int C, int xyzn[4], int abcd[4]) {
+  abcd[2] = xyzn[0];
+  abcd[1] = xyzn[1];
+  uint t = xyzn[2] * 4 + xyzn[3];
+  abcd[0] = t / C;
+  abcd[3] = t % C;
+}
+
+inline void abcd2xyzn_1(int abcd[4], int xyzn[4]) {
+  xyzn[1] = xyzn[2] = 0;
+  xyzn[0] = abcd[3] / 4;
+  xyzn[1] = abcd[3] % 4;
+}
+inline void abcd2xyzn_2(int abcd[4], int xyzn[4]) {
+  xyzn[2] = 0;
+  xyzn[1] = abcd[2];
+  xyzn[0] = abcd[3] / 4;
+  xyzn[3] = abcd[3] % 4;
+}
+inline void abcd2xyzn_3(int abcd[4], int xyzn[4]) {
+  xyzn[0] = abcd[3];
+  xyzn[1] = abcd[2];
+  xyzn[2] = abcd[1] / 4;
+  xyzn[3] = abcd[1] % 4;
+}
+inline void abcd2xyzn_4(int C, int abcd[4], int xyzn[4]) {
+  xyzn[0] = abcd[2];
+  xyzn[1] = abcd[1];
+  uint t = abcd[0] * C + abcd[3];
+  xyzn[2] = t / 4;
+  xyzn[3] = t % 4;
+}
+
+inline void xyzn2abcd(int C, int xyzn[4], int abcd[4]) {
+  abcd[2] = xyzn[0];
+  abcd[1] = xyzn[1];
+  uint t = xyzn[2] * 4 + xyzn[3];
+  abcd[0] = t / C;
+  abcd[3] = t % C;
+}
+
+inline void abcd2xyzn(int C, int abcd[4], int xyzn[4]) {
+  xyzn[0] = abcd[2];
+  xyzn[1] = abcd[1];
+  uint t = abcd[0] * C + abcd[3];
+  xyzn[2] = t / 4;
+  xyzn[3] = t % 4;
+}
+
+inline int32_t abcd2index(int32_t dim[4], int32_t abcd[4]) {
+  int32_t r = abcd[0];
+  r = r * dim[1] + abcd[1];
+  r = r * dim[2] + abcd[2];
+  r = r * dim[3] + abcd[3];
+  return r;
+}
+
+inline void index2abcd(int32_t dim[4], int32_t ind, int32_t abcd[4]) {
+  abcd[3] = ind % dim[3]; ind /= dim[3];
+  abcd[2] = ind % dim[2]; ind /= dim[2];
+  abcd[1] = ind % dim[1]; ind /= dim[1];
+  abcd[0] = ind;
+}
+
+inline void trans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
+  for (int i = 0; i < 4; i++) {
+    opos[i] = ipos[trans[i]];
+  }
+}
+
+inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) {
+  for (int i = 0; i < 4; i++) {
+    opos[trans[i]] = ipos[i];
+  }
+}
+
+
+struct MetalConvParam {
+  short offsetX;
+  short offsetY;
+  short offsetZ;
+  ushort strideX;
+  ushort strideY;
+  ushort dilationX;
+  ushort dilationY;
+};
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..2b070fc48b78391e96b93823eeff7f936de2ff7d
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.inc.metal
@@ -0,0 +1,318 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
+
+#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+#define FUNC_R(f, r) CONCAT2_(f, r)
+
+#if V == VX
+#define VV x
+#elif V == VY
+#define VV y
+#elif V == VZ
+#define VV z
+#else
+#define VV normal
+#endif
+
+#if V == VNORMAL
+//kernel void FUNC(concat, R, N, normal, P)(array<texture2d_array<P, access::read>, N> in [[texture(0)]],
+//                                     texture2d_array<P, access::read> out_x [[texture(N)]],
+//                                     texture2d_array<P, access::write> out [[texture(N+1)]],
+//                                     constant ConcatParam & pm [[buffer(0)]],
+//                                     uint3 gid [[thread_position_in_grid]]) {
+//}
+kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
+                                          texture2d_array<P, access::read> in1 [[texture(1)]],
+#if N >= 3
+                                          texture2d_array<P, access::read> in2 [[texture(2)]],
+#endif
+#if N >= 4
+                                          texture2d_array<P, access::read> in3 [[texture(3)]],
+#endif
+#if N >= 5
+                                          texture2d_array<P, access::read> in4 [[texture(4)]],
+#endif
+#if N >= 6
+                                          texture2d_array<P, access::read> in5 [[texture(5)]],
+#endif
+                                          texture2d_array<P, access::read> inx [[texture(N)]],
+                                          texture2d_array<P, access::write> out [[texture(N+1)]],
+                                          constant ConcatParam & pm [[buffer(0)]],
+                                          uint3 gid [[thread_position_in_grid]]) {
+
+   ConcatParam cp = pm;
+   int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4];
+   VECTOR(P, 4) r = inx.read(gid.xy, gid.z);
+   for (int i = 0; i < 4; i++) {
+     xyzn[3] = i;
+#if R == 4
+     xyzn2abcd_4(cp.odim[3], xyzn, abcd);
+#else
+     FUNC_R(xyzn2abcd, R)(xyzn, abcd);
+#endif
+     int k = abcd[cp.axis] - cp.offset;
+     if (k < 0) continue;
+     int j = 0;
+     for (; j < N; j++) {
+       if (k < cp.vdim[j]) {
+         break;
+       }
+       k -= cp.vdim[j];
+     }
+     if (j == N) {
+       continue;
+     }
+     int ta = cp.odim[cp.axis];
+     abcd[cp.axis] = k;
+     cp.odim[cp.axis] = cp.vdim[j];
+#if R == 4
+     abcd2xyzn_4(cp.odim[3], abcd, oxyzn);
+#else
+     FUNC_R(abcd2xyzn, R)(abcd, oxyzn);
+#endif
+     cp.odim[cp.axis] = ta;
+     switch (j) {
+       case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+       case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#if N >= 3
+       case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#endif
+#if N >= 4
+       case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#endif
+#if N >= 5
+       case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#endif
+#if N >= 6
+       case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break;
+#endif
+     }
+   }
+   out.write(r, gid.xy, gid.z);
+}
+
+#endif // V == NORMAL
+
+
+
+#if V == VX
+kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
+                                          texture2d_array<P, access::read> in1 [[texture(1)]],
+#if N >= 3
+                                          texture2d_array<P, access::read> in2 [[texture(2)]],
+#endif // N >= 3
+#if N >= 4
+                                          texture2d_array<P, access::read> in3 [[texture(3)]],
+#endif // N >= 4
+#if N >= 5
+                                          texture2d_array<P, access::read> in4 [[texture(4)]],
+#endif // N >= 5
+#if N >= 6
+                                          texture2d_array<P, access::read> in5 [[texture(5)]],
+#endif // N >= 6
+                                          texture2d_array<P, access::write> out [[texture(N)]],
+                                          constant ConcatParam & pm [[buffer(0)]],
+                                          uint3 gid [[thread_position_in_grid]]) {
+  int x = gid.x - pm.offset;
+  if (x < 0) return;
+  if (x < pm.vdim[0]) {
+    VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+  x -= pm.vdim[0];
+  if (x < pm.vdim[1]) {
+    VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#if N >= 3
+  x -= pm.vdim[1];
+  if (x < pm.vdim[2]) {
+    VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  x -= pm.vdim[2];
+  if (x < pm.vdim[3]) {
+    VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 4
+#if N >= 5
+  x -= pm.vdim[3];
+  if (x < pm.vdim[4]) {
+    VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 5
+#if N >= 6
+  x -= pm.vdim[4];
+  if (x < pm.vdim[5]) {
+    VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 6
+}
+#endif // V == VX
+
+#if V == VY
+kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
+                                      texture2d_array<P, access::read> in1 [[texture(1)]],
+#if N >= 3
+                                      texture2d_array<P, access::read> in2 [[texture(2)]],
+#endif // N >= 3
+#if N >= 4
+                                      texture2d_array<P, access::read> in3 [[texture(3)]],
+#endif // N >= 4
+#if N >= 5
+                                      texture2d_array<P, access::read> in4 [[texture(4)]],
+#endif // N >= 5
+#if N >= 6
+                                      texture2d_array<P, access::read> in5 [[texture(5)]],
+#endif // N >= 6
+                                      texture2d_array<P, access::write> out [[texture(N)]],
+                                      constant ConcatParam & pm [[buffer(0)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  int y = gid.y - pm.offset;
+  if (y < 0) return;
+  if (y < pm.vdim[0]) {
+    VECTOR(P, 4)  r = in0.read(gid.xy, gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+  y -= pm.vdim[0];
+  if (y < pm.vdim[1]) {
+    VECTOR(P, 4)  r = in1.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#if N >= 3
+  y -= pm.vdim[1];
+  if (y < pm.vdim[2]) {
+    VECTOR(P, 4)  r = in2.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  y -= pm.vdim[2];
+  if (y < pm.vdim[3]) {
+    VECTOR(P, 4)  r = in3.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 4
+#if N >= 5
+  y -= pm.vdim[3];
+  if (y < pm.vdim[4]) {
+    VECTOR(P, 4)  r = in4.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 5
+#if N >= 6
+  y -= pm.vdim[4];
+  if (y < pm.vdim[5]) {
+    VECTOR(P, 4)  r = in5.read(uint2(gid.x, y), gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 6
+}
+#endif // V == VY
+
+#if V == VZ
+kernel void FUNC(concat, R, N, VV, P)(texture2d_array<P, access::read> in0 [[texture(0)]],
+                                      texture2d_array<P, access::read> in1 [[texture(1)]],
+#if N >= 3
+                                      texture2d_array<P, access::read> in2 [[texture(2)]],
+#endif // N >= 3
+#if N >= 4
+                                      texture2d_array<P, access::read> in3 [[texture(3)]],
+#endif // N >= 4
+#if N >= 5
+                                      texture2d_array<P, access::read> in4 [[texture(4)]],
+#endif // N >= 5
+#if N >= 6
+                                      texture2d_array<P, access::read> in5 [[texture(5)]],
+#endif // N >= 6
+                                      texture2d_array<P, access::write> out [[texture(N)]],
+                                      constant ConcatParam & pm [[buffer(0)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  int z = gid.z - pm.offset;
+  if (z < 0) return;
+  if (z < pm.vdim[0]) {
+    VECTOR(P, 4) r = in0.read(gid.xy, gid.z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+  z -= pm.vdim[0];
+  if (z < pm.vdim[1]) {
+    VECTOR(P, 4)  r = in1.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#if N >= 3
+  z -= pm.vdim[1];
+  if (z < pm.vdim[2]) {
+    VECTOR(P, 4)  r = in2.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  z -= pm.vdim[2];
+  if (z < pm.vdim[3]) {
+    VECTOR(P, 4)  r = in3.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 4
+#if N >= 5
+  z -= pm.vdim[3];
+  if (z < pm.vdim[4]) {
+    VECTOR(P, 4)  r = in4.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 5
+#if N >= 6
+  z -= pm.vdim[4];
+  if (z < pm.vdim[5]) {
+    VECTOR(P, 4)  r = in5.read(gid.xy, z);
+    out.write(r, gid.xy, gid.z);
+    return;
+  }
+#endif // N >= 6
+}
+#endif // V == VZ
+
+
+#undef VV
+#endif // #ifdef P
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..b7d17f2d25de544e4ce938c577e0d04f536da9af
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConcatKernel.metal
@@ -0,0 +1,171 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+struct ConcatParam {
+  int32_t odim[4];
+  int32_t axis;
+  int32_t offset;
+  int32_t trans[4];
+  int32_t vdim[6];
+};
+
+#define VNORMAL 1
+#define VX 2
+#define VY 3
+#define VZ 4
+
+// >> fast mode
+// only support concat_{2,3,4}_{2,3,4,5,6}_y_{float,half}
+// only support concat_{3,4}_{2,3,4,5,6}_x_{float,half}
+// only support concat_{1,2,3,4}_{2,3,4,5,6}_z_{float,half}
+// >> normal mode (loop mode)
+// ssd-ar: (R=4, N=3, V=z), (R=3, N=2, V=y), (R=2, N=5, V=x), (R=3, N=5, V=x)
+// ssd: (R=2, N=6, V=y), (R=3, N=6, V=y)
+// genet: (R=4, N=2, V=normal)
+
+// ssd-ar: (R=3, N=5, V=x)
+#define V VX
+  #define R 3
+    #define N 5
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+// ssd-ar: (R=2, N=5, V=x)
+#define V VX
+  #define R 2
+    #define N 5
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+// ssd-ar: (R=3, N=2, V=y)
+#define V VY
+  #define R 3
+    #define N 2
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+// ssd-ar: (R=4, N=3, V=z)
+#define V VZ
+  #define R 4
+    #define N 3
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+// ssd: (R=2, N=6, V=y)
+#define V VY
+  #define R 2
+    #define N 6
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+// ssd: (R=3, N=6, V=y)
+#define V VY
+  #define R 3
+    #define N 6
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+#define V VNORMAL
+  #define R 4
+    #define N 2
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+#define V VY
+  #define R 2
+    #define N 2
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+#define V VY
+  #define R 2
+    #define N 5
+      #define P float
+        #include "ConcatKernel.inc.metal"
+      #undef P
+      #define P half
+        #include "ConcatKernel.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..87b60a64fc48ab89af274e0b24897e0b411599e0
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal
@@ -0,0 +1,310 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+
+kernel void conv_add_batch_norm_relu_1x1_half(
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
+            texture2d_array<half, access::write> outTexture [[texture(1)]],
+            constant MetalConvParam &param [[buffer(0)]],
+            const device half4 *weights [[buffer(1)]],
+            const device half4 *biase [[buffer(2)]],
+            const device half4 *new_scale [[buffer(3)]],
+            const device half4 *new_biase [[buffer(4)]],
+            uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void conv_add_batch_norm_relu_3x3_half(
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
+            texture2d_array<half, access::write> outTexture [[texture(1)]],
+            constant MetalConvParam &param [[buffer(0)]],
+            const device half4 *weights [[buffer(1)]],
+            const device half4 *biase [[buffer(2)]],
+            const device half4 *new_scale [[buffer(3)]],
+            const device half4 *new_biase [[buffer(4)]],
+            uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_add_batch_norm_relu_3x3_half(
+            texture2d_array<half, access::sample> inTexture [[texture(0)]],
+            texture2d_array<half, access::write> outTexture [[texture(1)]],
+            constant MetalConvParam &param [[buffer(0)]],
+            const device half *weights [[buffer(1)]],
+            const device half4 *biase [[buffer(2)]],
+            const device half4 *new_scale [[buffer(3)]],
+            const device half4 *new_biase [[buffer(4)]],
+            uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+
+
+/*---------------------------------------------*/
+
+
+
+kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         constant MetalConvParam &param [[buffer(0)]],
+                                         const device float4 *weights [[buffer(1)]],
+                                         const device float4 *biase [[buffer(2)]],
+                                         const device float4 *new_scale [[buffer(3)]],
+                                         const device float4 *new_biase [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         constant MetalConvParam &param [[buffer(0)]],
+                                         const device float4 *weights [[buffer(1)]],
+                                         const device float4 *biase [[buffer(2)]],
+                                         const device float4 *new_scale [[buffer(3)]],
+                                         const device float4 *new_biase [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                                   constant MetalConvParam &param [[buffer(0)]],
+                                                   const device float *weights [[buffer(1)]],
+                                                   const device float4 *biase [[buffer(2)]],
+                                                   const device float4 *new_scale [[buffer(3)]],
+                                                   const device float4 *new_biase [[buffer(4)]],
+                                                   uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
new file mode 100644
index 0000000000000000000000000000000000000000..274e416576743a473ba8931bcd538e9c39415f3c
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
@@ -0,0 +1,622 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+#pragma mark - convAdd
+kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = biase[gid.z];
+  
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 9;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = biase[gid.z];
+  
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+  
+  float4 input[9];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+    
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+    
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+    
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+    
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_5x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 5;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = biase[gid.z];
+  
+  ushort dilation_y = param.dilationY;
+  float4 input[5];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+    
+    for (int j = 0; j < 5; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void conv_add_1x5(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 5;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = biase[gid.z];
+  
+  ushort dilation_x = param.dilationX;
+  float4 input[5];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+    
+    for (int j = 0; j < 5; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                   constant MetalConvParam &param [[buffer(0)]],
+                                   const device float *weights [[buffer(1)]],
+                                   const device float4 *biase [[buffer(2)]],
+                                   uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = biase[gid.z];
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+//  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+#pragma mark - half
+
+kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  half4 output = biase[gid.z];
+  
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  half4 output = biase[gid.z];
+  
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+  
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y - dilation_y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y - dilation_y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(float4(input[j]), float4(weight_x));
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(float4(input[j]), float4(weight_y));
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(float4(input[j]), float4(weight_z));
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(float4(input[j]), float4(weight_w));
+    }
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                   texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                   constant MetalConvParam &param [[buffer(0)]],
+                                   const device half *weights [[buffer(1)]],
+                                   const device half4 *biase [[buffer(2)]],
+                                   uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  half4 output = biase[gid.z];
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void conv_add_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 5;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  half4 output = biase[gid.z];
+  
+  ushort dilation_y = param.dilationY;
+  half4 input[5];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+    
+    for (int j = 0; j < 5; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void conv_add_1x5_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device half4 *weights [[buffer(1)]],
+                         const device half4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 5;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  half4 output = biase[gid.z];
+  
+  ushort dilation_x = param.dilationX;
+  half4 input[5];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+    
+    for (int j = 0; j < 5; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void test_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                              texture2d_array<float, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device float4 *weights [[buffer(1)]],
+                              const device float4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  if (gid.x > 0 || gid.y > 0 || gid.z > 0) { return; }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  const uint kernelHXW = 9;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+  
+  float4 input[9];
+  
+  for (uint i = 0; i < input_arr_size; ++i) {
+    
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+    
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+    
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+    
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+    
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+    
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+    
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+    
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+    
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  //  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPrelu.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPrelu.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..069daa20e875eb00c0d518e0463987248ca8dce5
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPrelu.inc.metal
@@ -0,0 +1,447 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#include "Macro.metal"
+
+
+#pragma mark - convAdd
+kernel void FUNC3_(conv_add_1x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<P, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device VECTOR(P, 4) *weights [[buffer(1)]],
+                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+                         const device P *alpha [[buffer(3)]],
+#endif
+                         uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  VECTOR(P, 4) output = biase[gid.z];
+  
+  VECTOR(P, 4) input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample,float2(posInInput.x, posInInput.y), i);
+    VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  
+//  output = output + float4(biase[gid.z]);
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+kernel void FUNC3_(conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+    texture2d_array<P, access::write> outTexture [[texture(1)]],
+    constant MetalConvParam &param [[buffer(0)]],
+    const device VECTOR(P, 4) *weights [[buffer(1)]],
+    const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+     const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+     const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+     const device P *alpha [[buffer(3)]],
+#endif
+     uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+
+  const uint kernelHXW = 9;
+
+  uint input_arr_size = inTexture.get_array_size();
+
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+  VECTOR(P, 4) output = biase[gid.z];
+
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+
+  VECTOR(P, 4) input[9];
+
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i);
+
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i);
+
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
+
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
+
+    for (int j = 0; j < 9; ++j) {
+      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+
+      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+
+      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+
+      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+//  output = output + float4(biase[gid.z]);
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+kernel void FUNC3_(conv_add_5x1, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<P, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device VECTOR(P, 4) *weights [[buffer(1)]],
+                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+                        const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+                        const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+                        const device P *alpha [[buffer(3)]],
+#endif
+                         uint3 gid [[thread_position_in_grid]]) {
+
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+
+  const uint kernelHXW = 5;
+
+  uint input_arr_size = inTexture.get_array_size();
+
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+  VECTOR(P, 4) output = biase[gid.z];;
+
+  ushort dilation_y = param.dilationY;
+  VECTOR(P, 4) input[5];
+
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
+
+    input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
+
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+
+    input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
+
+    input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
+
+    for (int j = 0; j < 5; ++j) {
+      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+
+      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+
+      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+
+      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+
+kernel void FUNC3_(conv_add_1x5, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<P, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device VECTOR(P, 4) *weights [[buffer(1)]],
+                         const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+                         const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+                         const device P *alpha [[buffer(3)]],
+#endif
+                         uint3 gid [[thread_position_in_grid]]) {
+
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+
+  const uint kernelHXW = 5;
+
+  uint input_arr_size = inTexture.get_array_size();
+
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+
+  VECTOR(P, 4) output = biase[gid.z];
+
+  ushort dilation_x = param.dilationX;
+  VECTOR(P, 4) input[5];
+
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+
+    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+
+    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+
+    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+
+    for (int j = 0; j < 5; ++j) {
+      VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+
+      VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+
+      VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+
+      VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+kernel void FUNC3_(depthwise_conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sample> inTexture [[texture(0)]],
+    texture2d_array<P, access::write> outTexture [[texture(1)]],
+    constant MetalConvParam &param [[buffer(0)]],
+    const device P *weights [[buffer(1)]],
+    const device VECTOR(P, 4) *biase [[buffer(2)]],
+#ifdef PRELU_CHANNEL
+    const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_ELEMENT
+    const device VECTOR(P, 4) *alpha [[buffer(3)]],
+#endif
+#ifdef PRELU_OTHER
+    const device P *alpha [[buffer(3)]],
+#endif
+    uint3 gid [[thread_position_in_grid]]) {
+
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  VECTOR(P, 4) output = biase[gid.z];
+  VECTOR(P, 4) inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    VECTOR(P, 4) input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z);
+}
+
+#endif
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPreluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPreluKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..f03a1d5b625cf01f1f1bc5ac23bebf7dabd968d9
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPreluKernel.metal
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+#define P float
+
+  #define PRELU_CHANNEL prelu_channel
+  #define PRELU_TYPE prelu_channel
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_CHANNEL
+
+  #define PRELU_ELEMENT prelu_element
+  #define PRELU_TYPE prelu_element
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_ELEMENT
+
+  #define PRELU_OTHER   prelu_other
+  #define PRELU_TYPE prelu_other
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_OTHER
+
+#undef P
+
+#define P half
+
+  #define PRELU_CHANNEL prelu_channel
+  #define PRELU_TYPE prelu_channel
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_CHANNEL
+
+  #define PRELU_ELEMENT prelu_element
+  #define PRELU_TYPE prelu_element
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_ELEMENT
+
+  #define PRELU_OTHER   prelu_other
+  #define PRELU_TYPE prelu_other
+    #include "ConvAddPrelu.inc.metal"
+  #undef  PRELU_TYPE
+  #undef  PRELU_OTHER
+
+#undef P
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..4b97b7829a1fba27704fe7b60a03b2672f4f5953
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal
@@ -0,0 +1,297 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+#pragma mark - conv bn relu
+kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device float4 *weights [[buffer(1)]],
+                                     const device float4 *new_scale [[buffer(2)]],
+                                     const device float4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device float4 *weights [[buffer(1)]],
+                                     const device float4 *new_scale [[buffer(2)]],
+                                     const device float4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                               texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                               constant MetalConvParam &param [[buffer(0)]],
+                                               const device float *weights [[buffer(1)]],
+                                               const device float4 *new_scale [[buffer(2)]],
+                                               const device float4 *new_biase [[buffer(3)]],
+                                               uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+#pragma mark - half
+kernel void conv_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device half4 *weights [[buffer(1)]],
+                                     const device half4 *new_scale [[buffer(2)]],
+                                     const device half4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(float4(input), float4(weight_x));
+    
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(float4(input), float4(weight_y));
+    
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(float4(input), float4(weight_z));
+    
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(float4(input), float4(weight_w));
+  }
+  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device half4 *weights [[buffer(1)]],
+                                     const device half4 *new_scale [[buffer(2)]],
+                                     const device half4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(float4(input[j]), float4(weight_x));
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(float4(input[j]), float4(weight_y));
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(float4(input[j]), float4(weight_z));
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(float4(input[j]), float4(weight_w));
+    }
+  }
+  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                               texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                               constant MetalConvParam &param [[buffer(0)]],
+                                               const device half *weights [[buffer(1)]],
+                                               const device half4 *new_scale [[buffer(2)]],
+                                               const device half4 *new_biase [[buffer(3)]],
+                                               uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0);
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..c07515c13da54c7f8bf698f976e47f7cda6de32b
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
@@ -0,0 +1,280 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+// conv
+#pragma mark -- conv
+kernel void conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device float4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                               texture2d_array<float, access::write> outTexture [[texture(1)]],
+                               constant MetalConvParam &param [[buffer(0)]],
+                               const device float *weights [[buffer(1)]],
+                               uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device float4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device half4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(float4(input[j]), float4(weight_x));
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(float4(input[j]), float4(weight_y));
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(float4(input[j]), float4(weight_z));
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(float4(input[j]), float4(weight_w));
+    }
+  }
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void depthwise_conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                               texture2d_array<half, access::write> outTexture [[texture(1)]],
+                               constant MetalConvParam &param [[buffer(0)]],
+                               const device half *weights [[buffer(1)]],
+                               uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
+    output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
+    output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
+    output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
+  }
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+kernel void conv_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                     texture2d_array<half, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device half4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(float4(input), float4(weight_x));
+    
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(float4(input), float4(weight_y));
+    
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(float4(input), float4(weight_z));
+    
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(float4(input), float4(weight_w));
+  }
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..baf3f31157a472412bb08ccb3c803f5ec9e25d9c
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal
@@ -0,0 +1,174 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct MetalConvTransposeParam{
+  ushort kernelW;
+  ushort kernelH;
+  
+  ushort strideX;
+  ushort strideY;
+  
+  ushort paddingX;
+  ushort paddingY;
+  
+  ushort dilationX;
+  ushort dilationY;
+};
+
+kernel void conv_transpose2x2_stride2(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                      texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                      constant MetalConvTransposeParam &param [[buffer(0)]],
+                                      const device float4 *weights [[buffer(1)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_array_size = inTexture.get_array_size();
+  int kernel_index_x = gid.x % 2;
+  int kernel_index_y = gid.y % 2;
+  int kernel_index = kernel_index_y * 2 + kernel_index_x;
+  int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
+  int input_x = gid.x / 2;
+  int input_y = gid.y / 2;
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 output = float4(0.0);
+  for (int i = 0; i < input_array_size; ++i) {
+    
+    float4 input = inTexture.sample(sample, float2(input_x, input_y), i);
+    
+    float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
+    float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
+    float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
+    float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
+    
+    output.x += dot(input, kernel_slice0);
+    
+    output.y += dot(input, kernel_slice1);
+    
+    output.z += dot(input, kernel_slice2);
+    
+    output.w += dot(input, kernel_slice3);
+  }
+  
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void conv_transpose2x2_stride2_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                      constant MetalConvTransposeParam &param [[buffer(0)]],
+                                      const device half4 *weights [[buffer(1)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_array_size = inTexture.get_array_size();
+  int kernel_index_x = gid.x % 2;
+  int kernel_index_y = gid.y % 2;
+  int kernel_index = kernel_index_y * 2 + kernel_index_x;
+  int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
+  int input_x = gid.x / 2;
+  int input_y = gid.y / 2;
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 output = float4(0.0);
+  for (int i = 0; i < input_array_size; ++i) {
+    
+    half4 input = inTexture.sample(sample, float2(input_x, input_y), i);
+    
+    half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
+    half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
+    half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
+    half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
+    
+    output.x += dot(float4(input), float4(kernel_slice0));
+    
+    output.y += dot(float4(input), float4(kernel_slice1));
+    
+    output.z += dot(float4(input), float4(kernel_slice2));
+    
+    output.w += dot(float4(input), float4(kernel_slice3));
+  }
+  
+  outTexture.write(half4(output), gid.xy, gid.z);
+}
+
+//kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+//                           texture2d_array<float, access::write> outTexture [[texture(1)]],
+//                           constant MetalConvTransposeParam &param [[buffer(0)]],
+//                           const device float4 *weights [[buffer(1)]],
+//                           uint3 gid [[thread_position_in_grid]]){
+//  if (gid.x >= outTexture.get_width() ||
+//      gid.y >= outTexture.get_height() ||
+//      gid.z >= outTexture.get_array_size()) {
+//    return;
+//  }
+//
+//  int input_array_size = inTexture.get_array_size();
+//
+//  uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH;
+//
+//  uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice);
+//
+//  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+//
+//  float4 output;
+//
+//  for (int w = 0; w < param.kernelW; ++w) {
+//    int top = gid.x - w * param.dilationX + param.paddingX;
+//    int input_x = top / param.strideX;
+//    if (top < 0 || input_x >= int(inTexture.get_width())) {
+//      continue;
+//    }
+//
+//    for (int h = 0; h < param.kernelH; ++h) {
+//      int top_y = gid.y - h * param.dilationY + param.paddingY;
+//      int input_y = top_y / param.strideY;
+//      if (top_y < 0 || input_y >= int(inTexture.get_height())) {
+//        continue;
+//      }
+//
+//      uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size();
+//
+//      for (int slice = 0; slice < input_array_size; ++slice) {
+//
+//        float4 input;
+//        float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice];
+//        float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice];
+//
+//        float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice];
+//
+//        float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice];
+//
+//        input = inTexture.sample(sample, float2(input_x,    input_y), slice);
+//        output.x += dot(input, kernel_slice);
+//        output.y += dot(input, kernel_slice1);
+//        output.z += dot(input, kernel_slice2);
+//        output.w += dot(input, kernel_slice3);
+//      }
+//    }
+//  }
+//
+//  outTexture.write(output, gid.xy, gid.z);
+//}
+//
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal
new file mode 100644
index 0000000000000000000000000000000000000000..b152df828106acd96171a89f4f636f308e0e9e39
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal
@@ -0,0 +1,100 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+struct ElementwiseAddParam {
+  int32_t fast;
+  int32_t axis;
+  int32_t ylen;
+  int32_t xdim[4];
+  int32_t xtrans[4];
+  int32_t ydim[4];
+  int32_t ytrans[4];
+};
+
+kernel void elementwise_add(texture2d_array<float, access::read> inputX [[texture(0)]],
+                            texture2d_array<float, access::read> inputY [[texture(1)]],
+                            texture2d_array<float, access::write> outTexture [[texture(2)]],
+                            constant ElementwiseAddParam &pm [[buffer(0)]],
+                            uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  float4 rx, ry;
+
+  if (pm.fast == 1) {
+    rx = inputX.read(gid.xy, gid.z);
+    ry = inputY.read(gid.xy, gid.z);
+  } else {
+    rx = inputX.read(gid.xy, gid.z);
+    int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+    int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+    int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+    int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+    int32_t yshift = 4 - pm.ylen - pm.axis;
+    for (int n = 0; n < 4; n++) {
+      x_xyzn[3] = n;
+      xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+      invtrans(xtrans, x_abcd, t_abcd);
+      for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+        y_abcd[yshift+k] = t_abcd[k];
+      }
+      trans(ytrans, y_abcd, t_abcd);
+      abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+      ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+    }
+  }
+  float4 r = rx + ry;
+  outTexture.write(r, gid.xy, gid.z);
+}
+
+kernel void elementwise_add_half(texture2d_array<half, access::read> inputX [[texture(0)]],
+                            texture2d_array<half, access::read> inputY [[texture(1)]],
+                            texture2d_array<half, access::write> outTexture [[texture(2)]],
+                            constant ElementwiseAddParam &pm [[buffer(0)]],
+                            uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  half4 rx, ry;
+
+  if (pm.fast == 1) {
+    rx = inputX.read(gid.xy, gid.z);
+    ry = inputY.read(gid.xy, gid.z);
+  } else {
+    rx = inputX.read(gid.xy, gid.z);
+    int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+    int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+    int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+    int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+    int32_t yshift = 4 - pm.ylen - pm.axis;
+    for (int n = 0; n < 4; n++) {
+      x_xyzn[3] = n;
+      xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+      invtrans(xtrans, x_abcd, t_abcd);
+      for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+        y_abcd[yshift+k] = t_abcd[k];
+      }
+      trans(ytrans, y_abcd, t_abcd);
+      abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+      ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+    }
+  }
+  half4 r = rx + ry;
+  outTexture.write(r, gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..b1d68d680962c53778d624ab15bfcfeb1d1a3142
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.inc.metal
@@ -0,0 +1,91 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#include <metal_stdlib>
+#include "Macro.metal"
+
+using namespace metal;
+
+kernel void FUNC3_(elementwise_add, PRELU_TYPE, P)(texture2d_array<P, access::read> inputX [[texture(0)]],
+                                 texture2d_array<P, access::read> inputY [[texture(1)]],
+                                 texture2d_array<P, access::write> outTexture [[texture(2)]],
+                                 constant ElementwiseAddParam &pm [[buffer(0)]],
+#ifdef PRELU_CHANNEL
+                                 const device VECTOR(P, 4) *alpha [[buffer(1)]],
+#endif
+#ifdef PRELU_ELEMENT
+                                 const device VECTOR(P, 4) *alpha [[buffer(1)]],
+#endif
+#ifdef PRELU_OTHER
+                                 const device P *alpha [[buffer(1)]],
+#endif
+                                 uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  VECTOR(P, 4) rx, ry;
+  
+  if (pm.fast == 1) {
+    rx = inputX.read(gid.xy, gid.z);
+    ry = inputY.read(gid.xy, gid.z);
+    } else {
+      rx = inputX.read(gid.xy, gid.z);
+      int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+      int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+      int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+      int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+      int32_t yshift = 4 - pm.ylen - pm.axis;
+      for (int n = 0; n < 4; n++) {
+        x_xyzn[3] = n;
+        xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+        invtrans(xtrans, x_abcd, t_abcd);
+        for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+          y_abcd[yshift+k] = t_abcd[k];
+        }
+        trans(ytrans, y_abcd, t_abcd);
+        abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+        ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+      }
+  }
+  VECTOR(P, 4) output = rx + ry;
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+#endif
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..8fd1a9fdab8c86fbc52f6dab9c448b7b0f27d403
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.metal
@@ -0,0 +1,75 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+struct ElementwiseAddParam {
+  int32_t fast;
+  int32_t axis;
+  int32_t ylen;
+  int32_t xdim[4];
+  int32_t xtrans[4];
+  int32_t ydim[4];
+  int32_t ytrans[4];
+};
+
+#define P float
+
+#define PRELU_CHANNEL prelu_channel
+#define PRELU_TYPE channel
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_CHANNEL
+
+#define PRELU_ELEMENT element
+#define PRELU_TYPE prelu_element
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_ELEMENT
+
+#define PRELU_OTHER   other
+#define PRELU_TYPE prelu_other
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_OTHER
+
+#undef P
+
+#define P half
+
+#define PRELU_CHANNEL channel
+#define PRELU_TYPE channel
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_CHANNEL
+
+#define PRELU_ELEMENT element
+#define PRELU_TYPE prelu_element
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_ELEMENT
+
+#define PRELU_OTHER   other
+#define PRELU_TYPE prelu_other
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_OTHER
+
+#undef P
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/FetchKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/FetchKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..b7d7028d46356e0dae21b352161de31b0820ff1a
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/FetchKernel.metal
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void fetch(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                       device float *output [[buffer(0)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+  int input_height = inTexture.get_height();
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  int output_to = 4 * input_width * input_height;
+  output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
+  output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
+//  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
+//  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
+}
+
+
+kernel void fetch_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                  device float * output [[buffer(0)]],
+                  uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+  int input_height = inTexture.get_height();
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  int output_to = 4 * input_width * input_height;
+  output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
+  output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
+//  output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
+//  output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
+  
+}
+
+kernel void fetch_placeholder(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                              device float *output [[buffer(0)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+  
+}
+
+kernel void fetch_placeholder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                              device float *output [[buffer(0)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+}
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal
new file mode 100644
index 0000000000000000000000000000000000000000..368509f001aca6361b81b9b7839cf24b2efc5c12
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal
@@ -0,0 +1,69 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+// 占位函数, 啥也没干
+kernel void place_holder(texture2d<half, access::read> inTexture [[texture(0)]],
+                         texture2d_array<half, access::write> outTexture [[texture(1)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+}
+
+struct OutputDim {
+  ushort width;
+  ushort height;
+  ushort strideX;
+  ushort strideY;
+};
+
+kernel void resize(texture2d<half, access::read> inTexture [[texture(0)]],
+                   texture2d_array<half, access::write> outTexture [[texture(1)]],
+                   constant OutputDim &params [[buffer(0)]],
+                   uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  
+  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint2 pos = gid.xy * uint2(params.strideX, params.strideY);
+  const half4 input = inTexture.read(pos);
+  outTexture.write(half4(input.x, input.y, input.z, input.w), gid.xy, gid.z);
+}
+
+
+kernel void texture2d_to_2d_array(texture2d<float, access::read> inTexture [[texture(0)]],
+                                  texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                  uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height()){
+    return;
+  }
+  const float4 input = inTexture.read(gid.xy);
+  outTexture.write(input, gid.xy, 0);
+}
+
+kernel void texture2d_to_2d_array_half(texture2d<half, access::read> inTexture [[texture(0)]],
+                                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height()){
+    return;
+  }
+  const half4 input = inTexture.read(gid.xy);
+  outTexture.write(input, gid.xy, 0);
+}
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Macro.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Macro.metal
new file mode 100644
index 0000000000000000000000000000000000000000..950d7d5f0555b841da57554ff61f2f5cdbcae7aa
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Macro.metal
@@ -0,0 +1,29 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
+
+#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+
+#define FUNC3_(a, b, c) CONCAT3_(a, b, c)
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/NMSFetchResultKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/NMSFetchResultKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..44c57440e1ec138717ad1bc569fd772e0d7ede1a
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/NMSFetchResultKernel.metal
@@ -0,0 +1,80 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void nms_fetch_result(texture2d_array<float, access::read> inTexture [[texture(0)]],
+    device float *output [[buffer(0)]],
+    uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = input.x;
+  
+}
+
+
+kernel void nms_fetch_result_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                             device float *output [[buffer(0)]],
+                             uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = input.x;
+}
+
+kernel void nms_fetch_bbox(texture2d_array<float, access::read> inTexture [[texture(0)]],
+    device float4 *output [[buffer(0)]],
+    uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+//  int input_height = inTexture.get_height();
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = input;
+}
+
+kernel void nms_fetch_bbox_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                           device float4 *output [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= inTexture.get_width() ||
+      gid.y >= inTexture.get_height() ||
+      gid.z >= inTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_width = inTexture.get_width();
+//  int input_height = inTexture.get_height();
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  output[gid.y * input_width + gid.x] = float4(input);
+}
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..1f2f7240db2ba716090001ed539bddb87dff5117
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal
@@ -0,0 +1,93 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+struct PoolParam {
+  int ksizeX;
+  int ksizeY;
+  int strideX;
+  int strideY;
+  int paddingX;
+  int paddingY;
+  int poolType;
+};
+
+kernel void pool(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                 texture2d_array<float, access::write> outTexture [[texture(1)]],
+                 constant PoolParam &pm [[buffer(0)]],
+                 uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  int xmin = gid.x * pm.strideX - pm.paddingX;
+  int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
+  xmin = max(xmin, 0);
+  int ymin = gid.y * pm.strideX - pm.paddingX;
+  int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
+  ymin = max(ymin, 0);
+  
+  float4 r = 0;
+  if (pm.poolType == 0) {
+    r = inTexture.read(uint2(xmin, ymin), gid.z);
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r = fmax(r, inTexture.read(uint2(x, y), gid.z));
+      }
+    }
+  } else if (pm.poolType == 1) {
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r += inTexture.read(uint2(x, y), gid.z);
+      }
+    }
+    r /= pm.ksizeX * pm.ksizeY;
+  }
+  outTexture.write(r, gid.xy, gid.z);
+}
+
+kernel void pool_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                      texture2d_array<half, access::write> outTexture [[texture(1)]],
+                      constant PoolParam &pm [[buffer(0)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  int xmin = gid.x * pm.strideX - pm.paddingX;
+  int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
+  xmin = max(xmin, 0);
+  int ymin = gid.y * pm.strideX - pm.paddingX;
+  int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
+  ymin = max(ymin, 0);
+  
+  half4 r = 0;
+  if (pm.poolType == 0) {
+    r = inTexture.read(uint2(xmin, ymin), gid.z);
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r = fmax(r, inTexture.read(uint2(x, y), gid.z));
+      }
+    }
+  } else if (pm.poolType == 1) {
+    for (int x = xmin; x < xmax; x++) {
+      for (int y = ymin; y < ymax; y++) {
+        r += inTexture.read(uint2(x, y), gid.z);
+      }
+    }
+    r /= pm.ksizeX * pm.ksizeY;
+  }
+  outTexture.write(r, gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..597804137743dd253d05d91a5008f558dcaf42e7
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal
@@ -0,0 +1,151 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void prelu_channel(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                           texture2d_array<float, access::write> outTexture [[texture(1)]],
+                           const device float4 *alpha [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  float4 alpha_value = alpha[gid.z];
+  float4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void prelu_element(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<float, access::write> outTexture [[texture(1)]],
+                          const device float4 *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+
+  int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
+  float4 alpha_value = alpha[alpha_to + gid.z];
+
+  float4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void prelu_other(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<float, access::write> outTexture [[texture(1)]],
+                          const device float *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  float alpha_value = alpha[0];
+  float4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
+kernel void prelu_channel_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<half, access::write> outTexture [[texture(1)]],
+                          const device half4 *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  half4 alpha_value = alpha[gid.z];
+  half4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void prelu_element_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                          texture2d_array<half, access::write> outTexture [[texture(1)]],
+                          const device half4 *alpha [[buffer(0)]],
+                          uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  
+  int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
+  half4 alpha_value = alpha[alpha_to + gid.z];
+  
+  half4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+kernel void prelu_other_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                        texture2d_array<half, access::write> outTexture [[texture(1)]],
+                        const device half *alpha [[buffer(0)]],
+                        uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
+  half alpha_value = alpha[0];
+  half4 output;
+  output.x = input.x > 0 ? input.x : (alpha_value * input.x);
+  output.y = input.y > 0 ? input.y : (alpha_value * input.y);
+  output.z = input.z > 0 ? input.z : (alpha_value * input.z);
+  output.w = input.w > 0 ? input.w : (alpha_value * input.w);
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..7630febf77210bb364f0191e8b10a5a6923d6c95
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal
@@ -0,0 +1,367 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct PriorBoxMetalParam {
+  float offset;
+  float stepWidth;
+  float stepHeight;
+  float minSize;
+  float maxSize;
+  float imageWidth;
+  float imageHeight;
+  
+  bool clip;
+  
+  uint numPriors;
+  uint aspecRatiosSize;
+  uint minSizeSize;
+  uint maxSizeSize;
+};
+
+kernel void prior_box(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
+                      texture2d_array<float, access::write> varianceTexture [[texture(2)]],
+                      const device float *aspect_ratios [[buffer(0)]],
+                      constant PriorBoxMetalParam &param [[buffer(1)]],
+                      const device float4 *variances [[buffer(2)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  
+  float box_width, box_height;
+  
+  if (gid.z < param.aspecRatiosSize) {
+    float ar = aspect_ratios[gid.z];
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    
+    outBoxTexture.write(res, gid.xy, gid.z);
+  } else if (gid.z >= param.aspecRatiosSize) {
+    if (param.maxSizeSize > 0) {
+      box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+      float4 max_box;
+      max_box.x = (center_x - box_width) / param.imageWidth;
+      max_box.y = (center_y - box_height) / param.imageHeight;
+      max_box.z = (center_x + box_width) / param.imageWidth;
+      max_box.w = (center_y + box_height) / param.imageHeight;
+
+      float4 res;
+      if (param.clip) {
+        res = min(max(max_box, 0.0), 1.0);
+      } else {
+        res = max_box;
+      }
+      outBoxTexture.write(max_box, gid.xy, gid.z);
+    }
+  }
+  
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+    varianceTexture.write(variances_output, gid.xy, gid.z);
+  }
+}
+
+
+kernel void prior_box_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                      texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
+                      texture2d_array<half, access::write> varianceTexture [[texture(2)]],
+                      const device half *aspect_ratios [[buffer(0)]],
+                      constant PriorBoxMetalParam &param [[buffer(1)]],
+                      const device float4 *variances [[buffer(2)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  
+  float box_width, box_height;
+  
+  if (gid.z < param.aspecRatiosSize) {
+    half ar = aspect_ratios[gid.z];
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    
+    outBoxTexture.write(half4(res), gid.xy, gid.z);
+  } else if (gid.z >= param.aspecRatiosSize) {
+    if (param.maxSizeSize > 0) {
+      box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+      float4 max_box;
+      max_box.x = (center_x - box_width) / param.imageWidth;
+      max_box.y = (center_y - box_height) / param.imageHeight;
+      max_box.z = (center_x + box_width) / param.imageWidth;
+      max_box.w = (center_y + box_height) / param.imageHeight;
+      
+      float4 res;
+      if (param.clip) {
+        res = min(max(max_box, 0.0), 1.0);
+      } else {
+        res = max_box;
+      }
+      outBoxTexture.write(half4(max_box), gid.xy, gid.z);
+    }
+  }
+  
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+    varianceTexture.write(half4(variances_output), gid.xy, gid.z);
+  }
+}
+
+
+
+kernel void prior_box_MinMaxAspectRatiosOrder(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outBoxTexture [[texture(1)]],
+                      texture2d_array<float, access::write> varianceTexture [[texture(2)]],
+                      const device float *aspect_ratios [[buffer(0)]],
+                      constant PriorBoxMetalParam &param [[buffer(1)]],
+                      const device float4 *variances [[buffer(2)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  
+  float box_width, box_height;
+  
+  
+  
+  if (gid.z == 0) {
+    box_width = box_height = param.minSize / 2;
+    
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+
+    outBoxTexture.write(res, gid.xy, gid.z);
+  }
+  
+  if (gid.z == 1 && param.maxSizeSize > 0) {
+    
+    box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+    float4 max_box;
+    max_box.x = (center_x - box_width) / param.imageWidth;
+    max_box.y = (center_y - box_height) / param.imageHeight;
+    max_box.z = (center_x + box_width) / param.imageWidth;
+    max_box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = min(max(max_box, 0.0), 1.0);
+    } else {
+      res = max_box;
+    }
+    outBoxTexture.write(res, gid.xy, gid.z);
+  }
+  
+  int aspect_to = 0;
+  if (param.maxSizeSize > 0) {
+    aspect_to = gid.z - 2;
+  } else {
+    aspect_to = gid.z - 1;
+  }
+  
+
+  
+  
+  if (aspect_to >= 0 && aspect_to < int(param.aspecRatiosSize)) {
+    
+    int skip = 0;
+    for (int i = 0; i < aspect_to + 1; ++i) {
+      if (fabs(aspect_ratios[i] - 1.) < 1e-6) {
+        skip += 1;
+      }
+    }
+    aspect_to += skip;
+    
+    float ar = aspect_ratios[aspect_to];
+    
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    
+    outBoxTexture.write(res, gid.xy, gid.z);
+  }
+  
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+    varianceTexture.write(variances_output, gid.xy, gid.z);
+  }
+}
+
+
+kernel void prior_box_MinMaxAspectRatiosOrder_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                           texture2d_array<half, access::write> outBoxTexture [[texture(1)]],
+                           texture2d_array<half, access::write> varianceTexture [[texture(2)]],
+                           const device half *aspect_ratios [[buffer(0)]],
+                           constant PriorBoxMetalParam &param [[buffer(1)]],
+                           const device float4 *variances [[buffer(2)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outBoxTexture.get_width() ||
+      gid.y >= outBoxTexture.get_height() ||
+      gid.z >= outBoxTexture.get_array_size()) return;
+  
+  float center_x = (gid.x + param.offset) * param.stepWidth;
+  float center_y = (gid.y + param.offset) * param.stepHeight;
+  
+  float box_width, box_height;
+  
+  
+  
+  if (gid.z == 0) {
+    box_width = box_height = param.minSize / 2;
+    
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    
+    outBoxTexture.write(half4(res), gid.xy, gid.z);
+  }
+  
+  if (gid.z == 1 && param.maxSizeSize > 0) {
+    
+    box_width = box_height = sqrt(param.minSize * param.maxSize) / 2;
+    float4 max_box;
+    max_box.x = (center_x - box_width) / param.imageWidth;
+    max_box.y = (center_y - box_height) / param.imageHeight;
+    max_box.z = (center_x + box_width) / param.imageWidth;
+    max_box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = min(max(max_box, 0.0), 1.0);
+    } else {
+      res = max_box;
+    }
+    outBoxTexture.write(half4(res), gid.xy, gid.z);
+  }
+  
+  int aspect_to = 0;
+  if (param.maxSizeSize > 0) {
+    aspect_to = gid.z - 2;
+  } else {
+    aspect_to = gid.z - 1;
+  }
+  
+  if (aspect_to > 0 && aspect_to < int(param.aspecRatiosSize) && fabs(aspect_ratios[aspect_to] - 1.) > 1e-6) {
+    float ar = aspect_ratios[aspect_to];
+    
+    box_width = param.minSize * sqrt(ar) / 2;
+    box_height = param.minSize / sqrt(ar) / 2;
+    float4 box;
+    box.x = (center_x - box_width) / param.imageWidth;
+    box.y = (center_y - box_height) / param.imageHeight;
+    box.z = (center_x + box_width) / param.imageWidth;
+    box.w = (center_y + box_height) / param.imageHeight;
+    
+    float4 res;
+    if (param.clip) {
+      res = fmin(fmax(box, 0.0), 1.0);
+    } else {
+      res = box;
+    }
+    
+    outBoxTexture.write(half4(res), gid.xy, gid.z);
+  }
+  
+  float4 variance = variances[0];
+  if (gid.z < param.numPriors) {
+    float4 variances_output;
+    variances_output.x = variance.x;
+    variances_output.y = variance.y;
+    variances_output.z = variance.z;
+    variances_output.w = variance.w;
+    varianceTexture.write(half4(variances_output), gid.xy, gid.z);
+  }
+}
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReluKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..e725440bbe997d571f1860bce323516144a94da8
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReluKernel.metal
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+
+kernel void relu_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                 texture2d_array<half, access::write> outTexture [[texture(1)]],
+                 uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const half4 input = inTexture.read(gid.xy, gid.z);
+  const float4 relu = fmax((float4)input, 0.0);
+  outTexture.write(half4(relu), gid.xy, gid.z);
+}
+
+kernel void relu(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                      texture2d_array<float, access::write> outTexture [[texture(1)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const float4 input = inTexture.read(gid.xy, gid.z);
+  const float4 relu = fmax((float4)input, 0.0);
+  outTexture.write(float4(relu), gid.xy, gid.z);
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..7583537c2b404b7a95eeedfb4c69793a608f18ac
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.inc.metal
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+
+#define FUNC(f, r1, r2, p) CONCAT4_(f, r1, r2, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+#define FUNC_R(f, r) CONCAT2_(f, r)
+
+kernel void FUNC(reshape, RIN, ROUT, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
+                    texture2d_array<P, access::write> outTexture [[texture(1)]],
+                    constant ReshapeParam &rp [[buffer(0)]],
+                    uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+
+  int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4];
+  ReshapeParam lrp = rp;
+  int oC = lrp.odim[lrp.otrans[3]];
+  int iC = lrp.idim[lrp.itrans[3]];
+  int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3];
+  VECTOR(P, 4) r;
+  for (int n = 0; n < 4; n++) {
+    oxyzn[3] = n;
+#if ROUT == 4
+    xyzn2abcd_4(oC, oxyzn, oabcd);
+#else
+    FUNC_R(xyzn2abcd, ROUT)(oxyzn, oabcd);
+#endif
+    int tabcd[4];
+    invtrans(lrp.otrans, oabcd, tabcd);
+    int index = abcd2index(lrp.odim, tabcd);
+    if (index < count) {
+      index2abcd(lrp.idim, index, tabcd);
+      trans(lrp.itrans, tabcd, iabcd);
+#if RIN == 4
+      abcd2xyzn_4(iC, iabcd, ixyzn);
+#else
+      FUNC_R(abcd2xyzn, RIN)(iabcd, ixyzn);
+#endif
+      r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
+    } else {
+      r[n] = 0;
+    }
+  }
+  outTexture.write(r, gid.xy, gid.z);
+}
+
+#endif
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..d2f5815d422ec8c4f3e1e3c1992855547e002264
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal
@@ -0,0 +1,150 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONRITIONS OF ANY KINR, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+struct ReshapeParam {
+  int32_t idim[4];
+  int32_t itrans[4];
+  int32_t odim[4];
+  int32_t otrans[4];
+};
+
+#define P float
+#define RIN 4
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 3
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 2
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 1
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#undef P
+
+#define P half
+#define RIN 4
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 3
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 2
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+
+#define RIN 1
+#define ROUT 4
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 3
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 2
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#define ROUT 1
+#include "ReshapeKernel.inc.metal"
+#undef ROUT
+#undef RIN
+#undef P
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ResizeBilinear.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ResizeBilinear.metal
new file mode 100644
index 0000000000000000000000000000000000000000..fbb4e12cb82c12f8dc5b94c397e43b8c8c5ae518
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ResizeBilinear.metal
@@ -0,0 +1,75 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct resize_bilinear_param {
+//  int32_t out_h;
+//  int32_t out_w;
+  float ratio_h;
+  float ratio_w;
+};
+
+kernel void resize_bilinear(texture2d_array<float, access::read> input [[texture(0)]],
+                     texture2d_array<float, access::write> output [[texture(2)]],
+                     constant resize_bilinear_param & pm [[buffer(0)]],
+                     uint3 gid [[thread_position_in_grid]]) {
+  float4 r;
+  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+    r = input.read(gid.xy, gid.z);
+  } else {
+    float w = gid.x * pm.ratio_w;
+    float h = gid.y * pm.ratio_h;
+    uint w0 = w, h0 = h;
+    uint w1 = w0 + 1, h1 = h0 + 1;
+    float w1lambda = w - w0, h1lambda = h - h0;
+    float w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+    if (w1 >= input.get_width()) w1 = w0;
+    if (h1 >= input.get_height()) h1 = h0;
+    float4 r0 = input.read(uint2(w0, h0), gid.z);
+    float4 r1 = input.read(uint2(w1, h0), gid.z);
+    float4 r2 = input.read(uint2(w0, h1), gid.z);
+    float4 r3 = input.read(uint2(w1, h1), gid.z);
+    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
+  }
+  output.write(r, gid.xy, gid.z);
+}
+
+kernel void resize_bilinear_half(texture2d_array<half, access::read> input [[texture(0)]],
+                            texture2d_array<half, access::write> output [[texture(2)]],
+                            constant resize_bilinear_param & pm [[buffer(0)]],
+                            uint3 gid [[thread_position_in_grid]]) {
+  
+  half4 r;
+  if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) {
+    r = input.read(gid.xy, gid.z);
+  } else {
+    half w = gid.x * pm.ratio_w;
+    half h = gid.y * pm.ratio_h;
+    uint w0 = w, h0 = h;
+    uint w1 = w0 + 1, h1 = h0 + 1;
+    half w1lambda = w - w0, h1lambda = h - h0;
+    half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda;
+    if (w1 >= input.get_width()) w1 = w0;
+    if (h1 >= input.get_height()) h1 = h0;
+    half4 r0 = input.read(uint2(w0, h0), gid.z);
+    half4 r1 = input.read(uint2(w1, h0), gid.z);
+    half4 r2 = input.read(uint2(w0, h1), gid.z);
+    half4 r3 = input.read(uint2(w1, h1), gid.z);
+    r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3);
+  }
+  output.write(r, gid.xy, gid.z);
+  output.write(r, gid.xy, gid.z);
+}
diff --git a/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.h b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Shape.metal
similarity index 85%
rename from ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.h
rename to metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Shape.metal
index 092a369366b4f28e88eaac6032b60f18a332ba84..b50d5547193ccc9a1bef1b3ed6bbd1b7a64c3527 100644
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/ViewController.h
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Shape.metal
@@ -12,10 +12,10 @@
  See the License for the specific language governing permissions and
  limitations under the License. */
 
-#import <UIKit/UIKit.h>
-
-@interface ViewController : UIViewController
-
-
-@end
+#include <metal_stdlib>
+using namespace metal;
 
+kernel void shape() {
+}
+kernel void shape_half() {
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..455cf1471b5c369fc27040e03b57812e8d6bf0e8
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.inc.metal
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+
+#define FUNC(f, p) CONCAT2_(f, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+
+kernel void FUNC(softmax, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
+                    texture2d_array<P, access::write> outTexture [[texture(1)]],
+                    constant SoftmaxParam &sp [[buffer(0)]],
+                    uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+//  int zsize = inTexture.get_array_size();
+  P maxv = inTexture.read(uint2(0, gid.y), 0)[0];
+  int group = sp.K / 4;
+  int remain = sp.K % 4;
+  for (int x = 0; x < group; x++) {
+    VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
+    maxv = max(maxv, max(r[0], max(r[1], max(r[2], r[3]))));
+  }
+  if (remain > 0) {
+    VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
+    for (int i = 0; i < remain; i++) {
+      maxv = max(maxv, r[i]);
+    }
+  }
+  VECTOR(P, 4) rsum = {0, 0, 0, 0};
+  for (int x = 0; x < group; x++) {
+    VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0);
+    rsum += exp(r - maxv);
+  }
+  P sum = rsum[0] + rsum[1] + rsum[2] + rsum[3];
+  if (remain > 0) {
+    VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0);
+    for (int i = 0; i < remain; i++) {
+      sum += exp(r[i] - maxv);
+    }
+  }
+  VECTOR(P, 4) rr = inTexture.read(gid.xy, gid.z);
+  rr = exp(rr - maxv) / sum;
+  outTexture.write(rr, gid.xy, gid.z);
+}
+
+#endif
diff --git a/ios/PaddleMobileDemo/PaddleMobileDemo/main.m b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal
similarity index 74%
rename from ios/PaddleMobileDemo/PaddleMobileDemo/main.m
rename to metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal
index 8429e87bd1fba3a2e7070db56b458f2656b9bfa6..67c279a4441095e710985c65d85aac589b7d0f54 100644
--- a/ios/PaddleMobileDemo/PaddleMobileDemo/main.m
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal
@@ -12,11 +12,18 @@
  See the License for the specific language governing permissions and
  limitations under the License. */
 
-#import <UIKit/UIKit.h>
-#import "AppDelegate.h"
+#include <metal_stdlib>
+using namespace metal;
 
-int main(int argc, char * argv[]) {
-    @autoreleasepool {
-        return UIApplicationMain(argc, argv, nil, NSStringFromClass([AppDelegate class]));
-    }
-}
+struct SoftmaxParam {
+  int N;
+  int K;
+};
+
+#define P float
+#include "Softmax.inc.metal"
+#undef P
+
+#define P half
+#include "Softmax.inc.metal"
+#undef P
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..54e3f21e793a9c1474f13fed61857211cb7d117f
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.inc.metal
@@ -0,0 +1,122 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d
+#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e
+
+#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+#define FUNC_R(f, r) CONCAT2_(f, r)
+
+#if V == VX
+#define VV x
+#elif V == VY
+#define VV y
+#elif V == VZ
+#define VV z
+#else
+#define VV normal
+#endif
+
+#if V == VY
+kernel void FUNC(split, R, N, VV, P)(texture2d_array<P, access::read> input [[texture(0)]],
+                                 texture2d_array<P, access::write> out1 [[texture(1)]],
+                                 texture2d_array<P, access::write> out2 [[texture(2)]],
+#if N >= 3
+                                 texture2d_array<P, access::write> out3 [[texture(3)]],
+#endif // N >= 3
+#if N >= 4
+                                 texture2d_array<P, access::write> out4 [[texture(4)]],
+#endif // N >= 4
+                                 constant SplitParam &sp [[buffer(0)]],
+                                 uint3 gid [[thread_position_in_grid]]) {
+
+  VECTOR(P, 4) r = input.read(gid.xy, gid.z);
+  int y = gid.y - sp.offset;
+  if (y < sp.vdim[0]) {
+    out1.write(r, gid.xy, gid.z);
+    return;
+  }
+  y -= sp.vdim[0];
+  if (y < sp.vdim[1]) {
+    out2.write(r, uint2(gid.x, y), gid.z);
+    return;
+  }
+#if N >= 3
+  y -= sp.vdim[1];
+  if (y < sp.vdim[2]) {
+    out3.write(r, uint2(gid.x, y), gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  y -= sp.vdim[2];
+  if (y < sp.vdim[3]) {
+    out4.write(r, uint2(gid.x, y), gid.z);
+    return;
+  }
+#endif // N >= 4
+}
+#endif // V == VY
+
+
+#if V == VX
+kernel void FUNC(split, R, N, VV, P)(texture2d_array<P, access::read> input [[texture(0)]],
+                                     texture2d_array<P, access::write> out1 [[texture(1)]],
+                                     texture2d_array<P, access::write> out2 [[texture(2)]],
+#if N >= 3
+                                     texture2d_array<P, access::write> out3 [[texture(3)]],
+#endif // N >= 3
+#if N >= 4
+                                     texture2d_array<P, access::write> out4 [[texture(4)]],
+#endif // N >= 4
+                                     constant SplitParam &sp [[buffer(0)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) r = input.read(gid.xy, gid.z);
+  int x = gid.x;
+  if (x < sp.vdim[0]) {
+    out1.write(r, gid.xy, gid.z);
+    return;
+  }
+  x -= sp.vdim[0];
+  if (x < sp.vdim[1]) {
+    out2.write(r, uint2(x, gid.y), gid.z);
+    return;
+  }
+#if N >= 3
+  x -= sp.vdim[1];
+  if (x < sp.vdim[2]) {
+    out3.write(r, uint2(x, gid.y), gid.z);
+    return;
+  }
+#endif // N >= 3
+#if N >= 4
+  x -= sp.vdim[2];
+  if (x < sp.vdim[3]) {
+    out4.write(r, uint2(x, gid.y), gid.z);
+    return;
+  }
+#endif // N >= 4
+}
+#endif // V == VX
+
+
+
+#undef VV
+#endif
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.metal
new file mode 100644
index 0000000000000000000000000000000000000000..4c1e818d2bf5c7266169f406fbfaf8e322685dc4
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Split.metal
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+
+using namespace metal;
+
+struct SplitParam {
+  int32_t idim[4];
+  int32_t axis;
+  int32_t offset;
+  int32_t trans[4];
+  int32_t vdim[4];
+};
+
+#define VNORMAL 1
+#define VX 2
+#define VY 3
+#define VZ 4
+
+// only support split_{2, 3, 4}_{2, 3, 4}_y_{float, half}
+// only support split_{3, 4}_{2, 3, 4}_x_{float, half}
+
+
+//// ssd-ar: (R=3, N=2, V=y)
+#define V VY
+  #define R 3
+    #define N 2
+      #define P float
+        #include "Split.inc.metal"
+      #undef P
+      #define P half
+        #include "Split.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
+
+
+//// ssd-ar: (R=2, N=2, V=y)
+#define V VY
+  #define R 2
+    #define N 2
+      #define P float
+        #include "Split.inc.metal"
+      #undef P
+      #define P half
+        #include "Split.inc.metal"
+      #undef P
+    #undef N
+  #undef R
+#undef V
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.inc.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.inc.metal
new file mode 100644
index 0000000000000000000000000000000000000000..534166e45fc3db49cc5de526ec0d5179ca3f9899
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.inc.metal
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#ifdef P
+
+#define CONCAT2(a, b) a ## b
+#define CONCAT2_(a, b) a ## _ ## b
+#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c
+
+#define FUNC(f, r, p) CONCAT3_(f, r, p)
+#define VECTOR(p, n) CONCAT2(p, n)
+
+kernel void FUNC(transpose, R, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
+                      texture2d_array<P, access::write> outTexture [[texture(1)]],
+                      constant TransposeParam &pm [[buffer(0)]],
+                      uint3 gid [[thread_position_in_grid]]) {
+  VECTOR(P, 4) r;
+  int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0};
+  int iabcd[4], oabcd[4], ixyzn[4];
+  for (int n = 0; n < 4; n++) {
+    oxyzn[3] = n;
+#if R == 4
+    xyzn2abcd_4(pm.oC, oxyzn, iabcd);
+#endif // R == 4
+#if R == 3
+    xyzn2abcd_3(oxyzn, oabcd);
+#endif // R == 3
+#if R == 2
+    xyzn2abcd_2(oxyzn, oabcd);
+#endif // R == 2
+    iabcd[pm.axis[0]] = oabcd[0];
+    iabcd[pm.axis[1]] = oabcd[1];
+    iabcd[pm.axis[2]] = oabcd[2];
+    iabcd[pm.axis[3]] = oabcd[3];
+#if R == 4
+    abcd2xyzn_4(pm.iC, iabcd, ixyzn);
+#endif // R == 4
+#if R == 3
+    abcd2xyzn_3(iabcd, ixyzn);
+#endif // R == 3
+#if R == 2
+    abcd2xyzn_2(iabcd, ixyzn);
+#endif // R == 2
+    r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]];
+  }
+  outTexture.write(r, gid.xy, gid.z);
+}
+
+#endif
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal
new file mode 100644
index 0000000000000000000000000000000000000000..321663b9b7f09eba2041cb0932215d291e44aba6
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+struct TransposeParam {
+  int iC;
+  int oC;
+  int axis[4];
+};
+
+kernel void transpose_copy_float(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                           texture2d_array<float, access::write> outTexture [[texture(1)]],
+                           constant TransposeParam &pm [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
+}
+kernel void transpose_copy_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
+                           texture2d_array<half, access::write> outTexture [[texture(1)]],
+                           constant TransposeParam &pm [[buffer(0)]],
+                           uint3 gid [[thread_position_in_grid]]) {
+  outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z);
+}
+
+#define R 4
+  #define P float
+    #include "TransposeKernel.inc.metal"
+  #undef P
+  #define P half
+    #include "TransposeKernel.inc.metal"
+  #undef P
+#undef R
+
+#define R 3
+  #define P float
+    #include "TransposeKernel.inc.metal"
+  #undef P
+  #define P half
+    #include "TransposeKernel.inc.metal"
+  #undef P
+#undef R
+
+#define R 2
+  #define P float
+    #include "TransposeKernel.inc.metal"
+  #undef P
+  #define P half
+    #include "TransposeKernel.inc.metal"
+  #undef P
+#undef R
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..fc1b3164c9cf623a1bc4d350cc8a5f72c369bae4
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift
@@ -0,0 +1,69 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class MulticlassNMSParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      scores = try MulticlassNMSParam.getFirstTensor(key: "Scores", map: opDesc.inputs, from: inScope)
+      bboxes = try MulticlassNMSParam.getFirstTensor(key: "BBoxes", map: opDesc.inputs, from: inScope)
+      output = try MulticlassNMSParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      
+      middleOutput = FetchHolder.init(inCapacity: scores.tensorDim.numel(), inDim: scores.tensorDim.dims)
+      
+      bboxOutput = FetchHolder.init(inCapacity: bboxes.tensorDim.numel(), inDim: bboxes.tensorDim.dims)
+    } catch let error {
+      throw error
+    }
+  }
+  var bboxOutput: FetchHolder
+  var middleOutput: FetchHolder
+  let scores: Texture<P>
+  let bboxes: Texture<P>
+  var output: Texture<P>
+}
+
+class MulticlassNMSOp<P: PrecisionType>: Operator<MulticlassNMSKernel<P>, MulticlassNMSParam<P>>, Runable, Creator, InferShaperable{
+
+  func inputVariant() -> [String : [Variant]] {
+    return ["Scores" : [para.middleOutput], "BBoxes" : [para.bboxOutput]]
+  }
+  
+  func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let _ {
+      fatalError()
+    }
+  }
+  
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  typealias OpType =  MulticlassNMSOp<P>
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+
+  }
+  
+  func delogOutput() {
+    print(" nms - output: ")
+    print(para.bboxes.metalTexture.float32Array().strideArray())
+  }
+}
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..6f42f2aa9f8d0515946ace625ed16c5040fd3099
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift
@@ -0,0 +1,74 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class PoolParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try PoolParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try PoolParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      poolType = try PoolParam.getAttr(key: "pooling_type", attrs: opDesc.attrs)
+      ksize = try PoolParam.getAttr(key: "ksize", attrs: opDesc.attrs)
+      stride = try PoolParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      padding = try PoolParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      ceilMode = try PoolParam.getAttr(key: "ceil_mode", attrs: opDesc.attrs)
+      globalPooling = try PoolParam.getAttr(key: "global_pooling", attrs: opDesc.attrs)
+      assert(input.transpose == [0, 2, 3, 1])
+    } catch let error {
+      throw error
+    }
+    //        let buffer = input.metalTexture.buffer.contents().assumingMemoryBound(to: P.self)
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  var ksize: [Int32]
+  var stride: [Int32]
+  var padding: [Int32]
+  var poolType: String
+  var ceilMode: Bool
+  var globalPooling: Bool
+}
+
+class PoolOp<P: PrecisionType>: Operator<PoolKernel<P>, PoolParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = PoolOp<P>
+
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+
+    
+//    print("pool2d delog")
+//    let _: P? = para.input.metalTexture.logDesc(header: "pool2d input: ", stridable: true)
+//    print(para.ksize)
+//    print(para.stride)
+//    print(para.padding)
+//    print(para.poolType)
+//    let _: P? = para.output.metalTexture.logDesc(header: "pool2d output: ", stridable: true)
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..2d7987e937b9ddf6410ebb0d23bb89c76c1a13ce
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class PreluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try PreluParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try PreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      alpha = try PreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+      mode = try PreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  let mode: String
+  let alpha: Tensor<P>
+  let input: Texture<P>
+  var output: Texture<P>
+}
+
+class PreluOp<P: PrecisionType>: Operator<PreluKernel<P>, PreluParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = PreluOp<P>
+
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) input: ")
+    print(para.input.metalTexture.toTensor(dim: (n: para.input.padToFourDim[0], c: para.input.padToFourDim[1], h: para.input.padToFourDim[2], w: para.input.padToFourDim[3])).strideArray())
+    
+    print(" \(type) Alpha: ")
+    let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false)
+    
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
+  }
+  
+//    print("softmax delog")
+//    let _: P? = para.input.metalTexture.logDesc(header: "softmax input: ", stridable: false)
+//    let _: P? = para.output.metalTexture.logDesc(header: "softmax output: ", stridable: false)
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..2a9f18463483a024545300661e1db33cedce585b
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
@@ -0,0 +1,124 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class PriorBoxParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      min_max_aspect_ratios_order = try PriorBoxParam.getAttr(key: "min_max_aspect_ratios_order", attrs: opDesc.attrs)
+    } catch _ {
+    }
+    
+    do {
+      input = try PriorBoxParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try PriorBoxParam.outputBoxes(outputs: opDesc.outputs, from: inScope)
+      inputImage = try PriorBoxParam.inputImage(inputs: opDesc.inputs, from: inScope)
+      outputVariances = try PriorBoxParam.outputVariances(outputs: opDesc.outputs, from: inScope)
+      minSizes = try PriorBoxParam.getAttr(key: "min_sizes", attrs: opDesc.attrs)
+      maxSizes = try PriorBoxParam.getAttr(key: "max_sizes", attrs: opDesc.attrs)
+      aspectRatios = try PriorBoxParam.getAttr(key: "aspect_ratios", attrs: opDesc.attrs)
+      variances = try PriorBoxParam.getAttr(key: "variances", attrs: opDesc.attrs)
+      flip = try PriorBoxParam.getAttr(key: "flip", attrs: opDesc.attrs)
+      clip = try PriorBoxParam.getAttr(key: "clip", attrs: opDesc.attrs)
+      stepW = try PriorBoxParam.getAttr(key: "step_w", attrs: opDesc.attrs)
+      stepH = try PriorBoxParam.getAttr(key: "step_h", attrs: opDesc.attrs)
+      offset = try PriorBoxParam.getAttr(key: "offset", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  var min_max_aspect_ratios_order: Bool = false
+  let minSizes: [Float32]
+  let maxSizes: [Float32]
+  let aspectRatios: [Float32]
+  var newAspectRatios: MTLBuffer?
+  let variances: [Float32]
+  let flip: Bool
+  let clip: Bool
+  var stepW: Float32
+  var stepH: Float32
+  let offset: Float32
+  
+  let input: Texture<P>
+  let inputImage: Texture<P>
+  var output: Texture<P>
+  let outputVariances: Texture<P>
+}
+
+class PriorBoxOp<P: PrecisionType>: Operator<PriorBoxKernel<P>, PriorBoxParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = PriorBoxOp<P>
+
+  func inferShape() {
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+
+    print(" \(type) output: ")
+    // output
+//    let outputArray = para.output.metalTexture.float32Array()
+//    print(outputArray.strideArray())
+//    let device = para.input.metalTexture!.device
+//    let boxes:[Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: [2,0,1,3])
+//    let variances:[Float32] = device.texture2tensor(texture: para.outputVariances.metalTexture!, dim: para.outputVariances.tensorDim.dims, transpose: [2,0,1,3])
+//    print("boxes: ")
+//    print(boxes.strideArray())
+//    print("variances: ")
+//    print(variances.strideArray())
+    // output
+    print(" \(type) output: ")
+    
+    let box = para.output.metalTexture.realNHWC(dim: (para.output.dim[0], para.output.dim[1], para.output.dim[2], para.output.dim[3]))
+    print(" dim: \(para.output.dim)")
+    print(box.strideArray())
+//    print((0..<box.count).map { (index: $0, value: box[$0])})
+//    print(para.output.realNHWC().strideArray())
+    
+//    let padToFourDim = para.output.padToFourDim
+//    if para.output.transpose == [0, 1, 2, 3] {
+//      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]), texturePrecision: computePrecision)
+//      print(outputArray.strideArray())
+//    } else if para.output.transpose == [0, 2, 3, 1] {
+//      print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3]), texturePrecision: computePrecision).strideArray())
+//    } else {
+//      print(" not implement")
+//    }
+    
+//    writeToLibrary(fileName: "box_out", array: outputArray)
+    
+    // output variance
+//    let outputVarianceArray = para.outputVariances.metalTexture.floatArray { (o: Float32) -> Float32 in
+//      return o
+//    }
+//
+//    print(" output variance: \(outputVarianceArray)")
+    
+//    writeToLibrary(fileName: "variance_out", array: outputVarianceArray)
+    
+  }
+}
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..7748df75fef3a2280a51dda159ead0392e146443
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
@@ -0,0 +1,59 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+
+import Foundation
+
+class ReluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try ReluParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try ReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+}
+
+class ReluOp<P: PrecisionType>: Operator<ReluKernel<P>, ReluParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = ReluOp<P>
+  
+  func inferShape() {
+    para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+  }
+  
+}
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..ac46baca91bd6eedab9241da68a05d08391ec931
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift
@@ -0,0 +1,77 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ReshapeParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try ReshapeParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try ReshapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      shape = try ReshapeParam.getAttr(key: "shape", attrs: opDesc.attrs)
+        
+      var s: [Int] = shape.map { Int($0) }
+      
+      var di = -1
+      var ml = 1
+      for i in 0..<s.count {
+        if s[i] == -1 {
+          di = i
+          continue
+        }
+        ml *= s[i]
+      }
+      if di >= 0 {
+        s[di] = input.dim.numel() / ml
+      }
+      output.tensorDim = Dim.init(inDim: s)
+      var dim: [Int] = [1, 1, 1, 1]
+      for i in 0..<s.count {
+        dim[4-s.count+i] = s[i]
+      }
+      output.padToFourDim = Dim.init(inDim: dim)
+      output.dim = output.padToFourDim
+    } catch let error {
+      throw error
+    }
+  }
+  let input: Texture<P>
+  let shape: [Int32]
+  var output: Texture<P>
+}
+
+class ReshapeOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = ReshapeOp<P>
+
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  func delogOutput() {
+    print("reshape delog")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+//    print(outputArray)
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ResizeBilinearOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ResizeBilinearOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..e0e699cdb8b3a17eb109877f1a7bd986b5e07403
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ResizeBilinearOp.swift
@@ -0,0 +1,64 @@
+///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. */
+
+import Foundation
+
+class ResizeBilinearParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try ResizeBilinearParam.inputX(inputs: opDesc.inputs, from: inScope)
+//      if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) {
+//        fatalError()
+//      }
+      output = try ResizeBilinearParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      out_h = try ResizeBilinearParam.getAttr(key: "out_h", attrs: opDesc.attrs)
+      out_w = try ResizeBilinearParam.getAttr(key: "out_w", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  let out_h: Int32
+  let out_w: Int32
+}
+
+class ResizeBilinearOp<P: PrecisionType>: Operator<ResizeBilinearKernel<P>, ResizeBilinearParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = ResizeBilinearOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+  }
+  
+}
+
+
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ShapeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ShapeOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..b37eed0a9d398923bb866444cf224cb79bb2fecc
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ShapeOp.swift
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ShapeParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try ShapeParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ShapeParam.outputOut(outputs: opDesc.outputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  var output: Texture<P>
+  let input: Texture<P>
+}
+
+class ShapeOp<P: PrecisionType>: Operator<ShapeKernel<P>, ShapeParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = ShapeOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+  }
+  
+}
+
+
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..66b5c7b3146d4c433e12b846a971e4b5ae579f79
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class SoftmaxParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try SoftmaxParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try SoftmaxParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      
+      assert(input.tensorDim.dims.count == 2)
+      assert(input.transpose == [0, 1, 2, 3])
+      
+      output.dim = input.dim
+      output.tensorDim = input.tensorDim
+      output.padToFourDim = input.padToFourDim
+    } catch let error {
+      throw error
+    }
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+}
+
+class SoftmaxOp<P: PrecisionType>: Operator<SoftmaxKernel<P>, SoftmaxParam<P>>, Runable, Creator, InferShaperable{
+  typealias OpType = SoftmaxOp<P>
+
+  func inferShape() {
+    // para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print("softmax delog")
+    print(para.input)
+    
+    print(para.output)
+    let padToFourDim = para.output.padToFourDim
+    let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+    print(outputArray.strideArray())
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/SplitOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/SplitOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..4495902a46426e2a866ba81a2aa761951605f940
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/SplitOp.swift
@@ -0,0 +1,81 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class SplitParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try SplitParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = Texture<P>.init(device: input.metalTexture!.device, inDim: input.dim)
+      axis = try SplitParam.getAttr(key: "axis", attrs: opDesc.attrs)
+      sections = try SplitParam.getAttr(key: "sections", attrs: opDesc.attrs)
+      if axis < 0 {
+        axis = input.tensorDim.cout() + axis
+      }
+      guard let outlist = opDesc.outputs["Out"] else {
+        fatalError()
+      }
+      for out in outlist {
+        guard let variant = inScope[out], let v = variant as? Texture<P> else {
+          fatalError()
+        }
+        outputList.append(v)
+        sections.append(Int32(v.tensorDim.dims[axis]))
+      }
+    } catch let error {
+      throw error
+    }
+  }
+  
+  var axis: Int
+  let input: Texture<P>
+  var output: Texture<P>
+  var outputList: [Texture<P>] = []
+  var sections: [Int32] = []
+}
+
+class SplitOp<P: PrecisionType>: Operator<SplitKernel<P>, SplitParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = SplitOp<P>
+
+  func inferShape() {
+    //        para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.input.metalTexture!.device
+    for out in para.outputList {
+      let arr: [Float32] = device.texture2tensor(texture: out.metalTexture, dim: out.tensorDim.dims, transpose: out.transpose)
+      print(arr.strideArray())
+    }
+  }
+  
+}
+
+
+
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
new file mode 100644
index 0000000000000000000000000000000000000000..8b695ec76fcd46b46f503e21e70f8aac52cee717
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class TransposeParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      input = try TransposeParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try TransposeParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      axis = try TransposeParam.getAttr(key: "axis", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+  }
+  let input: Texture<P>
+  var output: Texture<P>
+  let axis: [Int32]
+}
+
+class TransposeOp<P: PrecisionType>: Operator<TransposeKernel<P>, TransposeParam<P>>, Runable, Creator, InferShaperable{
+  
+  typealias OpType = TransposeOp<P>
+
+  func inferShape() {
+    //para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    let device = para.output.metalTexture!.device
+    let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose)
+    print(outputArray.strideArray())
+  }
+}
+
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/PaddleMobile.swift b/metal/paddle-mobile/paddle-mobile/PaddleMobile.swift
new file mode 100644
index 0000000000000000000000000000000000000000..a6ed8d400ede11a09c4e10ac4dd84273dcf079dc
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/PaddleMobile.swift
@@ -0,0 +1,209 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Metal
+import MetalKit
+import Foundation
+
+@objc public enum Platform: Int{
+  case CPU, GPU
+}
+
+class ScaleKernel: CusomKernel {
+  init(device: MTLDevice, shape: Shape) {
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "scale", outputDim: shape, usePaddleMobileLib: false)
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "scale_half", outputDim: shape, usePaddleMobileLib: false)
+    } else {
+      fatalError(" unsupport ")
+    }
+  }
+  
+}
+
+public class Runner: NSObject {
+  var program: Program?
+  var executor: Executor<Float32>?
+  var queue: MTLCommandQueue?
+  var textureLoader: MTKTextureLoader?
+  public let net: Net
+  let device: MTLDevice?
+  let platform: Platform
+  var cpuPaddleMobile: PaddleMobileCPU?
+  let numel: Int
+  let meansNumber: [NSNumber]
+  
+  // dims num nchw
+  let dimsNum: [NSNumber]
+  /**
+   * inNet:        需要运行的网络
+   * commandQueue: GPU 是需要传入
+   * inPlatform:   需要使用的平台, GPU or CPU
+   */
+  @objc public init(inNet: Net, commandQueue: MTLCommandQueue?, inPlatform: Platform) {
+    net = inNet
+    queue = commandQueue
+    device = queue?.device
+    platform = inPlatform
+    if let inDevice = device {
+      textureLoader = MTKTextureLoader.init(device: inDevice)
+    }
+    if platform == .CPU {
+      cpuPaddleMobile = PaddleMobileCPU.init()
+    }
+    numel = net.dim.n * net.dim.c * net.dim.h * net.dim.w
+    meansNumber = net.means.map { NSNumber.init(value: $0) }
+    dimsNum = [NSNumber.init(value: net.dim.n),
+               NSNumber.init(value: net.dim.c),
+               NSNumber.init(value: net.dim.h),
+               NSNumber.init(value: net.dim.w)]
+  }
+  
+  /**
+   * load 模型, 返回 true 可进行预测
+   */
+  @objc public func load() -> Bool {
+    if platform == .GPU {
+      guard let inDevice = device, let inQueue = queue else {
+        print(" paddle mobile gpu load error, need MTLCommandQueue")
+        return false
+      }
+      let loader = Loader<Float32>.init()
+      do {
+//        program = try loader.load(device: inDevice, paramPointer: net.paramPointer!, paramSize: net.paramSize,modePointer:net.modelPointer!,modelSize:net.modelSize)
+        program = try loader.load(device: inDevice, modelPath: net.modelPath, paraPath: net.paramPath)
+        net.updateProgram(program: program!)
+
+        executor = try Executor<Float32>.init(inDevice: inDevice, inQueue: inQueue, inProgram: program!)
+      } catch let error {
+        print(error)
+        return false
+      }
+    } else {
+      return cpuPaddleMobile?.load(net.modelPath, andWeightsPath: net.paramPath) ?? false
+    }
+    return true
+  }
+  
+  @objc public func predict(inputPointer: UnsafeMutablePointer<Float32>, completion: @escaping ( _ success: Bool, _ result: PaddleMobileCPUResult?) -> Void) {
+    
+    guard let res = cpuPaddleMobile?.predictInput(inputPointer, dim: dimsNum) else {
+      completion(false, nil)
+      return
+    }
+    completion(true, res)
+  }
+  
+  /**
+   * GPU 版本 predict
+   * texture: 需要预测的 texture 需要做过预处理
+   * ( _ success: Bool, _ time:TimeInterval, _ resultArray: [Float32]) -> Void : 回调闭包, 三个参数分别为: 是否成功, 预测耗时, 结果数组
+   */
+  @objc public func predict(texture: MTLTexture, completion: @escaping ( _ success: Bool, _ result: ResultHolder?) -> Void) {
+    do {
+      try self.executor?.predict(input: texture, dim: [self.net.dim.n, self.net.dim.h, self.net.dim.w, self.net.dim.c], completionHandle: { [weak self] (res) in
+        guard let SSelf = self else {
+          fatalError( " self nil " )
+        }
+        let result = SSelf.net.fetchResult(paddleMobileRes: res)
+        completion(true, result)
+      }, preProcessKernle: self.net.preprocessKernel, except: self.net.except)
+    } catch let error {
+      print(error)
+      completion(false, nil)
+      return
+    }
+  }
+  
+  /**
+   * CPU GPU 通用版本 predict
+   * cgImage: 需要预测的图片
+   * ( _ success: Bool, _ time:TimeInterval, _ resultArray: [Float32]) -> Void : 回调闭包, 三个参数分别为: 是否成功, 预测耗时, 结果数组
+   */
+//  @objc public func predict(cgImage: CGImage, completion: @escaping ( _ success: Bool, _ resultArray: [Float32]) -> Void) {
+//    if platform == .GPU {
+//      getTexture(image: cgImage) { [weak self] (texture) in
+//        guard let SSelf = self else {
+//          fatalError( "" )
+//        }
+//        SSelf.predict(texture: texture, completion: completion)
+//      }
+//    } else if platform == .CPU {
+//      let input = preproccess(image: cgImage)
+//      predict(inputPointer: input, completion: completion)
+//      input.deinitialize(count: numel)
+//      input.deallocate()
+//    }
+//  }
+  
+  /*
+   * 清理内存, 调用此函数后, 不能再使用, 需重新 load
+   */
+  @objc public func clear() {
+    if platform == .GPU {
+      executor?.clear()
+      executor = nil
+      program = nil
+    } else if platform == .CPU {
+      cpuPaddleMobile?.clear()
+    }
+  }
+  
+  @objc public func preproccess(image: CGImage) -> UnsafeMutablePointer<Float> {
+    let output = UnsafeMutablePointer<Float>.allocate(capacity: numel)
+    let means = net.means.map { NSNumber.init(value: $0) }
+    let dims = [NSNumber.init(value: net.dim.n),
+                NSNumber.init(value: net.dim.c),
+                NSNumber.init(value: net.dim.h),
+                NSNumber.init(value: net.dim.w)]
+    cpuPaddleMobile?.preprocess(image, output: output, means: means, scale: net.scale, dim: dims)
+    return output
+  }
+  
+  /*
+   * 获取 texture, 对 texture 进行预处理, GPU 预测时使用
+   */
+  @objc public func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) {
+    let texture = try? textureLoader?.newTexture(cgImage: image, options: [:]) ?! " texture loader error"
+    scaleTexture(input: texture!, complete: getTexture)
+  }
+  
+  public func scaleTexture(input: MTLTexture , complete: @escaping (MTLTexture) -> Void) {
+    
+    guard let inQueue = queue, let inDevice = device else {
+      fatalError( " queue or devcie nil " )
+    }
+    
+    guard let buffer = inQueue.makeCommandBuffer() else {
+      fatalError( " make buffer error" )
+    }
+    
+    let scaleKernel = ScaleKernel.init(device: inDevice, shape: CusomKernel.Shape.init(inWidth: net.dim.w, inHeight: net.dim.h, inChannel: 3))
+    
+    do {
+      try scaleKernel.compute(inputTexuture: input, commandBuffer: buffer)
+    } catch let error {
+      print(error)
+      fatalError()
+    }
+    
+    buffer.addCompletedHandler { (buffer) in
+      complete(scaleKernel.outputTexture)
+    }
+    buffer.commit()
+  }
+}
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.h b/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.h
new file mode 100644
index 0000000000000000000000000000000000000000..00149053dfe6891f07f816feef524db35474a18b
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#import <Metal/Metal.h>
+#import <Foundation/Foundation.h>
+
+typedef enum : NSUInteger {
+  MobileNetType,
+  MobileNetSSDType,
+  GenetType,
+} NetType;
+
+@interface PaddleMobileGPUResult: NSObject
+
+@property (assign, nonatomic) float *output;
+
+@property (assign, nonatomic) int outputSize;
+
+-(void)releaseOutput;
+
+@end
+
+@interface ModelConfig: NSObject
+
+/*
+ * 预处理需要用到的值 (三个)
+ */
+@property (strong, nonatomic) NSArray<NSNumber *> *means;
+/*
+ * 预处理需要用到的 scale 值
+ */
+@property (assign, nonatomic) float scale;
+
+/*
+ * 输出维度信息  [n c h w]
+ */
+@property (strong, nonatomic) NSArray<NSNumber *> *dims;
+
+
+/*
+ * 模型参数内存地址
+ */
+@property (assign, nonatomic) void *paramPointer;
+
+/*
+ * 模型参数占用内存大小 (kb)
+ */
+@property (assign, nonatomic) int paramSize;
+
+/*
+ * 模型内存地址
+ */
+@property (assign, nonatomic) void *modelPointer;
+
+/*
+ * 模型占用内存大小 (kb)
+ */
+@property (assign, nonatomic) int modelSize;
+
+@end
+
+@interface PaddleMobileGPU: NSObject
+
+/*
+ * 初始化
+ */
+-(instancetype)initWithCommandQueue:(id<MTLCommandQueue>)queue net:(NetType)netType modelConfig:(ModelConfig *)config;
+
+/*
+ * paramPointer 模型参数内存地址
+ * paramSize    模型参数占用内存大小 (kb)
+ * modelPointer 模型内存地址
+ * modelSize    模型占用内存大小 (kb)
+ */
+-(BOOL)load;
+
+/*
+ * texture:     需要进行预测的图像转换的 texture
+ * completion:  预测完成回调
+ */
+-(void)predict:(id<MTLTexture>)texture withCompletion:(void (^)(BOOL, NSArray<NSNumber *> *))completion;
+
+/*
+ * texture:     需要进行预测的图像转换的 texture
+ * completion:  预测完成回调
+ */
+-(void)predict:(id<MTLTexture>)texture withResultCompletion:(void (^)(BOOL, PaddleMobileGPUResult *))completion;
+
+/*
+ * 清理内存
+ */
+-(void)clear;
+
+@end
+
+
diff --git a/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.m b/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.m
new file mode 100644
index 0000000000000000000000000000000000000000..4e56bf2f98db9cda0d36587bef576e90b3ee6553
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/PaddleMobileGPU.m
@@ -0,0 +1,95 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#import <Foundation/Foundation.h>
+#import "PaddleMobileGPU.h"
+#import "paddle_mobile.h"
+#import <paddle_mobile/paddle_mobile-Swift.h>
+
+@implementation ModelConfig
+@end
+
+@interface PaddleMobileGPUResult ()
+
+@property (strong, nonatomic) ResultHolder *resultHolder;
+
+- (void)setOutputResult:(ResultHolder *)resultHolder;
+
+@end
+
+@implementation PaddleMobileGPUResult
+- (void)setOutputResult:(ResultHolder *)resultHolder {
+  self.resultHolder = resultHolder;
+  self.output = resultHolder.result;
+  self.outputSize = resultHolder.capacity;
+}
+
+-(void)releaseOutput {
+  [self.resultHolder releasePointer];
+}
+@end
+
+@interface PaddleMobileGPU ()
+{
+  Runner *runner;
+}
+@end
+
+@implementation PaddleMobileGPU
+
+-(instancetype)initWithCommandQueue:(id<MTLCommandQueue>)queue net:(NetType)netType modelConfig:(ModelConfig *)config {
+  self = [super init];
+  if (self) {
+    Net *net = nil;
+    if (netType == GenetType) {
+      net = [[Genet alloc] initWithDevice:queue.device paramPointer:config.paramPointer paramSize:config.paramSize modePointer:config.modelPointer modelSize:config.modelSize];
+    } else if (netType == MobileNetSSDType) {
+      net = [[MobileNet_ssd_AR alloc] initWithDevice:queue.device paramPointer:config.paramPointer paramSize:config.paramSize modePointer:config.modelPointer modelSize:config.modelSize];
+    } else if (netType == MobileNetType) {
+      
+    }
+    runner = [[Runner alloc] initInNet:net commandQueue:queue inPlatform:PlatformGPU];
+  }
+  return self;
+}
+
+-(BOOL)load {
+  return [runner load];
+}
+
+-(void)predict:(id<MTLTexture>)texture withCompletion:(void (^)(BOOL, NSArray<NSNumber *> *))completion {
+  [runner predictWithTexture:texture completion:^(BOOL success, ResultHolder * _Nullable result) {
+    NSMutableArray<NSNumber *> *resultArray = [NSMutableArray arrayWithCapacity:result.capacity];
+    for (int i = 0; i < result.capacity; ++i) {
+      [resultArray addObject:[NSNumber numberWithFloat:result.result[i]]];
+    }
+    completion(success, resultArray);
+    [result releasePointer];
+    
+  }];
+}
+
+-(void)predict:(id<MTLTexture>)texture withResultCompletion:(void (^)(BOOL, PaddleMobileGPUResult *))completion {
+  [runner predictWithTexture:texture completion:^(BOOL success, ResultHolder * _Nullable result) {
+    PaddleMobileGPUResult *gpuResult = [[PaddleMobileGPUResult alloc] init];
+    [gpuResult setOutputResult:result];
+    completion(success, gpuResult);
+  }];
+}
+
+-(void)clear {
+  [runner clear];
+}
+
+@end
diff --git a/metal/paddle-mobile/paddle-mobile/Program/Attribute.swift b/metal/paddle-mobile/paddle-mobile/Program/Attribute.swift
new file mode 100644
index 0000000000000000000000000000000000000000..c26fd2132e6134dbbd05af08835229a31c231b9d
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Program/Attribute.swift
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+protocol Attr {
+}
+
+extension Bool: Attr {
+}
+
+extension Int: Attr {
+}
+
+extension Float: Attr {
+}
+
+extension Int64: Attr {
+}
+
+extension Array: Attr {
+}
+
+extension String: Attr {
+}
+
+func attrWithProtoDesc(attrDesc: PaddleMobile_Framework_Proto_OpDesc.Attr) -> Attr {
+    switch attrDesc.type {
+    case .boolean:
+        return attrDesc.b
+    case .int:
+        return Int(attrDesc.i)
+    case .string:
+        return attrDesc.s
+    case .long:
+        return attrDesc.l
+    case .float:
+        return attrDesc.f
+    case .booleans:
+        return attrDesc.bools
+    case .floats:
+        return attrDesc.floats
+    case .ints:
+        return attrDesc.ints
+    case .strings:
+        return attrDesc.strings
+    default:
+        fatalError(" not support this attr type: \(attrDesc.type)")
+    }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift b/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
new file mode 100644
index 0000000000000000000000000000000000000000..98dd7ff39a71cadfe6cc33f3d468448ac5155242
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
@@ -0,0 +1,69 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class BlockDesc {
+    let index: Int
+    let parentIndex: Int
+    let vars: [VarDesc]
+    let ops: [OpDesc]
+    init(block: PaddleMobile_Framework_Proto_BlockDesc) {
+        index = Int(block.idx)
+        parentIndex = Int(block.parentIdx)
+        var vars: [VarDesc] = []
+        for varOfBlock in block.vars {
+            vars.append(VarDesc.init(protoVarDesc: varOfBlock))
+        }
+        vars.sort { $0.name < $1.name }
+        self.vars = vars
+        var ops: [OpDesc] = []
+        for op in block.ops {
+            ops.append(OpDesc.init(protoOpDesc: op))
+        }
+        self.ops = ops
+    }
+    
+    init(inVars: [VarDesc], inOps: [OpDesc]) {
+        vars = inVars
+        ops = inOps
+        index = 0
+        parentIndex = 0
+    }
+    
+}
+
+extension BlockDesc: CustomStringConvertible, CustomDebugStringConvertible {
+    var description: String {
+        var str = ""
+        
+        for i in 0..<ops.count {
+          str += " op \(i): "
+          let op = ops[i]
+          str += op.description
+        }
+        
+        for varDesc in vars {
+            str += varDesc.description
+        }
+        
+        return str
+    }
+    
+    var debugDescription: String {
+        return description
+    }
+    
+    
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Program/OpDesc.swift b/metal/paddle-mobile/paddle-mobile/Program/OpDesc.swift
new file mode 100644
index 0000000000000000000000000000000000000000..44fc09a29db0deec67e7682b303b1d0947b47a51
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Program/OpDesc.swift
@@ -0,0 +1,81 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class OpDesc {
+  let inputs: [String : [String]]
+  var paraInputs: [String : [String]]
+  var outputs: [String : [String]]
+  let unusedOutputs: [String : [String]]
+  var attrs: [String : Attr] = [:]
+  var type: String
+  init(protoOpDesc: PaddleMobile_Framework_Proto_OpDesc) {
+    type = protoOpDesc.type
+    let creator = { (vars: [PaddleMobile_Framework_Proto_OpDesc.Var], canAdd: (String) -> Bool) -> [String : [String]] in
+      var map: [String : [String]] = [:]
+      for opDescVar  in vars {
+        if (canAdd(opDescVar.parameter)) {
+          map[opDescVar.parameter] = opDescVar.arguments
+        }
+      }
+      return map
+    }
+    
+    inputs = creator(protoOpDesc.inputs) {
+      opInfos[protoOpDesc.type]?.inputs.contains($0) ?? false
+    }
+    
+    paraInputs = creator(protoOpDesc.inputs) {
+      !(opInfos[protoOpDesc.type]?.inputs.contains($0) ?? false)
+    }
+    
+    outputs = creator(protoOpDesc.outputs) {
+      opInfos[protoOpDesc.type]?.outputs.contains($0) ?? false
+    }
+    
+    unusedOutputs = creator(protoOpDesc.outputs) {
+      !(opInfos[protoOpDesc.type]?.outputs.contains($0) ?? false)
+    }
+    
+    for attr in protoOpDesc.attrs {
+      if (attr.type != .block) {
+        attrs[attr.name] = attrWithProtoDesc(attrDesc: attr)
+      }
+    }
+  }
+}
+
+extension OpDesc: CustomStringConvertible, CustomDebugStringConvertible {
+  var description: String {
+    var str = ""
+    str += "op type: \(type): \n"
+    str += "    op inputs: \n"
+    str += "        \(inputs) \n"
+    str += "    op para inputs: \n"
+    str += "        \(paraInputs) \n"
+    str += "    op para outputs: \n"
+    str += "        \(outputs) \n"
+    str += "    op attrs: \n"
+    str += "        \(attrs) \n"
+    
+    return str
+  }
+  
+  var debugDescription: String {
+    return description
+  }
+  
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Program/Program.swift b/metal/paddle-mobile/paddle-mobile/Program/Program.swift
new file mode 100644
index 0000000000000000000000000000000000000000..464705d6db2b87945029de1bfcebddb1bfb4d092
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Program/Program.swift
@@ -0,0 +1,31 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+public class Program {
+    let paramPath: String
+    let programDesc: ProgramDesc
+    let scope: Scope
+    init(inProgramDesc: ProgramDesc, inParamPath: String, inScope: Scope) {
+        programDesc = inProgramDesc
+        paramPath = inParamPath
+        scope = inScope
+    }
+    init(inProgramDesc: ProgramDesc, inScope: Scope) {
+        programDesc = inProgramDesc
+        scope = inScope
+        paramPath = ""
+    }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Program/ProgramDesc.swift b/metal/paddle-mobile/paddle-mobile/Program/ProgramDesc.swift
new file mode 100644
index 0000000000000000000000000000000000000000..ad472e5a7d1fe9db248e47f4417d7c61fb01eaa9
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Program/ProgramDesc.swift
@@ -0,0 +1,44 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+public class ProgramDesc {
+    var blocks: [BlockDesc] = []
+    init(protoProgram: PaddleMobile_Framework_Proto_ProgramDesc) {
+        for block in protoProgram.blocks {
+            self.blocks.append(BlockDesc.init(block: block))
+        }
+    }
+    
+    init() {
+    }
+}
+
+extension ProgramDesc: CustomStringConvertible, CustomDebugStringConvertible {
+    public var description: String {
+        var str: String = ""
+        for i in 0..<blocks.count {
+            str += "block - \(i): \n"
+            str += blocks[i].description
+        }
+        return str
+    }
+    
+    public var debugDescription: String {
+        return description
+    }
+    
+    
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift b/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift
new file mode 100644
index 0000000000000000000000000000000000000000..87aced32c0c2cd576f023eeb5a3daad15daf1ce8
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift
@@ -0,0 +1,299 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+precedencegroup ChainNode {
+  associativity: left
+  higherThan: MultiplicationPrecedence
+}
+
+infix operator --> : ChainNode
+
+class Node {
+  var inputs: [Node] = []
+  var outputs: [Node] = []
+  var type: String
+  var opDesc: OpDesc?
+  init(inOpDesc: OpDesc) {
+    type = inOpDesc.type
+    opDesc = inOpDesc
+  }
+  
+  init(inType: String) {
+    type = inType
+  }
+  
+  subscript(index: Int) -> [Node] {
+    var nodes: [Node] = []
+    getNodesWithLocation(index: index, nowIndex: 0, nodes: &nodes)
+    return nodes
+  }
+  
+  func getNodesWithLocation(index: Int, nowIndex: Int, nodes: inout [Node]) {
+    if index == nowIndex {
+      nodes.append(self)
+    }
+    
+    for output in outputs {
+      output.getNodesWithLocation(index: index, nowIndex: nowIndex + 1, nodes: &nodes)
+    }
+  }
+  
+  static func -->(lNode: Node, rNode: Node) -> Node {
+    lNode.outputs.append(rNode)
+    rNode.inputs.append(lNode)
+    return rNode
+  }
+  
+  func depth(begin: UInt = 1) -> UInt {
+    var beginMax: UInt = 1
+    for output in outputs {
+      let subDepth = output.depth(begin: begin + 1)
+      beginMax = max(begin, subDepth)
+    }
+    beginMax = max(begin, beginMax)
+    return beginMax
+  }
+  
+  func to(depth: UInt) -> Node {
+    let beginNode = Node.init(inType: type)
+    beginNode.opDesc = opDesc
+    to(depth: depth - 1, withNode: beginNode)
+    return beginNode
+  }
+  
+  func folderWith(fusion: Fusion.Type, removedNodes: inout [Node]) {
+    let fusionNode = fusion.fusionNode()
+    let change = fusion.change()
+    let inOutputs = outputs
+    outputs.removeAll()
+    opDesc?.outputs.removeAll()
+    for i in 0..<inOutputs.count {
+      inOutputs[i].folderWith(beginNode: self, matchNode: fusionNode.outputs[i], change: change, removedNodes: &removedNodes)
+    }
+    opDesc?.type = fusion.fusionType()
+    type = fusion.fusionType()
+  }
+  
+  private func folderWith(beginNode: Node, matchNode: Node, change: [String : [(from: String, to: String)]], removedNodes: inout [Node]) {
+    guard let inOpdesc = opDesc else {
+      fatalError()
+    }
+    
+    for attr in inOpdesc.attrs {
+      beginNode.opDesc?.attrs[attr.key] = attr.value
+      //            print(beginNode.opDesc?.attrs)
+    }
+    
+    for paraInput in inOpdesc.paraInputs {
+      if let inChanges = change[type] {
+        for keyChange in inChanges {
+          if keyChange.from == paraInput.key {
+            beginNode.opDesc?.paraInputs[keyChange.to] = paraInput.value
+          } else {
+            beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
+          }
+        }
+      } else {
+        beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value
+      }
+    }
+    
+    if matchNode.outputs.count == 0 {
+      beginNode.outputs.append(contentsOf: outputs)
+      beginNode.opDesc?.outputs = inOpdesc.outputs
+      
+    }
+    removedNodes.append(self)
+    
+    for i in 0..<matchNode.outputs.count {
+      outputs[i].folderWith(beginNode: beginNode, matchNode: matchNode.outputs[i], change: change, removedNodes: &removedNodes)
+    }
+    
+  }
+  
+  private func to(depth: UInt, withNode: Node) {
+    if depth < 1 {
+      return
+    }
+    
+    for output in outputs {
+      let node = Node.init(inType: output.type)
+      node.opDesc = output.opDesc
+      withNode.outputs.append(node)
+      output.to(depth: depth - 1, withNode: node)
+    }
+  }
+  
+  func relationship() -> [String : Node]{
+    var map: [String : Node] = [:]
+    relationship(map: &map)
+    return map
+  }
+  
+  private func relationship(map: inout [String : Node]) {
+    guard let inOpDesc = opDesc else {
+      return
+    }
+    
+    for output in inOpDesc.outputs {
+      for outputKey in output.value {
+        map[outputKey] = self
+      }
+    }
+    
+    for output in outputs {
+      output.relationship(map: &map)
+    }
+  }
+  
+}
+
+extension Node: Equatable {
+  static func == (lhs: Node, rhs: Node) -> Bool {
+    if lhs.outputs.count != rhs.outputs.count {
+      return false
+    }
+    
+    if lhs.type != rhs.type {
+      return false
+    }
+    
+    for i in 0..<lhs.outputs.count {
+      if lhs.outputs[i] != rhs.outputs[i] {
+        return false
+      }
+    }
+    return true
+  }
+  
+}
+
+class ProgramOptimize<P: PrecisionType> {
+  // register fusion
+  let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self,
+//                                  ConvAddAddPreluOp<P>.self,
+                                  ConvAddPreluOp<P>.self,
+                                  ConvAddOp<P>.self,
+                                  ConvBNReluOp<P>.self,
+                                  DwConvBNReluOp<P>.self,
+                                  ElementwiseAddPreluOp<P>.self
+  ]
+  
+  func optimize(originProgramDesc: ProgramDesc) -> ProgramDesc {
+    
+    guard originProgramDesc.blocks.count == 1 else {
+      fatalError(" not support yet")
+    }
+    
+    var mapForNodeChain: [String : Node] = [:]
+    var nodes: [Node] = []
+    var typeMapNodes: [String : [(node: Node, output: [String : Node])]] = [:]
+    let block = originProgramDesc.blocks[0]
+    for opDesc in block.ops {
+      guard let opInputKeys = opInfos[opDesc.type]?.inputs, let outputKeys = opInfos[opDesc.type]?.outputs else {
+        fatalError()
+      }
+      
+      let node = Node.init(inOpDesc: opDesc)
+      for inputKey in opInputKeys {
+        if let inputs = opDesc.inputs[inputKey] {
+          for input in inputs {
+            if let inputNode = mapForNodeChain[input] {
+              _ = inputNode --> node
+            }
+          }
+        }
+      }
+      
+      for outputKey in outputKeys {
+        if let outputs = opDesc.outputs[outputKey] {
+          for output in outputs {
+            mapForNodeChain[output] = node
+          }
+        }
+      }
+      
+      nodes.append(node)
+      
+      if var inNodes = typeMapNodes[opDesc.type] {
+        inNodes.append((node, mapForNodeChain))
+        typeMapNodes[opDesc.type] = inNodes
+      } else {
+        typeMapNodes[opDesc.type] = [(node, mapForNodeChain)]
+      }
+    }
+    
+    for fusion in fusionOps {
+      let fusionNode = fusion.fusionNode()
+      let depth = fusionNode.depth()
+      if let toMatchNodes = typeMapNodes[fusionNode.type] {
+        for node in toMatchNodes {
+          
+          let toNode = node.node.to(depth: depth)
+          if toNode == fusionNode {   // match
+            var canFolder = true
+            let relationshipMap = toNode.relationship()
+            
+            for toCheck in fusion.needCheck() {
+              //              let nodes = toCheck
+              let checkNodes = toNode[toCheck.0]
+              
+              for checkNode in checkNodes {
+                let inputToChecks = checkNode.opDesc?.inputs[toCheck.1] ?? []
+                for inputToCheck in inputToChecks {
+                  if node.output[inputToCheck] == nil {
+                    if relationshipMap[inputToCheck] == nil {
+                      canFolder = false
+                    }
+                  }
+                }
+                
+                let paramInputToChecks = checkNode.opDesc?.paraInputs[toCheck.1] ?? []
+                for paramInputToCheck in paramInputToChecks {
+                  if node.output[paramInputToCheck] == nil {
+                    if relationshipMap[paramInputToCheck] == nil {
+                      canFolder = false
+                    }
+                  }
+                }
+              }
+            }
+            
+            if !canFolder {
+              continue
+            }
+            
+            var removeNodes: [Node] = []
+            node.node.folderWith(fusion: fusion, removedNodes: &removeNodes)
+            for removeNode in removeNodes {
+              nodes.remove(element: removeNode)
+            }
+          }
+        }
+      }
+    }
+    
+    var ops: [OpDesc] = []
+    for node in nodes {
+      ops.append(node.opDesc!)
+    }
+    
+    var newProgramDesc = ProgramDesc.init()
+    let newBlock = BlockDesc.init(inVars: block.vars, inOps: ops)
+    newProgramDesc.blocks.append(newBlock)
+    return newProgramDesc
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Program/Scope.swift b/metal/paddle-mobile/paddle-mobile/Program/Scope.swift
new file mode 100644
index 0000000000000000000000000000000000000000..77e32908b30ad7a843a583b47c2a11b76d19f3b9
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Program/Scope.swift
@@ -0,0 +1,55 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class Scope {
+    let feedKey: String
+    let fetchKey: String
+    func setInput(input: Variant) {
+        vars[feedKey] = input
+    }
+    
+    func setOutput(output: Variant) {
+        vars[fetchKey] = output
+    }
+    
+    func input() -> Variant? {
+        return vars[feedKey];
+    }
+    
+    func output() -> Variant? {
+        return vars[fetchKey];
+    }
+    
+    init(inFeedKey: String, inFetchKey: String) {
+        feedKey = inFeedKey
+        fetchKey = inFetchKey
+    }
+    
+    var vars: [String : Variant] = [:]
+    subscript(key: String) -> Variant?{
+        get {
+            return vars[key]
+        }
+        set {
+            vars[key] = newValue
+        }
+        
+    }
+
+    func clear(){
+        vars.removeAll()
+    }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Program/TensorDesc.swift b/metal/paddle-mobile/paddle-mobile/Program/TensorDesc.swift
new file mode 100644
index 0000000000000000000000000000000000000000..1a72f5ef717063136c4708c881befd789a57219c
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Program/TensorDesc.swift
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class TensorDesc {
+    let dims: [Int]
+    let dataType: VarTypeType
+    let dataLayout: DataLayout = DataLayout.NCHW()
+    var NCHWDim: [Int] {
+        get {
+            if dims.count != 4 {
+                return dims
+            }
+            if dataLayout == DataLayout.NCHW() {
+                return dims
+            } else if dataLayout == DataLayout.NHWC() {
+                var resultDims = dims
+                resultDims.swapAt(1, 3)
+                return resultDims
+            } else {
+                fatalError(" not support other layout")
+            }
+        }
+    }
+    
+    var NHWCDim: [Int] {
+        get {
+            if dims.count != 4 {
+                return dims
+            }
+            if dataLayout == DataLayout.NHWC() {
+                return dims
+            } else if dataLayout == DataLayout.NCHW() {
+                var resultDims = dims
+                resultDims.swapAt(1, 3)
+                return resultDims
+            } else {
+                fatalError(" not support other layout")
+            }
+        }
+    }
+    
+    init(protoTensorDesc: PaddleMobile_Framework_Proto_VarType.TensorDesc) {
+        dims = protoTensorDesc.dims.map{ Int($0) > 0 ? Int($0) : abs(Int($0)) }
+        dataType = VarTypeType.init(rawValue: protoTensorDesc.dataType.rawValue) ?? .ErrorType
+    }
+    
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Program/VarDesc.swift b/metal/paddle-mobile/paddle-mobile/Program/VarDesc.swift
new file mode 100644
index 0000000000000000000000000000000000000000..f29169598f69ec568bd9d08af8fa4738fe8f5eea
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Program/VarDesc.swift
@@ -0,0 +1,98 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+enum VarTypeType: Int {
+    case ErrorType = -1,
+    Bool = 0,
+    Int16 = 1,
+    Int32 = 2,
+    Int64 = 3,
+    FP16 = 4,
+    FP32 = 5,
+    FP64 = 6,
+    LodTensor = 7,
+    SelectedRows = 8,
+    FeedMiniBatch = 9,
+    FetchList = 10,
+    StepScopes = 11,
+    StepLodRankTable = 12,
+    StepLodTensorArray = 13,
+    StepPlaceList = 14,
+    Reader = 15,
+    Channel = 16,
+    Raw = 17,
+    Tuple = 18
+    
+    func dataTypeSize() throws -> Int {
+        switch self {
+        case .FP16:
+            return 2
+        case .FP32:
+            return 4
+        case .FP64:
+            return 8
+        case .Int32:
+            return 4
+        case .Int64:
+            return 8
+        case .Bool:
+            return 1
+        default:
+            throw PaddleMobileError.memoryError(message: "not support \(self) type to get size ")
+        }
+    }
+}
+
+class VarDesc {
+    let name: String
+    let persistable: Bool
+    let type: VarTypeType
+    let tensorDesc: TensorDesc?
+    init(protoVarDesc: PaddleMobile_Framework_Proto_VarDesc) {
+        type = VarTypeType.init(rawValue: protoVarDesc.type.type.rawValue) ?? .ErrorType
+        name = protoVarDesc.name
+        persistable = protoVarDesc.persistable
+        switch type {
+        case .SelectedRows:
+            tensorDesc = TensorDesc.init(protoTensorDesc: protoVarDesc.type.selectedRows)
+        case .LodTensor:
+            tensorDesc = TensorDesc.init(protoTensorDesc: protoVarDesc.type.lodTensor.tensor)
+        case .StepLodTensorArray:
+            tensorDesc = TensorDesc.init(protoTensorDesc: protoVarDesc.type.tensorArray.tensor);
+        default:
+            tensorDesc = .none
+        }
+    }
+}
+
+extension VarDesc: CustomStringConvertible, CustomDebugStringConvertible {
+    var description: String {
+        var str = ""
+        str += "var name \(name): \n"
+        if let inTensorDesc = tensorDesc {
+            str += " dim size: \(inTensorDesc.dims.count) \n"
+            str += "    dim: \(inTensorDesc.dims) \n"
+        } else {
+            str += " no dim info"
+        }
+        
+        return str
+    }
+    
+    var debugDescription: String {
+        return description
+    }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/Program/framework.pb.swift b/metal/paddle-mobile/paddle-mobile/Program/framework.pb.swift
new file mode 100644
index 0000000000000000000000000000000000000000..df4af3bcc91853e507321d46d3edfd04045f29ab
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/Program/framework.pb.swift
@@ -0,0 +1,1821 @@
+// DO NOT EDIT.
+//
+// Generated by the Swift generator plugin for the protocol buffer compiler.
+// Source: framework.proto
+//
+// For information on using the generated types, please see the documenation:
+//   https://github.com/apple/swift-protobuf/
+
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+//
+//Licensed under the Apache License, Version 2.0 (the "License");
+//you may not use this file except in compliance with the License.
+//You may obtain a copy of the License at
+//
+//http://www.apache.org/licenses/LICENSE-2.0
+//
+//Unless required by applicable law or agreed to in writing, software
+//distributed under the License is distributed on an "AS IS" BASIS,
+//WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//See the License for the specific language governing permissions and
+//limitations under the License. 
+
+import Foundation
+import SwiftProtobuf
+
+// If the compiler emits an error on this type, it is because this file
+// was generated by a version of the `protoc` Swift plug-in that is
+// incompatible with the version of SwiftProtobuf to which you are linking.
+// Please ensure that your are building against the same version of the API
+// that was used to generate this file.
+fileprivate struct _GeneratedWithProtocGenSwiftVersion: SwiftProtobuf.ProtobufAPIVersionCheck {
+  struct _2: SwiftProtobuf.ProtobufAPIVersion_2 {}
+  typealias Version = _2
+}
+
+enum PaddleMobile_Framework_Proto_AttrType: SwiftProtobuf.Enum {
+  typealias RawValue = Int
+  case int // = 0
+  case float // = 1
+  case string // = 2
+  case ints // = 3
+  case floats // = 4
+  case strings // = 5
+  case boolean // = 6
+  case booleans // = 7
+  case block // = 8
+  case long // = 9
+
+  init() {
+    self = .int
+  }
+
+  init?(rawValue: Int) {
+    switch rawValue {
+    case 0: self = .int
+    case 1: self = .float
+    case 2: self = .string
+    case 3: self = .ints
+    case 4: self = .floats
+    case 5: self = .strings
+    case 6: self = .boolean
+    case 7: self = .booleans
+    case 8: self = .block
+    case 9: self = .long
+    default: return nil
+    }
+  }
+
+  var rawValue: Int {
+    switch self {
+    case .int: return 0
+    case .float: return 1
+    case .string: return 2
+    case .ints: return 3
+    case .floats: return 4
+    case .strings: return 5
+    case .boolean: return 6
+    case .booleans: return 7
+    case .block: return 8
+    case .long: return 9
+    }
+  }
+
+}
+
+/// OpDesc describes an instance of a C++ framework::OperatorBase
+/// derived class type.
+struct PaddleMobile_Framework_Proto_OpDesc {
+  // SwiftProtobuf.Message conformance is added in an extension below. See the
+  // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
+  // methods supported on all messages.
+
+  var type: String {
+    get {return _type ?? String()}
+    set {_type = newValue}
+  }
+  /// Returns true if `type` has been explicitly set.
+  var hasType: Bool {return self._type != nil}
+  /// Clears the value of `type`. Subsequent reads from it will return its default value.
+  mutating func clearType() {self._type = nil}
+
+  var inputs: [PaddleMobile_Framework_Proto_OpDesc.Var] = []
+
+  var outputs: [PaddleMobile_Framework_Proto_OpDesc.Var] = []
+
+  var attrs: [PaddleMobile_Framework_Proto_OpDesc.Attr] = []
+
+  var isTarget: Bool {
+    get {return _isTarget ?? false}
+    set {_isTarget = newValue}
+  }
+  /// Returns true if `isTarget` has been explicitly set.
+  var hasIsTarget: Bool {return self._isTarget != nil}
+  /// Clears the value of `isTarget`. Subsequent reads from it will return its default value.
+  mutating func clearIsTarget() {self._isTarget = nil}
+
+  var unknownFields = SwiftProtobuf.UnknownStorage()
+
+  struct Attr {
+    // SwiftProtobuf.Message conformance is added in an extension below. See the
+    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
+    // methods supported on all messages.
+
+    var name: String {
+      get {return _name ?? String()}
+      set {_name = newValue}
+    }
+    /// Returns true if `name` has been explicitly set.
+    var hasName: Bool {return self._name != nil}
+    /// Clears the value of `name`. Subsequent reads from it will return its default value.
+    mutating func clearName() {self._name = nil}
+
+    var type: PaddleMobile_Framework_Proto_AttrType {
+      get {return _type ?? .int}
+      set {_type = newValue}
+    }
+    /// Returns true if `type` has been explicitly set.
+    var hasType: Bool {return self._type != nil}
+    /// Clears the value of `type`. Subsequent reads from it will return its default value.
+    mutating func clearType() {self._type = nil}
+
+    var i: Int32 {
+      get {return _i ?? 0}
+      set {_i = newValue}
+    }
+    /// Returns true if `i` has been explicitly set.
+    var hasI: Bool {return self._i != nil}
+    /// Clears the value of `i`. Subsequent reads from it will return its default value.
+    mutating func clearI() {self._i = nil}
+
+    var f: Float {
+      get {return _f ?? 0}
+      set {_f = newValue}
+    }
+    /// Returns true if `f` has been explicitly set.
+    var hasF: Bool {return self._f != nil}
+    /// Clears the value of `f`. Subsequent reads from it will return its default value.
+    mutating func clearF() {self._f = nil}
+
+    var s: String {
+      get {return _s ?? String()}
+      set {_s = newValue}
+    }
+    /// Returns true if `s` has been explicitly set.
+    var hasS: Bool {return self._s != nil}
+    /// Clears the value of `s`. Subsequent reads from it will return its default value.
+    mutating func clearS() {self._s = nil}
+
+    var ints: [Int32] = []
+
+    var floats: [Float] = []
+
+    var strings: [String] = []
+
+    var b: Bool {
+      get {return _b ?? false}
+      set {_b = newValue}
+    }
+    /// Returns true if `b` has been explicitly set.
+    var hasB: Bool {return self._b != nil}
+    /// Clears the value of `b`. Subsequent reads from it will return its default value.
+    mutating func clearB() {self._b = nil}
+
+    var bools: [Bool] = []
+
+    var blockIdx: Int32 {
+      get {return _blockIdx ?? 0}
+      set {_blockIdx = newValue}
+    }
+    /// Returns true if `blockIdx` has been explicitly set.
+    var hasBlockIdx: Bool {return self._blockIdx != nil}
+    /// Clears the value of `blockIdx`. Subsequent reads from it will return its default value.
+    mutating func clearBlockIdx() {self._blockIdx = nil}
+
+    var l: Int64 {
+      get {return _l ?? 0}
+      set {_l = newValue}
+    }
+    /// Returns true if `l` has been explicitly set.
+    var hasL: Bool {return self._l != nil}
+    /// Clears the value of `l`. Subsequent reads from it will return its default value.
+    mutating func clearL() {self._l = nil}
+
+    var unknownFields = SwiftProtobuf.UnknownStorage()
+
+    init() {}
+
+    fileprivate var _name: String? = nil
+    fileprivate var _type: PaddleMobile_Framework_Proto_AttrType? = nil
+    fileprivate var _i: Int32? = nil
+    fileprivate var _f: Float? = nil
+    fileprivate var _s: String? = nil
+    fileprivate var _b: Bool? = nil
+    fileprivate var _blockIdx: Int32? = nil
+    fileprivate var _l: Int64? = nil
+  }
+
+  struct Var {
+    // SwiftProtobuf.Message conformance is added in an extension below. See the
+    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
+    // methods supported on all messages.
+
+    var parameter: String {
+      get {return _parameter ?? String()}
+      set {_parameter = newValue}
+    }
+    /// Returns true if `parameter` has been explicitly set.
+    var hasParameter: Bool {return self._parameter != nil}
+    /// Clears the value of `parameter`. Subsequent reads from it will return its default value.
+    mutating func clearParameter() {self._parameter = nil}
+
+    var arguments: [String] = []
+
+    var unknownFields = SwiftProtobuf.UnknownStorage()
+
+    init() {}
+
+    fileprivate var _parameter: String? = nil
+  }
+
+  init() {}
+
+  fileprivate var _type: String? = nil
+  fileprivate var _isTarget: Bool? = nil
+}
+
+/// OpProto describes a C++ framework::OperatorBase derived class.
+struct PaddleMobile_Framework_Proto_OpProto {
+  // SwiftProtobuf.Message conformance is added in an extension below. See the
+  // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
+  // methods supported on all messages.
+
+  var type: String {
+    get {return _type ?? String()}
+    set {_type = newValue}
+  }
+  /// Returns true if `type` has been explicitly set.
+  var hasType: Bool {return self._type != nil}
+  /// Clears the value of `type`. Subsequent reads from it will return its default value.
+  mutating func clearType() {self._type = nil}
+
+  var inputs: [PaddleMobile_Framework_Proto_OpProto.Var] = []
+
+  var outputs: [PaddleMobile_Framework_Proto_OpProto.Var] = []
+
+  var attrs: [PaddleMobile_Framework_Proto_OpProto.Attr] = []
+
+  var comment: String {
+    get {return _comment ?? String()}
+    set {_comment = newValue}
+  }
+  /// Returns true if `comment` has been explicitly set.
+  var hasComment: Bool {return self._comment != nil}
+  /// Clears the value of `comment`. Subsequent reads from it will return its default value.
+  mutating func clearComment() {self._comment = nil}
+
+  var unknownFields = SwiftProtobuf.UnknownStorage()
+
+  /// VarProto describes the C++ type framework::Variable.
+  struct Var {
+    // SwiftProtobuf.Message conformance is added in an extension below. See the
+    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
+    // methods supported on all messages.
+
+    var name: String {
+      get {return _name ?? String()}
+      set {_name = newValue}
+    }
+    /// Returns true if `name` has been explicitly set.
+    var hasName: Bool {return self._name != nil}
+    /// Clears the value of `name`. Subsequent reads from it will return its default value.
+    mutating func clearName() {self._name = nil}
+
+    var comment: String {
+      get {return _comment ?? String()}
+      set {_comment = newValue}
+    }
+    /// Returns true if `comment` has been explicitly set.
+    var hasComment: Bool {return self._comment != nil}
+    /// Clears the value of `comment`. Subsequent reads from it will return its default value.
+    mutating func clearComment() {self._comment = nil}
+
+    var duplicable: Bool {
+      get {return _duplicable ?? false}
+      set {_duplicable = newValue}
+    }
+    /// Returns true if `duplicable` has been explicitly set.
+    var hasDuplicable: Bool {return self._duplicable != nil}
+    /// Clears the value of `duplicable`. Subsequent reads from it will return its default value.
+    mutating func clearDuplicable() {self._duplicable = nil}
+
+    var intermediate: Bool {
+      get {return _intermediate ?? false}
+      set {_intermediate = newValue}
+    }
+    /// Returns true if `intermediate` has been explicitly set.
+    var hasIntermediate: Bool {return self._intermediate != nil}
+    /// Clears the value of `intermediate`. Subsequent reads from it will return its default value.
+    mutating func clearIntermediate() {self._intermediate = nil}
+
+    var dispensable: Bool {
+      get {return _dispensable ?? false}
+      set {_dispensable = newValue}
+    }
+    /// Returns true if `dispensable` has been explicitly set.
+    var hasDispensable: Bool {return self._dispensable != nil}
+    /// Clears the value of `dispensable`. Subsequent reads from it will return its default value.
+    mutating func clearDispensable() {self._dispensable = nil}
+
+    var unknownFields = SwiftProtobuf.UnknownStorage()
+
+    init() {}
+
+    fileprivate var _name: String? = nil
+    fileprivate var _comment: String? = nil
+    fileprivate var _duplicable: Bool? = nil
+    fileprivate var _intermediate: Bool? = nil
+    fileprivate var _dispensable: Bool? = nil
+  }
+
+  /// AttrProto describes the C++ type Attribute.
+  struct Attr {
+    // SwiftProtobuf.Message conformance is added in an extension below. See the
+    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
+    // methods supported on all messages.
+
+    var name: String {
+      get {return _name ?? String()}
+      set {_name = newValue}
+    }
+    /// Returns true if `name` has been explicitly set.
+    var hasName: Bool {return self._name != nil}
+    /// Clears the value of `name`. Subsequent reads from it will return its default value.
+    mutating func clearName() {self._name = nil}
+
+    var type: PaddleMobile_Framework_Proto_AttrType {
+      get {return _type ?? .int}
+      set {_type = newValue}
+    }
+    /// Returns true if `type` has been explicitly set.
+    var hasType: Bool {return self._type != nil}
+    /// Clears the value of `type`. Subsequent reads from it will return its default value.
+    mutating func clearType() {self._type = nil}
+
+    var comment: String {
+      get {return _comment ?? String()}
+      set {_comment = newValue}
+    }
+    /// Returns true if `comment` has been explicitly set.
+    var hasComment: Bool {return self._comment != nil}
+    /// Clears the value of `comment`. Subsequent reads from it will return its default value.
+    mutating func clearComment() {self._comment = nil}
+
+    /// If that attribute is generated, it means the Paddle third
+    /// language binding has responsibility to fill that
+    /// attribute. End-User should not set that attribute.
+    var generated: Bool {
+      get {return _generated ?? false}
+      set {_generated = newValue}
+    }
+    /// Returns true if `generated` has been explicitly set.
+    var hasGenerated: Bool {return self._generated != nil}
+    /// Clears the value of `generated`. Subsequent reads from it will return its default value.
+    mutating func clearGenerated() {self._generated = nil}
+
+    var unknownFields = SwiftProtobuf.UnknownStorage()
+
+    init() {}
+
+    fileprivate var _name: String? = nil
+    fileprivate var _type: PaddleMobile_Framework_Proto_AttrType? = nil
+    fileprivate var _comment: String? = nil
+    fileprivate var _generated: Bool? = nil
+  }
+
+  init() {}
+
+  fileprivate var _type: String? = nil
+  fileprivate var _comment: String? = nil
+}
+
+struct PaddleMobile_Framework_Proto_VarType {
+  // SwiftProtobuf.Message conformance is added in an extension below. See the
+  // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
+  // methods supported on all messages.
+
+  var type: PaddleMobile_Framework_Proto_VarType.TypeEnum {
+    get {return _storage._type ?? .bool}
+    set {_uniqueStorage()._type = newValue}
+  }
+  /// Returns true if `type` has been explicitly set.
+  var hasType: Bool {return _storage._type != nil}
+  /// Clears the value of `type`. Subsequent reads from it will return its default value.
+  mutating func clearType() {_storage._type = nil}
+
+  var selectedRows: PaddleMobile_Framework_Proto_VarType.TensorDesc {
+    get {return _storage._selectedRows ?? PaddleMobile_Framework_Proto_VarType.TensorDesc()}
+    set {_uniqueStorage()._selectedRows = newValue}
+  }
+  /// Returns true if `selectedRows` has been explicitly set.
+  var hasSelectedRows: Bool {return _storage._selectedRows != nil}
+  /// Clears the value of `selectedRows`. Subsequent reads from it will return its default value.
+  mutating func clearSelectedRows() {_storage._selectedRows = nil}
+
+  var lodTensor: PaddleMobile_Framework_Proto_VarType.LoDTensorDesc {
+    get {return _storage._lodTensor ?? PaddleMobile_Framework_Proto_VarType.LoDTensorDesc()}
+    set {_uniqueStorage()._lodTensor = newValue}
+  }
+  /// Returns true if `lodTensor` has been explicitly set.
+  var hasLodTensor: Bool {return _storage._lodTensor != nil}
+  /// Clears the value of `lodTensor`. Subsequent reads from it will return its default value.
+  mutating func clearLodTensor() {_storage._lodTensor = nil}
+
+  var tensorArray: PaddleMobile_Framework_Proto_VarType.LoDTensorArrayDesc {
+    get {return _storage._tensorArray ?? PaddleMobile_Framework_Proto_VarType.LoDTensorArrayDesc()}
+    set {_uniqueStorage()._tensorArray = newValue}
+  }
+  /// Returns true if `tensorArray` has been explicitly set.
+  var hasTensorArray: Bool {return _storage._tensorArray != nil}
+  /// Clears the value of `tensorArray`. Subsequent reads from it will return its default value.
+  mutating func clearTensorArray() {_storage._tensorArray = nil}
+
+  var reader: PaddleMobile_Framework_Proto_VarType.ReaderDesc {
+    get {return _storage._reader ?? PaddleMobile_Framework_Proto_VarType.ReaderDesc()}
+    set {_uniqueStorage()._reader = newValue}
+  }
+  /// Returns true if `reader` has been explicitly set.
+  var hasReader: Bool {return _storage._reader != nil}
+  /// Clears the value of `reader`. Subsequent reads from it will return its default value.
+  mutating func clearReader() {_storage._reader = nil}
+
+  var channel: PaddleMobile_Framework_Proto_VarType.ChannelDesc {
+    get {return _storage._channel ?? PaddleMobile_Framework_Proto_VarType.ChannelDesc()}
+    set {_uniqueStorage()._channel = newValue}
+  }
+  /// Returns true if `channel` has been explicitly set.
+  var hasChannel: Bool {return _storage._channel != nil}
+  /// Clears the value of `channel`. Subsequent reads from it will return its default value.
+  mutating func clearChannel() {_storage._channel = nil}
+
+  var tuple: PaddleMobile_Framework_Proto_VarType.Tuple {
+    get {return _storage._tuple ?? PaddleMobile_Framework_Proto_VarType.Tuple()}
+    set {_uniqueStorage()._tuple = newValue}
+  }
+  /// Returns true if `tuple` has been explicitly set.
+  var hasTuple: Bool {return _storage._tuple != nil}
+  /// Clears the value of `tuple`. Subsequent reads from it will return its default value.
+  mutating func clearTuple() {_storage._tuple = nil}
+
+  var unknownFields = SwiftProtobuf.UnknownStorage()
+
+  enum TypeEnum: SwiftProtobuf.Enum {
+    typealias RawValue = Int
+
+    /// Pod Types
+    case bool // = 0
+    case int16 // = 1
+    case int32 // = 2
+    case int64 // = 3
+    case fp16 // = 4
+    case fp32 // = 5
+    case fp64 // = 6
+
+    /// Other types that may need additional descriptions
+    case lodTensor // = 7
+    case selectedRows // = 8
+    case feedMinibatch // = 9
+    case fetchList // = 10
+    case stepScopes // = 11
+    case lodRankTable // = 12
+    case lodTensorArray // = 13
+    case placeList // = 14
+    case reader // = 15
+    case channel // = 16
+
+    /// Any runtime decided variable type is raw
+    /// raw variables should manage their own allocations
+    /// in operators like nccl_op
+    case raw // = 17
+    case tuple // = 18
+
+    init() {
+      self = .bool
+    }
+
+    init?(rawValue: Int) {
+      switch rawValue {
+      case 0: self = .bool
+      case 1: self = .int16
+      case 2: self = .int32
+      case 3: self = .int64
+      case 4: self = .fp16
+      case 5: self = .fp32
+      case 6: self = .fp64
+      case 7: self = .lodTensor
+      case 8: self = .selectedRows
+      case 9: self = .feedMinibatch
+      case 10: self = .fetchList
+      case 11: self = .stepScopes
+      case 12: self = .lodRankTable
+      case 13: self = .lodTensorArray
+      case 14: self = .placeList
+      case 15: self = .reader
+      case 16: self = .channel
+      case 17: self = .raw
+      case 18: self = .tuple
+      default: return nil
+      }
+    }
+
+    var rawValue: Int {
+      switch self {
+      case .bool: return 0
+      case .int16: return 1
+      case .int32: return 2
+      case .int64: return 3
+      case .fp16: return 4
+      case .fp32: return 5
+      case .fp64: return 6
+      case .lodTensor: return 7
+      case .selectedRows: return 8
+      case .feedMinibatch: return 9
+      case .fetchList: return 10
+      case .stepScopes: return 11
+      case .lodRankTable: return 12
+      case .lodTensorArray: return 13
+      case .placeList: return 14
+      case .reader: return 15
+      case .channel: return 16
+      case .raw: return 17
+      case .tuple: return 18
+      }
+    }
+
+  }
+
+  struct TensorDesc {
+    // SwiftProtobuf.Message conformance is added in an extension below. See the
+    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
+    // methods supported on all messages.
+
+    /// Should only be PODType. Is enforced in C++
+    var dataType: PaddleMobile_Framework_Proto_VarType.TypeEnum {
+      get {return _dataType ?? .bool}
+      set {_dataType = newValue}
+    }
+    /// Returns true if `dataType` has been explicitly set.
+    var hasDataType: Bool {return self._dataType != nil}
+    /// Clears the value of `dataType`. Subsequent reads from it will return its default value.
+    mutating func clearDataType() {self._dataType = nil}
+
+    /// [UNK, 640, 480] is saved as [-1, 640, 480]
+    var dims: [Int64] = []
+
+    var unknownFields = SwiftProtobuf.UnknownStorage()
+
+    init() {}
+
+    fileprivate var _dataType: PaddleMobile_Framework_Proto_VarType.TypeEnum? = nil
+  }
+
+  struct LoDTensorDesc {
+    // SwiftProtobuf.Message conformance is added in an extension below. See the
+    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
+    // methods supported on all messages.
+
+    var tensor: PaddleMobile_Framework_Proto_VarType.TensorDesc {
+      get {return _storage._tensor ?? PaddleMobile_Framework_Proto_VarType.TensorDesc()}
+      set {_uniqueStorage()._tensor = newValue}
+    }
+    /// Returns true if `tensor` has been explicitly set.
+    var hasTensor: Bool {return _storage._tensor != nil}
+    /// Clears the value of `tensor`. Subsequent reads from it will return its default value.
+    mutating func clearTensor() {_storage._tensor = nil}
+
+    var lodLevel: Int32 {
+      get {return _storage._lodLevel ?? 0}
+      set {_uniqueStorage()._lodLevel = newValue}
+    }
+    /// Returns true if `lodLevel` has been explicitly set.
+    var hasLodLevel: Bool {return _storage._lodLevel != nil}
+    /// Clears the value of `lodLevel`. Subsequent reads from it will return its default value.
+    mutating func clearLodLevel() {_storage._lodLevel = nil}
+
+    var unknownFields = SwiftProtobuf.UnknownStorage()
+
+    init() {}
+
+    fileprivate var _storage = _StorageClass.defaultInstance
+  }
+
+  struct LoDTensorArrayDesc {
+    // SwiftProtobuf.Message conformance is added in an extension below. See the
+    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
+    // methods supported on all messages.
+
+    var tensor: PaddleMobile_Framework_Proto_VarType.TensorDesc {
+      get {return _storage._tensor ?? PaddleMobile_Framework_Proto_VarType.TensorDesc()}
+      set {_uniqueStorage()._tensor = newValue}
+    }
+    /// Returns true if `tensor` has been explicitly set.
+    var hasTensor: Bool {return _storage._tensor != nil}
+    /// Clears the value of `tensor`. Subsequent reads from it will return its default value.
+    mutating func clearTensor() {_storage._tensor = nil}
+
+    var lodLevel: Int32 {
+      get {return _storage._lodLevel ?? 0}
+      set {_uniqueStorage()._lodLevel = newValue}
+    }
+    /// Returns true if `lodLevel` has been explicitly set.
+    var hasLodLevel: Bool {return _storage._lodLevel != nil}
+    /// Clears the value of `lodLevel`. Subsequent reads from it will return its default value.
+    mutating func clearLodLevel() {_storage._lodLevel = nil}
+
+    var unknownFields = SwiftProtobuf.UnknownStorage()
+
+    init() {}
+
+    fileprivate var _storage = _StorageClass.defaultInstance
+  }
+
+  struct ReaderDesc {
+    // SwiftProtobuf.Message conformance is added in an extension below. See the
+    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
+    // methods supported on all messages.
+
+    var lodTensor: [PaddleMobile_Framework_Proto_VarType.LoDTensorDesc] = []
+
+    var unknownFields = SwiftProtobuf.UnknownStorage()
+
+    init() {}
+  }
+
+  struct ChannelDesc {
+    // SwiftProtobuf.Message conformance is added in an extension below. See the
+    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
+    // methods supported on all messages.
+
+    var dataType: PaddleMobile_Framework_Proto_VarType.TypeEnum {
+      get {return _dataType ?? .bool}
+      set {_dataType = newValue}
+    }
+    /// Returns true if `dataType` has been explicitly set.
+    var hasDataType: Bool {return self._dataType != nil}
+    /// Clears the value of `dataType`. Subsequent reads from it will return its default value.
+    mutating func clearDataType() {self._dataType = nil}
+
+    var capacity: Int64 {
+      get {return _capacity ?? 0}
+      set {_capacity = newValue}
+    }
+    /// Returns true if `capacity` has been explicitly set.
+    var hasCapacity: Bool {return self._capacity != nil}
+    /// Clears the value of `capacity`. Subsequent reads from it will return its default value.
+    mutating func clearCapacity() {self._capacity = nil}
+
+    var unknownFields = SwiftProtobuf.UnknownStorage()
+
+    init() {}
+
+    fileprivate var _dataType: PaddleMobile_Framework_Proto_VarType.TypeEnum? = nil
+    fileprivate var _capacity: Int64? = nil
+  }
+
+  struct Tuple {
+    // SwiftProtobuf.Message conformance is added in an extension below. See the
+    // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
+    // methods supported on all messages.
+
+    var elementType: [PaddleMobile_Framework_Proto_VarType.TypeEnum] = []
+
+    var unknownFields = SwiftProtobuf.UnknownStorage()
+
+    init() {}
+  }
+
+  init() {}
+
+  fileprivate var _storage = _StorageClass.defaultInstance
+}
+
+struct PaddleMobile_Framework_Proto_VarDesc {
+  // SwiftProtobuf.Message conformance is added in an extension below. See the
+  // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
+  // methods supported on all messages.
+
+  var name: String {
+    get {return _storage._name ?? String()}
+    set {_uniqueStorage()._name = newValue}
+  }
+  /// Returns true if `name` has been explicitly set.
+  var hasName: Bool {return _storage._name != nil}
+  /// Clears the value of `name`. Subsequent reads from it will return its default value.
+  mutating func clearName() {_storage._name = nil}
+
+  var type: PaddleMobile_Framework_Proto_VarType {
+    get {return _storage._type ?? PaddleMobile_Framework_Proto_VarType()}
+    set {_uniqueStorage()._type = newValue}
+  }
+  /// Returns true if `type` has been explicitly set.
+  var hasType: Bool {return _storage._type != nil}
+  /// Clears the value of `type`. Subsequent reads from it will return its default value.
+  mutating func clearType() {_storage._type = nil}
+
+  var persistable: Bool {
+    get {return _storage._persistable ?? false}
+    set {_uniqueStorage()._persistable = newValue}
+  }
+  /// Returns true if `persistable` has been explicitly set.
+  var hasPersistable: Bool {return _storage._persistable != nil}
+  /// Clears the value of `persistable`. Subsequent reads from it will return its default value.
+  mutating func clearPersistable() {_storage._persistable = nil}
+
+  var unknownFields = SwiftProtobuf.UnknownStorage()
+
+  init() {}
+
+  fileprivate var _storage = _StorageClass.defaultInstance
+}
+
+struct PaddleMobile_Framework_Proto_BlockDesc {
+  // SwiftProtobuf.Message conformance is added in an extension below. See the
+  // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
+  // methods supported on all messages.
+
+  var idx: Int32 {
+    get {return _idx ?? 0}
+    set {_idx = newValue}
+  }
+  /// Returns true if `idx` has been explicitly set.
+  var hasIdx: Bool {return self._idx != nil}
+  /// Clears the value of `idx`. Subsequent reads from it will return its default value.
+  mutating func clearIdx() {self._idx = nil}
+
+  var parentIdx: Int32 {
+    get {return _parentIdx ?? 0}
+    set {_parentIdx = newValue}
+  }
+  /// Returns true if `parentIdx` has been explicitly set.
+  var hasParentIdx: Bool {return self._parentIdx != nil}
+  /// Clears the value of `parentIdx`. Subsequent reads from it will return its default value.
+  mutating func clearParentIdx() {self._parentIdx = nil}
+
+  var vars: [PaddleMobile_Framework_Proto_VarDesc] = []
+
+  var ops: [PaddleMobile_Framework_Proto_OpDesc] = []
+
+  var forwardBlockIdx: Int32 {
+    get {return _forwardBlockIdx ?? -1}
+    set {_forwardBlockIdx = newValue}
+  }
+  /// Returns true if `forwardBlockIdx` has been explicitly set.
+  var hasForwardBlockIdx: Bool {return self._forwardBlockIdx != nil}
+  /// Clears the value of `forwardBlockIdx`. Subsequent reads from it will return its default value.
+  mutating func clearForwardBlockIdx() {self._forwardBlockIdx = nil}
+
+  var unknownFields = SwiftProtobuf.UnknownStorage()
+
+  init() {}
+
+  fileprivate var _idx: Int32? = nil
+  fileprivate var _parentIdx: Int32? = nil
+  fileprivate var _forwardBlockIdx: Int32? = nil
+}
+
+/// Please refer to
+/// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
+/// for more details.
+/// TODO(panyx0718): A model can have multiple programs. Need a
+/// way to distinguish them. Maybe ID or name?
+struct PaddleMobile_Framework_Proto_ProgramDesc {
+  // SwiftProtobuf.Message conformance is added in an extension below. See the
+  // `Message` and `Message+*Additions` files in the SwiftProtobuf library for
+  // methods supported on all messages.
+
+  var blocks: [PaddleMobile_Framework_Proto_BlockDesc] = []
+
+  var unknownFields = SwiftProtobuf.UnknownStorage()
+
+  init() {}
+}
+
+// MARK: - Code below here is support for the SwiftProtobuf runtime.
+
+fileprivate let _protobuf_package = "paddle_mobile.framework.proto"
+
+extension PaddleMobile_Framework_Proto_AttrType: SwiftProtobuf._ProtoNameProviding {
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    0: .same(proto: "INT"),
+    1: .same(proto: "FLOAT"),
+    2: .same(proto: "STRING"),
+    3: .same(proto: "INTS"),
+    4: .same(proto: "FLOATS"),
+    5: .same(proto: "STRINGS"),
+    6: .same(proto: "BOOLEAN"),
+    7: .same(proto: "BOOLEANS"),
+    8: .same(proto: "BLOCK"),
+    9: .same(proto: "LONG"),
+  ]
+}
+
+extension PaddleMobile_Framework_Proto_OpDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
+  static let protoMessageName: String = _protobuf_package + ".OpDesc"
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    3: .same(proto: "type"),
+    1: .same(proto: "inputs"),
+    2: .same(proto: "outputs"),
+    4: .same(proto: "attrs"),
+    5: .standard(proto: "is_target"),
+  ]
+
+  public var isInitialized: Bool {
+    if self._type == nil {return false}
+    if !SwiftProtobuf.Internal.areAllInitialized(self.inputs) {return false}
+    if !SwiftProtobuf.Internal.areAllInitialized(self.outputs) {return false}
+    if !SwiftProtobuf.Internal.areAllInitialized(self.attrs) {return false}
+    return true
+  }
+
+  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
+    while let fieldNumber = try decoder.nextFieldNumber() {
+      switch fieldNumber {
+      case 1: try decoder.decodeRepeatedMessageField(value: &self.inputs)
+      case 2: try decoder.decodeRepeatedMessageField(value: &self.outputs)
+      case 3: try decoder.decodeSingularStringField(value: &self._type)
+      case 4: try decoder.decodeRepeatedMessageField(value: &self.attrs)
+      case 5: try decoder.decodeSingularBoolField(value: &self._isTarget)
+      default: break
+      }
+    }
+  }
+
+  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
+    if !self.inputs.isEmpty {
+      try visitor.visitRepeatedMessageField(value: self.inputs, fieldNumber: 1)
+    }
+    if !self.outputs.isEmpty {
+      try visitor.visitRepeatedMessageField(value: self.outputs, fieldNumber: 2)
+    }
+    if let v = self._type {
+      try visitor.visitSingularStringField(value: v, fieldNumber: 3)
+    }
+    if !self.attrs.isEmpty {
+      try visitor.visitRepeatedMessageField(value: self.attrs, fieldNumber: 4)
+    }
+    if let v = self._isTarget {
+      try visitor.visitSingularBoolField(value: v, fieldNumber: 5)
+    }
+    try unknownFields.traverse(visitor: &visitor)
+  }
+
+  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_OpDesc) -> Bool {
+    if self._type != other._type {return false}
+    if self.inputs != other.inputs {return false}
+    if self.outputs != other.outputs {return false}
+    if self.attrs != other.attrs {return false}
+    if self._isTarget != other._isTarget {return false}
+    if unknownFields != other.unknownFields {return false}
+    return true
+  }
+}
+
+extension PaddleMobile_Framework_Proto_OpDesc.Attr: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
+  static let protoMessageName: String = PaddleMobile_Framework_Proto_OpDesc.protoMessageName + ".Attr"
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    1: .same(proto: "name"),
+    2: .same(proto: "type"),
+    3: .same(proto: "i"),
+    4: .same(proto: "f"),
+    5: .same(proto: "s"),
+    6: .same(proto: "ints"),
+    7: .same(proto: "floats"),
+    8: .same(proto: "strings"),
+    10: .same(proto: "b"),
+    11: .same(proto: "bools"),
+    12: .standard(proto: "block_idx"),
+    13: .same(proto: "l"),
+  ]
+
+  public var isInitialized: Bool {
+    if self._name == nil {return false}
+    if self._type == nil {return false}
+    return true
+  }
+
+  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
+    while let fieldNumber = try decoder.nextFieldNumber() {
+      switch fieldNumber {
+      case 1: try decoder.decodeSingularStringField(value: &self._name)
+      case 2: try decoder.decodeSingularEnumField(value: &self._type)
+      case 3: try decoder.decodeSingularInt32Field(value: &self._i)
+      case 4: try decoder.decodeSingularFloatField(value: &self._f)
+      case 5: try decoder.decodeSingularStringField(value: &self._s)
+      case 6: try decoder.decodeRepeatedInt32Field(value: &self.ints)
+      case 7: try decoder.decodeRepeatedFloatField(value: &self.floats)
+      case 8: try decoder.decodeRepeatedStringField(value: &self.strings)
+      case 10: try decoder.decodeSingularBoolField(value: &self._b)
+      case 11: try decoder.decodeRepeatedBoolField(value: &self.bools)
+      case 12: try decoder.decodeSingularInt32Field(value: &self._blockIdx)
+      case 13: try decoder.decodeSingularInt64Field(value: &self._l)
+      default: break
+      }
+    }
+  }
+
+  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
+    if let v = self._name {
+      try visitor.visitSingularStringField(value: v, fieldNumber: 1)
+    }
+    if let v = self._type {
+      try visitor.visitSingularEnumField(value: v, fieldNumber: 2)
+    }
+    if let v = self._i {
+      try visitor.visitSingularInt32Field(value: v, fieldNumber: 3)
+    }
+    if let v = self._f {
+      try visitor.visitSingularFloatField(value: v, fieldNumber: 4)
+    }
+    if let v = self._s {
+      try visitor.visitSingularStringField(value: v, fieldNumber: 5)
+    }
+    if !self.ints.isEmpty {
+      try visitor.visitRepeatedInt32Field(value: self.ints, fieldNumber: 6)
+    }
+    if !self.floats.isEmpty {
+      try visitor.visitRepeatedFloatField(value: self.floats, fieldNumber: 7)
+    }
+    if !self.strings.isEmpty {
+      try visitor.visitRepeatedStringField(value: self.strings, fieldNumber: 8)
+    }
+    if let v = self._b {
+      try visitor.visitSingularBoolField(value: v, fieldNumber: 10)
+    }
+    if !self.bools.isEmpty {
+      try visitor.visitRepeatedBoolField(value: self.bools, fieldNumber: 11)
+    }
+    if let v = self._blockIdx {
+      try visitor.visitSingularInt32Field(value: v, fieldNumber: 12)
+    }
+    if let v = self._l {
+      try visitor.visitSingularInt64Field(value: v, fieldNumber: 13)
+    }
+    try unknownFields.traverse(visitor: &visitor)
+  }
+
+  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_OpDesc.Attr) -> Bool {
+    if self._name != other._name {return false}
+    if self._type != other._type {return false}
+    if self._i != other._i {return false}
+    if self._f != other._f {return false}
+    if self._s != other._s {return false}
+    if self.ints != other.ints {return false}
+    if self.floats != other.floats {return false}
+    if self.strings != other.strings {return false}
+    if self._b != other._b {return false}
+    if self.bools != other.bools {return false}
+    if self._blockIdx != other._blockIdx {return false}
+    if self._l != other._l {return false}
+    if unknownFields != other.unknownFields {return false}
+    return true
+  }
+}
+
+extension PaddleMobile_Framework_Proto_OpDesc.Var: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
+  static let protoMessageName: String = PaddleMobile_Framework_Proto_OpDesc.protoMessageName + ".Var"
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    1: .same(proto: "parameter"),
+    2: .same(proto: "arguments"),
+  ]
+
+  public var isInitialized: Bool {
+    if self._parameter == nil {return false}
+    return true
+  }
+
+  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
+    while let fieldNumber = try decoder.nextFieldNumber() {
+      switch fieldNumber {
+      case 1: try decoder.decodeSingularStringField(value: &self._parameter)
+      case 2: try decoder.decodeRepeatedStringField(value: &self.arguments)
+      default: break
+      }
+    }
+  }
+
+  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
+    if let v = self._parameter {
+      try visitor.visitSingularStringField(value: v, fieldNumber: 1)
+    }
+    if !self.arguments.isEmpty {
+      try visitor.visitRepeatedStringField(value: self.arguments, fieldNumber: 2)
+    }
+    try unknownFields.traverse(visitor: &visitor)
+  }
+
+  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_OpDesc.Var) -> Bool {
+    if self._parameter != other._parameter {return false}
+    if self.arguments != other.arguments {return false}
+    if unknownFields != other.unknownFields {return false}
+    return true
+  }
+}
+
+extension PaddleMobile_Framework_Proto_OpProto: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
+  static let protoMessageName: String = _protobuf_package + ".OpProto"
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    1: .same(proto: "type"),
+    2: .same(proto: "inputs"),
+    3: .same(proto: "outputs"),
+    4: .same(proto: "attrs"),
+    5: .same(proto: "comment"),
+  ]
+
+  public var isInitialized: Bool {
+    if self._type == nil {return false}
+    if self._comment == nil {return false}
+    if !SwiftProtobuf.Internal.areAllInitialized(self.inputs) {return false}
+    if !SwiftProtobuf.Internal.areAllInitialized(self.outputs) {return false}
+    if !SwiftProtobuf.Internal.areAllInitialized(self.attrs) {return false}
+    return true
+  }
+
+  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
+    while let fieldNumber = try decoder.nextFieldNumber() {
+      switch fieldNumber {
+      case 1: try decoder.decodeSingularStringField(value: &self._type)
+      case 2: try decoder.decodeRepeatedMessageField(value: &self.inputs)
+      case 3: try decoder.decodeRepeatedMessageField(value: &self.outputs)
+      case 4: try decoder.decodeRepeatedMessageField(value: &self.attrs)
+      case 5: try decoder.decodeSingularStringField(value: &self._comment)
+      default: break
+      }
+    }
+  }
+
+  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
+    if let v = self._type {
+      try visitor.visitSingularStringField(value: v, fieldNumber: 1)
+    }
+    if !self.inputs.isEmpty {
+      try visitor.visitRepeatedMessageField(value: self.inputs, fieldNumber: 2)
+    }
+    if !self.outputs.isEmpty {
+      try visitor.visitRepeatedMessageField(value: self.outputs, fieldNumber: 3)
+    }
+    if !self.attrs.isEmpty {
+      try visitor.visitRepeatedMessageField(value: self.attrs, fieldNumber: 4)
+    }
+    if let v = self._comment {
+      try visitor.visitSingularStringField(value: v, fieldNumber: 5)
+    }
+    try unknownFields.traverse(visitor: &visitor)
+  }
+
+  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_OpProto) -> Bool {
+    if self._type != other._type {return false}
+    if self.inputs != other.inputs {return false}
+    if self.outputs != other.outputs {return false}
+    if self.attrs != other.attrs {return false}
+    if self._comment != other._comment {return false}
+    if unknownFields != other.unknownFields {return false}
+    return true
+  }
+}
+
+extension PaddleMobile_Framework_Proto_OpProto.Var: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
+  static let protoMessageName: String = PaddleMobile_Framework_Proto_OpProto.protoMessageName + ".Var"
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    1: .same(proto: "name"),
+    2: .same(proto: "comment"),
+    3: .same(proto: "duplicable"),
+    4: .same(proto: "intermediate"),
+    5: .same(proto: "dispensable"),
+  ]
+
+  public var isInitialized: Bool {
+    if self._name == nil {return false}
+    if self._comment == nil {return false}
+    return true
+  }
+
+  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
+    while let fieldNumber = try decoder.nextFieldNumber() {
+      switch fieldNumber {
+      case 1: try decoder.decodeSingularStringField(value: &self._name)
+      case 2: try decoder.decodeSingularStringField(value: &self._comment)
+      case 3: try decoder.decodeSingularBoolField(value: &self._duplicable)
+      case 4: try decoder.decodeSingularBoolField(value: &self._intermediate)
+      case 5: try decoder.decodeSingularBoolField(value: &self._dispensable)
+      default: break
+      }
+    }
+  }
+
+  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
+    if let v = self._name {
+      try visitor.visitSingularStringField(value: v, fieldNumber: 1)
+    }
+    if let v = self._comment {
+      try visitor.visitSingularStringField(value: v, fieldNumber: 2)
+    }
+    if let v = self._duplicable {
+      try visitor.visitSingularBoolField(value: v, fieldNumber: 3)
+    }
+    if let v = self._intermediate {
+      try visitor.visitSingularBoolField(value: v, fieldNumber: 4)
+    }
+    if let v = self._dispensable {
+      try visitor.visitSingularBoolField(value: v, fieldNumber: 5)
+    }
+    try unknownFields.traverse(visitor: &visitor)
+  }
+
+  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_OpProto.Var) -> Bool {
+    if self._name != other._name {return false}
+    if self._comment != other._comment {return false}
+    if self._duplicable != other._duplicable {return false}
+    if self._intermediate != other._intermediate {return false}
+    if self._dispensable != other._dispensable {return false}
+    if unknownFields != other.unknownFields {return false}
+    return true
+  }
+}
+
+extension PaddleMobile_Framework_Proto_OpProto.Attr: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
+  static let protoMessageName: String = PaddleMobile_Framework_Proto_OpProto.protoMessageName + ".Attr"
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    1: .same(proto: "name"),
+    2: .same(proto: "type"),
+    3: .same(proto: "comment"),
+    4: .same(proto: "generated"),
+  ]
+
+  public var isInitialized: Bool {
+    if self._name == nil {return false}
+    if self._type == nil {return false}
+    if self._comment == nil {return false}
+    return true
+  }
+
+  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
+    while let fieldNumber = try decoder.nextFieldNumber() {
+      switch fieldNumber {
+      case 1: try decoder.decodeSingularStringField(value: &self._name)
+      case 2: try decoder.decodeSingularEnumField(value: &self._type)
+      case 3: try decoder.decodeSingularStringField(value: &self._comment)
+      case 4: try decoder.decodeSingularBoolField(value: &self._generated)
+      default: break
+      }
+    }
+  }
+
+  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
+    if let v = self._name {
+      try visitor.visitSingularStringField(value: v, fieldNumber: 1)
+    }
+    if let v = self._type {
+      try visitor.visitSingularEnumField(value: v, fieldNumber: 2)
+    }
+    if let v = self._comment {
+      try visitor.visitSingularStringField(value: v, fieldNumber: 3)
+    }
+    if let v = self._generated {
+      try visitor.visitSingularBoolField(value: v, fieldNumber: 4)
+    }
+    try unknownFields.traverse(visitor: &visitor)
+  }
+
+  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_OpProto.Attr) -> Bool {
+    if self._name != other._name {return false}
+    if self._type != other._type {return false}
+    if self._comment != other._comment {return false}
+    if self._generated != other._generated {return false}
+    if unknownFields != other.unknownFields {return false}
+    return true
+  }
+}
+
+extension PaddleMobile_Framework_Proto_VarType: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
+  static let protoMessageName: String = _protobuf_package + ".VarType"
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    1: .same(proto: "type"),
+    2: .standard(proto: "selected_rows"),
+    3: .standard(proto: "lod_tensor"),
+    4: .standard(proto: "tensor_array"),
+    5: .same(proto: "reader"),
+    6: .same(proto: "channel"),
+    7: .same(proto: "tuple"),
+  ]
+
+  fileprivate class _StorageClass {
+    var _type: PaddleMobile_Framework_Proto_VarType.TypeEnum? = nil
+    var _selectedRows: PaddleMobile_Framework_Proto_VarType.TensorDesc? = nil
+    var _lodTensor: PaddleMobile_Framework_Proto_VarType.LoDTensorDesc? = nil
+    var _tensorArray: PaddleMobile_Framework_Proto_VarType.LoDTensorArrayDesc? = nil
+    var _reader: PaddleMobile_Framework_Proto_VarType.ReaderDesc? = nil
+    var _channel: PaddleMobile_Framework_Proto_VarType.ChannelDesc? = nil
+    var _tuple: PaddleMobile_Framework_Proto_VarType.Tuple? = nil
+
+    static let defaultInstance = _StorageClass()
+
+    private init() {}
+
+    init(copying source: _StorageClass) {
+      _type = source._type
+      _selectedRows = source._selectedRows
+      _lodTensor = source._lodTensor
+      _tensorArray = source._tensorArray
+      _reader = source._reader
+      _channel = source._channel
+      _tuple = source._tuple
+    }
+  }
+
+  fileprivate mutating func _uniqueStorage() -> _StorageClass {
+    if !isKnownUniquelyReferenced(&_storage) {
+      _storage = _StorageClass(copying: _storage)
+    }
+    return _storage
+  }
+
+  public var isInitialized: Bool {
+    return withExtendedLifetime(_storage) { (_storage: _StorageClass) in
+      if _storage._type == nil {return false}
+      if let v = _storage._selectedRows, !v.isInitialized {return false}
+      if let v = _storage._lodTensor, !v.isInitialized {return false}
+      if let v = _storage._tensorArray, !v.isInitialized {return false}
+      if let v = _storage._reader, !v.isInitialized {return false}
+      if let v = _storage._channel, !v.isInitialized {return false}
+      return true
+    }
+  }
+
+  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
+    _ = _uniqueStorage()
+    try withExtendedLifetime(_storage) { (_storage: _StorageClass) in
+      while let fieldNumber = try decoder.nextFieldNumber() {
+        switch fieldNumber {
+        case 1: try decoder.decodeSingularEnumField(value: &_storage._type)
+        case 2: try decoder.decodeSingularMessageField(value: &_storage._selectedRows)
+        case 3: try decoder.decodeSingularMessageField(value: &_storage._lodTensor)
+        case 4: try decoder.decodeSingularMessageField(value: &_storage._tensorArray)
+        case 5: try decoder.decodeSingularMessageField(value: &_storage._reader)
+        case 6: try decoder.decodeSingularMessageField(value: &_storage._channel)
+        case 7: try decoder.decodeSingularMessageField(value: &_storage._tuple)
+        default: break
+        }
+      }
+    }
+  }
+
+  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
+    try withExtendedLifetime(_storage) { (_storage: _StorageClass) in
+      if let v = _storage._type {
+        try visitor.visitSingularEnumField(value: v, fieldNumber: 1)
+      }
+      if let v = _storage._selectedRows {
+        try visitor.visitSingularMessageField(value: v, fieldNumber: 2)
+      }
+      if let v = _storage._lodTensor {
+        try visitor.visitSingularMessageField(value: v, fieldNumber: 3)
+      }
+      if let v = _storage._tensorArray {
+        try visitor.visitSingularMessageField(value: v, fieldNumber: 4)
+      }
+      if let v = _storage._reader {
+        try visitor.visitSingularMessageField(value: v, fieldNumber: 5)
+      }
+      if let v = _storage._channel {
+        try visitor.visitSingularMessageField(value: v, fieldNumber: 6)
+      }
+      if let v = _storage._tuple {
+        try visitor.visitSingularMessageField(value: v, fieldNumber: 7)
+      }
+    }
+    try unknownFields.traverse(visitor: &visitor)
+  }
+
+  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_VarType) -> Bool {
+    if _storage !== other._storage {
+      let storagesAreEqual: Bool = withExtendedLifetime((_storage, other._storage)) { (_args: (_StorageClass, _StorageClass)) in
+        let _storage = _args.0
+        let other_storage = _args.1
+        if _storage._type != other_storage._type {return false}
+        if _storage._selectedRows != other_storage._selectedRows {return false}
+        if _storage._lodTensor != other_storage._lodTensor {return false}
+        if _storage._tensorArray != other_storage._tensorArray {return false}
+        if _storage._reader != other_storage._reader {return false}
+        if _storage._channel != other_storage._channel {return false}
+        if _storage._tuple != other_storage._tuple {return false}
+        return true
+      }
+      if !storagesAreEqual {return false}
+    }
+    if unknownFields != other.unknownFields {return false}
+    return true
+  }
+}
+
+extension PaddleMobile_Framework_Proto_VarType.TypeEnum: SwiftProtobuf._ProtoNameProviding {
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    0: .same(proto: "BOOL"),
+    1: .same(proto: "INT16"),
+    2: .same(proto: "INT32"),
+    3: .same(proto: "INT64"),
+    4: .same(proto: "FP16"),
+    5: .same(proto: "FP32"),
+    6: .same(proto: "FP64"),
+    7: .same(proto: "LOD_TENSOR"),
+    8: .same(proto: "SELECTED_ROWS"),
+    9: .same(proto: "FEED_MINIBATCH"),
+    10: .same(proto: "FETCH_LIST"),
+    11: .same(proto: "STEP_SCOPES"),
+    12: .same(proto: "LOD_RANK_TABLE"),
+    13: .same(proto: "LOD_TENSOR_ARRAY"),
+    14: .same(proto: "PLACE_LIST"),
+    15: .same(proto: "READER"),
+    16: .same(proto: "CHANNEL"),
+    17: .same(proto: "RAW"),
+    18: .same(proto: "TUPLE"),
+  ]
+}
+
+extension PaddleMobile_Framework_Proto_VarType.TensorDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
+  static let protoMessageName: String = PaddleMobile_Framework_Proto_VarType.protoMessageName + ".TensorDesc"
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    1: .standard(proto: "data_type"),
+    2: .same(proto: "dims"),
+  ]
+
+  public var isInitialized: Bool {
+    if self._dataType == nil {return false}
+    return true
+  }
+
+  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
+    while let fieldNumber = try decoder.nextFieldNumber() {
+      switch fieldNumber {
+      case 1: try decoder.decodeSingularEnumField(value: &self._dataType)
+      case 2: try decoder.decodeRepeatedInt64Field(value: &self.dims)
+      default: break
+      }
+    }
+  }
+
+  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
+    if let v = self._dataType {
+      try visitor.visitSingularEnumField(value: v, fieldNumber: 1)
+    }
+    if !self.dims.isEmpty {
+      try visitor.visitRepeatedInt64Field(value: self.dims, fieldNumber: 2)
+    }
+    try unknownFields.traverse(visitor: &visitor)
+  }
+
+  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_VarType.TensorDesc) -> Bool {
+    if self._dataType != other._dataType {return false}
+    if self.dims != other.dims {return false}
+    if unknownFields != other.unknownFields {return false}
+    return true
+  }
+}
+
+extension PaddleMobile_Framework_Proto_VarType.LoDTensorDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
+  static let protoMessageName: String = PaddleMobile_Framework_Proto_VarType.protoMessageName + ".LoDTensorDesc"
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    1: .same(proto: "tensor"),
+    2: .standard(proto: "lod_level"),
+  ]
+
+  fileprivate class _StorageClass {
+    var _tensor: PaddleMobile_Framework_Proto_VarType.TensorDesc? = nil
+    var _lodLevel: Int32? = nil
+
+    static let defaultInstance = _StorageClass()
+
+    private init() {}
+
+    init(copying source: _StorageClass) {
+      _tensor = source._tensor
+      _lodLevel = source._lodLevel
+    }
+  }
+
+  fileprivate mutating func _uniqueStorage() -> _StorageClass {
+    if !isKnownUniquelyReferenced(&_storage) {
+      _storage = _StorageClass(copying: _storage)
+    }
+    return _storage
+  }
+
+  public var isInitialized: Bool {
+    return withExtendedLifetime(_storage) { (_storage: _StorageClass) in
+      if _storage._tensor == nil {return false}
+      if let v = _storage._tensor, !v.isInitialized {return false}
+      return true
+    }
+  }
+
+  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
+    _ = _uniqueStorage()
+    try withExtendedLifetime(_storage) { (_storage: _StorageClass) in
+      while let fieldNumber = try decoder.nextFieldNumber() {
+        switch fieldNumber {
+        case 1: try decoder.decodeSingularMessageField(value: &_storage._tensor)
+        case 2: try decoder.decodeSingularInt32Field(value: &_storage._lodLevel)
+        default: break
+        }
+      }
+    }
+  }
+
+  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
+    try withExtendedLifetime(_storage) { (_storage: _StorageClass) in
+      if let v = _storage._tensor {
+        try visitor.visitSingularMessageField(value: v, fieldNumber: 1)
+      }
+      if let v = _storage._lodLevel {
+        try visitor.visitSingularInt32Field(value: v, fieldNumber: 2)
+      }
+    }
+    try unknownFields.traverse(visitor: &visitor)
+  }
+
+  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_VarType.LoDTensorDesc) -> Bool {
+    if _storage !== other._storage {
+      let storagesAreEqual: Bool = withExtendedLifetime((_storage, other._storage)) { (_args: (_StorageClass, _StorageClass)) in
+        let _storage = _args.0
+        let other_storage = _args.1
+        if _storage._tensor != other_storage._tensor {return false}
+        if _storage._lodLevel != other_storage._lodLevel {return false}
+        return true
+      }
+      if !storagesAreEqual {return false}
+    }
+    if unknownFields != other.unknownFields {return false}
+    return true
+  }
+}
+
+extension PaddleMobile_Framework_Proto_VarType.LoDTensorArrayDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
+  static let protoMessageName: String = PaddleMobile_Framework_Proto_VarType.protoMessageName + ".LoDTensorArrayDesc"
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    1: .same(proto: "tensor"),
+    2: .standard(proto: "lod_level"),
+  ]
+
+  fileprivate class _StorageClass {
+    var _tensor: PaddleMobile_Framework_Proto_VarType.TensorDesc? = nil
+    var _lodLevel: Int32? = nil
+
+    static let defaultInstance = _StorageClass()
+
+    private init() {}
+
+    init(copying source: _StorageClass) {
+      _tensor = source._tensor
+      _lodLevel = source._lodLevel
+    }
+  }
+
+  fileprivate mutating func _uniqueStorage() -> _StorageClass {
+    if !isKnownUniquelyReferenced(&_storage) {
+      _storage = _StorageClass(copying: _storage)
+    }
+    return _storage
+  }
+
+  public var isInitialized: Bool {
+    return withExtendedLifetime(_storage) { (_storage: _StorageClass) in
+      if _storage._tensor == nil {return false}
+      if let v = _storage._tensor, !v.isInitialized {return false}
+      return true
+    }
+  }
+
+  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
+    _ = _uniqueStorage()
+    try withExtendedLifetime(_storage) { (_storage: _StorageClass) in
+      while let fieldNumber = try decoder.nextFieldNumber() {
+        switch fieldNumber {
+        case 1: try decoder.decodeSingularMessageField(value: &_storage._tensor)
+        case 2: try decoder.decodeSingularInt32Field(value: &_storage._lodLevel)
+        default: break
+        }
+      }
+    }
+  }
+
+  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
+    try withExtendedLifetime(_storage) { (_storage: _StorageClass) in
+      if let v = _storage._tensor {
+        try visitor.visitSingularMessageField(value: v, fieldNumber: 1)
+      }
+      if let v = _storage._lodLevel {
+        try visitor.visitSingularInt32Field(value: v, fieldNumber: 2)
+      }
+    }
+    try unknownFields.traverse(visitor: &visitor)
+  }
+
+  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_VarType.LoDTensorArrayDesc) -> Bool {
+    if _storage !== other._storage {
+      let storagesAreEqual: Bool = withExtendedLifetime((_storage, other._storage)) { (_args: (_StorageClass, _StorageClass)) in
+        let _storage = _args.0
+        let other_storage = _args.1
+        if _storage._tensor != other_storage._tensor {return false}
+        if _storage._lodLevel != other_storage._lodLevel {return false}
+        return true
+      }
+      if !storagesAreEqual {return false}
+    }
+    if unknownFields != other.unknownFields {return false}
+    return true
+  }
+}
+
+extension PaddleMobile_Framework_Proto_VarType.ReaderDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
+  static let protoMessageName: String = PaddleMobile_Framework_Proto_VarType.protoMessageName + ".ReaderDesc"
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    1: .standard(proto: "lod_tensor"),
+  ]
+
+  public var isInitialized: Bool {
+    if !SwiftProtobuf.Internal.areAllInitialized(self.lodTensor) {return false}
+    return true
+  }
+
+  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
+    while let fieldNumber = try decoder.nextFieldNumber() {
+      switch fieldNumber {
+      case 1: try decoder.decodeRepeatedMessageField(value: &self.lodTensor)
+      default: break
+      }
+    }
+  }
+
+  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
+    if !self.lodTensor.isEmpty {
+      try visitor.visitRepeatedMessageField(value: self.lodTensor, fieldNumber: 1)
+    }
+    try unknownFields.traverse(visitor: &visitor)
+  }
+
+  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_VarType.ReaderDesc) -> Bool {
+    if self.lodTensor != other.lodTensor {return false}
+    if unknownFields != other.unknownFields {return false}
+    return true
+  }
+}
+
+extension PaddleMobile_Framework_Proto_VarType.ChannelDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
+  static let protoMessageName: String = PaddleMobile_Framework_Proto_VarType.protoMessageName + ".ChannelDesc"
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    1: .standard(proto: "data_type"),
+    2: .same(proto: "capacity"),
+  ]
+
+  public var isInitialized: Bool {
+    if self._dataType == nil {return false}
+    if self._capacity == nil {return false}
+    return true
+  }
+
+  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
+    while let fieldNumber = try decoder.nextFieldNumber() {
+      switch fieldNumber {
+      case 1: try decoder.decodeSingularEnumField(value: &self._dataType)
+      case 2: try decoder.decodeSingularInt64Field(value: &self._capacity)
+      default: break
+      }
+    }
+  }
+
+  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
+    if let v = self._dataType {
+      try visitor.visitSingularEnumField(value: v, fieldNumber: 1)
+    }
+    if let v = self._capacity {
+      try visitor.visitSingularInt64Field(value: v, fieldNumber: 2)
+    }
+    try unknownFields.traverse(visitor: &visitor)
+  }
+
+  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_VarType.ChannelDesc) -> Bool {
+    if self._dataType != other._dataType {return false}
+    if self._capacity != other._capacity {return false}
+    if unknownFields != other.unknownFields {return false}
+    return true
+  }
+}
+
+extension PaddleMobile_Framework_Proto_VarType.Tuple: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
+  static let protoMessageName: String = PaddleMobile_Framework_Proto_VarType.protoMessageName + ".Tuple"
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    1: .standard(proto: "element_type"),
+  ]
+
+  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
+    while let fieldNumber = try decoder.nextFieldNumber() {
+      switch fieldNumber {
+      case 1: try decoder.decodeRepeatedEnumField(value: &self.elementType)
+      default: break
+      }
+    }
+  }
+
+  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
+    if !self.elementType.isEmpty {
+      try visitor.visitRepeatedEnumField(value: self.elementType, fieldNumber: 1)
+    }
+    try unknownFields.traverse(visitor: &visitor)
+  }
+
+  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_VarType.Tuple) -> Bool {
+    if self.elementType != other.elementType {return false}
+    if unknownFields != other.unknownFields {return false}
+    return true
+  }
+}
+
+extension PaddleMobile_Framework_Proto_VarDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
+  static let protoMessageName: String = _protobuf_package + ".VarDesc"
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    1: .same(proto: "name"),
+    2: .same(proto: "type"),
+    3: .same(proto: "persistable"),
+  ]
+
+  fileprivate class _StorageClass {
+    var _name: String? = nil
+    var _type: PaddleMobile_Framework_Proto_VarType? = nil
+    var _persistable: Bool? = nil
+
+    static let defaultInstance = _StorageClass()
+
+    private init() {}
+
+    init(copying source: _StorageClass) {
+      _name = source._name
+      _type = source._type
+      _persistable = source._persistable
+    }
+  }
+
+  fileprivate mutating func _uniqueStorage() -> _StorageClass {
+    if !isKnownUniquelyReferenced(&_storage) {
+      _storage = _StorageClass(copying: _storage)
+    }
+    return _storage
+  }
+
+  public var isInitialized: Bool {
+    return withExtendedLifetime(_storage) { (_storage: _StorageClass) in
+      if _storage._name == nil {return false}
+      if _storage._type == nil {return false}
+      if let v = _storage._type, !v.isInitialized {return false}
+      return true
+    }
+  }
+
+  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
+    _ = _uniqueStorage()
+    try withExtendedLifetime(_storage) { (_storage: _StorageClass) in
+      while let fieldNumber = try decoder.nextFieldNumber() {
+        switch fieldNumber {
+        case 1: try decoder.decodeSingularStringField(value: &_storage._name)
+        case 2: try decoder.decodeSingularMessageField(value: &_storage._type)
+        case 3: try decoder.decodeSingularBoolField(value: &_storage._persistable)
+        default: break
+        }
+      }
+    }
+  }
+
+  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
+    try withExtendedLifetime(_storage) { (_storage: _StorageClass) in
+      if let v = _storage._name {
+        try visitor.visitSingularStringField(value: v, fieldNumber: 1)
+      }
+      if let v = _storage._type {
+        try visitor.visitSingularMessageField(value: v, fieldNumber: 2)
+      }
+      if let v = _storage._persistable {
+        try visitor.visitSingularBoolField(value: v, fieldNumber: 3)
+      }
+    }
+    try unknownFields.traverse(visitor: &visitor)
+  }
+
+  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_VarDesc) -> Bool {
+    if _storage !== other._storage {
+      let storagesAreEqual: Bool = withExtendedLifetime((_storage, other._storage)) { (_args: (_StorageClass, _StorageClass)) in
+        let _storage = _args.0
+        let other_storage = _args.1
+        if _storage._name != other_storage._name {return false}
+        if _storage._type != other_storage._type {return false}
+        if _storage._persistable != other_storage._persistable {return false}
+        return true
+      }
+      if !storagesAreEqual {return false}
+    }
+    if unknownFields != other.unknownFields {return false}
+    return true
+  }
+}
+
+extension PaddleMobile_Framework_Proto_BlockDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
+  static let protoMessageName: String = _protobuf_package + ".BlockDesc"
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    1: .same(proto: "idx"),
+    2: .standard(proto: "parent_idx"),
+    3: .same(proto: "vars"),
+    4: .same(proto: "ops"),
+    5: .standard(proto: "forward_block_idx"),
+  ]
+
+  public var isInitialized: Bool {
+    if self._idx == nil {return false}
+    if self._parentIdx == nil {return false}
+    if !SwiftProtobuf.Internal.areAllInitialized(self.vars) {return false}
+    if !SwiftProtobuf.Internal.areAllInitialized(self.ops) {return false}
+    return true
+  }
+
+  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
+    while let fieldNumber = try decoder.nextFieldNumber() {
+      switch fieldNumber {
+      case 1: try decoder.decodeSingularInt32Field(value: &self._idx)
+      case 2: try decoder.decodeSingularInt32Field(value: &self._parentIdx)
+      case 3: try decoder.decodeRepeatedMessageField(value: &self.vars)
+      case 4: try decoder.decodeRepeatedMessageField(value: &self.ops)
+      case 5: try decoder.decodeSingularInt32Field(value: &self._forwardBlockIdx)
+      default: break
+      }
+    }
+  }
+
+  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
+    if let v = self._idx {
+      try visitor.visitSingularInt32Field(value: v, fieldNumber: 1)
+    }
+    if let v = self._parentIdx {
+      try visitor.visitSingularInt32Field(value: v, fieldNumber: 2)
+    }
+    if !self.vars.isEmpty {
+      try visitor.visitRepeatedMessageField(value: self.vars, fieldNumber: 3)
+    }
+    if !self.ops.isEmpty {
+      try visitor.visitRepeatedMessageField(value: self.ops, fieldNumber: 4)
+    }
+    if let v = self._forwardBlockIdx {
+      try visitor.visitSingularInt32Field(value: v, fieldNumber: 5)
+    }
+    try unknownFields.traverse(visitor: &visitor)
+  }
+
+  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_BlockDesc) -> Bool {
+    if self._idx != other._idx {return false}
+    if self._parentIdx != other._parentIdx {return false}
+    if self.vars != other.vars {return false}
+    if self.ops != other.ops {return false}
+    if self._forwardBlockIdx != other._forwardBlockIdx {return false}
+    if unknownFields != other.unknownFields {return false}
+    return true
+  }
+}
+
+extension PaddleMobile_Framework_Proto_ProgramDesc: SwiftProtobuf.Message, SwiftProtobuf._MessageImplementationBase, SwiftProtobuf._ProtoNameProviding {
+  static let protoMessageName: String = _protobuf_package + ".ProgramDesc"
+  static let _protobuf_nameMap: SwiftProtobuf._NameMap = [
+    1: .same(proto: "blocks"),
+  ]
+
+  public var isInitialized: Bool {
+    if !SwiftProtobuf.Internal.areAllInitialized(self.blocks) {return false}
+    return true
+  }
+
+  mutating func decodeMessage<D: SwiftProtobuf.Decoder>(decoder: inout D) throws {
+    while let fieldNumber = try decoder.nextFieldNumber() {
+      switch fieldNumber {
+      case 1: try decoder.decodeRepeatedMessageField(value: &self.blocks)
+      default: break
+      }
+    }
+  }
+
+  func traverse<V: SwiftProtobuf.Visitor>(visitor: inout V) throws {
+    if !self.blocks.isEmpty {
+      try visitor.visitRepeatedMessageField(value: self.blocks, fieldNumber: 1)
+    }
+    try unknownFields.traverse(visitor: &visitor)
+  }
+
+  func _protobuf_generated_isEqualTo(other: PaddleMobile_Framework_Proto_ProgramDesc) -> Bool {
+    if self.blocks != other.blocks {return false}
+    if unknownFields != other.unknownFields {return false}
+    return true
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Dim.swift b/metal/paddle-mobile/paddle-mobile/framework/Dim.swift
new file mode 100644
index 0000000000000000000000000000000000000000..7e4a05a8dcfc17be10f183de36575342383bb560
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/framework/Dim.swift
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+public struct Dim {
+    public init(inDim: [Int]) {
+        dims = inDim
+    }
+    
+    mutating func swapeDimAt(index1: Int, index2: Int) {
+        dims.swapAt(index1, index2)
+    }
+    
+    func cout() -> Int {
+        return dims.count
+    }
+    
+    func numel() -> Int {
+        return dims.reduce(1) { $0 * $1 }
+    }
+    
+    public static func ==(left: Dim, right: Dim) -> Bool {
+        return left.dims == right.dims;
+    }
+    
+    public subscript(index: Int) -> Int {
+        return dims[index];
+    }
+    
+    private(set) var dims: [Int]
+    private init(){
+        fatalError()
+    }
+}
+
+extension Dim: CustomStringConvertible {
+    public var description: String {
+        return "\(dims)"
+    }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Executor.swift b/metal/paddle-mobile/paddle-mobile/framework/Executor.swift
new file mode 100644
index 0000000000000000000000000000000000000000..bdaf8d0973ad3fa6c70e04ad84fd1b14bcb8b39a
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/framework/Executor.swift
@@ -0,0 +1,201 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+
+let testTo = 81
+
+var isTest = false
+
+let computePrecision: ComputePrecision = .Float16
+
+public class GPUResultHolder {
+  public let dim: [Int]
+  public let capacity: Int
+  public var resultPointer: UnsafeMutablePointer<Float32>?
+  public var intermediateResults: [String : [Variant]]?
+  public let elapsedTime: Double
+  public init(inDim: [Int], inPointer: UnsafeMutablePointer<Float32>?, inCapacity: Int, inElapsedTime: Double, inIntermediateResults: [String : [Variant]]? = nil) {
+    dim = inDim
+    capacity = inCapacity
+    
+    if let inInPointer = inPointer {
+      resultPointer = UnsafeMutablePointer<Float32>.allocate(capacity: inCapacity)
+      resultPointer?.initialize(from: inInPointer, count: inCapacity)
+    }
+    
+    elapsedTime = inElapsedTime
+    intermediateResults = inIntermediateResults
+  }
+  
+}
+
+extension GPUResultHolder: CustomDebugStringConvertible, CustomStringConvertible {
+  public var debugDescription: String {
+//    var str = ""
+//    str += "Dim: \(dim) \n value:[ "
+//    if resultArr.count < 20 {
+//      for d in resultArr {
+//        str += " \(d) "
+//      }
+//    } else {
+//      for d in stride(from: 0, to: resultArr.count, by: resultArr.count/20) {
+//        str += " \(resultArr[d]) "
+//      }
+//    }
+//    str += " ]"
+//    return str
+    fatalError()
+  }
+  
+  public var description: String {
+    return debugDescription
+  }
+}
+
+public class Executor<P: PrecisionType> {
+  var ops: [Runable & InferShaperable] = []
+  let program: Program
+  let device: MTLDevice
+  let inflightSemaphore: DispatchSemaphore
+  let queue: MTLCommandQueue
+  public init(inDevice:MTLDevice, inQueue: MTLCommandQueue, inProgram: Program) throws {
+    self.inflightSemaphore = DispatchSemaphore(value: 3)
+    program = inProgram
+    device = inDevice
+    queue = inQueue
+//    print("before for ")
+//print(program.scope.vars["fea_pyramid1_mbox_conf_flat.Flatten.output.1.tmp_0"])
+    
+    
+    for block in inProgram.programDesc.blocks {
+      //block.ops.count
+      for i in 0..<block.ops.count {
+        let opDesc = block.ops[i]
+        do {
+//          print("in for i \(i): ")
+//      print(program.scope.vars["fea_pyramid1_mbox_conf_flat.Flatten.output.1.tmp_0"])
+//
+//          if i == 56 {
+//          print(program.scope.vars["fea_pyramid1_mbox_conf_flat.Flatten.output.1.tmp_0"])
+//
+//          }
+          
+          let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: opDesc, scope: inProgram.scope)
+          ops.append(op)
+        } catch let error {
+          throw error
+        }
+      }
+    }
+  }
+  
+  public func predict(input: MTLTexture, dim: [Int], completionHandle: @escaping (GPUResultHolder) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws {
+    guard let buffer = queue.makeCommandBuffer() else {
+      throw PaddleMobileError.predictError(message: "CommandBuffer is nil")
+    }
+    inflightSemaphore.wait()
+    
+    let resInput: MTLTexture
+    if let inPre = preProcessKernle {
+      do {
+        try inPre.compute(inputTexuture: input, commandBuffer: buffer)
+        resInput = inPre.outputTexture
+      } catch let error {
+        throw error
+      }
+    } else {
+      resInput = input
+    }
+    
+    let beforeDate = Date.init()
+    let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: Dim.init(inDim: dim))
+    program.scope.setInput(input: inputTexture)
+    //(ops.count - except)
+    for i in 0..<(ops.count - except) {
+      let op = ops[i]
+      do {
+        try op.run(device: device, buffer: buffer)
+      } catch let error {
+        throw error
+      }
+    }
+    
+    var outputTextures: [String : [Variant]]?
+    if except > 0 {
+      ops[ops.count - except].computeMiddleResult(device: device, buffer: buffer)
+      outputTextures = ops[ops.count - except].inputVariant()
+    }
+    
+    buffer.addCompletedHandler { [weak self] (commandbuffer) in
+//      let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
+//      print(inputArr.strideArray())
+//
+////      print(dim)
+//      writeToLibrary(fileName: "test_image_ssd_ar", array: inputArr)
+//      print(" write done ")
+
+//      print("write to library done")
+//      return
+//                  print(inputArr)
+//
+//                  let stridableInput: [(index: Int, value: Float)] = input.stridableFloatArray()
+//                  print(stridableInput)
+//
+//                  let _: Flo? = input.logDesc(header: "input: ", stridable: true)
+//      for i in 0..<self!.ops.count {
+//        let op = self!.ops[i]
+//        print(" 第 \(i) 个 op: ")
+//        op.delogOutput()
+//      }
+      
+//      return;
+//      self!.ops[testTo - 2].delogOutput()
+//      self!.ops[testTo - 1].delogOutput()
+//      self!.ops[5].delogOutput()
+
+//      return
+      
+      guard let SSelf = self else {
+//        return
+        fatalError()
+      }
+      
+      let afterDate = Date.init()
+      var resultHolder: GPUResultHolder
+      if except > 0 {
+        resultHolder = GPUResultHolder.init(inDim: [], inPointer: nil, inCapacity: 0, inElapsedTime: afterDate.timeIntervalSince(beforeDate), inIntermediateResults: outputTextures)
+      } else {
+        let outputVar: Variant = SSelf.program.scope.output()!
+        let output: FetchHolder = outputVar as! FetchHolder
+//        let beforeToTensorDate = Date.init()
+
+        resultHolder = GPUResultHolder.init(inDim: output.dim, inPointer: output.result, inCapacity: output.capacity, inElapsedTime: afterDate.timeIntervalSince(beforeDate))
+        
+//        let timeToTensor = Date.init().timeIntervalSince(beforeToTensorDate)
+//        print(timeToTensor)
+      }
+
+      completionHandle(resultHolder)
+      SSelf.inflightSemaphore.signal()
+    }
+    buffer.commit()
+  }
+  
+  public func clear() {
+    program.scope.clear()
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Loader.swift b/metal/paddle-mobile/paddle-mobile/framework/Loader.swift
new file mode 100644
index 0000000000000000000000000000000000000000..ee640ddf1163bb1f41da49fe9089964321792d9f
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/framework/Loader.swift
@@ -0,0 +1,259 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+import SwiftProtobuf
+
+public class Loader<P: PrecisionType> {
+  class ParaLoader {
+    let file: UnsafeMutablePointer<FILE>
+    let fileSize: Int
+    var nowIndex: Int
+    init(paramPath: String) throws {
+      guard let tmpFile = fopen(paramPath, "rb") else {
+        throw PaddleMobileError.loaderError(message: "open param file error" + paramPath)
+      }
+      file = tmpFile
+      fseek(file, 0, SEEK_END)
+      fileSize = ftell(file)
+      guard fileSize > 0 else {
+        throw PaddleMobileError.loaderError(message: "param file size is too small")
+      }
+      rewind(file)
+      nowIndex = 0
+    }
+    
+    func read(tensor: Tensor<P>) throws {
+      guard nowIndex <= fileSize else {
+        throw PaddleMobileError.loaderError(message: "out of the file range")
+      }
+      
+      func pointerReader<T>(type: T.Type) -> T {
+        let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
+        fread(ptr, 1, MemoryLayout<T>.size, file)
+        nowIndex += MemoryLayout<T>.size
+        let pointee = ptr.pointee
+        ptr.deinitialize(count: MemoryLayout<UInt32>.size)
+        ptr.deallocate()
+        return pointee
+      }
+      
+      let _ = pointerReader(type: UInt32.self)
+      let lodLevel = pointerReader(type: UInt64.self)
+      for _ in 0..<lodLevel {
+        let size = pointerReader(type: UInt64.self)
+        for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
+          _ = pointerReader(type: size_t.self)
+        }
+      }
+      
+      let _ = pointerReader(type: UInt32.self)
+      
+      let tensorDescSize = pointerReader(type: Int32.self)
+      
+      fseek(file, Int(tensorDescSize), SEEK_CUR)
+      nowIndex += Int(tensorDescSize)
+      
+      /*
+       这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
+       */
+      
+      //现在模型传入模型为  Float 类型, 这块应该根据模型来
+      //            let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
+      //            let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
+      let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
+      
+      guard bytesRead == tensor.data.size else {
+        throw PaddleMobileError.loaderError(message: "param read size error")
+      }
+      
+      // TODO: use script to convert
+      //            let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
+      //            for i in 0..<tensor.numel() {
+      //                tensor.data[i] = P.init(inFloat: tmpPointer[i])
+      //            }
+      //            tmpPointer.deinitialize(count: tmpCapacity)
+      //            tmpPointer.deallocate()
+      
+      nowIndex += bytesRead
+    }
+    
+    deinit {
+      fclose(file)
+    }
+  }
+  class ParaLoaderWithPointer {
+    var paramPointer: UnsafeMutableRawPointer
+      let paramSize: Int
+      var nowIndex: Int
+      init(pPointer: UnsafeMutableRawPointer,pSize:Int) throws {
+          paramPointer = UnsafeMutableRawPointer.init(pPointer)
+          paramSize = pSize
+          nowIndex = 0
+      }
+    
+      func read(tensor: Tensor<P>) throws {
+        guard nowIndex <= paramSize else {
+          throw PaddleMobileError.loaderError(message: "out of the file range")
+        }
+        var readerIndex: Int = 0
+        func pointerReader<T>(type: T.Type) -> T {
+          let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
+          memcpy(ptr, paramPointer.advanced(by: Int(readerIndex)), MemoryLayout<T>.size)
+          nowIndex += MemoryLayout<T>.size
+          readerIndex += MemoryLayout<T>.size
+          let pointee = ptr.pointee
+          ptr.deinitialize(count: MemoryLayout<UInt32>.size)
+          ptr.deallocate()
+          
+          return pointee
+        }
+        let _ = pointerReader(type: UInt32.self)
+        let lodLevel = pointerReader(type: UInt64.self)
+        for _ in 0..<lodLevel {
+          let size = pointerReader(type: UInt64.self)
+          for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
+            _ = pointerReader(type: size_t.self)
+          }
+        }
+        
+        let _ = pointerReader(type: UInt32.self)
+        let tensorDescSize = pointerReader(type: Int32.self)
+        
+        paramPointer = paramPointer.advanced(by: Int(readerIndex))
+        paramPointer = paramPointer.advanced(by: Int(tensorDescSize))
+        nowIndex += Int(tensorDescSize)
+        
+        let _ = memcpy(tensor.data.pointer, paramPointer, tensor.data.size)
+        paramPointer = paramPointer.advanced(by: Int(tensor.data.size))
+        nowIndex += tensor.data.size
+    }
+    deinit {
+    }
+  }
+  public init(){}
+  func loadModelandParam(_ device:MTLDevice,_ modelData:Data, _ paraLoaderPointer:ParaLoaderWithPointer?, _ paraLoader:ParaLoader?) throws -> Program {
+    do {
+      let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init(
+        serializedData: modelData)
+      
+      let originProgramDesc = ProgramDesc.init(protoProgram: protoProgram)
+      let programDesc = ProgramOptimize<P>.init().optimize(originProgramDesc: originProgramDesc)
+      print(programDesc)
+      
+      guard programDesc.blocks.count > 0 else {
+        throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0")
+      }
+      
+      // to get feed key and fetch key
+      let block = programDesc.blocks[0]
+      guard let firstOp = block.ops.first, let lastOp = block.ops.last else {
+        throw PaddleMobileError.loaderError(message: "at least two operator")
+      }
+      
+      guard firstOp.type == gFeedType, lastOp.type == gFetchType else {
+        throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch")
+      }
+      
+      guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else {
+        throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found")
+      }
+      guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else {
+        throw PaddleMobileError.loaderError(message: "feed key or fetch key not found")
+      }
+      
+      let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey)
+      
+      // to load memory
+      for block in programDesc.blocks {
+        for varDesc in block.vars {
+          if (varDesc.type == .LodTensor) {
+            guard let tensorDesc = varDesc.tensorDesc else {
+              throw PaddleMobileError.loaderError(message: "get tensor desc failed")
+            }
+            
+            if (varDesc.persistable
+              && varDesc.type != .FeedMiniBatch
+              && varDesc.type != .FetchList) {
+              let dimArr = tensorDesc.dims
+              
+              guard dimArr.count > 0 else {
+                throw PaddleMobileError.loaderError(message: "tensor desc dim size error")
+              }
+              
+              let dim = Dim.init(inDim: dimArr)
+              let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout)
+              do {
+                if paraLoaderPointer != nil {
+                  try paraLoaderPointer!.read(tensor: tensor)
+                }
+                
+                if paraLoader != nil {
+                  try paraLoader!.read(tensor: tensor)
+                }
+              } catch let error {
+                throw error
+              }
+              //              tensor.convert(to: DataLayout.NHWC())
+              //                            tensor.initBuffer(device: device)
+              scope[varDesc.name] = tensor
+            } else {
+              let dim = Dim.init(inDim: tensorDesc.dims)
+              scope[varDesc.name] = Texture<P>.init(device: device, inDim: dim)
+            }
+          } else {
+            if varDesc.name == fetchKey {
+//              scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inCapacity: <#Int#>, inElapsedTime: 0.0)
+            } else if varDesc.name == feedKey {
+            }
+          }
+        }
+      }
+      
+      let program = Program.init(inProgramDesc: programDesc, inScope: scope)
+      
+      return program
+    } catch _ {
+      throw PaddleMobileError.loaderError(message: "protobuf decoder error")
+    }
+  }
+  public func load(device:MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) throws -> Program {
+    let modelData = Data.init(bytes:modePointer, count:modelSize)
+    guard let paraLoader = try? ParaLoaderWithPointer.init(pPointer: paramPointer,pSize: paramSize) else {
+      throw PaddleMobileError.loaderError(message: "load para error")
+    }
+    do {
+      let program = try loadModelandParam(device,modelData,paraLoader,nil)
+      return program
+    } catch let error {
+      throw error
+    }
+  }
+    
+  public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{
+    guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else {
+      throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !")
+    }
+    guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else {
+      throw PaddleMobileError.loaderError(message: "load para error")
+    }
+    
+    do {
+      let program = try loadModelandParam(device,modelData,nil,paraLoader)
+      return program
+    } catch let error {
+      throw error
+    }
+  }
+}
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
new file mode 100644
index 0000000000000000000000000000000000000000..c5ee1414521e7eb92011d4f4b608ad326b005531
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
@@ -0,0 +1,319 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+protocol Tensorial: CustomStringConvertible, CustomDebugStringConvertible{
+  var dim: Dim { get set }
+  func numel() -> Int
+  var layout: DataLayout { get }
+}
+
+extension Tensorial {
+  func numel() -> Int {
+    return dim.numel()
+  }
+}
+
+public enum ComputePrecision {
+  case Float32, Float16
+}
+
+class Tensor<P: PrecisionType>: Tensorial {
+  
+  var data: Data
+  var dim: Dim
+  var buffer: MTLBuffer!
+  private(set) var layout: DataLayout
+  
+  class Data {
+    init(inSize: Int, inPointer: UnsafeMutablePointer<P>) {
+      size = inSize
+      pointer = inPointer
+    }
+    let size: Int
+    var pointer: UnsafeMutablePointer<P>
+    subscript(index: Int) -> P{
+      get {
+        return pointer[index]
+      }
+      set {
+        pointer[index] = newValue
+      }
+    }
+    func release() {
+      pointer.deinitialize(count: size)
+      pointer.deallocate()
+    }
+    deinit {
+      //            release()
+    }
+  }
+  
+  required init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) {
+    dim = inDim
+    let size = inDim.numel() * MemoryLayout<P>.size
+    let pointer = UnsafeMutablePointer<P>.allocate(capacity: size)
+    data = Data.init(inSize: size, inPointer: pointer)
+    layout = inLayout
+  }
+  
+  func convert(to: DataLayout) {
+    guard to != layout else {
+      return
+    }
+    
+    guard dim.cout() == 4 else {
+      return
+    }
+    
+    guard layout == DataLayout.NCHW() && to == DataLayout.NHWC() else {
+      // other not support
+      return
+    }
+    let newPointer = UnsafeMutablePointer<P>.allocate(capacity: data.size)
+    
+    if layout == DataLayout.NCHW() {
+      NCHW2NHWC(newPtr: newPointer)
+    }
+    
+    data.release()
+    data.pointer = newPointer
+    layout = to
+  }
+  
+
+  
+  func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16, convertToNHWC: Bool = true, withTranspose: Bool = false) {
+    if convertToNHWC {
+//      print(layout)
+      convert(to: DataLayout.NHWC())
+    }
+    
+    if withTranspose {
+      let transposePointer = UnsafeMutablePointer<P>.allocate(capacity: numel())
+      let n = dim[0]
+      let hwc = numel()/n
+      for j in 0..<hwc {
+        for i in 0..<n {
+          //data[i * hwc + j]
+          transposePointer[j * n + i] = data[i * hwc + j]
+        }
+      }
+
+      dim.swapeDimAt(index1: 0, index2: 3)
+      data.release()
+      data.pointer = transposePointer
+    }
+    
+    guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else {
+      fatalError(" not support yet ")
+    }
+    
+    let precisionSize: Int
+    switch precision {
+    case .Float32:
+      precisionSize = 4
+    case .Float16:
+      precisionSize = 2
+    }
+    
+    if dim.cout() == 4 {
+      if layout == DataLayout.NHWC() {
+        let C = dim[3]
+        let cSlices = (C + 3) / 4
+        let paddedC = cSlices * 4
+        let count = paddedC * dim[0] * dim[1] * dim[2]
+        if C == paddedC {
+          buffer = device.makeBuffer(length: count * precisionSize)
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
+          }
+        } else if C == 1 {
+          buffer = device.makeBuffer(length: numel() * precisionSize)
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
+          }
+        } else {
+          buffer = device.makeBuffer(length: count * precisionSize)
+          let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
+          var tmpPointer = floatPointer
+          var dstPtr = convertedPointer
+          for _ in 0..<dim[0] * dim[1] * dim[2] {
+            for j in 0..<paddedC {
+              if j < C {
+                dstPtr[j] = tmpPointer[j]
+              } else {
+                dstPtr[j] = 0
+              }
+            }
+            tmpPointer += C
+            dstPtr += paddedC
+          }
+          
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
+          }
+          
+          convertedPointer.deinitialize(count: count)
+          convertedPointer.deallocate()
+        }
+      } else {
+        let C = dim[3]
+        let cSlices = (C + 3) / 4
+        let paddedC = cSlices * 4
+        let count = paddedC * dim[0] * dim[1] * dim[2]
+        if C == paddedC {
+          buffer = device.makeBuffer(length: count * precisionSize)
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
+          }
+        } else if C == 1 {
+          fatalError(" not support ")
+        } else {
+          buffer = device.makeBuffer(length: count * precisionSize)
+          let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
+          var tmpPointer = floatPointer
+          var dstPtr = convertedPointer
+          for _ in 0..<dim[0] * dim[1] * dim[2] {
+            for j in 0..<paddedC {
+              if j < C {
+                dstPtr[j] = tmpPointer[j]
+              } else {
+                dstPtr[j] = 0
+              }
+            }
+            tmpPointer += C
+            dstPtr += paddedC
+          }
+          
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
+          }
+          convertedPointer.deinitialize(count: count)
+          convertedPointer.deallocate()
+        }
+      }
+    } else if dim.cout() == 1 {
+      let num = ((numel() + 3) / 4) * 4
+      buffer = device.makeBuffer(length: num * precisionSize)
+      switch precision {
+      case .Float32:
+        buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout<P>.stride)
+      case .Float16:
+        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: num)
+      }
+    } else {
+      fatalError(" not support !")
+    }
+    //TODO: release
+    data.release()
+  }
+  
+  var width: Int {
+    get {
+      if dim.cout() == 4 {
+        return dim[1]
+      } else {
+        fatalError()
+      }
+    }
+  }
+  
+  var height: Int {
+    get {
+      if dim.cout() == 4 {
+        return dim[2]
+      } else {
+        fatalError()
+      }
+    }
+  }
+  
+  var channel: Int {
+    get {
+      if dim.cout() == 4 {
+        return dim[3]
+      } else {
+        fatalError()
+      }
+    }
+  }
+  
+  
+  func NCHW2NHWC(newPtr: UnsafeMutablePointer<P>) {
+    let N = dim[0]
+    let C = dim[1]
+    let H = dim[2]
+    let W = dim[3]
+    let HXW = H * W
+    let CXHXW = C * H * W
+    
+    var index: Int = 0
+    for n in 0..<N {
+      for h in 0..<H{
+        for w in 0..<W{
+          for c in 0..<C{
+            newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * W + w]
+            index += 1
+          }
+        }
+      }
+    }
+    dim.swapeDimAt(index1: 1, index2: 3)
+  }
+}
+
+extension Tensor {
+  
+  var debugDescription: String {
+    var str = "dim: \(dim) \n"
+    str += "MTLBuffer: \(self.buffer) \n"
+    for i in 0..<buffer.length/MemoryLayout<P>.size {
+      str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])"
+    }
+    return str
+  }
+  
+  func logDataPointer(header: String = "") {
+    print(header)
+    var str = ""
+    str += "data size: \(data.size) \n"
+    str += "dim: \(dim) \n"
+    for i in 0..<numel() {
+      str += " \(data.pointer[i])"
+    }
+    print(str)
+  }
+  
+  var description: String {
+    return debugDescription
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
new file mode 100644
index 0000000000000000000000000000000000000000..194d3d3015754cd2faf2dc3f4b4b098d762f2e53
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
@@ -0,0 +1,178 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Metal
+import Foundation
+
+class InputTexture {
+  let mtlTexture: MTLTexture
+  let expectDim: Dim
+  init(inMTLTexture: MTLTexture, inExpectDim: Dim) {
+    mtlTexture = inMTLTexture
+    expectDim = inExpectDim
+  }
+}
+
+extension InputTexture {
+  var description: String {
+    get{
+      return mtlTexture.description
+    }
+  }
+  
+  var debugDescription: String {
+    get {
+      return mtlTexture.debugDescription ?? " MetalTexture "
+    }
+  }
+}
+
+
+/*
+ 4 维 tensor 存储 texture，要考虑 transpose
+ transpose 之后的维度是 [a, b, c, d]，对应的texture_2darray
+ .width = c
+ .height = b
+ .len = a * d + 3 / 4
+ 
+低于 4 维的 tensor，transpose 必须为 [0, 1, 2, 3] 既不考虑 transpose
+ 
+// TODO transpose 对于低维 tensor 的扩展原则。。。
+// [a, b] -> [1, 1, a, b] transpose 必须为 [0, 1, x, x]
+// [a] -> [1, 1, 1, a] transpose 必须为 [0, 1, 2, 3]
+// [a, b, c] -> [1, a, b, c] tranpose 必须为 [0, x, x, x]
+
+3 维 tensor [a, b, c] 对应的 texture_2darray,
+.width = c
+.height = b
+.len = a + 3 / 4
+ 
+ 2 维 tensor [a, b] 对应的 texture_2darray
+ .width = b + 3 / 4
+ .height = a
+ .len = 1
+ 
+ 1 维 tensor [a] 对应的 texture_2darray
+ .width = a + 3 / 4
+ .height = 1
+ .len = 1
+ */
+
+
+public class Texture<P: PrecisionType>: Tensorial {
+  var dim: Dim
+  public var tensorDim: Dim
+  public var padToFourDim: Dim
+  private var textureDesc: MTLTextureDescriptor!
+  public var metalTexture: MTLTexture!
+  var transpose: [Int] = [0, 1, 2, 3]
+  
+  func toTensor() -> [Float32] {
+    guard  padToFourDim.cout() == 4 else {
+      fatalError("- not support -")
+    }
+    return metalTexture.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
+  }
+  
+  func realNHWC() -> [Float32] {
+    guard padToFourDim.cout() == 4 else {
+      fatalError(" - not support - ")
+    }
+    return metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+  }
+  
+  func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) {
+    transpose = inTranspose
+    for i in 0..<(4 - tensorDim.cout()) {
+      if i != inTranspose[i] {
+        fatalError()
+      }
+    }
+    let newDim = transpose.map { padToFourDim[$0] }
+    
+    let newLayout = transpose.map { layout.layoutWithDim[$0] }
+    
+    layout = DataLayout.init(newLayout)
+    dim = Dim.init(inDim: newDim)
+    
+    let tmpTextureDes = MTLTextureDescriptor.init()
+    tmpTextureDes.textureType = .type2DArray
+    tmpTextureDes.depth = 1
+    
+    switch tensorDim.cout() {
+    case 4:
+      tmpTextureDes.width = newDim[2]
+      tmpTextureDes.height = newDim[1]
+      tmpTextureDes.arrayLength = ((newDim[0]) * (newDim[3]) + 3) / 4
+    case 3:
+      tmpTextureDes.width = newDim[3]
+      tmpTextureDes.height = newDim[2]
+      tmpTextureDes.arrayLength = (newDim[1] + 3) / 4
+    case 2, 1:
+      tmpTextureDes.width = (newDim[3] + 3) / 4
+      tmpTextureDes.height = newDim[2]
+      tmpTextureDes.arrayLength = 1
+    default:
+      fatalError("unreachable")
+    }
+   
+    if computePrecision == .Float16 {
+      tmpTextureDes.pixelFormat = .rgba16Float
+    } else if computePrecision == .Float32 {
+      tmpTextureDes.pixelFormat = .rgba32Float
+    }
+    
+    tmpTextureDes.usage = [.shaderRead, .shaderWrite]
+    tmpTextureDes.storageMode = .shared
+    textureDesc = tmpTextureDes
+    metalTexture = device.makeTexture(descriptor: tmpTextureDes) ?! " texture nil "
+  }
+  
+  init(device: MTLDevice, inDim: Dim) {
+    var fourDim: Dim
+    if inDim.cout() == 4 {
+      fourDim = inDim
+    } else if inDim.cout() < 4 {
+      var fourDimNum: [Int] = []
+      for _ in 0..<(4 - inDim.cout()) {
+        fourDimNum.append(1)
+      }
+      fourDimNum.append(contentsOf: inDim.dims)
+      fourDim = Dim.init(inDim: fourDimNum)
+    } else {
+      fatalError(" not support ")
+    }
+    tensorDim = inDim
+    dim = fourDim
+    padToFourDim = fourDim
+    layout = DataLayout.init([(.N, fourDim[0]), (.C, fourDim[1]), (.H, fourDim[2]), (.W, fourDim[3])])
+  }
+  
+  private(set) var layout: DataLayout
+}
+
+extension Texture {
+  public var description: String {
+    return debugDescription
+  }
+  
+  public var debugDescription: String{
+    var str = ""
+    str += "Dim: \(dim) \n value:[ "
+    str += "\(metalTexture)"
+    str += " ]"
+    return str
+  }
+  
+}
diff --git a/metal/paddle-mobile/paddle-mobile/paddle_mobile.h b/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
new file mode 100644
index 0000000000000000000000000000000000000000..50b60e9fe6c973b675a97e16c3c15af2b72e3fc4
--- /dev/null
+++ b/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#import "PaddleMobileCPU.h"
+#import "CPUCompute.h"
+#import "PaddleMobileGPU.h"
+#import <UIKit/UIKit.h>
+
+//! Project version number for paddle_mobile.
+//FOUNDATION_EXPORT double paddle_mobileVersionNumber;
+
+//! Project version string for paddle_mobile.
+//FOUNDATION_EXPORT const unsigned char paddle_mobileVersionString[];
+
+
diff --git a/python/tools/imagetools/imagetools.py b/python/tools/imagetools/imagetools.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a4432858007d6858f2728815670cfd1ed5ec786
--- /dev/null
+++ b/python/tools/imagetools/imagetools.py
@@ -0,0 +1,61 @@
+# coding=utf-8
+import cv2
+from array import array
+
+
+def resize_take_rgbs(path, shape_h_w):
+    print '--------------resize_take_rgbs-----------------begin'
+    image = cv2.imread(path)
+    # print image.shape
+    cv2.imshow("before", image)
+
+    print_rgb(image[0, 0])
+    # image len may be for .just check it
+    # image.resize(shape_h_w)
+
+    image = cv2.resize(image, (shape_h_w[0], shape_h_w[1]))
+
+    cv2.imshow("after", image)
+    print image.shape
+    height = shape_h_w[0]
+    width = shape_h_w[1]
+
+    rs_ = []
+    gs_ = []
+    bs_ = []
+    for h in range(0, height):
+        for w in range(0, width):
+            bs_.append(image[h, w, 0])
+            gs_.append(image[h, w, 1])
+            rs_.append(image[h, w, 2])
+
+    # print image[2, 2, 0]/255.
+    print len(bs_)
+    print len(gs_)
+    print len(rs_)
+    print '--------------resize_take_rgbs-----------------end'
+    return bs_, gs_, rs_
+
+
+def print_rgb((b, g, r)):
+    print "像素 - R:%d,G:%d,B:%d" % (r, g, b)  # 显示像素值
+    #
+    # image[0, 0] = (100, 150, 200)  # 更改位置(0,0)处的像素
+    #
+    # (b, g, r) = image[0, 0]  # 再次读取(0,0)像素
+    # print "位置(0,0)处的像素 - 红:%d,绿:%d,蓝:%d" % (r, g, b)  # 显示更改后的像素值
+    #
+    # corner = image[0:100, 0:100]  # 读取像素块
+    # cv2.imshow("Corner", corner)  # 显示读取的像素块
+    #
+    # image[0:100, 0:100] = (0, 255, 0);  # 更改读取的像素块
+    #
+    # cv2.imshow("Updated", image)  # 显示图像
+    #
+    # cv2.waitKey(0)  # 程序暂停
+
+
+def save_to_file(to_file_name, array):
+    to_file = open(to_file_name, "wb")
+    array.tofile(to_file)
+    to_file.close()
diff --git a/python/tools/imagetools/img2nchw.py b/python/tools/imagetools/img2nchw.py
new file mode 100644
index 0000000000000000000000000000000000000000..70ca456a1b1b5d20b92d0aaa51b01abb352c1d54
--- /dev/null
+++ b/python/tools/imagetools/img2nchw.py
@@ -0,0 +1,69 @@
+# coding=utf-8
+import cv2
+from array import array
+import imagetools as tools
+from enum import Enum
+
+
+class ChannelType(Enum):
+    RGB = 0,
+    BGR = 1
+
+
+def combine_bgrs_nchw(bgrs, means_b_g_r, scale, channel_type=ChannelType.BGR):
+    print '--------------combine_bgrs_nchw-----------------begin'
+    print "scale: %f" % scale
+    print means_b_g_r
+    # print len(bgrs)
+    bs = bgrs[0]
+    gs = bgrs[1]
+    rs = bgrs[2]
+
+    assert len(bs) == len(gs) == len(rs)
+    print len(bs)
+    bgrs_float_array = array('f')
+
+    if channel_type == ChannelType.BGR:
+        print 'bgr'
+        for i in range(0, len(bs)):
+            bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
+        for i in range(0, len(gs)):
+            bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
+        for i in range(0, len(rs)):
+            bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
+    elif channel_type == ChannelType.RGB:
+        print 'rgb'
+
+        for i in range(0, len(rs)):
+            bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
+        for i in range(0, len(gs)):
+            bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
+        for i in range(0, len(bs)):
+            bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
+
+    print len(bgrs_float_array)
+
+    print '------------------'
+    print bgrs_float_array[0]
+    print bgrs_float_array[416 * 416 * 2 + 416 * 2 + 2]
+
+    # for i in range(0, 9):
+    #     print'bs %d' % i
+    #     print bs[i] / 255.
+
+    print bs[416 * 2 + 2] / 255.
+    print '--------------combine_bgrs_nchw-----------------end'
+
+    return bgrs_float_array
+
+
+# bgrs = tools.resize_take_rgbs('banana.jpeg', (224, 224, 3))
+# array = combine_bgrs_nchw(bgrs, (103.94, 116.78, 123.68), 0.017, array,ChannelType.BGR)
+# tools.save_to_file('banana_1_3_224_224_nchw_float')
+
+# cv2.waitKey(0)
+
+
+bgrs = tools.resize_take_rgbs('datas/newyolo.jpg', (416, 416, 3))
+array = combine_bgrs_nchw(bgrs, (0, 0, 0), 1. / 255, ChannelType.RGB)
+tools.save_to_file('datas/desktop_1_3_416_416_nchw_float', array)
diff --git a/python/tools/imagetools/img2nhwc.py b/python/tools/imagetools/img2nhwc.py
new file mode 100644
index 0000000000000000000000000000000000000000..c982fe303ecde08a9de1827ca67024567322d47f
--- /dev/null
+++ b/python/tools/imagetools/img2nhwc.py
@@ -0,0 +1,34 @@
+# coding=utf-8
+import cv2
+from array import array
+import imagetools as tools
+
+
+def combine_bgrs_nhwc(bgrs, means_b_g_r, scale):
+    print "scale: %f" % scale
+    print means_b_g_r
+    # print len(bgrs)
+    bs = bgrs[0]
+    gs = bgrs[1]
+    rs = bgrs[2]
+    assert len(bs) == len(gs) == len(rs)
+    # print len(bs)
+    bgrs_float_array = array('f')
+    for i in range(0, len(bs)):
+        bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
+        bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
+        bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
+
+    print len(bgrs_float_array)
+
+    print '------------------'
+    print bgrs_float_array[0]
+    print bgrs_float_array[999]
+    return bgrs_float_array
+
+
+bgrs = tools.resize_take_rgbs('newyolo_1.jpg', (416, 416, 3))
+array = combine_bgrs_nhwc(bgrs, (0, 0, 0), 1.0 / 255)
+tools.save_to_file('desktop_1_3_416_416_nhwc_float', array)
+
+cv2.waitKey(0)
diff --git a/python/tools/imagetools/numpy2binary.py b/python/tools/imagetools/numpy2binary.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd4bc6e10074183b8dcee4122860c4140ff54229
--- /dev/null
+++ b/python/tools/imagetools/numpy2binary.py
@@ -0,0 +1,47 @@
+# coding=utf-8
+
+# 这个脚本是可以将numpy合并到二进制
+import cv2
+import numpy as np
+import imagetools as tools
+from array import array
+
+#
+# image = cv2.imread(path)
+# print image.shape
+#
+# print_rgb(image[0, 0])
+# # image len may be for .just check it
+# image.resize(shape_h_w)
+
+
+data = np.fromfile('datas/img.res')
+print data.size
+print data[0]
+
+data.reshape(1, 3, 416, 416)
+out_array = array('f')
+print'--------------------'
+print data.size
+print data[0]
+
+print '如果是nhwc --------'
+# rgb rgb rgb rgb rgb
+print data[416 * 3 * 2 + 3 * 2 + 2]
+# print data[2]
+
+print '如果是nchw --------'
+# rgb rgb rgb rgb rgb
+print data[416 * 416 * 2 + 416 * 2 + 2]
+# print data[2]
+
+# 明明是nchw
+
+for i in range(0, data.size):
+    out_array.append(data[i])
+
+print len(out_array)
+
+print out_array[416 * 416 * 2 + 416 * 2 + 2]
+
+tools.save_to_file('datas/in_put_1_3_416_416_2', out_array)
diff --git a/python/tools/mdl2fluid/float2halffloat.py b/python/tools/mdl2fluid/float2halffloat.py
new file mode 100644
index 0000000000000000000000000000000000000000..3df8d43f9548429cef5d49f72fb07f3cef264834
--- /dev/null
+++ b/python/tools/mdl2fluid/float2halffloat.py
@@ -0,0 +1,70 @@
+# encoding:utf-8
+import math
+import re
+
+
+def Real2HalfFloat(data):
+    MINNUM = -65536
+    MAXNUM = 65535
+    FloatVal = 0
+    if data:
+        if data < MINNUM:
+            data = MINNUM
+        if data > MAXNUM:
+            data = MAXNUM
+
+        sign = 0
+        if data < 0:
+            sign = 1
+            data = -data
+
+        exp = math.floor((math.log2(data)))
+        expout = exp + 16
+
+        Mantial = round(data / pow(2, exp - 10)) - 1024
+
+        if expout <= 0:
+            FloatVal = 0
+        else:
+            FloatVal = sign * 32768 + expout * 1024 + Mantial
+    return FloatVal
+
+
+def ReadCfloatData(sourcefile):
+    input = []
+    with open(sourcfile, 'r') as f:
+        for line in f.readlines():
+            line = line.strip()
+            line = re.sub('\s+', ' ', line)  # 两个数字间多个空格
+            input.append(line.split(' '))
+    destfile = sourcefile.replace('.dat', '')
+    destfile = destfile.replace('.txt', '')
+    destfile += 'Out.dat'
+    with open(destfile, 'w') as fw:
+        for i in range(len(input)):
+            if len(input[i]) == 2:
+                real = Real2HalfFloat(float(input[i][0]))
+                imag = Real2HalfFloat(float(input[i][1]))
+                result = real * 65536 + imag
+                if imag and not real:
+                    fw.write('0x0000' + "%X" % result + '\n')
+                elif not imag and not real:
+                    fw.write('0x00000000' + '\n')
+                else:
+                    fw.write('0x' + "%X" % result + '\n')
+            elif len(input[i]) == 1:
+                result = Real2HalfFloat(float(input[i][0]))
+                if result:
+                    fw.write('0x' + "%X" % result + '\n')
+                else:
+                    fw.write('0x0000' + '\n')
+
+
+if __name__ == '__main__':
+    print('Tips: Input number 0 if you want to exit!\n')
+    while True:
+        sourcfile = input("input source file:\n")
+        if sourcfile is '0':
+            break
+        ReadCfloatData(sourcfile)
+        print('Transfer Success!')
diff --git a/python/tools/mdl2fluid/framework.proto b/python/tools/mdl2fluid/framework.proto
new file mode 100644
index 0000000000000000000000000000000000000000..07bfef1c2a69c236ac86732b2dbc00d8abb6334b
--- /dev/null
+++ b/python/tools/mdl2fluid/framework.proto
@@ -0,0 +1,176 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+option optimize_for = LITE_RUNTIME;
+package paddle_mobile.framework.proto;
+
+enum AttrType {
+  INT = 0;
+  FLOAT = 1;
+  STRING = 2;
+  INTS = 3;
+  FLOATS = 4;
+  STRINGS = 5;
+  BOOLEAN = 6;
+  BOOLEANS = 7;
+  BLOCK = 8;
+  LONG = 9;
+}
+
+// OpDesc describes an instance of a C++ framework::OperatorBase
+// derived class type.
+message OpDesc {
+
+  message Attr {
+    required string name = 1;
+    required AttrType type = 2;
+    optional int32 i = 3;
+    optional float f = 4;
+    optional string s = 5;
+    repeated int32 ints = 6;
+    repeated float floats = 7;
+    repeated string strings = 8;
+    optional bool b = 10;
+    repeated bool bools = 11;
+    optional int32 block_idx = 12;
+    optional int64 l = 13;
+  };
+
+  message Var {
+    required string parameter = 1;
+    repeated string arguments = 2;
+  };
+
+  required string type = 3;
+  repeated Var inputs = 1;
+  repeated Var outputs = 2;
+  repeated Attr attrs = 4;
+  optional bool is_target = 5 [ default = false ];
+};
+
+// OpProto describes a C++ framework::OperatorBase derived class.
+message OpProto {
+
+  // VarProto describes the C++ type framework::Variable.
+  message Var {
+    required string name = 1;
+    required string comment = 2;
+
+    optional bool duplicable = 3 [ default = false ];
+    optional bool intermediate = 4 [ default = false ];
+    optional bool dispensable = 5 [ default = false ];
+  }
+
+  // AttrProto describes the C++ type Attribute.
+  message Attr {
+    required string name = 1;
+    required AttrType type = 2;
+    required string comment = 3;
+    // If that attribute is generated, it means the Paddle third
+    // language binding has responsibility to fill that
+    // attribute. End-User should not set that attribute.
+    optional bool generated = 4 [ default = false ];
+  }
+
+  required string type = 1;
+  repeated Var inputs = 2;
+  repeated Var outputs = 3;
+  repeated Attr attrs = 4;
+  required string comment = 5;
+}
+
+message VarType {
+  enum Type {
+    // Pod Types
+    BOOL = 0;
+    INT16 = 1;
+    INT32 = 2;
+    INT64 = 3;
+    FP16 = 4;
+    FP32 = 5;
+    FP64 = 6;
+
+    // Other types that may need additional descriptions
+    LOD_TENSOR = 7;
+    SELECTED_ROWS = 8;
+    FEED_MINIBATCH = 9;
+    FETCH_LIST = 10;
+    STEP_SCOPES = 11;
+    LOD_RANK_TABLE = 12;
+    LOD_TENSOR_ARRAY = 13;
+    PLACE_LIST = 14;
+    READER = 15;
+    CHANNEL = 16;
+    // Any runtime decided variable type is raw
+    // raw variables should manage their own allocations
+    // in operators like nccl_op
+    RAW = 17;
+    TUPLE = 18;
+  }
+
+  required Type type = 1;
+
+  message TensorDesc {
+    // Should only be PODType. Is enforced in C++
+    required Type data_type = 1;
+    repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  }
+  optional TensorDesc selected_rows = 2;
+
+  message LoDTensorDesc {
+    required TensorDesc tensor = 1;
+    optional int32 lod_level = 2 [ default = 0 ];
+  }
+  optional LoDTensorDesc lod_tensor = 3;
+
+  message LoDTensorArrayDesc {
+    required TensorDesc tensor = 1;
+    optional int32 lod_level = 2 [ default = 0 ];
+  }
+  optional LoDTensorArrayDesc tensor_array = 4;
+
+  message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
+  optional ReaderDesc reader = 5;
+
+  message ChannelDesc {
+    required Type data_type = 1;
+    required int64 capacity = 2;
+  }
+  optional ChannelDesc channel = 6;
+
+  message Tuple { repeated Type element_type = 1; }
+  optional Tuple tuple = 7;
+}
+
+message VarDesc {
+  required string name = 1;
+  required VarType type = 2;
+  optional bool persistable = 3 [ default = false ];
+}
+
+message BlockDesc {
+  required int32 idx = 1;
+  required int32 parent_idx = 2;
+  repeated VarDesc vars = 3;
+  repeated OpDesc ops = 4;
+  optional int32 forward_block_idx = 5 [ default = -1 ];
+}
+
+// Please refer to
+// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
+// for more details.
+// TODO(panyx0718): A model can have multiple programs. Need a
+// way to distinguish them. Maybe ID or name?
+message ProgramDesc { repeated BlockDesc blocks = 1; }
diff --git a/python/tools/mdl2fluid/framework_pb2.py b/python/tools/mdl2fluid/framework_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a43deebc91d42e9eb38cf9940020238041d81da
--- /dev/null
+++ b/python/tools/mdl2fluid/framework_pb2.py
@@ -0,0 +1,1141 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: framework.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf.internal import enum_type_wrapper
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='framework.proto',
+  package='paddle_mobile.framework.proto',
+  syntax='proto2',
+  serialized_pb=_b('\n\x0f\x66ramework.proto\x12\x1dpaddle_mobile.framework.proto\"\xe5\x03\n\x06OpDesc\x12\x0c\n\x04type\x18\x03 \x02(\t\x12\x39\n\x06inputs\x18\x01 \x03(\x0b\x32).paddle_mobile.framework.proto.OpDesc.Var\x12:\n\x07outputs\x18\x02 \x03(\x0b\x32).paddle_mobile.framework.proto.OpDesc.Var\x12\x39\n\x05\x61ttrs\x18\x04 \x03(\x0b\x32*.paddle_mobile.framework.proto.OpDesc.Attr\x12\x18\n\tis_target\x18\x05 \x01(\x08:\x05\x66\x61lse\x1a\xd3\x01\n\x04\x41ttr\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x35\n\x04type\x18\x02 \x02(\x0e\x32\'.paddle_mobile.framework.proto.AttrType\x12\t\n\x01i\x18\x03 \x01(\x05\x12\t\n\x01\x66\x18\x04 \x01(\x02\x12\t\n\x01s\x18\x05 \x01(\t\x12\x0c\n\x04ints\x18\x06 \x03(\x05\x12\x0e\n\x06\x66loats\x18\x07 \x03(\x02\x12\x0f\n\x07strings\x18\x08 \x03(\t\x12\t\n\x01\x62\x18\n \x01(\x08\x12\r\n\x05\x62ools\x18\x0b \x03(\x08\x12\x11\n\tblock_idx\x18\x0c \x01(\x05\x12\t\n\x01l\x18\r \x01(\x03\x1a+\n\x03Var\x12\x11\n\tparameter\x18\x01 \x02(\t\x12\x11\n\targuments\x18\x02 \x03(\t\"\xcf\x03\n\x07OpProto\x12\x0c\n\x04type\x18\x01 \x02(\t\x12:\n\x06inputs\x18\x02 \x03(\x0b\x32*.paddle_mobile.framework.proto.OpProto.Var\x12;\n\x07outputs\x18\x03 \x03(\x0b\x32*.paddle_mobile.framework.proto.OpProto.Var\x12:\n\x05\x61ttrs\x18\x04 \x03(\x0b\x32+.paddle_mobile.framework.proto.OpProto.Attr\x12\x0f\n\x07\x63omment\x18\x05 \x02(\t\x1ax\n\x03Var\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x0f\n\x07\x63omment\x18\x02 \x02(\t\x12\x19\n\nduplicable\x18\x03 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0cintermediate\x18\x04 \x01(\x08:\x05\x66\x61lse\x12\x1a\n\x0b\x64ispensable\x18\x05 \x01(\x08:\x05\x66\x61lse\x1av\n\x04\x41ttr\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x35\n\x04type\x18\x02 \x02(\x0e\x32\'.paddle_mobile.framework.proto.AttrType\x12\x0f\n\x07\x63omment\x18\x03 \x02(\t\x12\x18\n\tgenerated\x18\x04 \x01(\x08:\x05\x66\x61lse\"\xb9\n\n\x07VarType\x12\x39\n\x04type\x18\x01 \x02(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\x12H\n\rselected_rows\x18\x02 \x01(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.TensorDesc\x12H\n\nlod_tensor\x18\x03 \x01(\x0b\x32\x34.paddle_mobile.framework.proto.VarType.LoDTensorDesc\x12O\n\x0ctensor_array\x18\x04 \x01(\x0b\x32\x39.paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc\x12\x41\n\x06reader\x18\x05 \x01(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.ReaderDesc\x12\x43\n\x07\x63hannel\x18\x06 \x01(\x0b\x32\x32.paddle_mobile.framework.proto.VarType.ChannelDesc\x12;\n\x05tuple\x18\x07 \x01(\x0b\x32,.paddle_mobile.framework.proto.VarType.Tuple\x1aZ\n\nTensorDesc\x12>\n\tdata_type\x18\x01 \x02(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\x12\x0c\n\x04\x64ims\x18\x02 \x03(\x03\x1ah\n\rLoDTensorDesc\x12\x41\n\x06tensor\x18\x01 \x02(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.TensorDesc\x12\x14\n\tlod_level\x18\x02 \x01(\x05:\x01\x30\x1am\n\x12LoDTensorArrayDesc\x12\x41\n\x06tensor\x18\x01 \x02(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.TensorDesc\x12\x14\n\tlod_level\x18\x02 \x01(\x05:\x01\x30\x1aV\n\nReaderDesc\x12H\n\nlod_tensor\x18\x01 \x03(\x0b\x32\x34.paddle_mobile.framework.proto.VarType.LoDTensorDesc\x1a_\n\x0b\x43hannelDesc\x12>\n\tdata_type\x18\x01 \x02(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\x12\x10\n\x08\x63\x61pacity\x18\x02 \x02(\x03\x1aJ\n\x05Tuple\x12\x41\n\x0c\x65lement_type\x18\x01 \x03(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\"\x8e\x02\n\x04Type\x12\x08\n\x04\x42OOL\x10\x00\x12\t\n\x05INT16\x10\x01\x12\t\n\x05INT32\x10\x02\x12\t\n\x05INT64\x10\x03\x12\x08\n\x04\x46P16\x10\x04\x12\x08\n\x04\x46P32\x10\x05\x12\x08\n\x04\x46P64\x10\x06\x12\x0e\n\nLOD_TENSOR\x10\x07\x12\x11\n\rSELECTED_ROWS\x10\x08\x12\x12\n\x0e\x46\x45\x45\x44_MINIBATCH\x10\t\x12\x0e\n\nFETCH_LIST\x10\n\x12\x0f\n\x0bSTEP_SCOPES\x10\x0b\x12\x12\n\x0eLOD_RANK_TABLE\x10\x0c\x12\x14\n\x10LOD_TENSOR_ARRAY\x10\r\x12\x0e\n\nPLACE_LIST\x10\x0e\x12\n\n\x06READER\x10\x0f\x12\x0b\n\x07\x43HANNEL\x10\x10\x12\x07\n\x03RAW\x10\x11\x12\t\n\x05TUPLE\x10\x12\"i\n\x07VarDesc\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x34\n\x04type\x18\x02 \x02(\x0b\x32&.paddle_mobile.framework.proto.VarType\x12\x1a\n\x0bpersistable\x18\x03 \x01(\x08:\x05\x66\x61lse\"\xb5\x01\n\tBlockDesc\x12\x0b\n\x03idx\x18\x01 \x02(\x05\x12\x12\n\nparent_idx\x18\x02 \x02(\x05\x12\x34\n\x04vars\x18\x03 \x03(\x0b\x32&.paddle_mobile.framework.proto.VarDesc\x12\x32\n\x03ops\x18\x04 \x03(\x0b\x32%.paddle_mobile.framework.proto.OpDesc\x12\x1d\n\x11\x66orward_block_idx\x18\x05 \x01(\x05:\x02-1\"G\n\x0bProgramDesc\x12\x38\n\x06\x62locks\x18\x01 \x03(\x0b\x32(.paddle_mobile.framework.proto.BlockDesc*}\n\x08\x41ttrType\x12\x07\n\x03INT\x10\x00\x12\t\n\x05\x46LOAT\x10\x01\x12\n\n\x06STRING\x10\x02\x12\x08\n\x04INTS\x10\x03\x12\n\n\x06\x46LOATS\x10\x04\x12\x0b\n\x07STRINGS\x10\x05\x12\x0b\n\x07\x42OOLEAN\x10\x06\x12\x0c\n\x08\x42OOLEANS\x10\x07\x12\t\n\x05\x42LOCK\x10\x08\x12\x08\n\x04LONG\x10\tB\x02H\x03')
+)
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+_ATTRTYPE = _descriptor.EnumDescriptor(
+  name='AttrType',
+  full_name='paddle_mobile.framework.proto.AttrType',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='INT', index=0, number=0,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='FLOAT', index=1, number=1,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='STRING', index=2, number=2,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='INTS', index=3, number=3,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='FLOATS', index=4, number=4,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='STRINGS', index=5, number=5,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='BOOLEAN', index=6, number=6,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='BOOLEANS', index=7, number=7,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='BLOCK', index=8, number=8,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='LONG', index=9, number=9,
+      options=None,
+      type=None),
+  ],
+  containing_type=None,
+  options=None,
+  serialized_start=2708,
+  serialized_end=2833,
+)
+_sym_db.RegisterEnumDescriptor(_ATTRTYPE)
+
+AttrType = enum_type_wrapper.EnumTypeWrapper(_ATTRTYPE)
+INT = 0
+FLOAT = 1
+STRING = 2
+INTS = 3
+FLOATS = 4
+STRINGS = 5
+BOOLEAN = 6
+BOOLEANS = 7
+BLOCK = 8
+LONG = 9
+
+
+_VARTYPE_TYPE = _descriptor.EnumDescriptor(
+  name='Type',
+  full_name='paddle_mobile.framework.proto.VarType.Type',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='BOOL', index=0, number=0,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='INT16', index=1, number=1,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='INT32', index=2, number=2,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='INT64', index=3, number=3,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='FP16', index=4, number=4,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='FP32', index=5, number=5,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='FP64', index=6, number=6,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='LOD_TENSOR', index=7, number=7,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='SELECTED_ROWS', index=8, number=8,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='FEED_MINIBATCH', index=9, number=9,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='FETCH_LIST', index=10, number=10,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='STEP_SCOPES', index=11, number=11,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='LOD_RANK_TABLE', index=12, number=12,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='LOD_TENSOR_ARRAY', index=13, number=13,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='PLACE_LIST', index=14, number=14,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='READER', index=15, number=15,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='CHANNEL', index=16, number=16,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='RAW', index=17, number=17,
+      options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='TUPLE', index=18, number=18,
+      options=None,
+      type=None),
+  ],
+  containing_type=None,
+  options=None,
+  serialized_start=2072,
+  serialized_end=2342,
+)
+_sym_db.RegisterEnumDescriptor(_VARTYPE_TYPE)
+
+
+_OPDESC_ATTR = _descriptor.Descriptor(
+  name='Attr',
+  full_name='paddle_mobile.framework.proto.OpDesc.Attr',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='paddle_mobile.framework.proto.OpDesc.Attr.name', index=0,
+      number=1, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='type', full_name='paddle_mobile.framework.proto.OpDesc.Attr.type', index=1,
+      number=2, type=14, cpp_type=8, label=2,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='i', full_name='paddle_mobile.framework.proto.OpDesc.Attr.i', index=2,
+      number=3, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='f', full_name='paddle_mobile.framework.proto.OpDesc.Attr.f', index=3,
+      number=4, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='s', full_name='paddle_mobile.framework.proto.OpDesc.Attr.s', index=4,
+      number=5, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='ints', full_name='paddle_mobile.framework.proto.OpDesc.Attr.ints', index=5,
+      number=6, type=5, cpp_type=1, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='floats', full_name='paddle_mobile.framework.proto.OpDesc.Attr.floats', index=6,
+      number=7, type=2, cpp_type=6, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='strings', full_name='paddle_mobile.framework.proto.OpDesc.Attr.strings', index=7,
+      number=8, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='b', full_name='paddle_mobile.framework.proto.OpDesc.Attr.b', index=8,
+      number=10, type=8, cpp_type=7, label=1,
+      has_default_value=False, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='bools', full_name='paddle_mobile.framework.proto.OpDesc.Attr.bools', index=9,
+      number=11, type=8, cpp_type=7, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='block_idx', full_name='paddle_mobile.framework.proto.OpDesc.Attr.block_idx', index=10,
+      number=12, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='l', full_name='paddle_mobile.framework.proto.OpDesc.Attr.l', index=11,
+      number=13, type=3, cpp_type=2, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=280,
+  serialized_end=491,
+)
+
+_OPDESC_VAR = _descriptor.Descriptor(
+  name='Var',
+  full_name='paddle_mobile.framework.proto.OpDesc.Var',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='parameter', full_name='paddle_mobile.framework.proto.OpDesc.Var.parameter', index=0,
+      number=1, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='arguments', full_name='paddle_mobile.framework.proto.OpDesc.Var.arguments', index=1,
+      number=2, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=493,
+  serialized_end=536,
+)
+
+_OPDESC = _descriptor.Descriptor(
+  name='OpDesc',
+  full_name='paddle_mobile.framework.proto.OpDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='type', full_name='paddle_mobile.framework.proto.OpDesc.type', index=0,
+      number=3, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='inputs', full_name='paddle_mobile.framework.proto.OpDesc.inputs', index=1,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='outputs', full_name='paddle_mobile.framework.proto.OpDesc.outputs', index=2,
+      number=2, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='attrs', full_name='paddle_mobile.framework.proto.OpDesc.attrs', index=3,
+      number=4, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='is_target', full_name='paddle_mobile.framework.proto.OpDesc.is_target', index=4,
+      number=5, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[_OPDESC_ATTR, _OPDESC_VAR, ],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=51,
+  serialized_end=536,
+)
+
+
+_OPPROTO_VAR = _descriptor.Descriptor(
+  name='Var',
+  full_name='paddle_mobile.framework.proto.OpProto.Var',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='paddle_mobile.framework.proto.OpProto.Var.name', index=0,
+      number=1, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='comment', full_name='paddle_mobile.framework.proto.OpProto.Var.comment', index=1,
+      number=2, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='duplicable', full_name='paddle_mobile.framework.proto.OpProto.Var.duplicable', index=2,
+      number=3, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='intermediate', full_name='paddle_mobile.framework.proto.OpProto.Var.intermediate', index=3,
+      number=4, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='dispensable', full_name='paddle_mobile.framework.proto.OpProto.Var.dispensable', index=4,
+      number=5, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=762,
+  serialized_end=882,
+)
+
+_OPPROTO_ATTR = _descriptor.Descriptor(
+  name='Attr',
+  full_name='paddle_mobile.framework.proto.OpProto.Attr',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='paddle_mobile.framework.proto.OpProto.Attr.name', index=0,
+      number=1, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='type', full_name='paddle_mobile.framework.proto.OpProto.Attr.type', index=1,
+      number=2, type=14, cpp_type=8, label=2,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='comment', full_name='paddle_mobile.framework.proto.OpProto.Attr.comment', index=2,
+      number=3, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='generated', full_name='paddle_mobile.framework.proto.OpProto.Attr.generated', index=3,
+      number=4, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=884,
+  serialized_end=1002,
+)
+
+_OPPROTO = _descriptor.Descriptor(
+  name='OpProto',
+  full_name='paddle_mobile.framework.proto.OpProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='type', full_name='paddle_mobile.framework.proto.OpProto.type', index=0,
+      number=1, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='inputs', full_name='paddle_mobile.framework.proto.OpProto.inputs', index=1,
+      number=2, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='outputs', full_name='paddle_mobile.framework.proto.OpProto.outputs', index=2,
+      number=3, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='attrs', full_name='paddle_mobile.framework.proto.OpProto.attrs', index=3,
+      number=4, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='comment', full_name='paddle_mobile.framework.proto.OpProto.comment', index=4,
+      number=5, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[_OPPROTO_VAR, _OPPROTO_ATTR, ],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=539,
+  serialized_end=1002,
+)
+
+
+_VARTYPE_TENSORDESC = _descriptor.Descriptor(
+  name='TensorDesc',
+  full_name='paddle_mobile.framework.proto.VarType.TensorDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='data_type', full_name='paddle_mobile.framework.proto.VarType.TensorDesc.data_type', index=0,
+      number=1, type=14, cpp_type=8, label=2,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='dims', full_name='paddle_mobile.framework.proto.VarType.TensorDesc.dims', index=1,
+      number=2, type=3, cpp_type=2, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1501,
+  serialized_end=1591,
+)
+
+_VARTYPE_LODTENSORDESC = _descriptor.Descriptor(
+  name='LoDTensorDesc',
+  full_name='paddle_mobile.framework.proto.VarType.LoDTensorDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='tensor', full_name='paddle_mobile.framework.proto.VarType.LoDTensorDesc.tensor', index=0,
+      number=1, type=11, cpp_type=10, label=2,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='lod_level', full_name='paddle_mobile.framework.proto.VarType.LoDTensorDesc.lod_level', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1593,
+  serialized_end=1697,
+)
+
+_VARTYPE_LODTENSORARRAYDESC = _descriptor.Descriptor(
+  name='LoDTensorArrayDesc',
+  full_name='paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='tensor', full_name='paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc.tensor', index=0,
+      number=1, type=11, cpp_type=10, label=2,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='lod_level', full_name='paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc.lod_level', index=1,
+      number=2, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1699,
+  serialized_end=1808,
+)
+
+_VARTYPE_READERDESC = _descriptor.Descriptor(
+  name='ReaderDesc',
+  full_name='paddle_mobile.framework.proto.VarType.ReaderDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='lod_tensor', full_name='paddle_mobile.framework.proto.VarType.ReaderDesc.lod_tensor', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1810,
+  serialized_end=1896,
+)
+
+_VARTYPE_CHANNELDESC = _descriptor.Descriptor(
+  name='ChannelDesc',
+  full_name='paddle_mobile.framework.proto.VarType.ChannelDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='data_type', full_name='paddle_mobile.framework.proto.VarType.ChannelDesc.data_type', index=0,
+      number=1, type=14, cpp_type=8, label=2,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='capacity', full_name='paddle_mobile.framework.proto.VarType.ChannelDesc.capacity', index=1,
+      number=2, type=3, cpp_type=2, label=2,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1898,
+  serialized_end=1993,
+)
+
+_VARTYPE_TUPLE = _descriptor.Descriptor(
+  name='Tuple',
+  full_name='paddle_mobile.framework.proto.VarType.Tuple',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='element_type', full_name='paddle_mobile.framework.proto.VarType.Tuple.element_type', index=0,
+      number=1, type=14, cpp_type=8, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1995,
+  serialized_end=2069,
+)
+
+_VARTYPE = _descriptor.Descriptor(
+  name='VarType',
+  full_name='paddle_mobile.framework.proto.VarType',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='type', full_name='paddle_mobile.framework.proto.VarType.type', index=0,
+      number=1, type=14, cpp_type=8, label=2,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='selected_rows', full_name='paddle_mobile.framework.proto.VarType.selected_rows', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='lod_tensor', full_name='paddle_mobile.framework.proto.VarType.lod_tensor', index=2,
+      number=3, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='tensor_array', full_name='paddle_mobile.framework.proto.VarType.tensor_array', index=3,
+      number=4, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='reader', full_name='paddle_mobile.framework.proto.VarType.reader', index=4,
+      number=5, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='channel', full_name='paddle_mobile.framework.proto.VarType.channel', index=5,
+      number=6, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='tuple', full_name='paddle_mobile.framework.proto.VarType.tuple', index=6,
+      number=7, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[_VARTYPE_TENSORDESC, _VARTYPE_LODTENSORDESC, _VARTYPE_LODTENSORARRAYDESC, _VARTYPE_READERDESC, _VARTYPE_CHANNELDESC, _VARTYPE_TUPLE, ],
+  enum_types=[
+    _VARTYPE_TYPE,
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1005,
+  serialized_end=2342,
+)
+
+
+_VARDESC = _descriptor.Descriptor(
+  name='VarDesc',
+  full_name='paddle_mobile.framework.proto.VarDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='paddle_mobile.framework.proto.VarDesc.name', index=0,
+      number=1, type=9, cpp_type=9, label=2,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='type', full_name='paddle_mobile.framework.proto.VarDesc.type', index=1,
+      number=2, type=11, cpp_type=10, label=2,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='persistable', full_name='paddle_mobile.framework.proto.VarDesc.persistable', index=2,
+      number=3, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=2344,
+  serialized_end=2449,
+)
+
+
+_BLOCKDESC = _descriptor.Descriptor(
+  name='BlockDesc',
+  full_name='paddle_mobile.framework.proto.BlockDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='idx', full_name='paddle_mobile.framework.proto.BlockDesc.idx', index=0,
+      number=1, type=5, cpp_type=1, label=2,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='parent_idx', full_name='paddle_mobile.framework.proto.BlockDesc.parent_idx', index=1,
+      number=2, type=5, cpp_type=1, label=2,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='vars', full_name='paddle_mobile.framework.proto.BlockDesc.vars', index=2,
+      number=3, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='ops', full_name='paddle_mobile.framework.proto.BlockDesc.ops', index=3,
+      number=4, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='forward_block_idx', full_name='paddle_mobile.framework.proto.BlockDesc.forward_block_idx', index=4,
+      number=5, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=-1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=2452,
+  serialized_end=2633,
+)
+
+
+_PROGRAMDESC = _descriptor.Descriptor(
+  name='ProgramDesc',
+  full_name='paddle_mobile.framework.proto.ProgramDesc',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='blocks', full_name='paddle_mobile.framework.proto.ProgramDesc.blocks', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=2635,
+  serialized_end=2706,
+)
+
+_OPDESC_ATTR.fields_by_name['type'].enum_type = _ATTRTYPE
+_OPDESC_ATTR.containing_type = _OPDESC
+_OPDESC_VAR.containing_type = _OPDESC
+_OPDESC.fields_by_name['inputs'].message_type = _OPDESC_VAR
+_OPDESC.fields_by_name['outputs'].message_type = _OPDESC_VAR
+_OPDESC.fields_by_name['attrs'].message_type = _OPDESC_ATTR
+_OPPROTO_VAR.containing_type = _OPPROTO
+_OPPROTO_ATTR.fields_by_name['type'].enum_type = _ATTRTYPE
+_OPPROTO_ATTR.containing_type = _OPPROTO
+_OPPROTO.fields_by_name['inputs'].message_type = _OPPROTO_VAR
+_OPPROTO.fields_by_name['outputs'].message_type = _OPPROTO_VAR
+_OPPROTO.fields_by_name['attrs'].message_type = _OPPROTO_ATTR
+_VARTYPE_TENSORDESC.fields_by_name['data_type'].enum_type = _VARTYPE_TYPE
+_VARTYPE_TENSORDESC.containing_type = _VARTYPE
+_VARTYPE_LODTENSORDESC.fields_by_name['tensor'].message_type = _VARTYPE_TENSORDESC
+_VARTYPE_LODTENSORDESC.containing_type = _VARTYPE
+_VARTYPE_LODTENSORARRAYDESC.fields_by_name['tensor'].message_type = _VARTYPE_TENSORDESC
+_VARTYPE_LODTENSORARRAYDESC.containing_type = _VARTYPE
+_VARTYPE_READERDESC.fields_by_name['lod_tensor'].message_type = _VARTYPE_LODTENSORDESC
+_VARTYPE_READERDESC.containing_type = _VARTYPE
+_VARTYPE_CHANNELDESC.fields_by_name['data_type'].enum_type = _VARTYPE_TYPE
+_VARTYPE_CHANNELDESC.containing_type = _VARTYPE
+_VARTYPE_TUPLE.fields_by_name['element_type'].enum_type = _VARTYPE_TYPE
+_VARTYPE_TUPLE.containing_type = _VARTYPE
+_VARTYPE.fields_by_name['type'].enum_type = _VARTYPE_TYPE
+_VARTYPE.fields_by_name['selected_rows'].message_type = _VARTYPE_TENSORDESC
+_VARTYPE.fields_by_name['lod_tensor'].message_type = _VARTYPE_LODTENSORDESC
+_VARTYPE.fields_by_name['tensor_array'].message_type = _VARTYPE_LODTENSORARRAYDESC
+_VARTYPE.fields_by_name['reader'].message_type = _VARTYPE_READERDESC
+_VARTYPE.fields_by_name['channel'].message_type = _VARTYPE_CHANNELDESC
+_VARTYPE.fields_by_name['tuple'].message_type = _VARTYPE_TUPLE
+_VARTYPE_TYPE.containing_type = _VARTYPE
+_VARDESC.fields_by_name['type'].message_type = _VARTYPE
+_BLOCKDESC.fields_by_name['vars'].message_type = _VARDESC
+_BLOCKDESC.fields_by_name['ops'].message_type = _OPDESC
+_PROGRAMDESC.fields_by_name['blocks'].message_type = _BLOCKDESC
+DESCRIPTOR.message_types_by_name['OpDesc'] = _OPDESC
+DESCRIPTOR.message_types_by_name['OpProto'] = _OPPROTO
+DESCRIPTOR.message_types_by_name['VarType'] = _VARTYPE
+DESCRIPTOR.message_types_by_name['VarDesc'] = _VARDESC
+DESCRIPTOR.message_types_by_name['BlockDesc'] = _BLOCKDESC
+DESCRIPTOR.message_types_by_name['ProgramDesc'] = _PROGRAMDESC
+DESCRIPTOR.enum_types_by_name['AttrType'] = _ATTRTYPE
+
+OpDesc = _reflection.GeneratedProtocolMessageType('OpDesc', (_message.Message,), dict(
+
+  Attr = _reflection.GeneratedProtocolMessageType('Attr', (_message.Message,), dict(
+    DESCRIPTOR = _OPDESC_ATTR,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpDesc.Attr)
+    ))
+  ,
+
+  Var = _reflection.GeneratedProtocolMessageType('Var', (_message.Message,), dict(
+    DESCRIPTOR = _OPDESC_VAR,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpDesc.Var)
+    ))
+  ,
+  DESCRIPTOR = _OPDESC,
+  __module__ = 'framework_pb2'
+  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpDesc)
+  ))
+_sym_db.RegisterMessage(OpDesc)
+_sym_db.RegisterMessage(OpDesc.Attr)
+_sym_db.RegisterMessage(OpDesc.Var)
+
+OpProto = _reflection.GeneratedProtocolMessageType('OpProto', (_message.Message,), dict(
+
+  Var = _reflection.GeneratedProtocolMessageType('Var', (_message.Message,), dict(
+    DESCRIPTOR = _OPPROTO_VAR,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpProto.Var)
+    ))
+  ,
+
+  Attr = _reflection.GeneratedProtocolMessageType('Attr', (_message.Message,), dict(
+    DESCRIPTOR = _OPPROTO_ATTR,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpProto.Attr)
+    ))
+  ,
+  DESCRIPTOR = _OPPROTO,
+  __module__ = 'framework_pb2'
+  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpProto)
+  ))
+_sym_db.RegisterMessage(OpProto)
+_sym_db.RegisterMessage(OpProto.Var)
+_sym_db.RegisterMessage(OpProto.Attr)
+
+VarType = _reflection.GeneratedProtocolMessageType('VarType', (_message.Message,), dict(
+
+  TensorDesc = _reflection.GeneratedProtocolMessageType('TensorDesc', (_message.Message,), dict(
+    DESCRIPTOR = _VARTYPE_TENSORDESC,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.TensorDesc)
+    ))
+  ,
+
+  LoDTensorDesc = _reflection.GeneratedProtocolMessageType('LoDTensorDesc', (_message.Message,), dict(
+    DESCRIPTOR = _VARTYPE_LODTENSORDESC,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.LoDTensorDesc)
+    ))
+  ,
+
+  LoDTensorArrayDesc = _reflection.GeneratedProtocolMessageType('LoDTensorArrayDesc', (_message.Message,), dict(
+    DESCRIPTOR = _VARTYPE_LODTENSORARRAYDESC,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc)
+    ))
+  ,
+
+  ReaderDesc = _reflection.GeneratedProtocolMessageType('ReaderDesc', (_message.Message,), dict(
+    DESCRIPTOR = _VARTYPE_READERDESC,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.ReaderDesc)
+    ))
+  ,
+
+  ChannelDesc = _reflection.GeneratedProtocolMessageType('ChannelDesc', (_message.Message,), dict(
+    DESCRIPTOR = _VARTYPE_CHANNELDESC,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.ChannelDesc)
+    ))
+  ,
+
+  Tuple = _reflection.GeneratedProtocolMessageType('Tuple', (_message.Message,), dict(
+    DESCRIPTOR = _VARTYPE_TUPLE,
+    __module__ = 'framework_pb2'
+    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.Tuple)
+    ))
+  ,
+  DESCRIPTOR = _VARTYPE,
+  __module__ = 'framework_pb2'
+  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType)
+  ))
+_sym_db.RegisterMessage(VarType)
+_sym_db.RegisterMessage(VarType.TensorDesc)
+_sym_db.RegisterMessage(VarType.LoDTensorDesc)
+_sym_db.RegisterMessage(VarType.LoDTensorArrayDesc)
+_sym_db.RegisterMessage(VarType.ReaderDesc)
+_sym_db.RegisterMessage(VarType.ChannelDesc)
+_sym_db.RegisterMessage(VarType.Tuple)
+
+VarDesc = _reflection.GeneratedProtocolMessageType('VarDesc', (_message.Message,), dict(
+  DESCRIPTOR = _VARDESC,
+  __module__ = 'framework_pb2'
+  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarDesc)
+  ))
+_sym_db.RegisterMessage(VarDesc)
+
+BlockDesc = _reflection.GeneratedProtocolMessageType('BlockDesc', (_message.Message,), dict(
+  DESCRIPTOR = _BLOCKDESC,
+  __module__ = 'framework_pb2'
+  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.BlockDesc)
+  ))
+_sym_db.RegisterMessage(BlockDesc)
+
+ProgramDesc = _reflection.GeneratedProtocolMessageType('ProgramDesc', (_message.Message,), dict(
+  DESCRIPTOR = _PROGRAMDESC,
+  __module__ = 'framework_pb2'
+  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.ProgramDesc)
+  ))
+_sym_db.RegisterMessage(ProgramDesc)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('H\003'))
+# @@protoc_insertion_point(module_scope)
diff --git a/python/tools/mdl2fluid/loader.py b/python/tools/mdl2fluid/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef2258e365a84003b7b90ac480abbd9798f48f59
--- /dev/null
+++ b/python/tools/mdl2fluid/loader.py
@@ -0,0 +1,18 @@
+import datetime
+import json
+import os
+
+import google.protobuf as pbg
+import framework_pb2 as framework_pb2
+
+
+def loadmdl(json_path):
+    print('mdl json path : ' + json_path)
+    with open(json_path, 'r') as f:
+        json_dick = json.load(f)
+        # print(json_dick)
+        layers = (json_dick['layer'])
+        for layer in layers:
+            print(layer)
+
+
diff --git a/python/tools/mdl2fluid/mdl2fluid.py b/python/tools/mdl2fluid/mdl2fluid.py
new file mode 100644
index 0000000000000000000000000000000000000000..a57a01d09eaf236fd9f890dcb9e8eead19aa7868
--- /dev/null
+++ b/python/tools/mdl2fluid/mdl2fluid.py
@@ -0,0 +1,335 @@
+import json
+import os
+
+import framework_pb2 as framework_pb2
+import op_types as types
+from swicher import Swichter
+import shutil
+
+
+def load_mdl(mdl_json_path):
+    # print('mdl json path : ' + mdl_json_path)
+    with open(mdl_json_path, 'r') as f:
+        return json.load(f)
+
+
+class Converter:
+    'convert mdlmodel to fluidmodel'
+
+    def __init__(self, mdl_json_path):
+        self.mdl_json_path = mdl_json_path
+        print mdl_json_path
+        self.mdl_json = load_mdl(self.mdl_json_path)
+        self.program_desc = framework_pb2.ProgramDesc()
+        self.weight_list_ = []
+        self.deepwise_weight_list_ = []
+        # print(json_dick)
+        # layers = (json_dick['layer'])
+        # for layer in layers:
+        #     print(layer)
+
+    def convert(self):
+        print 'convert begin.....'
+        # add block_desc
+        block_desc = self.program_desc.blocks.add()
+        block_desc.idx = 0
+        block_desc.parent_idx = -1
+        self.package_ops(block_desc)
+        self.package_vars(block_desc)
+        print 'blocks: '
+        print self.program_desc.blocks
+        print 'convert end.....'
+        desc_serialize_to_string = self.program_desc.SerializeToString()
+        shutil.rmtree('newyolo/')
+        shutil.copytree('multiobjects/float32s_nchw_with_head', 'newyolo/')
+
+        f = open("newyolo/__model__", "wb")
+        f.write(desc_serialize_to_string)
+        f.close()
+
+    def package_ops(self, block_desc):
+
+        self.add_op_feed(block_desc)
+
+        # add ops with layer
+        if 'layer' in self.mdl_json:
+
+            layers_ = self.mdl_json['layer']
+            for layer in layers_:
+                desc_ops_add = block_desc.ops.add()
+
+                # print layer
+                # for i in layer:
+                #     print i
+                if 'name' in layer:
+                    l_name = layer['name']
+                if 'type' in layer:
+                    self.package_ops_type(desc_ops_add, layer)
+
+                if 'weight' in layer:
+                    self.package_ops_weight2inputs(desc_ops_add, layer)
+
+                if 'output' in layer:
+                    self.package_ops_outputs(desc_ops_add, layer)
+
+                if 'input' in layer:
+                    self.package_ops_inputs(desc_ops_add, layer)
+
+                self.package_ops_attrs(desc_ops_add, layer)
+
+        self.add_op_fetch(block_desc)
+
+    def add_op_feed(self, block_desc):
+        desc_ops_add = block_desc.ops.add()
+        inputs_add = desc_ops_add.inputs.add()
+        inputs_add.parameter = 'X'
+        inputs_add.arguments.append('feed')
+        desc_ops_add.type = 'feed'
+        outputs_add = desc_ops_add.outputs.add()
+        outputs_add.parameter = 'Out'
+        outputs_add.arguments.append('data')
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'col'
+        # boolean
+        attrs_add.type = 0
+        attrs_add.i = 0
+
+    def add_op_fetch(self, block_desc):
+        desc_ops_add = block_desc.ops.add()
+        inputs_add = desc_ops_add.inputs.add()
+        inputs_add.parameter = 'X'
+        inputs_add.arguments.append('conv_pred_87')
+        desc_ops_add.type = 'fetch'
+        outputs_add = desc_ops_add.outputs.add()
+        outputs_add.parameter = 'Out'
+        outputs_add.arguments.append('fetch')
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'col'
+        # boolean
+        attrs_add.type = 0
+        attrs_add.i = 0
+
+    @staticmethod
+    def package_ops_attrs(desc_ops_add, layer):
+        # print l_params
+        # print desc_ops_add.type
+        if desc_ops_add.type == types.op_fluid_fusion_conv_add:
+            Converter.pack_fusion_conv_add_attr(desc_ops_add, layer)
+        elif desc_ops_add.type == types.op_fluid_relu:
+            # fusion_conv_add : attrs
+            attrs_add = desc_ops_add.attrs.add()
+            attrs_add.name = 'use_mkldnn'
+            # boolean
+            attrs_add.type = 6
+            attrs_add.b = 0
+
+    @staticmethod
+    def pack_fusion_conv_add_attr(desc_ops_add, layer):
+
+        # fusion_conv_add : attrs
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'workspace_size_MB'
+        # 0-->INT
+        attrs_add.type = 0
+        attrs_add.i = 4096
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'data_format'
+        # 2-->STRING
+        attrs_add.type = 2
+        attrs_add.s = 'AnyLayout'
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'use_mkldnn'
+        # boolean
+        attrs_add.type = 6
+        attrs_add.b = 0
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'use_cudnn'
+        # boolean
+        attrs_add.type = 6
+        attrs_add.b = 1
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'dilations'
+        # ints
+        attrs_add.type = 3
+        attrs_add.ints.append(1)
+        attrs_add.ints.append(1)
+
+        attrs_add = desc_ops_add.attrs.add()
+        attrs_add.name = 'axis'
+        # int
+        attrs_add.type = 0
+        attrs_add.i = 1
+
+        if 'param' in layer:
+            l_params = layer['param']
+
+            attrs_add = desc_ops_add.attrs.add()
+            attrs_add.name = 'paddings'
+            # ints
+            attrs_add.type = 3
+            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
+            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
+
+            attrs_add = desc_ops_add.attrs.add()
+            attrs_add.name = 'strides'
+            # ints
+            attrs_add.type = 3
+            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
+            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
+
+            attrs_add = desc_ops_add.attrs.add()
+            attrs_add.name = 'groups'
+            # int
+            attrs_add.type = 0
+            attrs_add.i = l_params[types.fusion_conv_add_attrs_dict.get('groups')]
+            # attrs_add.i = 1
+
+        #
+        # op_attrs_tupl = types.op_io_dict.get(desc_ops_add.type) \
+        #     .get(types.mdl_attrs_key)
+        #
+        #
+        #
+        #
+        # # group stride padding
+        # print '----------------------'
+        # for i, val in enumerate(op_attrs_tupl):
+        #     attrs_add = desc_ops_add.attrs.add()
+        #     attr_name = op_attrs_tupl[i]
+        #     print attr_name
+        #     attrs_add.name = attr_name
+        #     attrs_add.type = types.fluid_attrs_type_dict.get(attr_name)
+        #     attrs_add.
+        #     print l_params[types.fusion_conv_add_attrs_dict.get(attr_name)]
+
+        # for p in l_params:
+        #     attrs_add = desc_ops_add.attrs.add()
+
+    @staticmethod
+    def package_ops_inputs(desc_ops_add, layer):
+        l_inputs = layer['input']
+        for i in l_inputs:
+            inputs_add = desc_ops_add.inputs.add()
+            # print i
+            inputs_add.parameter = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_inputs_key)
+            inputs_add.arguments.append(i)
+
+    @staticmethod
+    def package_ops_outputs(desc_ops_add, layer):
+        l_outputs = layer['output']
+        for o in l_outputs:
+            # print o
+            outputs_add = desc_ops_add.outputs.add()
+            outputs_add.parameter = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_outputs_key)
+            outputs_add.arguments.append(o)
+
+    def package_ops_weight2inputs(self, desc_ops_add, layer):
+        l_weights = layer['weight']
+        for w in l_weights:
+            self.weight_list_.append(w)
+
+        if layer['type'] == 'DepthwiseConvolutionLayer':
+            # print l_weights[0]
+            self.deepwise_weight_list_.append(l_weights[0])
+
+        op_weight_tup = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_weight_key)
+        # print len(op_weight_tup)
+        for i, val in enumerate(op_weight_tup):
+            # print i
+            # print val
+            inputs_add = desc_ops_add.inputs.add()
+            inputs_add.parameter = op_weight_tup[i]
+            inputs_add.arguments.append(l_weights[i])
+
+        # for w in l_weights:
+        #     inputs_add = desc_ops_add.inputs.add()
+        #     # print w
+        #     inputs_add.parameter = op_weight_tup[0]
+        #     inputs_add.arguments.append(w)
+
+    @staticmethod
+    def package_ops_type(desc_ops_add, layer):
+        l_type = layer['type']
+        # print l_type
+        # print mdl2fluid_op_layer_dict.get(l_type)
+        desc_ops_add.type = types.mdl2fluid_op_layer_dict.get(l_type)
+
+    def package_vars(self, block_desc):
+        vars_add = block_desc.vars.add()
+        vars_add.name = 'feed'
+        vars_add.type.type = 9  # 9 is FEED_MINIBATCH
+        vars_add.persistable = 1
+        # fetch
+        vars_add = block_desc.vars.add()
+        vars_add.name = 'fetch'
+        vars_add.type.type = 10  # 10 is fetch list
+        vars_add.persistable = 1
+
+        json_matrix_ = self.mdl_json['matrix']
+        # print json_matrix_
+        for j in json_matrix_:
+            vars_add = block_desc.vars.add()
+            vars_add.name = j
+            vars_add.type.type = 7  # 7 is lodtensor
+            # print j
+            tensor = vars_add.type.lod_tensor.tensor
+            tensor.data_type = 5  # 5 is FP32
+
+            # print json_matrix_
+
+            dims_of_matrix = json_matrix_.get(j)
+            # dims_size = len(dims_of_matrix)
+            # print dims_size
+
+            # if dims_size == 4:
+            #     tensor.dims.append(dims_of_matrix[0])  # N
+            #     tensor.dims.append(dims_of_matrix[3])  # C
+            #     tensor.dims.append(dims_of_matrix[1])  # H
+            #     tensor.dims.append(dims_of_matrix[2])  # W
+            # else:
+
+            # issues in mdl model filter swich n and c
+            if j in self.deepwise_weight_list_ and len(dims_of_matrix) == 4:
+                print j
+                tensor.dims.append(dims_of_matrix[1])
+                tensor.dims.append(dims_of_matrix[0])
+                tensor.dims.append(dims_of_matrix[2])
+                tensor.dims.append(dims_of_matrix[3])
+                print tensor.dims
+            else:
+                for dims in dims_of_matrix:
+                    # print dims
+                    tensor.dims.append(dims)
+
+            if j in self.weight_list_:
+                vars_add.persistable = 1
+                dims_size = len(dims_of_matrix)
+                # print dims_size
+                if dims_size == 4:
+                    # convert weight from nhwc to nchw
+                    Swichter().nhwc2nchw_one_slice_add_head(
+                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/' + j + '.bin',
+                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/' + j,
+                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/' + j + '.tmp',
+                        dims_of_matrix[0],
+                        dims_of_matrix[1],
+                        dims_of_matrix[2],
+                        dims_of_matrix[3]
+                    )
+                else:
+                    Swichter().copy_add_head(
+                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/' + j + '.bin',
+                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/' + j,
+                        '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/' + j + '.tmp'
+                    )
+            else:
+                vars_add.persistable = 0
+
+
+mdl_path = "/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/YOLO_Universal.json"
+converter = Converter(mdl_path)
+converter.convert()
diff --git a/python/tools/mdl2fluid/model_combine.py b/python/tools/mdl2fluid/model_combine.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae3ca8a786dc0d4032deda35c33f44d3d96e983d
--- /dev/null
+++ b/python/tools/mdl2fluid/model_combine.py
@@ -0,0 +1,19 @@
+# coding=utf-8
+import os
+
+path = "yolo_v2_tofile_source/"  # 文件夹目录
+to_file_path = "yolo_v2_tofile_combined/params"
+files = os.listdir(path)  # 得到文件夹下的所有文件名称
+files.sort(cmp=None, key=str.lower)
+to_file = open(to_file_path, "wb")
+
+for file in files:  # 遍历文件夹
+    if not os.path.isdir(file):  # 判断是否是文件夹，不是文件夹才打开
+        f = open(path + "/" + file)  # 打开文件
+        name = f.name
+        print 'name:  ' + name
+        from_file = open(name, "rb")
+        to_file.write(from_file.read())
+        from_file.close()
+
+to_file.close()
diff --git a/python/tools/mdl2fluid/model_reader.py b/python/tools/mdl2fluid/model_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d53350db20739526b77663f791942299d4bc149
--- /dev/null
+++ b/python/tools/mdl2fluid/model_reader.py
@@ -0,0 +1,30 @@
+import os
+
+import framework_pb2 as framework_pb2
+
+
+def read_model(model_path):
+    print('read_model.')
+    path_8 = unicode(model_path, 'utf8')
+
+    try:
+        with open(path_8, "rb") as f_model:
+            print get_file_size(model_path)
+            desc = framework_pb2.ProgramDesc()
+            desc.ParseFromString(f_model.read())
+            print desc
+            # print desc.blocks
+
+    except IOError:
+        print ": File not found.  Creating a new file."
+
+
+def get_file_size(file_path):
+    file_path = unicode(file_path, 'utf8')
+    fsize = os.path.getsize(file_path)
+    fsize = fsize / float(1024 * 1024)
+    return round(fsize, 2)
+
+
+path = "newyolo/__model__"
+read_model(path)
diff --git a/python/tools/mdl2fluid/op_types.py b/python/tools/mdl2fluid/op_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff7d78d20835c605dc581ef14ad2d7d5171fea1d
--- /dev/null
+++ b/python/tools/mdl2fluid/op_types.py
@@ -0,0 +1,123 @@
+# coding=utf-8
+
+# mdl layers
+layer_mdl_conv = 'ConvolutionLayer'
+layer_mdl_deepwise_conv = 'DepthwiseConvolutionLayer'
+layer_mdl_relu = 'ReluLayer'
+layer_mdl_pointwise_add = 'PointwiseConvolutionLayer'
+
+# fluid ops
+op_fluid_fusion_conv_add = 'fusion_conv_add'
+op_fluid_relu = 'relu'
+
+# dict mdk layer ---  fluid op
+mdl2fluid_op_layer_dict = {
+    layer_mdl_conv: op_fluid_fusion_conv_add,
+    layer_mdl_deepwise_conv: op_fluid_fusion_conv_add,
+    layer_mdl_relu: op_fluid_relu,
+    layer_mdl_pointwise_add: op_fluid_fusion_conv_add
+}
+
+mdl_outputs_key = "outputs"
+mdl_inputs_key = "inputs"
+mdl_weight_key = "weights"
+mdl_attrs_key = "params"
+
+# dict of mdl-input _out param  to fluid input out attrs
+fusion_conv_add_dict = {
+    mdl_inputs_key: 'Input',
+    mdl_outputs_key: 'Out',
+    mdl_weight_key: ('Filter', 'Y'),
+    mdl_attrs_key: (
+        # 'workspace_size_MB', 'use_mkldnn', 'use_cudnn', 'data_format','dilations',
+        # dilations =  [1,1]
+        'groups', 'paddings', 'strides'
+        # 'axis'
+    )
+}
+
+relu_dict = {
+    mdl_inputs_key: 'X',
+    mdl_outputs_key: 'Out',
+    mdl_weight_key: ()
+
+}
+# mdl layers  ---  fluid ops
+op_io_dict = {
+    'fusion_conv_add': fusion_conv_add_dict,
+    'relu': relu_dict
+}
+
+# fluid attr key  ---  mdl params key
+fusion_conv_add_attrs_dict = {
+    'paddings': 'pad',
+    'strides': 'stride',
+    'groups': 'group'
+}
+# fluid attr key  ---  mdl params key
+fluid_attrs_type_dict = {
+    'paddings': 0,
+    'strides': 6,
+    'groups': 6
+}
+
+# '': "bias_term",    是不是要add   目前 yolo的模型都是 bias_term = 1
+
+
+# attrs {
+#       name: "axis"
+#       type: INT
+#       i: 1
+#     }
+
+
+# attrs_name = {
+#     'name': "workspace_size_MB",
+#     'type': 'INT',
+#     'i': '4096'
+# }
+# attrs
+# {
+#     name: "data_format"
+#     type: STRING
+#     s: "AnyLayout"
+# }
+# attrs
+# {
+#     name: "use_mkldnn"
+#     type: BOOLEAN
+#     b: false
+# }
+# attrs
+# {
+#     name: "use_cudnn"
+#     type: BOOLEAN
+#     b: true
+# }
+# attrs
+# {
+#     name: "dilations"
+#     type: INTS
+#     ints: 1
+#     ints: 1
+# }
+# attrs
+# {
+#     name: "groups"
+#     type: INT
+#     i: 1
+# }
+# attrs
+# {
+#     name: "paddings"
+#     type: INTS
+#     ints: 0
+#     ints: 0
+# }
+# attrs
+# {
+#     name: "strides"
+#     type: INTS
+#     ints: 1
+#     ints: 1
+# }
diff --git a/python/tools/mdl2fluid/swicher.py b/python/tools/mdl2fluid/swicher.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfe0360fd5b32f5e6fa61f6f05a0a384fb3a1e9b
--- /dev/null
+++ b/python/tools/mdl2fluid/swicher.py
@@ -0,0 +1,115 @@
+from array import array
+
+
+class Swichter:
+    def __init__(self):
+        pass
+
+    def nhwc2nchw_one_slice(self, from_file_name, to_file_name, batch, channel, height, width):
+        from_file = open(from_file_name, "rb")
+        to_file = open(to_file_name, "wb")
+
+        float_array = array("f")
+        float_array.fromfile(from_file, width * height * batch * channel)
+        float_write_array = array("f")
+
+        for b in range(batch):
+            for c in range(channel):
+                for h in range(height):
+                    for w in range(width):
+                        float_value = float_array[b * channel * width * height
+                                                  + channel * (h * width + w) + c]
+
+                        float_write_array.append(float_value)
+
+        float_write_array.tofile(to_file)
+        from_file.close()
+        to_file.close()
+
+    def copy(self, from_file_name, to_file_name):
+        from_file = open(from_file_name, "rb")
+        to_file = open(to_file_name, "wb")
+
+        to_file.write(from_file.read())
+        from_file.close()
+        to_file.close()
+
+    def nhwc2nchw_one_slice_add_head(self, from_file_name, to_file_name, tmp_file_name, batch, channel, height, width):
+        from_file = open(from_file_name, "rb")
+        tmp_file = open(tmp_file_name, "wb+")
+        float_array = array("f")
+        float_array.fromfile(from_file, width * height * batch * channel)
+        float_write_array = array("f")
+
+        for b in range(batch):
+            for c in range(channel):
+                for h in range(height):
+                    for w in range(width):
+                        float_value = float_array[b * channel * width * height
+                                                  + channel * (h * width + w) + c]
+
+                        float_write_array.append(float_value)
+
+        float_write_array.tofile(tmp_file)
+        tmp_file.close()
+        from_file.close()
+
+        tmp_file = open(tmp_file_name, "rb")
+        to_file = open(to_file_name, "wb")
+
+        tmp = tmp_file.read()
+        head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
+        to_file.write(head)
+        to_file.write(tmp)
+        tmp_file.close()
+        to_file.close()
+
+    def read_head(self, head_file):
+        from_file = open(head_file, "rb")
+        read = from_file.read(24)
+        # print read
+        from_file.close()
+        # print read
+        return read
+
+    def copy_add_head(self, from_file_name, to_file_name, tmp_file_name):
+        from_file = open(from_file_name, "rb")
+        to_file = open(to_file_name, "wb")
+        # tmp_file = open(tmp_file_name, "wb")
+
+        head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
+        to_file.write(head)
+        to_file.write(from_file.read())
+        from_file.close()
+        to_file.close()
+        pass
+
+    def copy_padding_add_head(self, from_file_name, to_file_name, tmp_file_name, padding):
+        print'padding  = %d' % padding
+        from_file = open(from_file_name, "rb")
+        # print len(from_file.read())
+        from_file.seek(padding, 0)
+
+        read = from_file.read()
+        print len(read)
+
+        to_file = open(to_file_name, "wb")
+        # tmp_file = open(tmp_file_name, "wb")
+
+        head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
+        to_file.write(head)
+        to_file.write(read)
+        from_file.close()
+        to_file.close()
+        pass
+
+# Swichter().nhwc2nchw_one_slice_add_head(
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/conv1_0.bin',
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/conv1_0',
+#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/.tmp',
+#     32,
+#     3, 3, 3)
+
+# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
+
+# Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')
diff --git a/src/common/enforce.h b/src/common/enforce.h
index 51d2110e32433686d1b3353bc63b92a564a13e9d..aebe2a58031cb1341596f07dbf653be4a5e01900 100644
--- a/src/common/enforce.h
+++ b/src/common/enforce.h
@@ -61,7 +61,14 @@ struct PaddleMobileException : public std::exception {
   }
 #else
 #define PADDLE_MOBILE_THROW_EXCEPTION(...)
-#define PADDLE_MOBILE_ENFORCE(stat, ...)
+
+#define PADDLE_MOBILE_ENFORCE(stat, ...) \
+  {                                      \
+    if (stat) {                          \
+    } else {                             \
+    }                                    \
+  }
+
 #endif
 
 }  // namespace paddle_mobile
diff --git a/src/common/types.cpp b/src/common/types.cpp
index cea42171f0205e0d40b2703d5c90f0b9fc253e68..a0a3b6954ebd3cf32519fa3d91012d4e3be170fa 100644
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -17,36 +17,59 @@ limitations under the License. */
 
 namespace paddle_mobile {
 
-const std::string G_OP_TYPE_CONV = "conv2d";
-const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
-const std::string G_OP_TYPE_BOX_CODER = "box_coder";
-const std::string G_OP_TYPE_CONCAT = "concat";
-const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
-const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
-const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
-const std::string G_OP_TYPE_FC = "fusion_fc";
-const std::string G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
-const std::string G_OP_TYPE_LRN = "lrn";
-const std::string G_OP_TYPE_MUL = "mul";
-const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
-const std::string G_OP_TYPE_POOL2D = "pool2d";
-const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
-const std::string G_OP_TYPE_RELU = "relu";
-const std::string G_OP_TYPE_RESHAPE = "reshape";
-const std::string G_OP_TYPE_SIGMOID = "sigmoid";
-const std::string G_OP_TYPE_SOFTMAX = "softmax";
-const std::string G_OP_TYPE_TRANSPOSE = "transpose";
-const std::string G_OP_TYPE_SPLIT = "split";
-const std::string G_OP_TYPE_FEED = "feed";
-const std::string G_OP_TYPE_FETCH = "fetch";
-const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
-const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence";
-const std::string G_OP_TYPE_DROPOUT = "dropout";
+const char *G_OP_TYPE_CONV = "conv2d";
+const char *G_OP_TYPE_BATCHNORM = "batch_norm";
+const char *G_OP_TYPE_BOX_CODER = "box_coder";
+const char *G_OP_TYPE_CONCAT = "concat";
+const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
+const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
+const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
+const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu";
+const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
+const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu";
+const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
+const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
+const char *G_OP_TYPE_FC = "fusion_fc";
+const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
+const char *G_OP_TYPE_LRN = "lrn";
+const char *G_OP_TYPE_MUL = "mul";
+const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
+const char *G_OP_TYPE_POOL2D = "pool2d";
+const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
+const char *G_OP_TYPE_RELU = "relu";
+const char *G_OP_TYPE_RESHAPE = "reshape";
+const char *G_OP_TYPE_SIGMOID = "sigmoid";
+const char *G_OP_TYPE_SOFTMAX = "softmax";
+const char *G_OP_TYPE_TRANSPOSE = "transpose";
+const char *G_OP_TYPE_SPLIT = "split";
+const char *G_OP_TYPE_FEED = "feed";
+const char *G_OP_TYPE_FETCH = "fetch";
+const char *G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
+const char *G_OP_TYPE_IM2SEQUENCE = "im2sequence";
+const char *G_OP_TYPE_DROPOUT = "dropout";
+const char *G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn";
+const char *G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn";
+const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU =
+    "fusion_elementwise_add_relu";
+const char *G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu";
+const char *G_OP_TYPE_REGION = "region";
+const char *G_OP_TYPE_FUSION_CONV_BN = "fusion_conv_bn";
+const char *G_OP_TYPE_CONV_TRANSPOSE = "conv2d_transpose";
+const char *G_OP_TYPE_PRELU = "prelu";
+const char *G_OP_TYPE_LOOKUP_TABLE = "lookup_table";
+const char *G_OP_TYPE_GRU = "gru";
+const char *G_OP_TYPE_CRF = "crf_decoding";
+const char *G_OP_TYPE_BILINEAR_INTERP = "bilinear_interp";
+const char *G_OP_TYPE_FLATTEN = "flatten";
+const char *G_OP_TYPE_SHAPE = "shape";
 
 std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
     op_input_output_key = {
         {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_FUSION_DWCONV_BN_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_BN_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_PRELU, {{"X", "Alpha"}, {"Out"}}},
         {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
         {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
@@ -63,13 +86,31 @@ std::unordered_map<
         {G_OP_TYPE_BOX_CODER,
          {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
         {G_OP_TYPE_FUSION_CONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_BN_ADD_RELU, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
         {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
         {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
         {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
         {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
         {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}}};
+        {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_BN, {{"Input"}, {"Y"}}},
+        {G_OP_TYPE_FUSION_POOL_BN, {{"X"}, {"Y"}}},
+        {G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_FC_RELU, {{"X", "Y", "Z"}, {"Out"}}},
+        {G_OP_TYPE_REGION, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_BN, {{"Input"}, {"Y"}}},
+        {G_OP_TYPE_LOOKUP_TABLE, {{"W", "Ids"}, {"Out"}}},
+        {G_OP_TYPE_GRU,
+         {{"Input", "H0", "Weight", "Bias"},
+          {"BatchGate", "BatchResetHiddenPrev", "BatchHidden", "Hidden"}}},
+        {G_OP_TYPE_CRF, {{"Emission", "Transition", "Label"}, {"ViterbiPath"}}},
+        {G_OP_TYPE_BILINEAR_INTERP, {{"OutSize", "X"}, {"Out"}}},
+        {G_OP_TYPE_FLATTEN, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_SHAPE, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_CONV_TRANSPOSE, {{"Input"}, {"Output"}}}};
 
 }  // namespace paddle_mobile
diff --git a/src/common/types.h b/src/common/types.h
index ec428b9911f64d7ccc8c6f5dc4be7f970e855d3c..6d38e4178907aa30968a6760a6ae5d69f4b61167 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -16,10 +16,13 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 namespace paddle_mobile {
-enum class Precision : int { FP32 = 0 };
+enum class Precision : int { FP32 = 0, FP16 = 1 };
+
+typedef int16_t half;
 
 template <Precision p>
 struct PrecisionTrait {
@@ -30,6 +33,10 @@ template <>
 struct PrecisionTrait<Precision::FP32> {
   typedef float ptype;
 };
+template <>
+struct PrecisionTrait<Precision::FP16> {
+  typedef half ptype;
+};
 
 //! device type
 enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
@@ -72,32 +79,46 @@ enum PMStatus {
   PMWrongDevice = 0x08     /*!< un-correct device. */
 };
 
-extern const std::string G_OP_TYPE_CONV;
-extern const std::string G_OP_TYPE_BATCHNORM;
-extern const std::string G_OP_TYPE_BOX_CODER;
-extern const std::string G_OP_TYPE_CONCAT;
-extern const std::string G_OP_TYPE_ELEMENTWISE_ADD;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU;
-extern const std::string G_OP_TYPE_FC;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
-
-extern const std::string G_OP_TYPE_LRN;
-extern const std::string G_OP_TYPE_MUL;
-extern const std::string G_OP_TYPE_MULTICLASS_NMS;
-extern const std::string G_OP_TYPE_POOL2D;
-extern const std::string G_OP_TYPE_PRIOR_BOX;
-extern const std::string G_OP_TYPE_RELU;
-extern const std::string G_OP_TYPE_RESHAPE;
-extern const std::string G_OP_TYPE_SIGMOID;
-extern const std::string G_OP_TYPE_SOFTMAX;
-extern const std::string G_OP_TYPE_TRANSPOSE;
-extern const std::string G_OP_TYPE_SPLIT;
-extern const std::string G_OP_TYPE_FEED;
-extern const std::string G_OP_TYPE_FETCH;
-extern const std::string G_OP_TYPE_DEPTHWISE_CONV;
-extern const std::string G_OP_TYPE_IM2SEQUENCE;
-extern const std::string G_OP_TYPE_DROPOUT;
+extern const char *G_OP_TYPE_CONV;
+extern const char *G_OP_TYPE_BATCHNORM;
+extern const char *G_OP_TYPE_BOX_CODER;
+extern const char *G_OP_TYPE_CONCAT;
+extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
+extern const char *G_OP_TYPE_FC;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU;
+extern const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_CONV_BN_RELU;
+
+extern const char *G_OP_TYPE_LRN;
+extern const char *G_OP_TYPE_MUL;
+extern const char *G_OP_TYPE_MULTICLASS_NMS;
+extern const char *G_OP_TYPE_POOL2D;
+extern const char *G_OP_TYPE_PRIOR_BOX;
+extern const char *G_OP_TYPE_RELU;
+extern const char *G_OP_TYPE_RESHAPE;
+extern const char *G_OP_TYPE_SIGMOID;
+extern const char *G_OP_TYPE_SOFTMAX;
+extern const char *G_OP_TYPE_TRANSPOSE;
+extern const char *G_OP_TYPE_SPLIT;
+extern const char *G_OP_TYPE_FEED;
+extern const char *G_OP_TYPE_FETCH;
+extern const char *G_OP_TYPE_DEPTHWISE_CONV;
+extern const char *G_OP_TYPE_IM2SEQUENCE;
+extern const char *G_OP_TYPE_DROPOUT;
+
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN;
+extern const char *G_OP_TYPE_FUSION_POOL_BN;
+extern const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU;
+extern const char *G_OP_TYPE_FUSION_FC_RELU;
+extern const char *G_OP_TYPE_REGION;
+extern const char *G_OP_TYPE_FUSION_CONV_BN;
+extern const char *G_OP_TYPE_CONV_TRANSPOSE;
+extern const char *G_OP_TYPE_PRELU;
 
 extern std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
diff --git a/src/common/variant.h b/src/common/variant.h
index b87a5e67a76f4c616f2c450ef4527bcf6c16286b..00b8eb985d8f7fc22bb93a3e229aa387c358e257 100644
--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <cstdlib>
+
 #include "common/enforce.h"
 #include "common/log.h"
 
@@ -82,7 +84,7 @@ struct Variant {
     if (type_id == typeid(T).hash_code()) {
       return *const_cast<T *>(reinterpret_cast<const T *>(&data));
     } else {
-      PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant ");
+      PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant");
       exit(0);
     }
   }
diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f10aee5014d8f377ecc8e1735276aebf6418436f
--- /dev/null
+++ b/src/fpga/api.cpp
@@ -0,0 +1,424 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "api.h"
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <algorithm>
+#include <map>
+#include "bias_scale.h"
+#include "filter.h"
+#include "image.h"
+#define FPGA_TEST_MODE
+#define PADDLE_MOBILE_OS_LINUX
+
+namespace paddle_mobile {
+namespace fpga {
+
+static int fd = -1;
+static const char *device_path = "/dev/fpgadrv0";
+static std::map<void *, size_t> memory_map;
+
+static inline int do_ioctl(int req, const void *arg) {
+#ifdef PADDLE_MOBILE_OS_LINUX
+  int result = ioctl(fd, req, (uint64_t)arg);
+  PADDLE_MOBILE_ENFORCE(result == 0, "ioctl didn't return correctly");
+  return result;
+#else
+  return -1;
+#endif
+}
+
+int open_device() {
+  if (fd == -1) {
+    fd = open(device_path, O_RDWR);
+  }
+  return fd;
+}
+
+// memory management;
+void *fpga_malloc(size_t size) {
+  static uint64_t counter = 0;
+
+#ifdef PADDLE_MOBILE_OS_LINUX
+  auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+#else
+  auto ptr = malloc(size);
+#endif
+  counter += size;
+  memory_map.insert(std::make_pair(ptr, size));
+  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
+       << counter << " bytes";
+  return ptr;
+}
+
+void fpga_free(void *ptr) {
+  static uint64_t counter = 0;
+  size_t size = 0;
+
+  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
+  if (iter != memory_map.end()) {
+    size = iter->second;
+    memory_map.erase(iter);
+#ifdef PADDLE_MOBILE_OS_LINUX
+    munmap(ptr, size);
+#else
+    free(ptr);
+#endif
+    counter += size;
+    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
+         << counter << " bytes";
+  } else {
+    DLOG << "Invalid pointer";
+  }
+}
+
+void fpga_copy(void *dest, const void *src, size_t num) {
+  memcpy(dest, src, num);
+}
+
+int fpga_flush(void *address, size_t size) {
+  struct MemoryCacheArgs args = {nullptr};
+  args.address = address;
+  args.size = size;
+  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
+}
+
+int fpga_invalidate(void *address, size_t size) {
+  struct MemoryCacheArgs args = {nullptr};
+  args.address = address;
+  args.size = size;
+  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
+}
+
+int ComputeBasicConv(const struct ConvArgs &args) {
+  DLOG << "======Compute Basic Conv======";
+  DLOG << "   relu_enabled:" << args.relu_enabled
+       << "   sb_address:" << args.sb_address
+       << "   filter_address:" << args.filter_address
+       << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+
+  return do_ioctl(IOCTL_CONFIG_CONV, &args);
+}
+
+int ComputeFpgaConv(const struct WrapperConvArgs &args) {
+#ifdef FPGA_TEST_MODE
+  DLOG << "=============ComputeFPGAConv===========";
+  DLOG << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num
+       << "   split_num:" << args.split_num;
+#endif
+
+  int split_num = args.split_num;
+  for (int i = 0; i < split_num; i++) {
+    ComputeBasicConv(args.conv_args[i]);
+  }
+
+  if (split_num > 1) {
+    ComputeFPGAConcat(args.concat_arg);
+  }
+}
+
+int ComputeFpgaPool(const struct PoolingArgs &args) {
+#ifdef FPGA_TEST_MODE
+  DLOG << "=============ComputeFpgaPool===========";
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   kernel_height:" << args.kernel.height
+       << "   kernel_width:" << args.kernel.width
+       << "   stride_h:" << args.kernel.stride_h
+       << "   stride_w:" << args.kernel.stride_w;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+
+  return do_ioctl(IOCTL_CONFIG_POOLING, &args);
+}
+
+int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
+#ifdef FPGA_TEST_MODE
+  DLOG << "=============ComputeFpgaEWAdd===========";
+  DLOG << "   relu_enabled:" << args.relu_enabled << "   const0:" << args.const0
+       << "   const1:" << args.const1;
+  DLOG << "   image0_address:" << args.image0.address
+       << "   image0_scale_address:" << args.image0.scale_address
+       << "   image0_channels:" << args.image0.channels
+       << "   image0_height:" << args.image0.height
+       << "   image0_width:" << args.image0.width
+       << "   pad0_height:" << args.image0.pad_height
+       << "   pad0_width:" << args.image0.pad_width;
+  DLOG << "   image1_address:" << args.image1.address
+       << "   image1_scale_address:" << args.image1.scale_address
+       << "   image1_channels:" << args.image1.channels
+       << "   image1_height:" << args.image1.height
+       << "   image1_width:" << args.image1.width
+       << "   pad1_height:" << args.image1.pad_height
+       << "   pad_width:" << args.image1.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+
+  return do_ioctl(IOCTL_CONFIG_EW, &args);
+}
+int PerformBypass(const struct BypassArgs &args) {
+#ifdef FPGA_TEST_MODE
+  DLOG << "=============ComputeFpgaBypass===========";
+  DLOG << "   input_type:" << args.input_data_type
+       << "   output_type:" << args.output_data_type
+       << "   input_layout_type:" << args.input_layout_type
+       << "   output_layout_type:" << args.output_layout_type;
+  DLOG << "   image_address:" << args.image.address
+       << "   image_scale_address:" << args.image.scale_address
+       << "   image_channels:" << args.image.channels
+       << "   image_height:" << args.image.height
+       << "   image_width:" << args.image.width
+       << "   pad_height:" << args.image.pad_height
+       << "   pad_width:" << args.image.pad_width;
+  DLOG << "   out_address:" << args.output.address
+       << "   out_scale_address:" << args.output.scale_address;
+#endif
+
+  return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
+}
+
+int ComputeFPGAConcat(const struct ConcatArgs &args) {
+#ifdef FPGA_TEST_MODE
+  DLOG << "=============ComputeFpgaConcat===========";
+  DLOG << "   Image_num: " << args.image_num
+       << "   out_address:" << args.image_out
+       << "   out_scale_address:" << args.scale_out;
+  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
+  for (int i = 0; i < args.image_num; i++) {
+    DLOG << "   " << i << "th:        ";
+    DLOG << "   channel_num:" << args.channel_num[i]
+         << "   image_address:" << args.images_in[i]
+         << "   image_scale_address:" << args.scales_in[i];
+  }
+#endif
+
+  image::concat_images(args.images_in, args.scales_in, args.image_out,
+                       args.scale_out, args.image_num, args.channel_num,
+                       args.height, args.width);
+  return 0;
+}
+
+int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); }
+
+void format_image(framework::Tensor *image_tensor) {
+  auto dims = image_tensor->dims();
+  auto channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = image_tensor->data<float>();
+  size_t memory_size = channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);
+  fpga_copy(new_data, data_ptr, memory_size);
+  image::format_image(&new_data, channel, height, width);
+  image_tensor->reset_data_ptr(new_data);
+}
+
+void format_fp16_ofm(framework::Tensor *ofm_tensor) {
+  auto dims = ofm_tensor->dims();
+  size_t memory_size = 0;
+  if (dims.size() == 4) {
+    auto channel = dims[1], height = dims[2], width = dims[3];
+    memory_size =
+        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
+  } else if (dims.size() == 2) {
+    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
+  } else {
+    DLOG << "Wrong ofm dimension";
+  }
+  auto p = fpga_malloc(memory_size);
+  memset(p, 0, memory_size);
+  ofm_tensor->reset_data_ptr(p);
+}
+
+void format_fp32_ofm(framework::Tensor *ofm_tensor) {
+  auto dims = ofm_tensor->dims();
+  size_t memory_size = 0;
+  if (dims.size() == 4) {
+    auto channel = dims[1], height = dims[2], width = dims[3];
+    memory_size =
+        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(float);
+  } else if (dims.size() == 2) {
+    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(float);
+  } else {
+    DLOG << "Wrong ofm dimension";
+  }
+  auto p = fpga_malloc(memory_size);
+  memset(p, 0, memory_size);
+  ofm_tensor->reset_data_ptr(p);
+}
+
+float filter_find_max(framework::Tensor *filter_tensor) {
+  auto filter_ptr = filter_tensor->data<float>();
+  return filter::find_max(filter_ptr, filter_tensor->numel());
+}
+
+int get_plit_num(framework::Tensor *filter_tensor) {
+  auto dims = filter_tensor->dims();
+  auto chw = dims[1] * dims[2] * dims[3];
+  auto num = dims[0];
+  int div_capacity = filter::calc_division_capacity(chw);
+  return filter::calc_split_num(num, div_capacity);
+}
+
+int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
+  auto dims = filter_tensor->dims();
+  auto chw = dims[1] * dims[2] * dims[3];
+  auto num = dims[0];
+  int div_capacity = filter::calc_division_capacity(chw);
+  return filter::calc_num_per_div(num, group_num, div_capacity);
+}
+
+int get_aligned_filter_element_num(int chw) {
+  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+}
+
+int get_aligned_filter_num(int num) {
+  return align_to_x(num, FILTER_NUM_ALIGNMENT);
+}
+
+void format_filter(framework::Tensor *filter_tensor, float max_value,
+                   int group_num) {
+  filter_tensor->scale[0] = float(max_value / 127.0);
+  filter_tensor->scale[1] = float(127.0 / max_value);
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);
+  fpga_copy(new_data, data_ptr, memory_size);
+  filter::format_filter(&new_data, num, channel, height, width, group_num,
+                        max_value);
+  filter_tensor->reset_data_ptr(new_data);
+}
+
+void format_bias_scale_array(float **bias_scale_array,
+                             int element_num_per_division, int num) {
+  bias_scale::format_bias_scale_array(bias_scale_array,
+                                      element_num_per_division, num);
+}
+
+void format_concat_output(framework::Tensor *out, int height, int width,
+                          int image_num, uint32_t *channel_num) {
+  int sum_channel = 0, sum_cw = 0;
+  for (int i = 0; i < image_num; i++) {
+    sum_channel += channel_num[i];
+  }
+
+  sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT);
+  auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half));
+  auto ddim = framework::make_ddim({1, sum_channel, height, width});
+  out->Resize(ddim);
+  out->reset_data_ptr(data_ptr);
+}
+
+void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
+                   framework::Tensor *out, framework::Tensor *filter,
+                   bool relu_enabled, int group_num, int stride_h, int stride_w,
+                   int padding_h, int padding_w, float *bs_ptr) {
+  auto input_ptr = input->data<float>();
+  auto filter_ptr = filter->data<float>();
+  auto out_ptr = out->data<float>();
+
+  arg->group_num = (uint32_t)group_num;
+  // Either group_num or split_num = 1;
+  arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
+  arg->filter_num = (uint32_t)filter->dims()[0];
+  arg->output.address = out_ptr;
+  arg->output.scale_address = out->scale;
+  arg->conv_args = (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));
+
+  arg->concat_arg.image_num = arg->split_num;
+  arg->concat_arg.image_out = out_ptr;
+  arg->concat_arg.scale_out = out->scale;
+  arg->concat_arg.height = (uint32_t)filter->dims()[2];
+  arg->concat_arg.width = (uint32_t)filter->dims()[3];
+
+  int n = arg->split_num;
+  arg->concat_arg.images_in = (half **)fpga_malloc(n * sizeof(int *));
+  arg->concat_arg.scales_in = (float **)fpga_malloc(n * sizeof(float *));
+  arg->concat_arg.channel_num = (uint32_t *)fpga_malloc(n * sizeof(uint32_t));
+  arg->concat_arg.image_out = out_ptr;
+
+  auto channel = (int)out->dims()[1];
+  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
+  int element_num = get_aligned_filter_element_num(
+      filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
+
+  for (int i = 0; i < n; i++) {
+    arg->conv_args[i].relu_enabled = relu_enabled;
+    arg->conv_args[i].group_num = (uint32_t)group_num;
+    arg->conv_args[i].kernel.stride_h = (uint32_t)stride_h;
+    arg->conv_args[i].kernel.stride_w = (uint32_t)stride_w;
+    arg->conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
+    arg->conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
+    arg->conv_args[i].image.address = input_ptr;
+    arg->conv_args[i].image.channels = (uint32_t)input->dims()[1];
+    arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
+    arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
+    arg->conv_args[i].image.scale_address = input->scale;
+    arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
+    arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
+    arg->conv_args[i].filter_scale_address = filter->scale;
+    arg->conv_args[i].filter_address =
+        &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
+    arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
+    arg->conv_args[i].filter_num =
+        (uint32_t)(i == n - 1 ? channel - (n - 1) * filter_num_per_div
+                              : filter_num_per_div);
+
+    if (n > 1) {
+      arg->conv_args[i].output.scale_address =
+          (float *)fpga_malloc(2 * sizeof(float));
+      arg->conv_args[i].output.address = fpga_malloc(
+          input->dims()[2] *
+          align_to_x(input->dims()[3] * arg->conv_args[i].filter_num,
+                     IMAGE_ALIGNMENT) *
+          sizeof(half));
+    }
+
+    else {
+      arg->conv_args[i].output.scale_address = out->scale;
+      arg->conv_args[i].output.address = out_ptr;
+    }
+
+    arg->concat_arg.images_in[i] = (half *)arg->conv_args[i].output.address;
+    arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address;
+    arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num;
+  }
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/api.h b/src/fpga/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5fa05b6750996ee391a30d2651a69d90e357547
--- /dev/null
+++ b/src/fpga/api.h
@@ -0,0 +1,226 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <cstddef>
+#include <iostream>
+#include <limits>
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+namespace fpga {
+
+enum DataType {
+  DATA_TYPE_FP32 = 1,
+  DATA_TYPE_FP16 = 0,
+};
+
+enum LayoutType {
+  LAYOUT_CHW = 1,
+  LAYOUT_HWC = 0,
+};
+
+struct VersionArgs {
+  void* buffer;
+};
+
+struct MemoryCopyArgs {
+  void* src;
+  void* dest;
+  size_t size;
+};
+
+struct KernelArgs {
+  uint32_t width;
+  uint32_t height;
+  uint32_t stride_w;
+  uint32_t stride_h;
+};
+
+struct ImageInputArgs {
+  void* address;         // input featuremap virtual address
+  float* scale_address;  // input scale address;
+  uint32_t channels;
+  uint32_t width;  // featuremap width
+  uint32_t height;
+  uint32_t pad_width;  // padding width;
+  uint32_t pad_height;
+};
+
+struct ImageOutputArgs {
+  void* address;         // output result address;
+  float* scale_address;  // output scale address;
+};
+
+struct ConvArgs {
+  bool relu_enabled;
+  void* sb_address;  // scale and bias are interlaced;
+  void* filter_address;
+  float* filter_scale_address;
+  uint32_t filter_num;
+  uint32_t group_num;
+
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+
+struct ConcatArgs {
+  uint32_t image_num;
+  half** images_in;
+  float** scales_in;
+  void* image_out;
+  float* scale_out;
+  uint32_t* channel_num;
+  uint32_t height;
+  uint32_t width;
+};
+
+struct WrapperConvArgs {
+  uint32_t split_num;
+  uint32_t group_num;
+  uint32_t filter_num;
+  struct ImageOutputArgs output;
+  struct ConvArgs* conv_args;
+  struct ConcatArgs concat_arg;
+};
+
+struct PoolingArgs {
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+
+struct EWAddArgs {
+  bool relu_enabled;
+
+  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
+  uint32_t const1;
+  struct ImageInputArgs image0;
+  struct ImageInputArgs image1;
+  struct ImageOutputArgs output;
+};
+
+struct BypassArgs {
+  enum DataType input_data_type;
+  enum DataType output_data_type;
+  enum LayoutType input_layout_type;
+  enum LayoutType output_layout_type;
+  struct ImageInputArgs image;
+  struct ImageOutputArgs output;
+};
+
+struct FpgaRegWriteArgs {
+  uint64_t address;  //
+  uint64_t value;
+};
+
+struct FpgaRegReadArgs {
+  uint64_t address;
+  uint64_t value;
+};
+
+struct MemoryCacheArgs {
+  void* address;
+  size_t size;
+};
+
+#define IOCTL_FPGA_MAGIC 'FPGA'
+
+#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
+
+#define IOCTL_SEPARATOR_0 10
+
+#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
+#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
+#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
+
+#define IOCTL_SEPARATOR_1 20
+
+#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
+#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
+#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
+#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
+#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
+#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
+
+enum FPGA_ERR_TYPE {
+  ERR_IOCTL_CMD = -1,
+  ERR_TIMEOUT = -2,
+  ERR_COMPLETION_TIMEOUT = -3,
+  ERR_INVALID_FPGA_ADDR = -4,
+  ERR_NOMEM = -5,
+  ERR_NO_RESERVE_MEM = -6,
+  ERR_COPY_FROM_USER = -7,
+  ERR_COPY_TO_USER = -8,
+  ERR_DEL_TIMER = -9,
+  ERR_ENABLE_MSI = -10,
+  ERR_REGISTER_IRQ = -11,
+  ERR_PCIE_REGISTER = -12,
+  ERR_PCIE_PROBE = -13,
+  ERR_REGISTER_BLOCK = -14,
+  ERR_ALLOC_GENDISK = -15,
+  ERR_INIT_QUEUE = -16,
+  ERR_WAIT = -17,
+  ERR_ECC_ERROR = -31,
+  ERR_FPGA_FAIL_STOP = -64,
+  ERR_FPGA_DEBUG_STOP = -113,
+  DEV_TMP_UNAVAILABLE = -128
+};
+
+//============================== API =============================
+
+int open_device();
+int close_device();
+
+void* fpga_malloc(size_t size);
+void fpga_free(void* ptr);
+void fpga_copy(void* dst, const void* src, size_t num);
+int fpga_flush(void* address, size_t size);
+int fpga_invalidate(void* address, size_t size);
+
+int PerformBypass(const struct BypassArgs& args);
+int ComputeFpgaConv(const struct WrapperConvArgs& args);
+int ComputeFpgaPool(const struct PoolingArgs& args);
+int ComputeFpgaEWAdd(const struct EWAddArgs& args);
+int ComputeFPGAConcat(const struct ConcatArgs& args);
+
+static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+
+int get_align_image_cw(int cw);
+void format_image(framework::Tensor* image_tensor);
+void format_fp16_ofm(framework::Tensor* ofm_tensor);  // only allocate memory
+void format_fp32_ofm(framework::Tensor* ofm_tensor);
+
+float filter_find_max(framework::Tensor* filter_tensor);
+int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num);
+int get_plit_num(framework::Tensor* filter_tensor);
+int get_aligned_filter_element_num(int chw);
+int get_aligned_filter_num(int num);
+void format_filter(framework::Tensor* filter_tensor, float max_value,
+                   int group_num);
+void format_bias_scale_array(float** bias_scale_array,
+                             int element_num_per_division, int num);
+void format_concat_output(framework::Tensor* out, int height, int width,
+                          int image_num, uint32_t* channel_num);
+
+void fill_conv_arg(struct WrapperConvArgs* arg, framework::Tensor* input,
+                   framework::Tensor* out, framework::Tensor* filter,
+                   bool relu_enabled, int group_num, int stride_h, int stride_w,
+                   int padding_h, int padding_w, float* bs_ptr);
+
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/bias_scale.cpp b/src/fpga/bias_scale.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e5c3419a0c35b5c7c81b0ee1fd89a58838b5a26
--- /dev/null
+++ b/src/fpga/bias_scale.cpp
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "bias_scale.h"
+#include <memory.h>
+#include "api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+namespace bias_scale {
+
+void align_element(float **data_in, int num_per_div_before_alignment, int num) {
+  int copynum = 0;
+  float *ptr_unaligned = *data_in;
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
+  int num_element =
+      2 * div_num * num_per_div_after_alignment;  // including bias & scale
+  float *ptr_aligned = (float *)fpga_malloc(num_element * sizeof(float));
+
+  memset(ptr_aligned, 0, num_element * sizeof(float));
+
+  for (int i = 0; i < div_num; i++) {
+    if (i == div_num - 1) {
+      copynum = (num_per_div_after_alignment * div_num > num)
+                    ? (num % num_per_div_after_alignment)
+                    : (num_per_div_before_alignment);
+    } else {
+      copynum = num_per_div_before_alignment;
+    }
+
+    memcpy(ptr_aligned + i * num_per_div_after_alignment,
+           ptr_unaligned + num_per_div_before_alignment * i,
+           copynum * sizeof(float));
+    memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
+           ptr_unaligned + num_per_div_before_alignment * i + num,
+           copynum * sizeof(float));
+  }
+
+  fpga_free(ptr_unaligned);
+  *data_in = ptr_aligned;
+}
+
+void interleave(float **data_in, int num_after_alignment) {
+  // num_after_alignment: number of bias after alignment
+
+  float *ptr_uninterleaved = *data_in;
+  float *ptr_interleaved =
+      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));
+  int num = num_after_alignment / 4;
+  for (int i = 0; i < num; i++) {
+    memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
+           4 * sizeof(float));
+    memcpy(ptr_interleaved + 8 * i + 4,
+           ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
+  }
+
+  fpga_free(ptr_uninterleaved);
+  *data_in = ptr_interleaved;
+}
+
+void format_bias_scale_array(float **bias_scale_array,
+                             int element_num_per_division, int num) {
+  align_element(bias_scale_array, element_num_per_division, num);
+  int div_num = (num + element_num_per_division - 1) / element_num_per_division;
+  int element_num_after_division =
+      align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
+  interleave(bias_scale_array, div_num * element_num_after_division);
+  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
+}
+
+}  // namespace bias_scale
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/bias_scale.h b/src/fpga/bias_scale.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d1e44c5470dae02fde6956a3744edc2e371a87b
--- /dev/null
+++ b/src/fpga/bias_scale.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define BS_NUM_ALIGNMENT 8
+
+namespace paddle_mobile {
+namespace fpga {
+namespace bias_scale {
+
+void align_element(float** data_in, int num_per_div_before_alignment, int num);
+void interleave(float** data_in, int num_after_alignment);
+void format_bias_scale_array(float** bias_scale_array,
+                             int element_num_per_division, int num);
+
+}  // namespace bias_scale
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/filter.cpp b/src/fpga/filter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3b09ede10d10f605e69d06df2e148dd463e94d5b
--- /dev/null
+++ b/src/fpga/filter.cpp
@@ -0,0 +1,215 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "filter.h"
+#include <memory.h>
+#include "api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+namespace filter {
+
+int calc_division_capacity(int chw) { return 2048 / ((chw + 15) / 16) * 32; }
+
+int calc_split_num(int num, int division_capacity) {
+  return (num + division_capacity - 1) / division_capacity;
+}
+
+int calc_division_number(int num, int group_num, int division_capacity) {
+  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
+                        "Filter number should be divisible by group number");
+  int split_num = calc_split_num(num, division_capacity);
+  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
+                        "Split number or group number should be 1");
+  return group_num * split_num;
+}
+
+int calc_num_per_div(int num, int group_num, int division_capacity) {
+  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
+                        "Filter number should be divisible by group number");
+  int split_num = calc_split_num(num, division_capacity);
+  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
+                        "Split number or group number should be 1");
+  if (group_num == 1) {
+    if (num > division_capacity) {
+      return division_capacity;
+    } else {
+      return num;
+    }
+  } else {
+    return (num + group_num - 1) / group_num;
+  }
+}
+
+void convert_to_hwc(char **data_in, int num, int channel, int height,
+                    int width) {
+  char *tmp = *data_in;
+  int chw = channel * height * width;
+  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));
+  for (int n = 0; n < num; n++) {
+    int64_t amount_per_row = width * channel;
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        int64_t offset_height = h * amount_per_row;
+        for (int w = 0; w < width; w++) {
+          *(data_tmp + n * chw + offset_height + w * channel + c) =
+              *((*data_in)++);
+        }
+      }
+    }
+  }
+
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+float find_max(float *data_in, int data_size) {
+  float max = 0.0;
+  for (int i = 0; i < data_size; ++i) {
+    float value = data_in[i];
+    float abs = value > 0 ? value : -value;
+    max = std::max(max, abs);
+  }
+  return max;
+}
+
+void quantize(float **data_in, int data_size, float max) {
+  float *tmp = *data_in;
+  float fix_range = 127;
+  float scale = fix_range / max;
+
+  char *tmp_data = (char *)fpga_malloc(data_size * sizeof(char));
+  for (int i = 0; i < data_size; i++) {
+    tmp_data[i] = (char)((*data_in)[i] * scale);
+  }
+  *data_in = (float *)tmp_data;
+  fpga_free(tmp);
+}
+
+void align_element(char **data_in, int num, int chw) {
+  int i = 0;
+  int j = 0;
+  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  if (align_chw != chw) {
+    char *tmp = *data_in;
+    char *data_tmp = (char *)fpga_malloc(num * align_chw * sizeof(char));
+
+    memset(data_tmp, 0, num * align_chw);
+    for (j = 0; j < num; j++) {
+      memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
+    }
+    *data_in = data_tmp;
+    fpga_free(tmp);
+  }
+}
+
+void align_num(char **data_in, int num_per_div_before_alignment, int num,
+               int chw) {
+  int i = 0;
+  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  if (num_per_div_after_alignment != num_per_div_before_alignment) {
+    char *tmp = *data_in;
+    int div_num =
+        (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+    int num_element = div_num * num_per_div_after_alignment * align_chw;
+    char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));
+
+    memset(data_tmp, 0, num_element * sizeof(char));
+
+    for (i = 0; i < div_num; i++) {
+      memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
+             *data_in + num_per_div_before_alignment * align_chw * i,
+             num_per_div_before_alignment * align_chw);
+    }
+
+    *data_in = data_tmp;
+    fpga_free(tmp);
+  }
+}
+
+void reorder(char **data_in, int num_after_alignment, int chw) {
+  int index = 0;
+  int new_index;
+
+  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+
+  char *data_tmp =
+      (char *)fpga_malloc(chw_align * num_after_alignment * sizeof(char));
+  char *tmp = *data_in;
+  for (index = 0; index < num_after_alignment; index++) {
+    new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
+                (index / 16 % 2 * 4);
+    memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align,
+           chw_align);
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void interleave(char **data_in, int num_after_alignment, int chw) {
+  int i = 0;
+  int j = 0;
+  int k = 0;
+  int interleave_per_num = 16;
+  ;
+  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  char *data_tmp =
+      (char *)fpga_malloc(chw_align * num_after_alignment * sizeof(char));
+  char *tmp = *data_in;
+  int interleave_num = chw_align * 2 / interleave_per_num;
+  for (i = 0; i < num_after_alignment; i += 2) {
+    for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
+      memcpy(data_tmp + i * chw_align + interleave_per_num * j,
+             *data_in + i * chw_align + interleave_per_num * k,
+             interleave_per_num);
+      memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
+             *data_in + (i + 1) * chw_align + interleave_per_num * k,
+             interleave_per_num);
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void format_filter(float **data_in, int num, int channel, int height, int width,
+                   int group_num, float max) {
+  int data_size = channel * height * width * num;
+  int chw = channel * height * width;
+
+  int division_capacity = calc_division_capacity(chw);
+  int num_per_div_before_alignment =
+      calc_num_per_div(num, group_num, division_capacity);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment * div_num;
+
+  quantize(data_in, data_size, max);
+
+  char **quantize_data = (char **)data_in;
+
+  convert_to_hwc(quantize_data, num, channel, height, width);
+  align_element(quantize_data, num, chw);
+  align_num(quantize_data, num_per_div_before_alignment, num, chw);
+  reorder(quantize_data, num_after_alignment, chw);
+  interleave(quantize_data, num_after_alignment, chw);
+  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
+                                 num_after_alignment * sizeof(char));
+}
+
+}  // namespace filter
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/filter.h b/src/fpga/filter.h
new file mode 100644
index 0000000000000000000000000000000000000000..89132fabc4abee15ba8aa5e7cae8a14042cb3ad4
--- /dev/null
+++ b/src/fpga/filter.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define FILTER_NUM_ALIGNMENT 32      // Filter number aligned to 32
+#define FILTER_ELEMENT_ALIGNMENT 16  // Filter element number aligned to 16
+
+namespace paddle_mobile {
+namespace fpga {
+namespace filter {
+
+int calc_division_capacity(int chw);
+int calc_split_num(int num, int division_capacity);
+int calc_division_number(int num, int group_num, int division_capacity);
+int calc_num_per_div(int num, int group_num, int division_capacity);
+void convert_to_hwc(float** data_in, int num, int channel, int height,
+                    int width);
+float find_max(float* data_in, int data_size);
+void quantize(float** data_in, int data_size, float max);
+void align_element(float** data_in, int num, int chw);
+void align_num(char** data_in, int num_per_div_before_alignment, int num,
+               int chw);
+void reorder(float** data_in, int num_after_alignment, int chw);
+void interleave(float** data_in, int num_after_alignment, int chw);
+void format_filter(float** data_in, int num, int channel, int height, int width,
+                   int group_num, float max);
+}  // namespace filter
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/image.cpp b/src/fpga/image.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad5053f9780895d94cc3095dc694e86dbbb1abac
--- /dev/null
+++ b/src/fpga/image.cpp
@@ -0,0 +1,113 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "image.h"
+#include <memory.h>
+#include "api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+namespace image {
+
+void convert_to_hwc(float **data_in, int channel, int height, int width) {
+  float *tmp = *data_in;
+  float *data_tmp =
+      (float *)fpga_malloc(channel * height * width * sizeof(float));
+  int64_t amount_per_row = width * channel;
+  for (int c = 0; c < channel; c++) {
+    for (int h = 0; h < height; h++) {
+      int64_t offset_height = h * amount_per_row;
+      for (int w = 0; w < width; w++) {
+        *(data_tmp + offset_height + w * channel + c) = *((*data_in)++);
+      }
+    }
+  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
+}
+
+void align_element_conv(float **data_in, int height, int cw) {
+  int h = 0;
+  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
+  if (align_cw != cw) {
+    float *tmp = *data_in;
+    float *data_tmp = (float *)fpga_malloc(height * align_cw * sizeof(float));
+
+    memset(data_tmp, 0, height * align_cw * sizeof(float));
+
+    for (h = 0; h < height; h++) {
+      memcpy((void *)(data_tmp + h * align_cw), (void *)(*data_in + h * cw),
+             cw * sizeof(float));
+    }
+
+    *data_in = data_tmp;
+    fpga_free(tmp);
+  }
+}
+
+void format_image(float **data_in, int channel, int height, int width) {
+  convert_to_hwc(data_in, channel, height, width);
+  align_element_conv(data_in, height, channel * width);
+  fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
+                           sizeof(float));
+}
+
+void concat_images(int16_t **images_in, float **scales_in, void *image_out,
+                   float *scale_out, int image_num, uint32_t *channel_num,
+                   int height, int width) {
+  int i = 0;
+  int j = 0;
+  int k = 0;
+  int each_out_line_channel = 0;
+  int align_each_out_area_cw = 0;
+  int align_each_in_area_cw = 0;
+  int align_each_out_area_cw_differ = 0;
+  int tmp_channel = 0;
+  scale_out[0] = 0.0;
+  scale_out[1] = 0.0;
+  for (i = 0; i < image_num; i++) {
+    each_out_line_channel += channel_num[i];
+    scale_out[0] = std::max(*scale_out, scales_in[i][0]);
+    fpga_invalidate(images_in[i],
+                    height *
+                        align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
+                        sizeof(int16_t));
+  }
+  scale_out[1] = 1 / scale_out[0];
+  align_each_out_area_cw =
+      align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
+  align_each_out_area_cw_differ =
+      align_each_out_area_cw - each_out_line_channel * width;
+
+  for (k = 0; k < height; k++) {
+    for (j = 0; j < width; j++) {
+      for (i = 0; i < image_num; i++) {
+        align_each_in_area_cw =
+            align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
+        memcpy((int16_t *)image_out + tmp_channel +
+                   k * align_each_out_area_cw_differ,
+               images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
+               channel_num[i] * sizeof(int16_t));
+
+        tmp_channel += channel_num[i];
+      }
+    }
+  }
+
+  fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
+}
+
+}  // namespace image
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/fpga/image.h b/src/fpga/image.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e004916118ae97d60d24e798300d66a98191211
--- /dev/null
+++ b/src/fpga/image.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+#define IMAGE_ALIGNMENT 16  // Aligned to 16
+namespace paddle_mobile {
+namespace fpga {
+namespace image {
+
+void convert_to_hwc(float** data_in, int channel, int height, int width);
+void align_element_conv(float** data_in, int height, int cw);
+void format_image(float** data_in, int channel, int height, int width);
+void concat_images(int16_t** images_in, float** scales_in, void* image_out,
+                   float* scale_out, int image_num, uint32_t* channel_num,
+                   int height,
+                   int width);  // Concat featuremaps along channel direction
+}  // namespace image
+}  // namespace fpga
+}  // namespace paddle_mobile
diff --git a/src/framework/attribute.h b/src/framework/attribute.h
index f0519a35b3ed2a02e35f1ef0d6a718efb7b76095..ed264057be6810d8bae29e0117fa4f6d91067cc1 100644
--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <cstdlib>
 #include <string>
 #include <typeinfo>
 #include <unordered_map>
diff --git a/src/framework/data_layout.h b/src/framework/data_layout.h
index f1249008f088dce48ed040e47900121c2eb41af1..0ba31ef9b7016b453b34cc4a023b0841b2110540 100644
--- a/src/framework/data_layout.h
+++ b/src/framework/data_layout.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <cctype>
+#include <cstdlib>
 #include <string>
 
 namespace paddle_mobile {
diff --git a/src/framework/ddim.h b/src/framework/ddim.h
index 833bc2783f855fd9d6df50d21345539fbe2ca6c4..db240b260185bb8ac2ba1fe84d3390bedac5c36d 100644
--- a/src/framework/ddim.h
+++ b/src/framework/ddim.h
@@ -14,9 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <cstdlib>
 #include <initializer_list>
 #include <typeinfo>
 #include <vector>
+
 #include "common/enforce.h"
 #include "common/variant.h"
 #include "dim.h"
diff --git a/src/framework/dim.h b/src/framework/dim.h
index dd7610de65d4a4c93402cf49b0fdbdc7995610c0..85e86076e1de53fa80b75f56237901da49e22eb9 100644
--- a/src/framework/dim.h
+++ b/src/framework/dim.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <cstdlib>
+#include <string>
 #include "common/enforce.h"
 namespace paddle_mobile {
 namespace framework {
diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp
index 36b4663cb603d29bb60cfc297899d1c300e8ca91..dd865fb27d4345f16ddca8005463986787d681be 100644
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -28,6 +28,16 @@ vector<string> OperatorBase<Dtype>::GetOutKeys() const {
   return it->second.second;
 }
 
+template <typename Dtype>
+vector<string> OperatorBase<Dtype>::GetInputKeys() const {
+  auto it = op_input_output_key.find(type_);
+  if (it == op_input_output_key.end()) {
+    DLOG << type_ << " has no outputs";
+    return {};
+  }
+  return it->second.first;
+}
+
 template <typename Dtype>
 OperatorBase<Dtype>::OperatorBase(const std::string &type,
                                   const VariableNameMap &inputs,
@@ -49,10 +59,27 @@ template <typename Dtype>
 void OperatorBase<Dtype>::Run() const {
   RunImpl();
 #ifdef PADDLE_MOBILE_DEBUG
-  vector<string> output_keys = GetOutKeys();
-  for (const auto key : output_keys) {
-    Tensor *out_ = GetVarValue<framework::LoDTensor>(key, outputs_, *scope_);
-    DLOG << type_ << " output- " << key << "=" << *out_;
+  DLOG << "-------------" << type_ << "----------------------------";
+  vector<string> input_keys = GetInputKeys();
+  for (const auto key : input_keys) {
+    auto var_vec_in = inputs_.at(key);
+    for (int i = 0; i < var_vec_in.size(); ++i) {
+      auto vari = scope_->FindVar(var_vec_in[i]);
+      if (vari->IsInitialized()) {
+        Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
+        if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
+      }
+    }
+  }
+  for (const auto key : GetOutKeys()) {
+    auto var_vec_out = outputs_.at(key);
+    for (int i = 0; i < var_vec_out.size(); ++i) {
+      auto vari = scope_->FindVar(var_vec_out[i]);
+      if (vari->IsInitialized()) {
+        Tensor *tensor = vari->template GetMutable<framework::LoDTensor>();
+        if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
+      }
+    }
   }
 #endif
 }
diff --git a/src/framework/operator.h b/src/framework/operator.h
index 793551b0cd3eea290243c156c27616a34c37a3d2..5252ee65a2a80910500f4085bb92b80829f9e45b 100644
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -61,6 +61,7 @@ class OperatorBase {
   virtual ~OperatorBase() {}
   void Run() const;
   std::vector<string> GetOutKeys() const;
+  std::vector<string> GetInputKeys() const;
   virtual void RunImpl() const = 0;
 
   virtual void Init() = 0;
@@ -118,6 +119,10 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
   virtual void InferShape() const = 0;
 
   void Init() {
+    //    for (auto i : this->inputs_) {
+    //      DLOG << i.first;
+    //      DLOG << i.second;
+    //    }
     PADDLE_MOBILE_ENFORCE(kernel_.Init(&param_), "  %s kernel init failed",
                           this->type_.c_str());
   }
@@ -146,7 +151,7 @@ class OpKernelBase {
   }
 #endif
   virtual void Compute(const P &para) const = 0;
-  virtual bool Init(P *para) { return true; };
+  virtual bool Init(P *para) { return true; }
   virtual ~OpKernelBase() = default;
 
  private:
@@ -178,6 +183,8 @@ class FusionOpMatcher {
 
   std::string BeginType() { return node_.Type(); }
 
+  virtual std::vector<std::pair<int, std::string>> NeedCheck() { return {}; }
+
   //  virtual  bool Fusion();
  protected:
   Node node_;
diff --git a/src/framework/program/program-optimize/fusion_op_register.h b/src/framework/program/program-optimize/fusion_op_register.h
index 4cc83f8c80ab86ee6dcc1e3c395f872419da2be7..a5890d34c600f6c4f4838ec94c202801b3044d3f 100644
--- a/src/framework/program/program-optimize/fusion_op_register.h
+++ b/src/framework/program/program-optimize/fusion_op_register.h
@@ -14,11 +14,13 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include <map>
 #include <string>
+#include <vector>
 
 #include "framework/operator.h"
-#include "node.h"
+#include "framework/program/program-optimize/node.h"
 
 namespace paddle_mobile {
 namespace framework {
@@ -34,12 +36,25 @@ class FusionOpRegister {
   }
 
   void regist(FusionOpMatcher* matcher) {
+    if (matchers_.find(matcher->Type()) != matchers_.end()) {
+      return;
+    }
+
     std::shared_ptr<FusionOpMatcher> shared_matcher(matcher);
     matchers_[matcher->Type()] = shared_matcher;
   }
 
-  const std::map<std::string, std::shared_ptr<FusionOpMatcher>> Matchers() {
-    return matchers_;
+  const std::vector<std::shared_ptr<FusionOpMatcher>> Matchers() {
+    std::vector<std::shared_ptr<FusionOpMatcher>> matchers;
+    for (const auto& match : matchers_) {
+      matchers.push_back(match.second);
+    }
+    std::sort(matchers.begin(), matchers.end(),
+              [](std::shared_ptr<FusionOpMatcher> first,
+                 std::shared_ptr<FusionOpMatcher> second) {
+                return first->BeginNode().Depth() > second->BeginNode().Depth();
+              });
+    return matchers;
   }
 
  private:
diff --git a/src/framework/program/program-optimize/node.cpp b/src/framework/program/program-optimize/node.cpp
index e635e07eaf4484c3e390101c3b43fdaf24bbd2c6..c8b15e67663c2e74901b499c0b244a03113891ff 100644
--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -14,15 +14,33 @@ limitations under the License. */
 
 #include "framework/program/program-optimize/node.h"
 #include <algorithm>
+#include <map>
+#include <memory>
 #include "framework/operator.h"
 
 namespace paddle_mobile {
 
 namespace framework {
 
+std::vector<Node *> Node::operator[](int index) {
+  std::vector<Node *> nodes;
+  GetNodesWithLocation(index, 0, &nodes);
+  return nodes;
+}
+
+void Node::GetNodesWithLocation(int index, int now_index,
+                                std::vector<Node *> *nodes) {
+  if (index == now_index) {
+    nodes->push_back(this);
+  }
+
+  for (int i = 0; i < this->outputs_.size(); ++i) {
+    this->outputs_[i]->GetNodesWithLocation(index, now_index + 1, nodes);
+  }
+}
+
 Node &Node::operator>(std::shared_ptr<Node> node) {
   outputs_.push_back(node);
-  std::shared_ptr<Node> this_node;
   node->inputs_.push_back(this);
   return *node;
 }
@@ -31,7 +49,7 @@ bool Node::operator==(const Node &in) {
   if (in.type_ == this->type_) {
     if (this->outputs_.size() == in.outputs_.size()) {
       for (int i = 0; i < outputs_.size(); ++i) {
-        if (!(*outputs_[i] == *in.outputs_[i])) {
+        if (!(this->outputs_[i]->MedianEqual(*in.outputs_[i]))) {
           return false;
         }
       }
@@ -44,20 +62,54 @@ bool Node::operator==(const Node &in) {
   return true;
 }
 
-std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(int size) {
-  std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
-  OpDescs(size - 1, &op_descs);
-  return op_descs;
+bool Node::MedianEqual(const Node &in) {
+  if (in.type_ == this->type_) {
+    if (this->outputs_.size() == in.outputs_.size()) {
+      //      if (this->inputs_.size() != in.inputs_.size()) {
+      //        DLOG << " == - this input size: " << this->inputs_.size();
+      //        DLOG << " == - ptr of this " << this;
+      //        DLOG << " == - in input size: " << in.inputs_.size();
+      //        DLOG << " == - input size not equal ";
+      //        return false;
+      //      } else {
+      //        for (int i = 0; i < this->inputs_.size(); ++i) {
+      //          if (this->inputs_[i]->type_ != in.inputs_[i]->type_) {
+      //            DLOG << " == - input type not equal ";
+      //            return false;
+      //          }
+      //        }
+      //      }
+
+      for (int i = 0; i < outputs_.size(); ++i) {
+        if (!((*outputs_[i]).MedianEqual(*in.outputs_[i]))) {
+          return false;
+        }
+      }
+    } else {
+      //      DLOG << " == - output size not equal ";
+      return false;
+    }
+  } else {
+    //    DLOG << " == - median type is not equal ";
+    return false;
+  }
+  return true;
 }
 
-void Node::OpDescs(int index,
-                   std::vector<std::shared_ptr<framework::OpDesc>> *op_desc) {
-  if (index == 0) {
-    return;
+std::map<std::string, Node *> Node::Relationship() {
+  std::map<std::string, Node *> map;
+  RelationshipPrivate(&map);
+  return map;
+}
+
+void Node::RelationshipPrivate(std::map<std::string, Node *> *map) {
+  for (auto output : op_desc_->outputs_) {
+    for (auto output_key : output.second) {
+      (*map)[output_key] = this;
+    }
   }
-  op_desc->push_back(this->op_desc_);
-  for (auto &output : outputs_) {
-    output->OpDescs(index, op_desc);
+  for (auto output : this->outputs_) {
+    output->RelationshipPrivate(map);
   }
 }
 
@@ -68,7 +120,9 @@ std::shared_ptr<Node> Node::To(int size) {
 }
 
 void Node::To(int index, std::shared_ptr<Node> node) {
+  node->op_desc_ = this->op_desc_;
   node->type_ = this->type_;
+  node->inputs_ = this->inputs_;
   if (index != 0) {
   } else {
     return;
@@ -117,8 +171,19 @@ void Node::Folder(
   if (change->find(this->type_) != change->end()) {
     auto change_pairs = (*change)[this->type_];
     for (const auto &change_pair : change_pairs) {
-      op_desc->GetInputs()[change_pair.second] =
-          this->op_desc_->GetInputs()[change_pair.first];
+      std::map<std::string, int> f;
+      if (this->op_desc_->GetInputs().find(change_pair.first) !=
+          this->op_desc_->GetInputs().end()) {
+        if (op_desc->GetInputs().find(change_pair.second) !=
+            op_desc->GetInputs().end()) {
+          for (auto value : this->op_desc_->GetInputs()[change_pair.first]) {
+            op_desc->GetInputs()[change_pair.second].push_back(value);
+          }
+        } else {
+          op_desc->GetInputs()[change_pair.second] =
+              this->op_desc_->GetInputs()[change_pair.first];
+        }
+      }
     }
   }
 
@@ -127,7 +192,28 @@ void Node::Folder(
   }
   if (index > 0) {
     --index;
+
     for (auto output : outputs_) {
+      if (change->find(this->type_) != change->end()) {
+        auto change_pairs = (*change)[this->type_];
+        for (const auto &change_pair : change_pairs) {
+          std::map<std::string, int> f;
+          if (this->op_desc_->GetOutputs().find(change_pair.first) !=
+              this->op_desc_->GetOutputs().end()) {
+            if (op_desc->GetInputs().find(change_pair.second) !=
+                op_desc->GetInputs().end()) {
+              for (auto value :
+                   this->op_desc_->GetOutputs()[change_pair.first]) {
+                op_desc->GetInputs()[change_pair.second].push_back(value);
+              }
+            } else {
+              op_desc->GetInputs()[change_pair.second] =
+                  this->op_desc_->GetOutputs()[change_pair.first];
+            }
+          }
+        }
+      }
+
       removed_nodes->push_back(output);
       output->Folder(op_desc, outputs, index, change, begin_node,
                      removed_nodes);
diff --git a/src/framework/program/program-optimize/node.h b/src/framework/program/program-optimize/node.h
index 88bf1e16ed2a5fb3a038eadd546d63ffb3916f68..b86fc96a67e290540c94487497fced55abf09041 100644
--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -35,6 +35,7 @@ class Node {
       : op_desc_(op_desc), type_(op_desc->Type()) {}
   Node &operator>(std::shared_ptr<Node> node);
   bool operator==(const Node &in);
+  bool MedianEqual(const Node &in);
 
 #ifdef PADDLE_MOBILE_DEBUG
   std::string ToString() const;
@@ -47,13 +48,17 @@ class Node {
       std::map<std::string, std::vector<std::pair<std::string, std::string>>>
           change,
       std::vector<std::shared_ptr<Node>> *removed_nodes);
-  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(int size);
   std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
   std::string Type() { return type_; }
 
+  std::vector<Node *> operator[](int index);
+
+  std::map<std::string, Node *> Relationship();
+
  private:
-  void OpDescs(int size,
-               std::vector<std::shared_ptr<framework::OpDesc>> *op_desc);
+  void RelationshipPrivate(std::map<std::string, Node *> *map);
+  void GetNodesWithLocation(int index, int now_index,
+                            std::vector<Node *> *nodes);
   void To(int index, std::shared_ptr<Node>);
   void Folder(
       std::shared_ptr<framework::OpDesc> op_desc,
diff --git a/src/framework/program/program-optimize/program_optimize.cpp b/src/framework/program/program-optimize/program_optimize.cpp
index 3619bc79f576651245aa322992df9d318c810cd4..ed523a985138e5cb7cbd0b30fad77ba08c29223f 100644
--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -29,9 +29,15 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
 
   for (int i = 0; i < optimize_program->Blocks().size(); ++i) {
     std::unordered_map<std::string, std::shared_ptr<Node>> output_nodes;
-    std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>>
+    std::unordered_map<
+        std::string,
+        std::vector<
+            std::pair<std::shared_ptr<Node>,
+                      std::unordered_map<std::string, std::shared_ptr<Node>>>>>
         type_map;
 
+    std::unordered_map<std::string, bool> output_has;
+
     std::vector<std::shared_ptr<Node>> nodes;
 
     std::shared_ptr<Node> begin_node;
@@ -50,7 +56,7 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
       nodes.push_back(node);
 
       //
-      type_map[op->Type()].push_back(node);
+      type_map[op->Type()].push_back({node, output_nodes});
 
       if (j == 0) {
         begin_node = node;
@@ -69,6 +75,7 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
       }
 
       auto output_keys = op_input_output_key.at(op_type).second;
+
       for (auto output_key : output_keys) {
         auto op_outputs = op->Output(output_key);
         for (int k = 0; k < op_outputs.size(); ++k) {
@@ -78,17 +85,47 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
     }
 
     for (auto &registed : FusionOpRegister::Instance()->Matchers()) {
-      std::string fusion_type = registed.first;
-      std::shared_ptr<FusionOpMatcher> matcher = registed.second;
-      //      DLOG << " registed node \n " << matcher->BeginNode();
+      std::string fusion_type = registed->Type();
+      std::shared_ptr<FusionOpMatcher> matcher = registed;
 
       auto match_vector = type_map[matcher->BeginType()];
 
-      for (auto &match_node : match_vector) {
+      for (auto &match_node_pair : match_vector) {
+        auto match_node = match_node_pair.first;
+
+        auto node_has = match_node_pair.second;
+
         auto depth = matcher->BeginNode().Depth();
         auto sub_node = match_node->To(depth);
         //        DLOG << " sub node: " << *sub_node;
         if (*sub_node == matcher->BeginNode()) {
+          bool can_folder = true;
+
+          auto relationship_map = sub_node->Relationship();
+
+          for (auto to_check : matcher->NeedCheck()) {
+            //            if (node_has)
+            auto nodes = (*sub_node)[to_check.first];
+            for (auto node : nodes) {
+              auto inputs_to_check =
+                  node->OpDescOfNode()->Input(to_check.second);
+
+              for (auto input_to_check : inputs_to_check) {
+                if (node_has.find(input_to_check) == node_has.end()) {
+                  if (relationship_map.find(input_to_check) ==
+                      relationship_map.end()) {
+                    can_folder = false;
+                  } else {
+                  }
+                }
+              }
+            }
+          }
+
+          if (!can_folder) {
+            continue;
+          }
+
           //          DLOG << " match success " << " fusion node: \n" <<
           //          matcher->BeginNode() << "\nsub node: \n" << *sub_node;
           //          DLOG << "match node\n"<< *match_node;
@@ -96,11 +133,13 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
           std::vector<std::shared_ptr<Node>> removed_nodes;
           matcher->FolderNodes(match_node.get(), &removed_nodes);
 
-          for (int j = 0; j < removed_nodes.size(); ++j) {
-            auto removed_node = removed_nodes[j];
+          for (int k = removed_nodes.size() - 1; k >= 0; --k) {
+            auto removed_node = removed_nodes[k];
             auto removed_ite =
                 std::find(nodes.begin(), nodes.end(), removed_node);
-            nodes.erase(removed_ite);
+            if (removed_ite != nodes.end()) {
+              nodes.erase(removed_ite);
+            }
           }
         }
       }
diff --git a/src/framework/program/program.h b/src/framework/program/program.h
index 5760efc826667d805695118b12e41efa0305553b..192328a567e6d3bfad7a8a3b35e3bc64131a2cd2 100644
--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -30,6 +30,9 @@ class Program {
   std::string model_path;
   std::string para_path;
   bool combined = false;
+  bool quantification = false;
+  size_t combined_params_len;
+  const uint8_t *combined_params_buf;
 
  private:
 };
diff --git a/src/framework/scope.h b/src/framework/scope.h
index d714f61af3bd443c09fcef7aacee2416b90b5e02..054f141ff68895e0879fd31e15d90c76ea038135 100644
--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -23,7 +23,17 @@ namespace framework {
 class Scope {
  public:
   Scope() = default;
-  ~Scope() = default;
+
+  ~Scope() {
+    for (auto &var : vars_) {
+      delete var.second;
+    }
+    vars_.clear();
+    for (auto kid : kids_) {
+      delete kid;
+    }
+    kids_.clear();
+  }
 
   Scope &NewScope() const;
 
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index 9bbd81aa30f6fa0188dacd0dce01813e17b9e339..ba8e3d3402f16966f08c370bff8cd6b0d1f2637b 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -16,14 +16,15 @@ limitations under the License. */
 
 #include <cstdint>
 #include <cstring>
+#include <fstream>
 #include <memory>
+#include <string>
 #include <type_traits>
 #include <typeindex>
 #include <vector>
-#include "common/enforce.h"
 
-#include <fstream>
 #include "common/enforce.h"
+#include "common/types.h"
 #include "framework/data_layout.h"
 #include "framework/ddim.h"
 #include "memory/t_malloc.h"
@@ -63,7 +64,9 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 };
 
 static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t> functor;
+  SizeOfTypeFunctor<int8_t, int, half, float, double, int16_t, int64_t, bool,
+                    size_t>
+      functor;
   size_t size = functor(type);
 
   PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
@@ -113,8 +116,8 @@ class Tensor {
     PADDLE_MOBILE_ENFORCE(
         (std::is_same<T, void>::value ||
          holder_->type().hash_code() == typeid(T).hash_code()),
-        "Tensor holds the wrong type, it holds %s",
-        this->holder_->type().name());
+        "Tensor holds the wrong type, it holds %s ,requested:%s",
+        this->holder_->type().name(), typeid(T).name());
 
     return reinterpret_cast<const T *>(
         reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
@@ -152,7 +155,7 @@ class Tensor {
     if (holder_ != nullptr) {
       holder_->set_type(type);
     }
-    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor'snumel must >=0.")
+    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
     int64_t size = numel() * SizeOfType(type);
     if (holder_ == nullptr || holder_->size() < size + offset_) {
       holder_.reset(new PlaceholderImpl(size, type));
@@ -287,7 +290,6 @@ class Tensor {
 
     virtual void set_type(std::type_index type) { type_ = type; }
 
-    /*! the pointer of memory block. */
     std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t>> ptr_;
 
     /*! the size of memory block. */
@@ -317,6 +319,13 @@ class Tensor {
    * begins.
    */
   size_t offset_;
+#ifdef PADDLE_MOBILE_FPGA
+ public:
+  inline void reset_data_ptr(void *p) {
+    ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p);
+  }
+  float scale[2];  // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
+#endif
 };
 
 #ifdef PADDLE_MOBILE_DEBUG
@@ -324,9 +333,17 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
   printer << " dims: " << tensor.dims() << "\n";
   int stride = tensor.numel() / 20;
   stride = stride > 0 ? stride : 1;
+#ifndef PADDLE_MOBILE_FPGA
   for (int i = 0; i < tensor.numel(); i += stride) {
-    printer << tensor.data<float>()[i] << " ";
+    //  这不一定是float的
+    if (tensor.type() == typeid(float)) {
+      printer << tensor.data<float>()[i] << " ";
+    } else if (tensor.type() == typeid(int64_t)) {
+      printer << tensor.data<int64_t>()[i] << " ";
+    }
   }
+#endif
+
   return printer;
 }
 
diff --git a/src/io/api.cc b/src/io/api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e254aa15ac06083038773d89c23d40242847782
--- /dev/null
+++ b/src/io/api.cc
@@ -0,0 +1,85 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "cstring"
+#include "io/paddle_inference_api.h"
+
+namespace paddle_mobile {
+
+int PaddleDtypeSize(PaddleDType dtype) {
+  switch (dtype) {
+    case PaddleDType::FLOAT32:
+      return sizeof(float);
+    case PaddleDType::INT64:
+      return sizeof(int64_t);
+    default:
+      assert(false);
+      return -1;
+  }
+}
+
+PaddleBuf::PaddleBuf(PaddleBuf&& other)
+    : data_(other.data_),
+      length_(other.length_),
+      memory_owned_(other.memory_owned_) {
+  other.memory_owned_ = false;
+  other.data_ = nullptr;
+  other.length_ = 0;
+}
+
+PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
+
+PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
+  // only the buffer with external memory can be copied
+  if (!other.memory_owned_) {
+    data_ = other.data_;
+    length_ = other.length_;
+    memory_owned_ = other.memory_owned_;
+  } else {
+    Resize(other.length());
+    memcpy(data_, other.data(), other.length());
+    length_ = other.length();
+    memory_owned_ = true;
+  }
+  return *this;
+}
+
+void PaddleBuf::Resize(size_t length) {
+  // Only the owned memory can be reset, the external memory can't be changed.
+  if (length_ == length) return;
+  if (memory_owned_) {
+    Free();
+  }
+  data_ = new char[length];
+  length_ = length;
+  memory_owned_ = true;
+}
+
+void PaddleBuf::Reset(void* data, size_t length) {
+  Free();
+  memory_owned_ = false;
+  data_ = data;
+  length_ = length;
+}
+
+void PaddleBuf::Free() {
+  if (memory_owned_ && data_) {
+    assert(length_ > 0);
+    delete[] static_cast<char*>(data_);
+    data_ = nullptr;
+    length_ = 0;
+  }
+}
+
+}  // namespace paddle_mobile
diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4609438ec9fbdb5b5030b56a4bf18b9437bf7c2e
--- /dev/null
+++ b/src/io/api_paddle_mobile.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "io/api_paddle_mobile.h"
+#include <vector>
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+
+template <typename Dtype, Precision P>
+PaddleMobilePredictor<Dtype, P>::PaddleMobilePredictor(
+    const PaddleMobileConfig &config) {
+  PADDLE_MOBILE_ENFORCE(Init(config) == true,
+                        "paddle mobile predictor init failed!");
+  config_ = config;
+}
+
+template <typename Dtype, Precision P>
+bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
+  paddle_mobile_.reset(new PaddleMobile<Dtype, P>());
+  if (!config.model_dir.empty()) {
+    paddle_mobile_->Load(config.model_dir, config.optimize,
+                         config.quantification, config.batch_size);
+  } else if (!config.prog_file.empty() && !config.param_file.empty()) {
+    paddle_mobile_->Load(config.prog_file, config.param_file, config.optimize,
+                         config.quantification, config.batch_size);
+  } else {
+    LOG(kLOG_ERROR) << "fail to load inference model!";
+    return false;
+  }
+  // If the openmp is open, set the thread num
+  paddle_mobile_->SetThreadNum(config.thread_num);
+  return true;
+}
+
+template <typename Dtype, Precision P>
+bool PaddleMobilePredictor<Dtype, P>::Run(
+    const std::vector<PaddleTensor> &inputs,
+    std::vector<PaddleTensor> *output_data, int batch_size) {
+  if (inputs.empty()) {
+    LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
+    return false;
+  }
+  auto input = inputs[0];
+
+  if (input.shape.size() != 4) {
+    LOG(kLOG_ERROR) << "input shape not equal to 4!";
+    return false;
+  }
+  std::vector<int64_t> dims;
+  for (auto d : input.shape) {
+    dims.push_back(static_cast<int64_t>(d));
+  }
+
+  // use tensor
+  framework::DDim ddim =
+      framework::make_ddim({dims[0], dims[1], dims[2], dims[3]});
+
+  framework::Tensor input_tensor;
+  input_tensor.Resize(ddim);
+  int input_length = framework::product(ddim);
+  typedef typename PrecisionTrait<P>::ptype PType;
+  auto input_ptr = input_tensor.mutable_data<PType>();
+
+  memcpy(input_ptr, static_cast<PType *>(input.data.data()),
+         input_length * sizeof(PType));
+  auto output_tensor = paddle_mobile_->Predict(input_tensor);
+
+  if (output_data->empty()) {
+    LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
+    return false;
+  }
+
+  auto &output = (*output_data)[0];
+  int output_length = output_tensor->numel();
+  std::vector<int64_t> tensor_shape =
+      framework::vectorize(output_tensor->dims());
+
+  for (auto d : tensor_shape) {
+    output.shape.push_back(static_cast<int>(d));
+  }
+
+  if (output.data.length() < output_length * sizeof(PType)) {
+    output.data.Resize(output_length * sizeof(PType));
+  }
+
+  memcpy(output.data.data(), output_tensor->template data<PType>(),
+         output_length * sizeof(PType));
+
+  return true;
+}
+
+// A factory to help create difference predictor.
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(
+    const PaddleMobileConfig &config) {
+  std::unique_ptr<PaddlePredictor> x;
+  if (config.precision == PaddleMobileConfig::FP32) {
+    if (config.device == PaddleMobileConfig::kCPU) {
+      x.reset(new PaddleMobilePredictor<CPU, Precision::FP32>(config));
+    } else if (config.device == PaddleMobileConfig::kFPGA) {
+      x.reset(new PaddleMobilePredictor<FPGA, Precision::FP32>(config));
+    } else if (config.device == PaddleMobileConfig::kGPU_MALI) {
+      x.reset(new PaddleMobilePredictor<GPU_MALI, Precision::FP32>(config));
+    } else {
+      LOG(kLOG_ERROR) << "unsupport device type!";
+      return nullptr;
+    }
+  } else {
+    LOG(kLOG_ERROR) << "unsupport precision type!";
+    return nullptr;
+  }
+  return std::move(x);
+}
+
+}  // namespace paddle_mobile
diff --git a/src/io/api_paddle_mobile.h b/src/io/api_paddle_mobile.h
new file mode 100644
index 0000000000000000000000000000000000000000..66c6a4d5d9f8fc81b96642c6d5b62757dd581bc3
--- /dev/null
+++ b/src/io/api_paddle_mobile.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the implementation of inference API with Anakin engine
+ * embeded, this API can only support Anakin models.
+ */
+
+#pragma once
+
+#include <vector>
+#include "io/paddle_inference_api.h"
+
+// from paddle_mobile
+#include "common/enforce.h"
+#include "common/types.h"
+#include "io/paddle_mobile.h"
+
+namespace paddle_mobile {
+
+template <typename Dtype = CPU, Precision P = Precision::FP32>
+class PaddleMobilePredictor : public PaddlePredictor {
+ public:
+  PaddleMobilePredictor() {}
+
+  explicit PaddleMobilePredictor(const PaddleMobileConfig& config);
+
+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override;
+
+  ~PaddleMobilePredictor() override{};
+
+ private:
+  std::unique_ptr<PaddleMobile<Dtype, P>> paddle_mobile_;
+  bool Init(const PaddleMobileConfig& config);
+
+  PaddleMobileConfig config_;
+};
+
+}  // namespace paddle_mobile
diff --git a/src/io/executor.cpp b/src/io/executor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d733231ef03f74eba2f1f2e989a0bad1cf43f161
--- /dev/null
+++ b/src/io/executor.cpp
@@ -0,0 +1,732 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "io/executor.h"
+#include <operators/math/gemm.h>
+#include <algorithm>
+#include <vector>
+#include "common/enforce.h"
+#include "common/log.h"
+#include "framework/framework.pb-c.h"
+#include "framework/lod_tensor.h"
+#include "framework/operator.h"
+#include "framework/program/program-optimize/program_optimize.h"
+#include "framework/program/program_desc.h"
+#include "framework/program/var_desc.h"
+#include "framework/scope.h"
+#include "framework/tensor.h"
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+#include <queue>
+#include <utility>
+#include "common/threadpool.h"
+#endif
+
+namespace paddle_mobile {
+using framework::Variable;
+
+char *Get_binary_data(std::string filename) {
+  FILE *file = fopen(filename.c_str(), "rb");
+  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
+                        filename.c_str());
+  fseek(file, 0, SEEK_END);
+  int64_t size = ftell(file);
+  PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
+  rewind(file);
+  char *data = new char[size];
+  size_t bytes_read = fread(data, 1, size, file);
+  PADDLE_MOBILE_ENFORCE(bytes_read == size,
+                        "read binary file bytes do not match with fseek");
+  fclose(file);
+  return data;
+}
+
+#pragma mark - executor
+template <typename Dtype, Precision P>
+Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
+                             bool use_optimize, bool loddable)
+    : program_(p),
+      batch_size_(batch_size),
+      use_optimize_(use_optimize),
+      loddable_(loddable) {
+  if (use_optimize_) {
+    to_predict_program_ = program_.optimizeProgram;
+  } else {
+    to_predict_program_ = program_.originProgram;
+  }
+  Variable *variable_ptr = program_.scope->Var("batch_size");
+  variable_ptr[0].SetValue<int>(batch_size);
+  PADDLE_MOBILE_ENFORCE(to_predict_program_ != nullptr,
+                        "to_predict_program_ == NULL!");
+  const std::vector<std::shared_ptr<framework::BlockDesc>> blocks =
+      to_predict_program_->Blocks();
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+  depManager.resize(blocks.size());
+#endif
+  DLOG << "executer in loaddable mode: " << loddable_;
+  for (int i = 0; i < blocks.size(); ++i) {
+    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
+    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
+    for (int j = 0; j < ops.size(); ++j) {
+      std::shared_ptr<framework::OpDesc> op = ops[j];
+      DLOG << "create op: " << j << "  " << op->Type();
+      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
+          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
+          program_.scope);
+      // use pre_infershape to pre resize , but if u use an lod mode tensor u
+      // need to resize in runtime
+      if (!loddable_) {
+        op_base->InferShape();
+      }
+      ops_of_block_[*block_desc.get()].push_back(op_base);
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+      depManager[i].analysisDep(ops_of_block_[*block_desc.get()]);
+#endif
+    }
+    DLOG << "Total " << ops.size() << " ops have been created ";
+  }
+  if (program_.combined) {
+    InitCombineMemory();
+  } else {
+    InitMemory();
+  }
+  std::shared_ptr<framework::BlockDesc> to_predict_block =
+      to_predict_program_->Block(0);
+  auto &ops = ops_of_block_[*to_predict_block.get()];
+  int i = 0;
+  for (const auto &op : ops) {
+    DLOG << "Init op: " << i++ << "  " << op->Type();
+    op->Init();
+  }
+}
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
+                                    framework::LoDTensor *tensor, char **data) {
+  // 1. version
+  uint32_t version = *reinterpret_cast<uint32_t *>(*data);
+
+  (*data) += sizeof(uint32_t);
+
+  // 2 Lod information
+  uint64_t *lod_level_ptr = new uint64_t();
+  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
+  uint64_t lod_level = *lod_level_ptr;
+  delete lod_level_ptr;
+  (*data) += sizeof(uint64_t);
+
+  auto &lod = *tensor->mutable_lod();
+  lod.resize(lod_level);
+  for (uint64_t i = 0; i < lod_level; ++i) {
+    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
+    (*data) += sizeof(uint64_t);
+    std::vector<size_t> tmp(size / sizeof(size_t));
+
+    for (int k = 0; k < tmp.size(); ++k) {
+      tmp[k] = *reinterpret_cast<size_t *>(*data);
+      (*data) += sizeof(size_t);
+    }
+
+    for (auto j : tmp) {
+      LOG(kLOG_DEBUG1) << "    lod - " << j;
+    }
+    lod[i] = tmp;
+  }
+
+  // 3. tensor version
+  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
+  (*data) += sizeof(uint32_t);
+
+  // 4. tensor desc
+  int32_t size = *reinterpret_cast<int32_t *>(*data);
+  (*data) += sizeof(int32_t);
+
+  std::unique_ptr<char[]> buf(new char[size]);
+  for (int m = 0; m < size; ++m) {
+    buf.get()[m] = (*data)[m];
+  }
+  (*data) += (sizeof(char) * size);
+
+  const framework::TensorDesc &desc = var_desc.Tensor_desc();
+  int memory_size = 1;
+  for (auto l : desc.Dims()) {
+    memory_size *= l;
+  }
+
+  tensor->Resize(framework::make_ddim(desc.Dims()));
+
+  void *memory = nullptr;
+  int type_size = 0;
+  switch (desc.DataType()) {
+    case framework::VARTYPE_TYPE_FP16:
+      type_size = 2;
+      break;
+    case framework::VARTYPE_TYPE_FP32:
+      type_size = 4;
+      memory = tensor->mutable_data<float>();
+      break;
+    case framework::VARTYPE_TYPE_FP64:
+      type_size = 8;
+      break;
+    case framework::VARTYPE_TYPE_INT32:
+      memory = tensor->mutable_data<int32_t>();
+      type_size = 4;
+      break;
+    case framework::VARTYPE_TYPE_INT64:
+      type_size = 8;
+      break;
+    case framework::VARTYPE_TYPE_BOOL:
+      type_size = 1;
+      break;
+    default:
+      break;
+  }
+  if (program_.quantification) {
+    float min_value;
+    float max_value;
+
+    memcpy(&min_value, *data, sizeof(float));
+    memcpy(&max_value, *data + sizeof(float), sizeof(float));
+    *data += 2 * sizeof(float);
+    const float factor = (max_value - min_value) / 255.0;
+    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
+    for (int k = 0; k < memory_size; ++k) {
+      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
+    }
+    *data += (memory_size * sizeof(uint8_t));
+  } else {
+    for (int n = 0; n < memory_size; n++) {
+      float value;
+      memcpy(&value, *data + n * type_size, type_size);
+      if (value < 1e-30 && value > -1e-30) {
+        static_cast<float *>(memory)[n] = 0.0;
+      } else {
+        static_cast<float *>(memory)[n] = value;
+      }
+    }
+    (*data) += (sizeof(char) * memory_size * type_size);
+  }
+}
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::InitMemory() {
+  for (const auto &block : to_predict_program_->Blocks()) {
+    for (const auto &var_desc : block->Vars()) {
+      auto var = program_.scope->Var(var_desc->Name());
+      if (var_desc->Persistable()) {
+        auto tensor = var->template GetMutable<framework::LoDTensor>();
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+          continue;
+        }
+
+        char *origin_data =
+            Get_binary_data(program_.model_path + "/" + var_desc->Name());
+        char *data = origin_data;
+        LoadMemory(*var_desc, tensor, &data);
+
+        //        DLOG << "-----      " << var_desc->Name();
+        //        DLOG << "-----      " << tensor->dims();
+        //        float *pDouble = tensor->template data<float>();
+        //        for (int i = 0; i < tensor->numel() && i < 30; ++i) {
+        //          std::cout << pDouble[i] << std::endl;
+        //        }
+        delete origin_data;
+      } else {
+        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+          bool is_mute_match;
+          framework::LoDTensor *tensor = nullptr;
+
+          is_mute_match = varInputMemory(var_desc, var, tensor);
+
+          PADDLE_MOBILE_ENFORCE(
+              is_mute_match,
+              "got unhandled var_desc->Tensor_desc().DataType(): %d",
+              var_desc->Tensor_desc().DataType());
+        }
+      }
+    }
+  }
+}
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::InitCombineMemory() {
+  char *origin_data;
+  if (program_.combined_params_buf && program_.combined_params_len) {
+    LOG(kLOG_INFO) << "use outter memory";
+    origin_data = (char *)program_.combined_params_buf;
+  } else {
+    LOG(kLOG_INFO) << " begin init combine memory";
+    origin_data = Get_binary_data(program_.para_path);
+  }
+  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
+  char *data = origin_data;
+  for (const auto &block : to_predict_program_->Blocks()) {
+    for (const auto &var_desc : block->Vars()) {
+      auto var = program_.scope->Var(var_desc->Name());
+      if (var_desc->Persistable()) {
+        auto tensor = var->template GetMutable<framework::LoDTensor>();
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+          continue;
+        }
+        LoadMemory(*var_desc, tensor, &data);
+      } else {
+        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+          bool is_mute_match = false;
+          framework::LoDTensor *tensor;
+
+          is_mute_match = varInputMemory(var_desc, var, tensor);
+
+          PADDLE_MOBILE_ENFORCE(
+              is_mute_match,
+              "got unhandled var_desc->Tensor_desc().DataType(): %d",
+              var_desc->Tensor_desc().DataType());
+        }
+      }
+    }
+  }
+  delete origin_data;
+  LOG(kLOG_INFO) << " end init combine memory ";
+}
+template <typename Dtype, Precision P>
+bool Executor<Dtype, P>::varInputMemory(
+    const std::shared_ptr<framework::VarDesc> &var_desc, Variable *var,
+    framework::LoDTensor *tensor) const {
+  bool is_mute_match = false;
+  switch (var_desc->Tensor_desc().DataType()) {
+    case framework::VARTYPE_TYPE_FP16: {
+      break;
+    }
+
+    case framework::VARTYPE_TYPE_FP32: {
+      tensor = var->template GetMutable<framework::LoDTensor>();
+      tensor->template mutable_data<Ptype>();
+      is_mute_match = true;
+      break;
+    }
+
+    case framework::VARTYPE_TYPE_FP64: {
+      break;
+    }
+
+    case framework::VARTYPE_TYPE_INT32: {
+      tensor = var->template GetMutable<framework::LoDTensor>();
+      tensor->template mutable_data<int32_t>();
+      is_mute_match = true;
+      break;
+    }
+
+    case framework::VARTYPE_TYPE_INT64: {
+      tensor = var->template GetMutable<framework::LoDTensor>();
+      tensor->template mutable_data<int64_t>();
+      is_mute_match = true;
+      break;
+    }
+    case framework::VARTYPE_TYPE_BOOL: {
+      break;
+    }
+
+    default: { break; }
+  }
+
+  return is_mute_match;
+}
+
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
+    const framework::Tensor &t) {
+  framework::Variable *g_feed_value = program_.scope->Var("feed");
+  framework::Tensor *feed_tensor =
+      g_feed_value->GetMutable<framework::LoDTensor>();
+  feed_tensor->Resize(t.dims());
+  feed_tensor->ShareDataWith(t);
+  std::shared_ptr<framework::BlockDesc> to_predict_block =
+      to_predict_program_->Block(0);
+  auto &ops = ops_of_block_[*to_predict_block.get()];
+
+#ifdef PADDLE_MOBILE_PROFILE
+  std::vector<ProfInfo> profile(ops.size());
+#endif
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+  std::mutex m;
+  std::condition_variable cv;
+  std::queue<int> next;
+  next.push(0);
+  int rsize = ops.size();
+  std::vector<int> status(rsize, 0);
+  auto &threadPool = ThreadPool::getThreadPool();
+  auto &dep = depManager[0];
+  auto finishF = [&ops, &m, &cv, &next, &status, &rsize, &dep](int opi) {
+    std::lock_guard<std::mutex> lk(m);
+    rsize--;
+    status[opi] = 2;
+    for (int i : dep.getNext(opi)) {
+      bool ok = true;
+      for (int j : dep.getDeps(i)) {
+        if (status[j] != 2) {
+          ok = false;
+          break;
+        }
+      }
+      if (ok && (status[i] == 0)) {
+        next.push(i);
+      }
+    }
+    cv.notify_one();
+  };
+  for (;;) {
+    std::unique_lock<std::mutex> lk(m);
+    cv.wait(lk, [&next, &rsize] { return rsize == 0 || !next.empty(); });
+    if (rsize == 0) {
+      break;
+    }
+    while (next.size() > 0) {
+      int opi = next.front();
+      next.pop();
+      status[opi] = 1;
+      threadPool.enqueue([opi, &ops, &finishF, &profile] {
+        auto &op = ops[opi];
+#ifdef PADDLE_MOBILE_PROFILE
+        struct timespec ts;
+        clock_gettime(CLOCK_MONOTONIC, &ts);
+        profile[opi].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+        profile[opi].tid = ThreadPool::getThreadPoolThreadId();
+#endif
+        ops[opi]->Run();
+#ifdef PADDLE_MOBILE_PROFILE
+        clock_gettime(CLOCK_MONOTONIC, &ts);
+        profile[opi].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+        finishF(opi);
+      });
+    }
+  }
+#else
+  for (int i = 0; i < ops.size(); i++) {
+#ifdef PADDLE_MOBILE_PROFILE
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+    // to Run
+    ops[i]->Run();
+#ifdef PADDLE_MOBILE_PROFILE
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+  }
+#endif
+  auto last_op = ops.rbegin();
+  auto output_map = (*last_op)->Outputs();
+  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
+  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
+  framework::LoDTensor *output_tensor =
+      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
+                                                   *(program_.scope));
+#ifdef PADDLE_MOBILE_PROFILE
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+  // TODO(haipeng): expose profile info as an interface, user can get them to
+  // analysis
+  //      the performance of their deepnet.
+  FILE *df = fopen("net.dot", "w");
+  fprintf(df, "digraph {\n");
+  for (int i = 0; i < ops.size(); i++) {
+    for (int j : dep.getNext(i)) {
+      fprintf(df, "op_%d -> op_%d\n", i, j);
+    }
+  }
+  for (int i = 0; i < ops.size(); i++) {
+    fprintf(df, "op_%d[label=\"%s (%d)\"]\n", i, ops[i]->Type().c_str(), i);
+  }
+  fprintf(df, "}\n");
+  fclose(df);
+#endif
+  //  FILE *pf = fopen("profile.out", "w");
+  std::unordered_map<std::string, uint64_t> _tp;
+  for (int i = 0; i < profile.size(); i++) {
+    const auto &pInfo = profile[i];
+    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
+    _tp[ops[i]->Type()] += timeCost;
+    //    fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i,
+    //    ops[i]->Type().c_str(),
+    //            pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
+  }
+  //  fclose(pf);
+  printf("====================[ profile ]======================\n");
+  using prof_t = std::pair<std::string, uint64_t>;
+  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
+  uint64_t _ptotal = 0;
+  for (auto const &p : _tv) {
+    _ptotal += p.second;
+  }
+  auto compf = [](const prof_t &a, const prof_t &b) {
+    return a.second > b.second;
+  };
+  std::sort(_tv.begin(), _tv.end(), compf);
+  _tv.push_back(std::make_pair("total", _ptotal));
+  for (auto const &p : _tv) {
+    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
+           static_cast<float>(p.second),
+           static_cast<float>(p.second) / _ptotal * 100.0);
+  }
+  printf("====================[---------]======================\n");
+#endif
+  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
+}
+
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
+    const framework::LoDTensor &t) {
+  framework::Variable *g_feed_value = program_.scope->Var("feed");
+  framework::LoDTensor *feed_tensor =
+      g_feed_value->GetMutable<framework::LoDTensor>();
+  feed_tensor->Resize(t.dims());
+  feed_tensor->ShareDataWith(t);
+  feed_tensor->set_lod(t.lod());
+
+  std::shared_ptr<framework::BlockDesc> to_predict_block =
+      to_predict_program_->Block(0);
+
+  auto &ops = ops_of_block_[*to_predict_block.get()];
+
+#ifdef PADDLE_MOBILE_PROFILE
+  std::vector<ProfInfo> profile(ops.size());
+#endif
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+  std::mutex m;
+  std::condition_variable cv;
+  std::queue<int> next;
+  next.push(0);
+  int rsize = ops.size();
+  std::vector<int> status(rsize, 0);
+  auto &threadPool = ThreadPool::getThreadPool();
+  auto &dep = depManager[0];
+  auto finishF = [&ops, &m, &cv, &next, &status, &rsize, &dep](int opi) {
+    std::lock_guard<std::mutex> lk(m);
+    rsize--;
+    status[opi] = 2;
+    for (int i : dep.getNext(opi)) {
+      bool ok = true;
+      for (int j : dep.getDeps(i)) {
+        if (status[j] != 2) {
+          ok = false;
+          break;
+        }
+      }
+      if (ok && (status[i] == 0)) {
+        next.push(i);
+      }
+    }
+    cv.notify_one();
+  };
+  for (;;) {
+    std::unique_lock<std::mutex> lk(m);
+    cv.wait(lk, [&next, &rsize] { return rsize == 0 || !next.empty(); });
+    if (rsize == 0) {
+      break;
+    }
+    while (next.size() > 0) {
+      int opi = next.front();
+      next.pop();
+      status[opi] = 1;
+      threadPool.enqueue([opi, &ops, &finishF, &profile] {
+        auto &op = ops[opi];
+#ifdef PADDLE_MOBILE_PROFILE
+        struct timespec ts;
+        clock_gettime(CLOCK_MONOTONIC, &ts);
+        profile[opi].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+        profile[opi].tid = ThreadPool::getThreadPoolThreadId();
+#endif
+        ops[opi]->Run();
+#ifdef PADDLE_MOBILE_PROFILE
+        clock_gettime(CLOCK_MONOTONIC, &ts);
+        profile[opi].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+        finishF(opi);
+      });
+    }
+  }
+#else
+  for (int i = 0; i < ops.size(); i++) {
+#ifdef PADDLE_MOBILE_PROFILE
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+    if (loddable_) {
+      ops[i]->InferShape();
+    }
+    // to Run
+    ops[i]->Run();
+#ifdef PADDLE_MOBILE_PROFILE
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+  }
+#endif
+  auto last_op = ops.rbegin();
+
+  auto output_map = (*last_op)->Outputs();
+  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
+  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
+  framework::LoDTensor *output_tensor =
+      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
+                                                   *(program_.scope));
+#ifdef PADDLE_MOBILE_PROFILE
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+  // TODO(haipeng): expose profile info as an interface, user can get them to
+  // analysis
+  //      the performance of their deepnet.
+  FILE *df = fopen("net.dot", "w");
+  fprintf(df, "digraph {\n");
+  for (int i = 0; i < ops.size(); i++) {
+    for (int j : dep.getNext(i)) {
+      fprintf(df, "op_%d -> op_%d\n", i, j);
+    }
+  }
+  for (int i = 0; i < ops.size(); i++) {
+    fprintf(df, "op_%d[label=\"%s (%d)\"]\n", i, ops[i]->Type().c_str(), i);
+  }
+  fprintf(df, "}\n");
+  fclose(df);
+#endif
+  //  FILE *pf = fopen("profile.out", "w");
+  std::unordered_map<std::string, uint64_t> _tp;
+  for (int i = 0; i < profile.size(); i++) {
+    const auto &pInfo = profile[i];
+    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
+    _tp[ops[i]->Type()] += timeCost;
+    //    fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i,
+    //    ops[i]->Type().c_str(),
+    //            pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
+  }
+  //  fclose(pf);
+  printf("====================[ profile ]======================\n");
+  using prof_t = std::pair<std::string, uint64_t>;
+  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
+  uint64_t _ptotal = 0;
+  for (auto const &p : _tv) {
+    _ptotal += p.second;
+  }
+  auto compf = [](const prof_t &a, const prof_t &b) {
+    return a.second > b.second;
+  };
+  std::sort(_tv.begin(), _tv.end(), compf);
+  _tv.push_back(std::make_pair("total", _ptotal));
+  for (auto const &p : _tv) {
+    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
+           static_cast<float>(p.second),
+           static_cast<float>(p.second) / _ptotal * 100.0);
+  }
+  printf("====================[---------]======================\n");
+#endif
+  return std::make_shared<framework::LoDTensor>(
+      framework::LoDTensor(*output_tensor));
+}
+
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
+    const framework::Tensor &t, int block_id) {
+  return Predict(t);
+}
+
+template <typename Dtype, Precision P>
+std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
+    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
+  framework::Tensor tensor(input, framework::make_ddim(dims));
+  std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
+  Executor<Dtype, P>::Ptype *output_ptr =
+      output_tensor->data<typename Executor<Dtype, P>::Ptype>();
+  std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
+  for (int j = 0; j < output_tensor->numel(); ++j) {
+    result_vector.push_back(output_ptr[j]);
+  }
+  return result_vector;
+}
+
+#ifdef PADDLE_MOBILE_FPGA
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
+                                        string var_name) {
+  framework::Variable *g_feed_value = program_.scope->Var(var_name);
+  framework::Tensor *feed_tensor =
+      g_feed_value->GetMutable<framework::LoDTensor>();
+  feed_tensor->Resize(t.dims());
+  feed_tensor->ShareDataWith(t);
+};
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::FeedData(const framework::Tensor &t) {
+  InjectVariable(t, "feed");
+};
+
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
+  std::shared_ptr<framework::BlockDesc> to_predict_block =
+      to_predict_program_->Block(0);
+  auto &ops = ops_of_block_[*to_predict_block.get()];
+
+  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
+  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
+  auto output_map = op->Outputs();
+  std::vector<std::string> out_keys = op->GetOutKeys();
+  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
+  auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
+      out_keys[0], output_map, *(program_.scope));
+  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
+};
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::Predict_From_To(int start, int end) {
+  std::shared_ptr<framework::BlockDesc> to_predict_block =
+      to_predict_program_->Block(0);
+  auto &ops = ops_of_block_[*to_predict_block.get()];
+  end = end < 0 ? (int)ops.size() : end;
+  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
+                        "start or end parameter is wrong");
+
+#ifdef PADDLE_MOBILE_PROFILE
+  std::vector<ProfInfo> profile(ops.size());
+#endif
+  for (int i = start; i < end; i++) {
+#ifdef PADDLE_MOBILE_PROFILE
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+    DLOG << "Running op: " << i << "  " << ops[i]->Type();
+    ops[i]->Run();
+
+#ifdef PADDLE_MOBILE_PROFILE
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+  }
+};
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::Predict_From(int start) {
+  Predict_From_To(start);
+};
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::Predict_To(int end) {
+  Predict_From_To(0, end);
+};
+#endif
+
+template class Executor<CPU, Precision::FP32>;
+template class Executor<GPU_MALI, Precision::FP32>;
+template class Executor<FPGA, Precision::FP32>;
+
+}  // namespace paddle_mobile
diff --git a/src/io/io.h b/src/io/executor.h
similarity index 71%
rename from src/io/io.h
rename to src/io/executor.h
index acae829339bdc049c5899b9c7f6a7a2c91693ae8..67d3f02ac37c4203950a2679d30d7aa9072c70ba 100644
--- a/src/io/io.h
+++ b/src/io/executor.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "common/types.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"
@@ -29,34 +30,10 @@ limitations under the License. */
 #include <thread>
 #include "common/dep_core.h"
 #endif
+using std::string;
 
 namespace paddle_mobile {
 
-template <typename Dtype = CPU, Precision P = Precision::FP32>
-class Loader {
- public:
-  /*
-   * @b load separate format fluid model
-   * @b 加载分开形式的 fluid 模型
-   * */
-  const framework::Program<Dtype, P> Load(const std::string &dirname,
-                                          bool optimize = false,
-                                          bool can_add_split = false);
-
-  /*
-   * @b load combine format fluid mode
-   * @b 加载结合在一起格式的模型
-   * */
-  const framework::Program<Dtype, P> Load(const std::string &model_path,
-                                          const std::string &para_path,
-                                          bool optimize = false);
-
- private:
-  const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
-                                                 bool optimize = false,
-                                                 bool can_add_split = false);
-};
-
 template <typename Dtype = CPU, Precision P = Precision::FP32>
 class Executor {
  public:
@@ -67,13 +44,17 @@ class Executor {
    * @b 用 loader load 的 program 实例化 executor
    * */
   Executor(const framework::Program<Dtype> p, int batch_size = 1,
-           bool use_optimize = true);
+           bool use_optimize = true, bool loddable = false);
 
   /*
    * @b to predict
    * */
   std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
-
+  /*
+   * @b to predict
+   * */
+  std::shared_ptr<framework::LoDTensor> PredictLod(
+      const framework::LoDTensor &t);
   /*
    * @b to predict with vector and dim
    *
@@ -86,7 +67,7 @@ class Executor {
   Executor() = default;
   void InitMemory();
   void LoadMemory(const framework::VarDesc var_desc,
-                  framework::LoDTensor *tensor, char *&data);
+                  framework::LoDTensor *tensor, char **data);
   void InitCombineMemory();
   framework::Program<Dtype> program_;
   int batch_size_ = 1;
@@ -97,6 +78,7 @@ class Executor {
            std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>>
       ops_of_block_;
   bool use_optimize_ = false;
+  bool loddable_ = false;
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
   std::vector<depCore> depManager;
 #endif
@@ -107,6 +89,21 @@ class Executor {
     uint64_t runEnd = 0UL;
   };
 #endif
+
+  bool varInputMemory(const std::shared_ptr<framework::VarDesc> &var_desc,
+                      framework::Variable *var,
+                      framework::LoDTensor *tensor) const;
+
+#ifdef PADDLE_MOBILE_FPGA
+
+ public:
+  void InjectVariable(const framework::Tensor &t, string var_name);
+  void FeedData(const framework::Tensor &t);
+  std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
+  void Predict_From_To(int start = 0, int end = -1);
+  void Predict_From(int start);
+  void Predict_To(int end);
+#endif
 };
 
 }  // namespace paddle_mobile
diff --git a/src/io/io.cpp b/src/io/io.cpp
deleted file mode 100644
index 007ed8df2a8192c5b310d54dc9eb3ad852aeeee0..0000000000000000000000000000000000000000
--- a/src/io/io.cpp
+++ /dev/null
@@ -1,515 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "io/io.h"
-#include <algorithm>
-#include <vector>
-#include "common/enforce.h"
-#include "common/log.h"
-#include "framework/framework.pb-c.h"
-#include "framework/lod_tensor.h"
-#include "framework/operator.h"
-#include "framework/program/program-optimize/program_optimize.h"
-#include "framework/program/program_desc.h"
-#include "framework/program/var_desc.h"
-#include "framework/scope.h"
-#include "framework/tensor.h"
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-#include <queue>
-#include <utility>
-#include "common/threadpool.h"
-#endif
-
-namespace paddle_mobile {
-using framework::Variable;
-
-char *Get_binary_data(std::string filename) {
-  FILE *file = fopen(filename.c_str(), "rb");
-  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                        filename.c_str());
-  fseek(file, 0, SEEK_END);
-  long size = ftell(file);
-  PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
-  rewind(file);
-  char *data = new char[size];
-  size_t bytes_read = fread(data, 1, size, file);
-  PADDLE_MOBILE_ENFORCE(bytes_read == size,
-                        "read binary file bytes do not match with fseek");
-  fclose(file);
-  return data;
-}
-
-static size_t ReadBuffer(const char *file_name, uint8_t **out) {
-  printf("%s \n", file_name);
-  FILE *fp;
-  fp = fopen(file_name, "rb");
-  PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name);
-
-  fseek(fp, 0, SEEK_END);
-  size_t size = ftell(fp);
-  rewind(fp);
-
-  DLOG << "model size: " << size;
-
-  *out = reinterpret_cast<uint8_t *>(malloc(size));
-
-  size_t cur_len = 0;
-  size_t nread;
-  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
-    cur_len += nread;
-  }
-  fclose(fp);
-  return cur_len;
-}
-
-template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &dirname, bool optimize, bool can_add_split) {
-  auto program =
-      this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
-  program.model_path = dirname;
-  return program;
-}
-
-template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &model_path, const std::string &para_path,
-    bool optimize) {
-  auto program = this->LoadProgram(model_path, optimize);
-  program.para_path = para_path;
-  program.combined = true;
-  return program;
-}
-
-template <typename Dtype, Precision P>
-const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
-    const std::string &model_path, bool optimize, bool can_add_split) {
-  std::string model_filename = model_path;
-  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-  uint8_t *buf = NULL;
-  size_t read_size = ReadBuffer(model_filename.c_str(), &buf);
-
-  PADDLE_MOBILE_ENFORCE(buf != NULL, "read from __model__ is null");
-
-  c_program = paddle_mobile__framework__proto__program_desc__unpack(
-      NULL, read_size, buf);
-  //
-  PADDLE_MOBILE_ENFORCE(c_program != NULL, "program is null");
-  //
-  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
-  //
-  auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
-
-  framework::Program<Dtype, P> program;
-  program.originProgram = originProgramDesc;
-
-  auto scope = std::make_shared<framework::Scope>();
-  program.scope = scope;
-
-  for (const auto &block : originProgramDesc->Blocks()) {
-    for (auto var_desc : block->Vars()) {
-      auto var = scope->Var(var_desc->Name());
-
-      if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-        if (var_desc->Persistable() &&
-            var_desc->Type() != framework::VARTYPE_TYPE_FEED_MINIBATCH &&
-            var_desc->Type() != framework::VARTYPE_TYPE_FETCH_LIST) {
-          auto dim = var_desc->Tensor_desc().Dims();
-          auto tensor = var->GetMutable<framework::LoDTensor>();
-          tensor->Resize(framework::make_ddim(dim));
-        } else {
-          auto dim = var_desc->Tensor_desc().Dims();
-          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
-          dim[0] = 1;
-          auto tensor = var->GetMutable<framework::LoDTensor>();
-          tensor->Resize(framework::make_ddim(dim));
-        }
-      } else {
-        // TODO(codeWorm): some.
-      }
-    }
-  }
-
-  if (optimize) {
-    framework::ProgramOptimize program_optimize;
-    program.optimizeProgram =
-        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
-  }
-  if (optimize) {
-    program.optimizeProgram->Description("optimize: ");
-  } else {
-    originProgramDesc->Description("program: ");
-  }
-
-  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
-  return program;
-}
-
-template class Loader<CPU, Precision::FP32>;
-template class Loader<FPGA, Precision::FP32>;
-template class Loader<GPU_MALI, Precision::FP32>;
-
-#pragma mark - executor
-template <typename Dtype, Precision P>
-Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
-                             bool use_optimize)
-    : program_(p), batch_size_(batch_size), use_optimize_(use_optimize) {
-  if (use_optimize_) {
-    to_predict_program_ = program_.optimizeProgram;
-  } else {
-    to_predict_program_ = program_.originProgram;
-  }
-  Variable *variable_ptr = program_.scope->Var("batch_size");
-  variable_ptr[0].SetValue<int>(batch_size);
-  const std::vector<std::shared_ptr<framework::BlockDesc>> blocks =
-      to_predict_program_->Blocks();
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-  depManager.resize(blocks.size());
-#endif
-  for (int i = 0; i < blocks.size(); ++i) {
-    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
-    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
-    for (int j = 0; j < ops.size(); ++j) {
-      std::shared_ptr<framework::OpDesc> op = ops[j];
-      DLOG << "create op: " << op->Type();
-      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
-          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
-          program_.scope);
-      op_base->InferShape();
-      ops_of_block_[*block_desc.get()].push_back(op_base);
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-      depManager[i].analysisDep(ops_of_block_[*block_desc.get()]);
-#endif
-    }
-  }
-  if (program_.combined) {
-    InitCombineMemory();
-  } else {
-    InitMemory();
-  }
-
-  std::shared_ptr<framework::BlockDesc> to_predict_block =
-      to_predict_program_->Block(0);
-  auto &ops = ops_of_block_[*to_predict_block.get()];
-  for (const auto &op : ops) {
-    op->Init();
-  }
-}
-
-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
-                                    framework::LoDTensor *tensor, char *&data) {
-  // 1. version
-  uint32_t version = *(uint32_t *)data;
-  data += sizeof(uint32_t);
-
-  // 2 Lod information
-  uint64_t *lod_level_ptr = new uint64_t();
-  memcpy(lod_level_ptr, data, sizeof(uint64_t));
-  uint64_t lod_level = *lod_level_ptr;
-  delete lod_level_ptr;
-  data += sizeof(uint64_t);
-
-  auto &lod = *tensor->mutable_lod();
-  lod.resize(lod_level);
-  for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size = *(uint64_t *)data;
-    data += sizeof(uint64_t);
-    DLOG << "lod size: " << i << size;
-    std::vector<size_t> tmp(size / sizeof(size_t));
-
-    for (int k = 0; k < tmp.size(); ++k) {
-      tmp[k] = *(size_t *)data;
-      DLOG << "tmp[k]: " << k << *(size_t *)data;
-      data += sizeof(size_t);
-    }
-
-    for (auto j : tmp) {
-      LOG(kLOG_DEBUG1) << "    lod - " << j;
-    }
-    lod[i] = tmp;
-  }
-
-  // 3. tensor version
-  uint32_t tensor_version = *(uint32_t *)data;
-  data += sizeof(uint32_t);
-
-  // 4. tensor desc
-  int32_t size = *(int32_t *)data;
-  data += sizeof(int32_t);
-
-  std::unique_ptr<char[]> buf(new char[size]);
-  for (int m = 0; m < size; ++m) {
-    buf.get()[m] = data[m];
-  }
-  data += (sizeof(char) * size);
-
-  const framework::TensorDesc &desc = var_desc.Tensor_desc();
-  int memory_size = 1;
-  for (auto l : desc.Dims()) {
-    memory_size *= l;
-  }
-
-  tensor->Resize(framework::make_ddim(desc.Dims()));
-
-  void *memory = tensor;
-  int type_size = 0;
-  switch (desc.DataType()) {
-    case framework::VARTYPE_TYPE_FP16:
-      type_size = 2;
-      break;
-    case framework::VARTYPE_TYPE_FP32:
-      type_size = 4;
-      memory = tensor->mutable_data<float>();
-      break;
-    case framework::VARTYPE_TYPE_FP64:
-      type_size = 8;
-      break;
-    case framework::VARTYPE_TYPE_INT32:
-      type_size = 4;
-      break;
-    case framework::VARTYPE_TYPE_INT64:
-      type_size = 8;
-      break;
-    case framework::VARTYPE_TYPE_BOOL:
-      type_size = 1;
-      break;
-    default:
-      break;
-  }
-
-  for (int n = 0; n < memory_size * type_size; ++n) {
-    static_cast<char *>(memory)[n] = data[n];
-  }
-  data += (sizeof(char) * memory_size * type_size);
-}
-
-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::InitMemory() {
-  for (const auto &block : to_predict_program_->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = program_.scope->Var(var_desc->Name());
-      if (var_desc->Persistable()) {
-        auto tensor = var->template GetMutable<framework::LoDTensor>();
-        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-          continue;
-        }
-
-        char *origin_data =
-            Get_binary_data(program_.model_path + "/" + var_desc->Name());
-        char *data = origin_data;
-        LoadMemory(*var_desc, tensor, data);
-        delete origin_data;
-      } else {
-        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-          auto tensor = var->template GetMutable<framework::LoDTensor>();
-
-          tensor->template mutable_data<Ptype>();
-        }
-      }
-    }
-  }
-}
-
-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::InitCombineMemory() {
-  LOG(kLOG_INFO) << " begin init combine memory";
-  char *origin_data = Get_binary_data(program_.para_path);
-  char *data = origin_data;
-  for (const auto &block : to_predict_program_->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = program_.scope->Var(var_desc->Name());
-      if (var_desc->Persistable()) {
-        auto tensor = var->template GetMutable<framework::LoDTensor>();
-        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-          continue;
-        }
-        LoadMemory(*var_desc, tensor, data);
-      } else {
-        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-          auto tensor = var->template GetMutable<framework::LoDTensor>();
-          tensor->template mutable_data<Ptype>();
-        }
-      }
-    }
-  }
-  delete origin_data;
-  LOG(kLOG_INFO) << " end init combine memory ";
-}
-
-template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
-    const framework::Tensor &t) {
-  framework::Variable *g_feed_value = program_.scope->Var("feed");
-  framework::Tensor *feed_tensor =
-      g_feed_value->GetMutable<framework::LoDTensor>();
-  feed_tensor->Resize(t.dims());
-  feed_tensor->ShareDataWith(t);
-  std::shared_ptr<framework::BlockDesc> to_predict_block =
-      to_predict_program_->Block(0);
-  auto &ops = ops_of_block_[*to_predict_block.get()];
-#ifdef PADDLE_MOBILE_PROFILE
-  std::vector<ProfInfo> profile(ops.size());
-#endif
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-  std::mutex m;
-  std::condition_variable cv;
-  std::queue<int> next;
-  next.push(0);
-  int rsize = ops.size();
-  std::vector<int> status(rsize, 0);
-  auto &threadPool = ThreadPool::getThreadPool();
-  auto &dep = depManager[0];
-  auto finishF = [&ops, &m, &cv, &next, &status, &rsize, &dep](int opi) {
-    std::lock_guard<std::mutex> lk(m);
-    rsize--;
-    status[opi] = 2;
-    for (int i : dep.getNext(opi)) {
-      bool ok = true;
-      for (int j : dep.getDeps(i)) {
-        if (status[j] != 2) {
-          ok = false;
-          break;
-        }
-      }
-      if (ok && (status[i] == 0)) {
-        next.push(i);
-      }
-    }
-    cv.notify_one();
-  };
-  for (;;) {
-    std::unique_lock<std::mutex> lk(m);
-    cv.wait(lk, [&next, &rsize] { return rsize == 0 || !next.empty(); });
-    if (rsize == 0) {
-      break;
-    }
-    while (next.size() > 0) {
-      int opi = next.front();
-      next.pop();
-      status[opi] = 1;
-      threadPool.enqueue([opi, &ops, &finishF, &profile] {
-        auto &op = ops[opi];
-#ifdef PADDLE_MOBILE_PROFILE
-        struct timespec ts;
-        clock_gettime(CLOCK_MONOTONIC, &ts);
-        profile[opi].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-        profile[opi].tid = ThreadPool::getThreadPoolThreadId();
-#endif
-        ops[opi]->Run();
-#ifdef PADDLE_MOBILE_PROFILE
-        clock_gettime(CLOCK_MONOTONIC, &ts);
-        profile[opi].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-        finishF(opi);
-      });
-    }
-  }
-#else
-  for (int i = 0; i < ops.size(); i++) {
-#ifdef PADDLE_MOBILE_PROFILE
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-
-    // to Run
-    ops[i]->Run();
-#ifdef PADDLE_MOBILE_PROFILE
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-  }
-#endif
-  auto last_op = ops.rbegin();
-
-  auto output_map = (*last_op)->Outputs();
-  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
-  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
-  framework::LoDTensor *output_tensor =
-      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
-                                                   *(program_.scope));
-#ifdef PADDLE_MOBILE_PROFILE
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-  // TODO expose profile info as an interface, user can get them to analysis
-  //      the performance of their deepnet.
-  FILE *df = fopen("net.dot", "w");
-  fprintf(df, "digraph {\n");
-  for (int i = 0; i < ops.size(); i++) {
-    for (int j : dep.getNext(i)) {
-      fprintf(df, "op_%d -> op_%d\n", i, j);
-    }
-  }
-  for (int i = 0; i < ops.size(); i++) {
-    fprintf(df, "op_%d[label=\"%s (%d)\"]\n", i, ops[i]->Type().c_str(), i);
-  }
-  fprintf(df, "}\n");
-  fclose(df);
-#endif
-  FILE *pf = fopen("profile.out", "w");
-  std::unordered_map<std::string, uint64_t> _tp;
-  for (int i = 0; i < profile.size(); i++) {
-    const auto &pInfo = profile[i];
-    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
-    _tp[ops[i]->Type()] += timeCost;
-    fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i, ops[i]->Type().c_str(),
-            pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
-  }
-  fclose(pf);
-  printf("====================[ profile ]======================\n");
-  using prof_t = std::pair<std::string, uint64_t>;
-  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
-  uint64_t _ptotal = 0;
-  for (auto const &p : _tv) {
-    _ptotal += p.second;
-  }
-  auto compf = [](const prof_t &a, const prof_t &b) {
-    return a.second > b.second;
-  };
-  std::sort(_tv.begin(), _tv.end(), compf);
-  _tv.push_back(std::make_pair("total", _ptotal));
-  for (auto const &p : _tv) {
-    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(), (float)p.second,
-           (float)p.second / _ptotal * 100.0);
-  }
-  printf("====================[---------]======================\n");
-#endif
-
-  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
-}
-template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
-    const framework::Tensor &t, int block_id) {
-  return Predict(t);
-}
-
-template <typename Dtype, Precision P>
-std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
-    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
-  framework::Tensor tensor(input, framework::make_ddim(dims));
-  std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
-  Executor<Dtype, P>::Ptype *output_ptr =
-      output_tensor->data<typename Executor<Dtype, P>::Ptype>();
-  std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
-  for (int j = 0; j < output_tensor->numel(); ++j) {
-    result_vector.push_back(output_ptr[j]);
-  }
-  return result_vector;
-}
-
-template class Executor<CPU, Precision::FP32>;
-template class Executor<FPGA, Precision::FP32>;
-template class Executor<GPU_MALI, Precision::FP32>;
-
-}  // namespace paddle_mobile
diff --git a/src/io/loader.cpp b/src/io/loader.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48a2b5cfdaa5f53cd9611dd0be1ce3df05988311
--- /dev/null
+++ b/src/io/loader.cpp
@@ -0,0 +1,197 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "io/loader.h"
+
+#include "framework/lod_tensor.h"
+#include "framework/program/program-optimize/program_optimize.h"
+
+namespace paddle_mobile {
+using framework::Variable;
+
+/**
+ * muteandresize tensor as originProgramDesc and scope in loadParams
+ *
+ * @param originProgramDesc
+ * @param scope
+ */
+void InitMemoryFromProgram(
+    std::shared_ptr<framework::ProgramDesc> &originProgramDesc,
+    std::shared_ptr<framework::Scope> &scope) {
+  for (const auto &block : originProgramDesc.get()->Blocks()) {
+    for (const auto &var_desc : block->Vars()) {
+      auto var = scope.get()->Var(var_desc->Name());
+      if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+        if (var_desc->Persistable()) {
+          auto dim = var_desc->Tensor_desc().Dims();
+          auto tensor = var->GetMutable<framework::LoDTensor>();
+          tensor->Resize(framework::make_ddim(dim));
+        } else {
+          auto dim = var_desc->Tensor_desc().Dims();
+          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
+          dim[0] = 1;
+          auto tensor = var->GetMutable<framework::LoDTensor>();
+          tensor->Resize(framework::make_ddim(dim));
+        }
+      } else {
+        // TODO(codeWorm): some.
+      }
+    }
+  }
+}
+/**
+ * fusion and print someinfos
+ * @tparam Dtype
+ * @tparam P
+ * @param optimize
+ * @param can_add_split
+ * @param program
+ * @param originProgramDesc
+ */
+template <typename Dtype, Precision P>
+void FusionAndPrintInfos(
+    bool &optimize, bool &can_add_split, framework::Program<Dtype, P> &program,
+    const std::shared_ptr<framework::ProgramDesc> &originProgramDesc) {
+  if (optimize) {
+    framework::ProgramOptimize program_optimize;
+    program.optimizeProgram =
+        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
+  }
+  if (optimize) {
+    program.optimizeProgram->Description("optimize: ");
+  } else {
+    originProgramDesc->Description("program: ");
+  }
+}
+static size_t ReadBuffer(const char *file_name, uint8_t **out) {
+  FILE *fp;
+  fp = fopen(file_name, "rb");
+  PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name);
+
+  fseek(fp, 0, SEEK_END);
+  size_t size = ftell(fp);
+  rewind(fp);
+
+  DLOG << "model size: " << size;
+
+  *out = reinterpret_cast<uint8_t *>(malloc(size));
+
+  size_t cur_len = 0;
+  size_t nread;
+  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
+    cur_len += nread;
+  }
+  fclose(fp);
+  return cur_len;
+}
+
+template <typename Dtype, Precision P>
+const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
+    const std::string &dirname, bool optimize, bool quantification,
+    bool can_add_split) {
+  auto program = this->LoadProgram(dirname + "/__model__", optimize,
+                                   quantification, can_add_split);
+  program.model_path = dirname;
+  return program;
+}
+
+template <typename Dtype, Precision P>
+const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
+    const std::string &model_path, const std::string &para_path, bool optimize,
+    bool quantification) {
+  auto program = this->LoadProgram(model_path, optimize, quantification);
+
+  program.para_path = para_path;
+  program.combined = true;
+  program.quantification = quantification;
+  return program;
+}
+
+template <typename Dtype, Precision P>
+const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
+    const std::string &model_path, bool optimize, bool quantification,
+    bool can_add_split) {
+  std::string model_filename = model_path;
+  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
+  uint8_t *buf = NULL;
+  size_t read_size = ReadBuffer(model_filename.c_str(), &buf);
+
+  PADDLE_MOBILE_ENFORCE(buf != NULL, "read from __model__ is null");
+
+  c_program = paddle_mobile__framework__proto__program_desc__unpack(
+      NULL, read_size, buf);
+  //
+  PADDLE_MOBILE_ENFORCE(c_program != NULL, "program is null");
+  //
+  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
+  //
+  auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
+
+  framework::Program<Dtype, P> program;
+  program.originProgram = originProgramDesc;
+  program.quantification = quantification;
+  program.combined_params_len = 0;
+  program.combined_params_buf = nullptr;
+  auto scope = std::make_shared<framework::Scope>();
+  program.scope = scope;
+
+  // use  originProgramDesc and scope to init tensors
+  InitMemoryFromProgram(originProgramDesc, scope);
+  // perform fusion and print infos
+  FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc);
+
+  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
+  return program;
+}
+
+template <typename Dtype, Precision P>
+const framework::Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
+    size_t read_size, const uint8_t *buf, size_t combined_params_len,
+    const uint8_t *combined_params_buf, bool optimize, bool quantification) {
+  bool can_add_split = false;
+
+  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
+  PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
+
+  c_program = paddle_mobile__framework__proto__program_desc__unpack(
+      nullptr, read_size, buf);
+  //
+  PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
+  //
+  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
+  //
+
+  auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
+
+  framework::Program<Dtype, P> program;
+  program.combined = true;
+  program.originProgram = originProgramDesc;
+  program.quantification = quantification;
+  program.combined_params_len = combined_params_len;
+  program.combined_params_buf = combined_params_buf;
+
+  auto scope = std::make_shared<framework::Scope>();
+  program.scope = scope;
+  InitMemoryFromProgram(originProgramDesc, scope);
+  FusionAndPrintInfos(optimize, can_add_split, program, originProgramDesc);
+  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program,
+                                                               nullptr);
+  return program;
+}
+
+template class Loader<CPU, Precision::FP32>;
+template class Loader<FPGA, Precision::FP32>;
+template class Loader<GPU_MALI, Precision::FP32>;
+
+}  // namespace paddle_mobile
diff --git a/src/io/loader.h b/src/io/loader.h
new file mode 100644
index 0000000000000000000000000000000000000000..505366793da50413c52d8970cb47d062608d6484
--- /dev/null
+++ b/src/io/loader.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "common/types.h"
+#include "framework/program/program.h"
+
+namespace paddle_mobile {
+
+template <typename Dtype = CPU, Precision P = Precision::FP32>
+class Loader {
+ public:
+  /*
+   * @b load separate format fluid model
+   * @b 加载分开形式的 fluid 模型
+   * */
+  const framework::Program<Dtype, P> Load(const std::string &dirname,
+                                          bool optimize = false,
+                                          bool quantification = false,
+                                          bool can_add_split = false);
+
+  /*
+   * @b load combine format fluid mode
+   * @b 加载结合在一起格式的模型
+   * */
+  const framework::Program<Dtype, P> Load(const std::string &model_path,
+                                          const std::string &para_path,
+                                          bool optimize = false,
+                                          bool quantification = false);
+
+  const framework::Program<Dtype, P> LoadCombinedMemory(
+      size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
+      const uint8_t *combined_params_buf, bool optimize = false,
+      bool quantification = false);
+
+ private:
+  const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
+                                                 bool optimize = false,
+                                                 bool quantification = false,
+                                                 bool can_add_split = false);
+};
+
+}  // namespace paddle_mobile
diff --git a/src/io/paddle_inference_api.h b/src/io/paddle_inference_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..97564f4132d2e43cf736c2eb4a95d437584be24f
--- /dev/null
+++ b/src/io/paddle_inference_api.h
@@ -0,0 +1,132 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the definition of a simple Inference API for Paddle.
+ *
+ * ATTENTION: It requires some C++ features, for lower version C++ or C, we
+ * might release another API.
+ */
+
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace paddle_mobile {
+
+enum PaddleDType {
+  FLOAT32,
+  INT64,
+};
+
+class PaddleBuf {
+ public:
+  PaddleBuf() = default;
+  PaddleBuf(PaddleBuf&& other);
+  // Copy only available when memory is managed externally.
+  explicit PaddleBuf(const PaddleBuf&);
+  PaddleBuf& operator=(const PaddleBuf&);
+  // Do not own the memory.
+  PaddleBuf(void* data, size_t length)
+      : data_(data), length_(length), memory_owned_{false} {}
+  // Own memory.
+  PaddleBuf(size_t length)
+      : data_(new char[length]), length_(length), memory_owned_(true) {}
+  // Resize to `length` bytes.
+  void Resize(size_t length);
+  // Reset to external memory.
+  void Reset(void* data, size_t length);
+  bool empty() const { return length_ == 0; }
+  void* data() const { return data_; }
+  size_t length() const { return length_; }
+
+  ~PaddleBuf() { Free(); }
+
+ private:
+  void Free();
+  void* data_{nullptr};  // pointer to the data memory.
+  size_t length_{0};     // number of memory bytes.
+  bool memory_owned_{true};
+};
+
+struct PaddleTensor {
+  PaddleTensor() = default;
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+};
+
+enum class PaddleEngineKind {
+  kPaddleMobile,
+  // TODO(Superjomn) support following engines latter.
+  // kTensorRT,           // Use TensorRT for inference.
+  // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
+  // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
+};
+
+/*
+ * A simple Inference API for Paddle. Currently this API can be used by
+ * non-sequence scenerios.
+ */
+class PaddlePredictor {
+ public:
+  struct Config;
+  PaddlePredictor() = default;
+  PaddlePredictor(const PaddlePredictor&) = delete;
+  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
+
+  // Predict an record.
+  // The caller should be responsible for allocating and releasing the memory of
+  // `inputs`. `inputs` should be available until Run returns. Caller should be
+  // responsible for the output tensor's buffer, either allocated or passed from
+  // outside.
+  virtual bool Run(const std::vector<PaddleTensor>& inputs,
+                   std::vector<PaddleTensor>* output_data,
+                   int batch_size = -1) = 0;
+
+  // Destroy the Predictor.
+  virtual ~PaddlePredictor() = default;
+
+  // The common configs for all the predictors.
+  struct Config {
+    std::string model_dir;  // path to the model directory.
+  };
+};
+
+struct PaddleMobileConfig : public PaddlePredictor::Config {
+  enum Precision { FP32 = 0 };
+  enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
+
+  enum Precision precision;
+  enum Device device;
+
+  int batch_size = 1;
+  bool optimize = true;
+  bool quantification = false;
+  int thread_num = 1;
+  std::string prog_file;
+  std::string param_file;
+};
+
+// A factory to help create different predictors.
+template <typename ConfigT,
+          PaddleEngineKind engine = PaddleEngineKind::kPaddleMobile>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+}  // namespace paddle_mobile
diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0b84f1ff45e519dbbc244863db481f2364907a89
--- /dev/null
+++ b/src/io/paddle_mobile.cpp
@@ -0,0 +1,167 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "io/paddle_mobile.h"
+
+namespace paddle_mobile {
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
+#ifdef _OPENMP
+  //  omp_set_dynamic(0);
+  omp_set_num_threads(num);
+#endif
+};
+
+template <typename Dtype, Precision P>
+bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
+                                  bool quantification, int batch_size,
+                                  bool loddable) {
+  if (loader_.get() == nullptr) {
+    loader_ = std::make_shared<Loader<Dtype, P>>();
+  } else {
+    LOG(kLOG_INFO) << "loader inited";
+  }
+
+  if (executor_.get() == nullptr) {
+    executor_ = std::make_shared<Executor<Dtype, P>>(
+        loader_->Load(dirname, optimize, quantification), batch_size, optimize,
+        loddable);
+  } else {
+    LOG(kLOG_INFO) << "executor inited";
+  }
+
+  return true;
+}
+
+template <typename Dtype, Precision P>
+bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
+                                  const std::string &para_path, bool optimize,
+                                  bool quantification, int batch_size,
+                                  bool loddable) {
+  if (loader_.get() == nullptr) {
+    loader_ = std::make_shared<Loader<Dtype, P>>();
+  } else {
+    LOG(kLOG_INFO) << "loader inited";
+  }
+
+  if (executor_.get() == nullptr) {
+    executor_ = std::make_shared<Executor<Dtype, P>>(
+        loader_->Load(model_path, para_path, optimize, quantification),
+        batch_size, optimize, loddable);
+  } else {
+    LOG(kLOG_INFO) << "executor inited";
+  }
+
+  return true;
+}
+
+template <typename Dtype, Precision P>
+bool PaddleMobile<Dtype, P>::LoadCombinedMemory(
+    size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
+    const uint8_t *combined_params_buf) {
+  int batch_size = 1;
+  bool optimise = true;
+  bool quantification = false;
+
+  if (loader_.get() == nullptr) {
+    loader_ = std::make_shared<Loader<Dtype, P>>();
+  } else {
+    LOG(kLOG_INFO) << "loader inited";
+  }
+
+  if (executor_.get() == nullptr) {
+    executor_ = std::make_shared<Executor<Dtype, P>>(
+        loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
+                                    combined_params_buf, optimise,
+                                    quantification),
+        batch_size, optimise);
+  } else {
+    LOG(kLOG_INFO) << "executor inited";
+  }
+
+  return true;
+}
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::Predict(
+    const framework::Tensor &t) {
+  return executor_->Predict(t);
+}
+
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::PredictLod(
+    const framework::LoDTensor &t) {
+  return executor_->PredictLod(t);
+}
+
+template <typename Dtype, Precision P>
+std::vector<typename PaddleMobile<Dtype, P>::Ptype>
+PaddleMobile<Dtype, P>::Predict(const std::vector<Ptype> &input,
+                                const std::vector<int64_t> &dims) {
+  return executor_->Predict(input, dims);
+}
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::Clear() {
+  executor_ = nullptr;
+  loader_ = nullptr;
+}
+
+template <typename Dtype, Precision P>
+PaddleMobile<Dtype, P>::~PaddleMobile() {
+  executor_ = nullptr;
+  loader_ = nullptr;
+}
+
+#ifdef PADDLE_MOBILE_FPGA
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::InjectVariable(const framework::Tensor &t,
+                                            string var_name) {
+  executor_->InjectVariable(t, var_name);
+}
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::FeedData(const framework::Tensor &t) {
+  executor_->FeedData(t);
+};
+
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::FetchResult(int id) {
+  return executor_->FetchResult(id);
+};
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::Predict_From_To(int start, int end) {
+  executor_->Predict_From_To(start, end);
+};
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::Predict_From(int start) {
+  executor_->Predict_From(start);
+};
+
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::Predict_To(int end) {
+  executor_->Predict_To(end);
+};
+#endif
+
+template class PaddleMobile<CPU, Precision::FP32>;
+
+template class PaddleMobile<FPGA, Precision::FP32>;
+
+template class PaddleMobile<GPU_MALI, Precision::FP32>;
+
+}  // namespace paddle_mobile
diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h
new file mode 100644
index 0000000000000000000000000000000000000000..73c5553d91c1b4781718265aba8b7fa8dd5e2777
--- /dev/null
+++ b/src/io/paddle_mobile.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#ifdef _OPENMP
+#include <omp.h>
+#endif  // _OPENMP
+
+#include "common/types.h"
+#include "framework/tensor.h"
+#include "io/executor.h"
+#include "io/loader.h"
+
+namespace paddle_mobile {
+
+template <typename Dtype = CPU, Precision P = Precision::FP32>
+class PaddleMobile {
+  typedef typename PrecisionTrait<P>::ptype Ptype;
+
+ public:
+  PaddleMobile() {}
+  /*
+   * @b load separate format fluid model
+   * @b 加载分开形式的 fluid 模型
+   * */
+  bool Load(const std::string &dirname, bool optimize = false,
+            bool quantification = false, int batch_size = 1,
+            bool loddable = false);
+
+  /*
+   * @b load combine format fluid mode
+   * @b 加载结合在一起格式的模型
+   * */
+  bool Load(const std::string &model_path, const std::string &para_path,
+            bool optimize = false, bool quantification = false,
+            int batch_size = 1, bool loddable = false);
+  /*
+   * @b 设置线程数, 当 cmake 中开启 openmp 时生效
+   * */
+  void SetThreadNum(int num);
+
+  /*
+   * @b to predict
+   * */
+  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
+
+  /*
+   * @b to predict
+   * */
+  std::shared_ptr<framework::Tensor> PredictLod(const framework::LoDTensor &t);
+
+  /*
+   * @b to predict with vector and dim
+   *
+   * @b 使用 输入 和 输入的维度信息 进行预测
+   * */
+  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
+                             const std::vector<int64_t> &dims);
+
+  /**
+   * 从内存加载model 以及 combinedparams的接口
+   *
+   * @param model_len model 文件的内存大小
+   * @param model_buf model文件的内存
+   * @param combined_params_len  params文件的内存大小
+   * @param combined_params_buf  params文件的内存
+   * @return
+   */
+  bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
+                          size_t combined_params_len,
+                          const uint8_t *combined_params_buf);
+
+  void Clear();
+
+  ~PaddleMobile();
+
+ private:
+  std::shared_ptr<Loader<Dtype, P>> loader_;
+  std::shared_ptr<Executor<Dtype, P>> executor_;
+
+#ifdef PADDLE_MOBILE_FPGA
+ public:
+  void InjectVariable(const framework::Tensor &t, string var_name);
+  void FeedData(const framework::Tensor &t);
+  std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
+  void Predict_From_To(int start = 0, int end = -1);
+  void Predict_From(int start);
+  void Predict_To(int end);
+#endif
+};
+
+}  // namespace paddle_mobile
diff --git a/src/ios_io/PaddleMobileCPU.h b/src/ios_io/PaddleMobileCPU.h
new file mode 100644
index 0000000000000000000000000000000000000000..c68d81f328f4ce9a9bf16624f677b2996644c35c
--- /dev/null
+++ b/src/ios_io/PaddleMobileCPU.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#import <CoreImage/CoreImage.h>
+#import <Foundation/Foundation.h>
+
+@interface PaddleMobileCPUResult: NSObject
+
+@property (assign, nonatomic, readonly) float *output;
+
+@property (assign, nonatomic, readonly) int outputSize;
+
+-(void)releaseOutput;
+
+@end
+
+@interface PaddleMobileCPU : NSObject
+
+/*
+    创建对象
+*/
+- (instancetype)init;
+
+/*
+    load 模型, 开辟内存
+*/
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+
+/*
+  加载散开形式的模型, 需传入模型的目录
+*/
+- (BOOL)load:(NSString *)modelAndWeightPath;
+
+/*
+ * 从内存中加载模型
+ * */
+- (BOOL)LoadCombinedMemory:(size_t)modelLen
+               andModelBuf:(const uint8_t *)modelBuf
+         andModelParamsLen:(size_t)combinedParamsLen
+      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf;
+
+/*
+ *  对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存
+ * */
+-(void)preprocess:(CGImageRef)image
+           output:(float *)output
+            means:(NSArray<NSNumber *> *)means
+        scale:(float)scale
+        dim:(NSArray<NSNumber *> *)dim;
+
+/*
+ * 预测预处理后的数据, 返回结果使用结束需要调用其 realseOutput 函数进行释放
+ * */
+- (PaddleMobileCPUResult *)predictInput:(float *)input
+                                    dim:(NSArray<NSNumber *> *)dim;
+
+/*
+    进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
+
+/*
+    进行预测, 默认 means 为 0, scale 为 1.0
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
+
+/*
+    清理内存
+*/
+- (void)clear;
+
+@end
diff --git a/src/ios_io/PaddleMobileCPU.mm b/src/ios_io/PaddleMobileCPU.mm
new file mode 100644
index 0000000000000000000000000000000000000000..5a21418ef5fa9cbf7b24436cb778fc8c6c164e16
--- /dev/null
+++ b/src/ios_io/PaddleMobileCPU.mm
@@ -0,0 +1,316 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#import "PaddleMobileCPU.h"
+
+#import "op_symbols.h"
+#include "framework/tensor.h"
+#import "io/paddle_mobile.h"
+
+#import <memory>
+#import <vector>
+
+
+@interface PaddleMobileCPUResult()
+
+-(void)toSetOutput:(float *)output;
+
+-(void)toSetOutputSize:(int)outputSize;
+
+@end
+
+@implementation PaddleMobileCPUResult
+
+-(void)releaseOutput {
+  delete [] _output;
+  _output = nil;
+  _outputSize = 0;
+}
+
+-(void)toSetOutput:(float *)output {
+  _output = output;
+}
+
+-(void)toSetOutputSize:(int)outputSize {
+  _outputSize = outputSize;
+}
+
+@end
+
+
+@interface  PaddleMobileCPU()
+{
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32> *pam_;
+  BOOL loaded_;
+}
+@end
+
+@implementation PaddleMobileCPU
+
+static std::mutex shared_mutex;
+
+- (instancetype)init {
+  if (self = [super init]) {
+    pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32>();
+  }
+  return self;
+}
+
+- (void)dealloc {
+  if (pam_) {
+    delete pam_;
+  }
+}
+
++ (instancetype)sharedInstance{
+  static dispatch_once_t onceToken;
+  static id sharedManager = nil;
+  dispatch_once(&onceToken, ^{
+    sharedManager = [[[self class] alloc] init];
+  });
+  return sharedManager;
+}
+
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath{
+  std::string model_path_str = std::string([modelPath UTF8String]);
+  std::string weights_path_str = std::string([weighsPath UTF8String]);
+  pam_->SetThreadNum(2);
+  if (loaded_ = pam_->Load(model_path_str, weights_path_str, true)) {
+    return YES;
+  } else {
+    return NO;
+  }
+}
+
+- (BOOL)LoadCombinedMemory:(size_t)modelLen
+               andModelBuf:(const uint8_t *)modelBuf
+         andModelParamsLen:(size_t)combinedParamsLen
+      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf {
+  pam_->SetThreadNum(2);
+  return loaded_ = pam_->LoadCombinedMemory(modelLen, modelBuf, combinedParamsLen, combinedParamsBuf);
+}
+
+- (BOOL)load:(NSString *)modelAndWeightPath{
+  std::string model_path_str = std::string([modelAndWeightPath UTF8String]);
+  if (loaded_ = pam_->Load(model_path_str)) {
+    return YES;
+  } else {
+    return NO;
+  }
+}
+
+
+-(void)preprocess:(CGImageRef)image
+           output:(float *)output
+            means:(NSArray<NSNumber *> *)means
+        scale:(float)scale
+        dim:(NSArray<NSNumber *> *)dim {
+  std::lock_guard<std::mutex> lock(shared_mutex);
+
+  // dim to c++ vector, get numel
+  std::vector<int64_t > dim_vec;
+  int numel = 1;
+  for (int k = 0; k < dim.count; ++k) {
+    int d = dim[k].intValue;
+    numel *= d;
+    dim_vec.push_back(d);
+  }
+
+  const int sourceRowBytes = CGImageGetBytesPerRow(image);
+  const int imageWidth = CGImageGetWidth(image);
+  const int imageHeight = CGImageGetHeight(image);
+  const int imageChannels = 4;
+  CGDataProviderRef provider = CGImageGetDataProvider(image);
+  CFDataRef cfData = CGDataProviderCopyData(provider);
+  const UInt8 *input = CFDataGetBytePtr(cfData);
+
+  int wanted_input_width = dim_vec[3];
+  int wanted_input_height = dim_vec[2];
+  int wanted_input_channels = dim_vec[1];
+
+  for (int c = 0; c < wanted_input_channels; ++c) {
+    float *out_channel = output + c * wanted_input_height * wanted_input_width;
+    for (int y = 0; y < wanted_input_height; ++y) {
+      float *out_row = out_channel + y * wanted_input_width;
+      for (int x = 0; x < wanted_input_width; ++x) {
+        int in_row = (y * imageHeight) / wanted_input_height;
+        int in_col = (x * imageWidth) / wanted_input_width;
+        const UInt8 *in_pixel = input + (in_row * imageWidth * imageChannels) + (in_col * imageChannels);
+        float *out_pos = out_row + x;
+        if (c == 0) {
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }else if (c == 1){
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }else if (c == 2){
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }
+      }
+    }
+  }
+
+}
+
+-(void)preprocess:(const UInt8 *)input output:(float *)output imageWidth:(int)imageWidth imageHeight:(int)imageHeight imageChannels:(int)imageChannels means:(NSArray<NSNumber *> *)means scale:(float)scale dim:(std::vector<int64_t>)dim{
+  if (means == nil) {
+    means = @[@0, @0, @0];
+  }
+
+  int wanted_input_width = dim[3];
+  int wanted_input_height = dim[2];
+  int wanted_input_channels = dim[1];
+
+  for (int c = 0; c < wanted_input_channels; ++c) {
+    float *out_channel = output + c * wanted_input_height * wanted_input_width;
+    for (int y = 0; y < wanted_input_height; ++y) {
+      float *out_row = out_channel + y * wanted_input_width;
+      for (int x = 0; x < wanted_input_width; ++x) {
+        int in_row = (y * imageHeight) / wanted_input_height;
+        int in_col = (x * imageWidth) / wanted_input_width;
+        const UInt8 *in_pixel = input + (in_row * imageWidth * imageChannels) + (in_col * imageChannels);
+        float *out_pos = out_row + x;
+        if (c == 0) {
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }else if (c == 1){
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }else if (c == 2){
+          *out_pos = (in_pixel[c] - means[c].floatValue) * scale;
+        }
+      }
+    }
+  }
+}
+
+- (PaddleMobileCPUResult *)predictInput:(float *)input
+                      dim:(NSArray<NSNumber *> *)dim {
+  std::lock_guard<std::mutex> lock(shared_mutex);
+  if (!loaded_) {
+    printf("PaddleMobile doesn't be loaded yet");
+    return nil;
+  }
+
+  if (dim.count != 4) {
+    printf("dim must have 4 elements");
+    return nil;
+  }
+
+  // dim to c++ vector, get numel
+  std::vector<int64_t > dim_vec;
+  int numel = 1;
+  for (int k = 0; k < dim.count; ++k) {
+    int d = dim[k].intValue;
+    numel *= d;
+    dim_vec.push_back(d);
+  }
+
+  paddle_mobile::framework::Tensor input_tensor;
+
+  paddle_mobile::framework::DDim dims = paddle_mobile::framework::make_ddim(dim_vec);
+
+  float *input_ptr = input_tensor.mutable_data<float>(dims);
+
+  memcpy(input_ptr, input,
+         numel * sizeof(float));
+
+  std::shared_ptr<paddle_mobile::framework::Tensor> output = pam_->Predict(input_tensor);
+
+  float *output_pointer = new float[output->numel()];
+
+  memcpy(output_pointer, output->data<float>(),
+         output->numel() * sizeof(float));
+
+  PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init];
+  [cpuResult toSetOutput: output_pointer];
+  [cpuResult toSetOutputSize: output->numel()];
+
+  return cpuResult;
+}
+
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale{
+//  printf(" predict one ");
+  std::lock_guard<std::mutex> lock(shared_mutex);
+  if (!loaded_) {
+    printf("PaddleMobile doesn't be loaded yet");
+    return nil;
+  }
+
+  if (dim.count != 4) {
+    printf("dim must have 4 elements");
+    return nil;
+  }
+
+  // dim to c++ vector, get numel
+  std::vector<int64_t > dim_vec;
+  int numel = 1;
+  for (int k = 0; k < dim.count; ++k) {
+    int d = dim[k].intValue;
+    numel *= d;
+    dim_vec.push_back(d);
+  }
+
+  const int sourceRowBytes = CGImageGetBytesPerRow(image);
+  const int image_width = CGImageGetWidth(image);
+  const int image_height = CGImageGetHeight(image);
+  const int image_channels = 4;
+  CGDataProviderRef provider = CGImageGetDataProvider(image);
+  CFDataRef cfData = CGDataProviderCopyData(provider);
+  const UInt8 *input = CFDataGetBytePtr(cfData);
+
+  // sample image
+  float *output = (float *)malloc(numel*sizeof(float));
+  [self preprocess:input output:output imageWidth:image_width imageHeight:image_height imageChannels:image_channels means:means scale:scale dim:dim_vec];
+  float *dataPointer = nullptr;
+  if (nullptr != output) {
+    dataPointer = output;
+  } else {
+    return nil;
+  }
+
+  // input
+  std::vector<float> predict_input;
+  for (int j = 0; j < numel; ++j) {
+    predict_input.push_back(dataPointer[j]);
+  }
+
+  // predict
+  std::vector<float> cpp_result = pam_->Predict(predict_input, dim_vec);
+
+  // result
+  long count = 0;
+  count = cpp_result.size();
+  NSMutableArray *result = [[NSMutableArray alloc] init];
+  for (int i = 0; i < count; i++) {
+    [result addObject:[NSNumber numberWithFloat:cpp_result[i]]];
+  }
+
+
+  free(output);
+
+  // 待验证
+  //  if ([UIDevice currentDevice].systemVersion.doubleValue < 11.0) {
+  CFRelease(cfData);
+  cfData = NULL;
+  //  }
+
+  return result;
+}
+
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim {
+  [self predict:image dim:dim means:nil scale:1];
+}
+
+- (void)clear{
+  pam_->Clear();
+}
+
+@end
diff --git a/src/ios_io/op_symbols.h b/src/ios_io/op_symbols.h
new file mode 100644
index 0000000000000000000000000000000000000000..af0401c15ab28b0baa0cdbffb16a46215a26953e
--- /dev/null
+++ b/src/ios_io/op_symbols.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#include "operators/batchnorm_op.h"
+#include "operators/bilinear_interp_op.h"
+#include "operators/box_coder_op.h"
+#include "operators/concat_op.h"
+#include "operators/conv_op.h"
+#include "operators/conv_transpose_op.h"
+#include "operators/crf_op.h"
+#include "operators/depthwise_conv_op.h"
+#include "operators/dropout_op.h"
+#include "operators/elementwise_add_op.h"
+#include "operators/feed_op.h"
+#include "operators/fetch_op.h"
+#include "operators/flatten_op.h"
+#include "operators/fusion_conv_add.h"
+#include "operators/fusion_conv_add_add_prelu_op.h"
+#include "operators/fusion_conv_add_bn_op.h"
+#include "operators/fusion_conv_add_bn_relu_op.h"
+#include "operators/fusion_conv_add_prelu_op.h"
+#include "operators/fusion_conv_add_relu_op.h"
+#include "operators/fusion_conv_bn_add_relu_op.h"
+#include "operators/fusion_conv_bn_relu_op.h"
+#include "operators/fusion_dwconv_bn_relu_op.h"
+#include "operators/fusion_elementwise_add_relu_op.h"
+#include "operators/fusion_fc_op.h"
+#include "operators/fusion_fc_relu_op.h"
+#include "operators/gru_op.h"
+#include "operators/im2sequence_op.h"
+#include "operators/lookup_op.h"
+#include "operators/lrn_op.h"
+#include "operators/mul_op.h"
+#include "operators/multiclass_nms_op.h"
+#include "operators/pool_op.h"
+#include "operators/prelu_op.h"
+#include "operators/prior_box_op.h"
+#include "operators/relu_op.h"
+#include "operators/reshape_op.h"
+#include "operators/resize_op.h"
+#include "operators/scale_op.h"
+#include "operators/shape_op.h"
+#include "operators/sigmoid_op.h"
+#include "operators/slice_op.h"
+#include "operators/softmax_op.h"
+#include "operators/split_op.h"
+#include "operators/transpose_op.h"
diff --git a/src/jni/PML.java b/src/jni/PML.java
new file mode 100644
index 0000000000000000000000000000000000000000..9cbea253ff54ca82cb5059ea096d5a436018119a
--- /dev/null
+++ b/src/jni/PML.java
@@ -0,0 +1,71 @@
+package com.baidu.paddle;
+
+public class PML {
+    /**
+     * load seperated model
+     *
+     * @param modelDir model dir
+     * @return isloadsuccess
+     */
+    public static native boolean load(String modelDir);
+
+    /**
+     * load seperated model
+     *
+     * @param modelDir model dir
+     * @return isloadsuccess
+     */
+    public static native boolean loadnlp(String modelDir);
+
+    /**
+     * load combined model
+     *
+     * @param modelPath model file path
+     * @param paramPath param file path
+     * @return isloadsuccess
+     */
+    public static native boolean loadCombined(String modelPath, String paramPath);
+
+    /**
+     * load model and qualified params
+     *
+     * @param modelDir qualified model dir
+     * @return isloadsuccess
+     */
+    public static native boolean loadQualified(String modelDir);
+
+    /**
+     * load model and qualified combined params
+     *
+     * @param modelPath model file path
+     * @param paramPath qualified param path
+     * @return isloadsuccess
+     */
+    public static native boolean loadCombinedQualified(String modelPath, String paramPath);
+
+    /**
+     * predict image
+     *
+     * @param buf   of pretreated image (as your model like)
+     * @param ddims format of your input
+     * @return result
+     */
+    public static native float[] predictImage(float[] buf, int[] ddims);
+
+
+    public static native float[] predictYuv(byte[] buf, int imgWidth, int imgHeight, int[] ddims, float[] meanValues);
+
+    /**
+     * clear model data
+     */
+    public static native void clear();
+
+    /**
+     * setThread num when u enable openmp
+     *
+     * @param threadCount threadCount
+     */
+    public static native void setThread(int threadCount);
+
+
+}
diff --git a/src/jni/paddle_mobile_jni.cpp b/src/jni/paddle_mobile_jni.cpp
index f663b78fd490f2c9f0af525c7dabd2cc513c3a53..56d522b1560d2c4cbeab04eeceaa598543273806 100644
--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
@@ -15,6 +15,17 @@ limitations under the License. */
 #ifdef ANDROID
 
 #include "paddle_mobile_jni.h"
+#include <cmath>
+#include "common/log.h"
+#include "framework/tensor.h"
+#include "io/paddle_mobile.h"
+
+#ifdef ENABLE_EXCEPTION
+
+#include "common/enforce.h"
+
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -28,18 +39,10 @@ using std::string;
 
 extern const char *ANDROID_LOG_TAG =
     "paddle_mobile LOG built on " __DATE__ " " __TIME__;
-static Executor<CPU> *shared_executor_instance = nullptr;
+paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+static std::mutex shared_mutex;
 
-// toDo mutex lock
-// static std::mutex shared_mutex;
-
-Executor<CPU> *getExecutorInstance(const Program<CPU> p, int batch_size,
-                                   bool use_optimize) {
-  if (nullptr == shared_executor_instance) {
-    shared_executor_instance = new Executor<CPU>(p, batch_size, use_optimize);
-  }
-  return shared_executor_instance;
-}
+PaddleMobile<CPU> *getPaddleMobileInstance() { return &paddle_mobile; }
 
 string jstring2cppstring(JNIEnv *env, jstring jstr) {
   const char *cstr = env->GetStringUTFChars(jstr, 0);
@@ -51,37 +54,397 @@ string jstring2cppstring(JNIEnv *env, jstring jstr) {
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                                           jclass thiz,
                                                           jstring modelPath) {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  std::lock_guard<std::mutex> lock(shared_mutex);
+  ANDROIDLOGI("load invoked");
   bool optimize = true;
-  auto program = loader.Load(jstring2cppstring(env, modelPath), optimize);
-  shared_executor_instance = getExecutorInstance(program, 1, optimize);
-  return shared_executor_instance != nullptr ? JNI_TRUE : JNI_FALSE;
+  bool isLoadOk = false;
+
+#ifdef ENABLE_EXCEPTION
+  try {
+    isLoadOk = getPaddleMobileInstance()->Load(
+        jstring2cppstring(env, modelPath), optimize);
+  } catch (paddle_mobile::PaddleMobileException &e) {
+    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
+    isLoadOk = false;
+  }
+#else
+  isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                             optimize);
+#endif
+  return static_cast<jboolean>(isLoadOk);
+}
+
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_PML_loadnlp(JNIEnv *env, jclass thiz, jstring modelPath) {
+  std::lock_guard<std::mutex> lock(shared_mutex);
+  ANDROIDLOGI("load invoked");
+  bool optimize = true;
+  bool isLoadOk = false;
+
+#ifdef ENABLE_EXCEPTION
+  try {
+    isLoadOk = getPaddleMobileInstance()->Load(
+        jstring2cppstring(env, modelPath), optimize, false, 1, true);
+  } catch (paddle_mobile::PaddleMobileException &e) {
+    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
+    isLoadOk = false;
+  }
+#else
+  isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                             optimize, false, 1, true);
+#endif
+  return static_cast<jboolean>(isLoadOk);
+}
+
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
+    JNIEnv *env, jclass thiz, jstring modelPath) {
+  std::lock_guard<std::mutex> lock(shared_mutex);
+
+  ANDROIDLOGI("loadQualified invoked");
+  bool optimize = true;
+  bool qualified = true;
+  bool isLoadOk = false;
+
+#ifdef ENABLE_EXCEPTION
+  try {
+    isLoadOk = getPaddleMobileInstance()->Load(
+        jstring2cppstring(env, modelPath), optimize, qualified);
+  } catch (paddle_mobile::PaddleMobileException &e) {
+    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
+    isLoadOk = false;
+  }
+#else
+  isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                             optimize, qualified);
+#endif
+
+  return static_cast<jboolean>(isLoadOk);
+}
+
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) {
+  std::lock_guard<std::mutex> lock(shared_mutex);
+  ANDROIDLOGI("loadCombined invoked");
+  bool optimize = true;
+  bool isLoadOk = false;
+
+#ifdef ENABLE_EXCEPTION
+  try {
+    isLoadOk = getPaddleMobileInstance()->Load(
+        jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath),
+        optimize);
+  } catch (paddle_mobile::PaddleMobileException &e) {
+    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
+    isLoadOk = false;
+  }
+#else
+  isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                             jstring2cppstring(env, paramPath),
+                                             optimize);
+#endif
+  return static_cast<jboolean>(isLoadOk);
+}
+
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) {
+  std::lock_guard<std::mutex> lock(shared_mutex);
+  ANDROIDLOGI("loadCombinedQualified invoked");
+  bool optimize = true;
+  bool qualified = true;
+  bool isLoadOk = false;
+
+#ifdef ENABLE_EXCEPTION
+  try {
+    isLoadOk = getPaddleMobileInstance()->Load(
+        jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath),
+        optimize, qualified);
+  } catch (paddle_mobile::PaddleMobileException &e) {
+    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
+    isLoadOk = false;
+  }
+#else
+  isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                             jstring2cppstring(env, paramPath),
+                                             optimize, qualified);
+#endif
+  return static_cast<jboolean>(isLoadOk);
 }
 
 JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
-    JNIEnv *env, jclass thiz, jfloatArray buf) {
+    JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims) {
+  std::lock_guard<std::mutex> lock(shared_mutex);
+
+  ANDROIDLOGI("predictImage invoked");
   jfloatArray result = NULL;
+
+#ifdef ENABLE_EXCEPTION
+  ANDROIDLOGE("ENABLE_EXCEPTION!");
+
+  try {
+    jsize ddim_size = env->GetArrayLength(ddims);
+    if (ddim_size != 4) {
+      ANDROIDLOGE("ddims size not equal to 4");
+    }
+    jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
+    framework::DDim ddim = framework::make_ddim(
+        {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
+    int length = framework::product(ddim);
+    int count = 0;
+    float *dataPointer = nullptr;
+    if (nullptr != buf) {
+      dataPointer = env->GetFloatArrayElements(buf, NULL);
+    }
+    framework::Tensor input;
+    input.Resize(ddim);
+    auto input_ptr = input.mutable_data<float>();
+    for (int i = 0; i < length; i++) {
+      input_ptr[i] = dataPointer[i];
+    }
+    auto output = getPaddleMobileInstance()->Predict(input);
+    count = output->numel();
+    result = env->NewFloatArray(count);
+    env->SetFloatArrayRegion(result, 0, count, output->data<float>());
+    env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
+    env->DeleteLocalRef(ddims);
+    env->ReleaseFloatArrayElements(buf, dataPointer, 0);
+    env->DeleteLocalRef(buf);
+
+  } catch (paddle_mobile::PaddleMobileException &e) {
+    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
+  }
+
+#else
+  jsize ddim_size = env->GetArrayLength(ddims);
+  if (ddim_size != 4) {
+    ANDROIDLOGE("ddims size not equal to 4");
+  }
+  jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
+  framework::DDim ddim = framework::make_ddim(
+      {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
+  int length = framework::product(ddim);
   int count = 0;
   float *dataPointer = nullptr;
   if (nullptr != buf) {
     dataPointer = env->GetFloatArrayElements(buf, NULL);
   }
   framework::Tensor input;
-  framework::DDim ddim = framework::make_ddim({1, 3, 224, 224});
   input.Resize(ddim);
   auto input_ptr = input.mutable_data<float>();
-  for (int i = 0; i < framework::product(ddim); i++) {
+  for (int i = 0; i < length; i++) {
     input_ptr[i] = dataPointer[i];
   }
-  auto output = shared_executor_instance->Predict(input);
+  auto output = getPaddleMobileInstance()->Predict(input);
+  count = output->numel();
+  result = env->NewFloatArray(count);
+  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
+  env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
+  env->DeleteLocalRef(ddims);
+  env->ReleaseFloatArrayElements(buf, dataPointer, 0);
+  env->DeleteLocalRef(buf);
+//  env->DeleteLocalRef(dataPointer);
+#endif
+
+  ANDROIDLOGI("predictImage finished");
+
+  return result;
+}
+
+inline int yuv_to_rgb(int y, int u, int v, float *r, float *g, float *b) {
+  int r1 = (int)(y + 1.370705 * (v - 128));
+  int g1 = (int)(y - 0.698001 * (u - 128) - 0.703125 * (v - 128));
+  int b1 = (int)(y + 1.732446 * (u - 128));
+
+  r1 = (int)fminf(255, fmaxf(0, r1));
+  g1 = (int)fminf(255, fmaxf(0, g1));
+  b1 = (int)fminf(255, fmaxf(0, b1));
+  *r = r1;
+  *g = g1;
+  *b = b1;
+
+  return 0;
+}
+void convert_nv21_to_matrix(uint8_t *nv21, float *matrix, int width, int height,
+                            int targetWidth, int targetHeight, float *means) {
+  const uint8_t *yData = nv21;
+  const uint8_t *vuData = nv21 + width * height;
+
+  const int yRowStride = width;
+  const int vuRowStride = width;
+
+  float scale_x = width * 1.0 / targetWidth;
+  float scale_y = height * 1.0 / targetHeight;
+
+  for (int j = 0; j < targetHeight; ++j) {
+    int y = j * scale_y;
+    const uint8_t *pY = yData + y * yRowStride;
+    const uint8_t *pVU = vuData + (y >> 1) * vuRowStride;
+    for (int i = 0; i < targetWidth; ++i) {
+      int x = i * scale_x;
+      const int offset = ((x >> 1) << 1);
+      float r = 0;
+      float g = 0;
+      float b = 0;
+      yuv_to_rgb(pY[x], pVU[offset + 1], pVU[offset], &r, &g, &b);
+      int r_index = j * targetWidth + i;
+      int g_index = r_index + targetWidth * targetHeight;
+      int b_index = g_index + targetWidth * targetHeight;
+      matrix[r_index] = r - means[0];
+      matrix[g_index] = g - means[1];
+      matrix[b_index] = b - means[2];
+    }
+  }
+}
+
+JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
+    JNIEnv *env, jclass thiz, jbyteArray yuv_, jint imgwidth, jint imgHeight,
+    jintArray ddims, jfloatArray meanValues) {
+  std::lock_guard<std::mutex> lock(shared_mutex);
+
+  ANDROIDLOGI("predictYuv invoked");
+  jfloatArray result = NULL;
+
+#ifdef ENABLE_EXCEPTION
+  try {
+    jsize ddim_size = env->GetArrayLength(ddims);
+    if (ddim_size != 4) {
+      ANDROIDLOGE("ddims size not equal to 4");
+    }
+    jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
+    framework::DDim ddim = framework::make_ddim(
+        {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
+    int length = framework::product(ddim);
+    float matrix[length];
+    jbyte *yuv = env->GetByteArrayElements(yuv_, NULL);
+    float *meansPointer = nullptr;
+    if (nullptr != meanValues) {
+      meansPointer = env->GetFloatArrayElements(meanValues, NULL);
+    }
+    convert_nv21_to_matrix((uint8_t *)yuv, matrix, imgwidth, imgHeight, ddim[3],
+                           ddim[2], meansPointer);
+    int count = 0;
+    framework::Tensor input;
+    input.Resize(ddim);
+    auto input_ptr = input.mutable_data<float>();
+    for (int i = 0; i < length; i++) {
+      input_ptr[i] = matrix[i];
+    }
+    auto output = getPaddleMobileInstance()->Predict(input);
+    count = output->numel();
+    result = env->NewFloatArray(count);
+    env->SetFloatArrayRegion(result, 0, count, output->data<float>());
+    env->ReleaseByteArrayElements(yuv_, yuv, 0);
+    env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
+    env->ReleaseFloatArrayElements(meanValues, meansPointer, 0);
+    ANDROIDLOGI("predictYuv finished");
+  } catch (paddle_mobile::PaddleMobileException &e) {
+    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
+  }
+#else
+  jsize ddim_size = env->GetArrayLength(ddims);
+  if (ddim_size != 4) {
+    ANDROIDLOGE("ddims size not equal to 4");
+  }
+  jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
+  framework::DDim ddim = framework::make_ddim(
+      {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
+  int length = framework::product(ddim);
+  float matrix[length];
+  jbyte *yuv = env->GetByteArrayElements(yuv_, NULL);
+  float *meansPointer = nullptr;
+  if (nullptr != meanValues) {
+    meansPointer = env->GetFloatArrayElements(meanValues, NULL);
+  }
+  convert_nv21_to_matrix((uint8_t *)yuv, matrix, imgwidth, imgHeight, ddim[3],
+                         ddim[2], meansPointer);
+  int count = 0;
+  framework::Tensor input;
+  input.Resize(ddim);
+  auto input_ptr = input.mutable_data<float>();
+  for (int i = 0; i < length; i++) {
+    input_ptr[i] = matrix[i];
+  }
+  auto output = getPaddleMobileInstance()->Predict(input);
   count = output->numel();
   result = env->NewFloatArray(count);
   env->SetFloatArrayRegion(result, 0, count, output->data<float>());
+  env->ReleaseByteArrayElements(yuv_, yuv, 0);
+  env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
+  env->ReleaseFloatArrayElements(meanValues, meansPointer, 0);
+  ANDROIDLOGI("predictYuv finished");
+#endif
+
+  return result;
+}
+JNIEXPORT jlongArray JNICALL
+Java_com_baidu_paddle_PML_predictLod(JNIEnv *env, jclass thiz, jlongArray buf) {
+  std::lock_guard<std::mutex> lock(shared_mutex);
+
+  jlong *ddim_ptr = env->GetLongArrayElements(buf, NULL);
+  jsize ddim_size = env->GetArrayLength(buf);
+  std::vector<int64_t> ids;
+
+  for (int i = 0; i < ddim_size; ++i) {
+    jlong x = ddim_ptr[i];
+    ids.push_back((int64_t)x);
+  }
+
+  paddle_mobile::framework::LoDTensor words;
+
+  auto size = static_cast<int>(ids.size());
+
+  paddle_mobile::framework::LoD lod{{0, ids.size()}};
+  DDim dims{size, 1};
+  words.Resize(dims);
+  words.set_lod(lod);
+  auto *pdata = words.mutable_data<int64_t>();
+  size_t n = words.numel() * sizeof(int64_t);
+  memcpy(pdata, ids.data(), n);
+  auto vec_result = paddle_mobile.PredictLod(words);
+  int count = vec_result->numel();
+  jlongArray result = NULL;
+  ANDROIDLOGE("predict nlp size %d", count);
+
+  result = env->NewLongArray(count);
+
+  env->SetLongArrayRegion(result, 0, count, vec_result->data<int64_t>());
+
   return result;
 }
 
+JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_setThread(JNIEnv *env,
+                                                           jclass thiz,
+                                                           jint threadCount) {
+  std::lock_guard<std::mutex> lock(shared_mutex);
+
+  ANDROIDLOGI("setThreadCount %d", threadCount);
+#ifdef ENABLE_EXCEPTION
+  try {
+    getPaddleMobileInstance()->SetThreadNum((int)threadCount);
+  } catch (paddle_mobile::PaddleMobileException &e) {
+    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
+  }
+#else
+  getPaddleMobileInstance()->SetThreadNum((int)threadCount);
+
+#endif
+}
+
 JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_clear(JNIEnv *env,
-                                                       jclass thiz) {}
+                                                       jclass thiz) {
+  std::lock_guard<std::mutex> lock(shared_mutex);
+
+#ifdef ENABLE_EXCEPTION
+  try {
+    getPaddleMobileInstance()->Clear();
+
+  } catch (paddle_mobile::PaddleMobileException &e) {
+    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
+  }
+#else
+  getPaddleMobileInstance()->Clear();
+
+#endif
+}
 
 }  // namespace jni
 }  // namespace paddle_mobile
diff --git a/src/jni/paddle_mobile_jni.h b/src/jni/paddle_mobile_jni.h
index a262d4070c37013977e869fa816d52d78fbfa485..158d64d4517b69761b26fc18f2e0943798174014 100644
--- a/src/jni/paddle_mobile_jni.h
+++ b/src/jni/paddle_mobile_jni.h
@@ -15,9 +15,6 @@ limitations under the License. */
 #pragma once
 #ifdef ANDROID
 #include <jni.h>
-#include "common/log.h"
-#include "framework/tensor.h"
-#include "io/io.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -25,23 +22,59 @@ extern "C" {
 namespace paddle_mobile {
 namespace jni {
 /**
- * load model & params of the net for android
+ * load separated model for android
  */
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                                           jclass thiz,
                                                           jstring modelPath);
 
+/**
+ * load separated qualified model for android
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
+    JNIEnv *env, jclass thiz, jstring modelPath);
+/**
+ * load combined model  for android
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath);
+
+/**
+ * load combined qualified model for android
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath);
+
 /**
  * object detection for anroid
  */
 JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
-    JNIEnv *env, jclass thiz, jfloatArray buf);
+    JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims);
+
+/**
+ * object detection for anroid
+ */
+JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
+    JNIEnv *env, jclass thiz, jbyteArray yuv, jint imgwidth, jint imgHeight,
+    jintArray ddims, jfloatArray meanValues);
 
+/**
+ * object detection for anroid
+ */
+JNIEXPORT jfloatArray JNICALL
+Java_com_baidu_paddle_PML_predict(JNIEnv *env, jclass thiz, jfloatArray buf);
+
+/**
+ * setThreadCount for multithread
+ */
+JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_setThread(JNIEnv *env,
+                                                           jclass thiz,
+                                                           jint threadCount);
 /**
  * clear data of the net when destroy for android
  */
-JNIEXPORT void JNICALL Java_com_baidu_paddle_PMLL_clear(JNIEnv *env,
-                                                        jclass thiz);
+JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_clear(JNIEnv *env,
+                                                       jclass thiz);
 }  // namespace jni
 }  // namespace paddle_mobile
 #ifdef __cplusplus
diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp
index 0252f3c07c06487720586b0f650e2179d247234f..2bd4c0ac6ba3c7b066cc7ad2439ab6bebb7c3cd9 100644
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -16,10 +16,32 @@ limitations under the License. */
 #include <cstdlib>
 #include <cstring>
 
+#ifdef PADDLE_MOBILE_FPGA
+
+#include "fpga/api.h"
+
+#endif
+
 namespace paddle_mobile {
 namespace memory {
 const int MALLOC_ALIGN = 64;
 
+#ifdef PADDLE_MOBILE_FPGA
+namespace fpga = paddle_mobile::fpga;
+
+void Copy(void *dst, const void *src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+void *Alloc(size_t size) { return fpga::fpga_malloc(size); }
+
+void Free(void *ptr) {
+  if (ptr) {
+    fpga::fpga_free(ptr);
+  }
+}
+
+#else
 void Copy(void *dst, const void *src, size_t num) {
   std::memcpy(dst, src, num);
 }
@@ -42,5 +64,7 @@ void Free(void *ptr) {
   }
 }
 
+#endif
+
 }  // namespace memory
 }  // namespace paddle_mobile
diff --git a/src/operators/batchnorm_op.cpp b/src/operators/batchnorm_op.cpp
index 5d94d54f88e33b168739b1bbdf9af0bea9fe1b4f..f820908404ea637d9680c32d5c4b5568e191dd7e 100644
--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -26,17 +26,15 @@ void BatchNormOp<Dtype, T>::InferShape() const {
   auto x_dims = this->param_.InputX()->dims();
   this->param_.OutputY()->Resize(x_dims);
 }
-template class BatchNormOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(batch_norm);
 REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(batch_norm);
 REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
diff --git a/src/operators/batchnorm_op.h b/src/operators/batchnorm_op.h
index 9ee0b2dcf6b6ec46fcb08cac88d3df275d33f7d6..52c423f1bb90428e867ea6fb992036ab83c683d7 100644
--- a/src/operators/batchnorm_op.h
+++ b/src/operators/batchnorm_op.h
@@ -26,14 +26,15 @@ namespace operators {
 using std::string;
 template <typename DeviceType, typename T>
 class BatchNormOp
-    : public framework::OperatorWithKernel<DeviceType, BatchNormParam,
+    : public framework::OperatorWithKernel<DeviceType,
+                                           BatchNormParam<DeviceType>,
                                            BatchNormKernel<DeviceType, T>> {
  public:
   BatchNormOp(const string &type, const VariableNameMap &inputs,
               const VariableNameMap &outputs,
               const framework::AttributeMap &attrs,
               std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType, BatchNormParam,
+      : framework::OperatorWithKernel<DeviceType, BatchNormParam<DeviceType>,
                                       BatchNormKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
@@ -45,4 +46,13 @@ class BatchNormOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(batch_norm);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(batch_norm);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/bilinear_interp_op.cpp b/src/operators/bilinear_interp_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b3388c38ec6050faff1cb7bbe49e8dd042291fc9
--- /dev/null
+++ b/src/operators/bilinear_interp_op.cpp
@@ -0,0 +1,56 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef BILINEAR_INTERP_OP
+
+#include "operators/bilinear_interp_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+void BilinearOp<DeviceType, T>::InferShape() const {
+  PADDLE_MOBILE_ENFORCE(this->param_.InputX() != nullptr,
+                        "Input(X) of BilinearInterOp should not be null.");
+  PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
+                        "Output(Out) of BilinearInterOp should not be null.");
+
+  auto dim_x = this->param_.InputX()->dims();  // NCHW format
+  int out_h = this->param_.OutH();
+  int out_w = this->param_.OutW();
+  PADDLE_MOBILE_ENFORCE(dim_x.size() == 4, "X's dimension must be 4");
+
+  if (this->param_.InputOutPutSize() != nullptr) {
+    auto out_size_dim = this->param_.InputOutPutSize()->dims();
+
+    PADDLE_MOBILE_ENFORCE(out_size_dim.size() == 1,
+                          "OutSize's dimension size must be 1");
+    PADDLE_MOBILE_ENFORCE(out_size_dim[0] == 2, "OutSize's dim[0] must be 2");
+  }
+  std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
+  this->param_.Out()->Resize(framework::make_ddim(dim_out));
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(bilinear_interp, ops::BilinearOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/bilinear_interp_op.h b/src/operators/bilinear_interp_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..dbbf24eeac7a900d49f49242fddb8e568968dddc
--- /dev/null
+++ b/src/operators/bilinear_interp_op.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef BILINEAR_INTERP_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/bilinear_interp_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class BilinearOp : public framework::OperatorWithKernel<
+                       DeviceType, BilinearInterpParam<DeviceType>,
+                       operators::BilinearInterpKernel<DeviceType, T>> {
+ public:
+  BilinearOp(const std::string &type, const VariableNameMap &inputs,
+             const VariableNameMap &outputs,
+             const framework::AttributeMap &attrs,
+             std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, BilinearInterpParam<DeviceType>,
+            operators::BilinearInterpKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, BilinearInterpParam<DeviceType>,
+      operators::BilinearInterpKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(bilinear_interp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/box_coder_op.cpp b/src/operators/box_coder_op.cpp
index 31891ed74266d599898dd7426eed5cd28f320ab6..9e57c9021dac1b6857752989727c1c86051e33f7 100644
--- a/src/operators/box_coder_op.cpp
+++ b/src/operators/box_coder_op.cpp
@@ -47,13 +47,12 @@ void BoxCoderOp<Dtype, T>::InferShape() const {
   this->param_.OutputBox()->Resize(framework::make_ddim(
       {input_targetbox_dims[0], input_priorbox_dims[0], 4}));
 }
-template class BoxCoderOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(box_coder);
 REGISTER_OPERATOR_CPU(box_coder, ops::BoxCoderOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/box_coder_op.h b/src/operators/box_coder_op.h
index 33ff2358bc8285a026c217ed11c2250769395567..5d475c98b6859a33b39e6b36419fa055cde7a1d3 100644
--- a/src/operators/box_coder_op.h
+++ b/src/operators/box_coder_op.h
@@ -28,20 +28,20 @@ namespace operators {
 using paddle_mobile::framework::Tensor;
 
 template <typename DeviceType, typename T>
-class BoxCoderOp
-    : public framework::OperatorWithKernel<
-          DeviceType, BoxCoderParam, operators::BoxCoderKernel<DeviceType, T>> {
+class BoxCoderOp : public framework::OperatorWithKernel<
+                       DeviceType, BoxCoderParam<DeviceType>,
+                       operators::BoxCoderKernel<DeviceType, T>> {
  public:
   BoxCoderOp(const std::string &type, const VariableNameMap &inputs,
              const VariableNameMap &outputs,
              const framework::AttributeMap &attrs,
              std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType, BoxCoderParam,
+      : framework::OperatorWithKernel<DeviceType, BoxCoderParam<DeviceType>,
                                       operators::BoxCoderKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, BoxCoderParam,
+      DeviceType, BoxCoderParam<DeviceType>,
       operators::BoxCoderKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
@@ -51,4 +51,12 @@ class BoxCoderOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(box_coder);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/concat_op.cpp b/src/operators/concat_op.cpp
index fe0507dc812a3ddafcc0433c2659c3b49ea87f6e..f767f3481c999a16da46e75e314e8ebcb54193fa 100644
--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #ifdef CONCAT_OP
 
-#include "concat_op.h"
+#include <vector>
+
+#include "operators/concat_op.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -56,21 +58,19 @@ void ConcatOp<Dtype, T>::InferShape() const {
 
   this->param_.Out()->Resize(out_dims);
 }
-template class ConcatOp<CPU, float>;
 
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(concat);
 REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(concat);
 REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(concat, ops::ConcatOp);
 #endif
 
 #endif
diff --git a/src/operators/concat_op.h b/src/operators/concat_op.h
index 93612c6b1b6d1f6aa992773ef5cccc0c93f1b6e8..a169c17dc468dd06ed344a0c7a6ef3cb2c977a27 100644
--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
@@ -24,19 +24,19 @@ namespace paddle_mobile {
 namespace operators {
 using std::string;
 template <typename DeviceType, typename T>
-class ConcatOp
-    : public framework::OperatorWithKernel<
-          DeviceType, ConcatParam, operators::ConcatKernel<DeviceType, T>> {
+class ConcatOp : public framework::OperatorWithKernel<
+                     DeviceType, ConcatParam<DeviceType>,
+                     operators::ConcatKernel<DeviceType, T>> {
  public:
   ConcatOp(const string &type, const VariableNameMap &inputs,
            const VariableNameMap &outputs, const framework::AttributeMap &attrs,
            std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType, ConcatParam,
+      : framework::OperatorWithKernel<DeviceType, ConcatParam<DeviceType>,
                                       operators::ConcatKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, ConcatParam,
+      DeviceType, ConcatParam<DeviceType>,
       operators::ConcatKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
@@ -46,4 +46,14 @@ class ConcatOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(concat);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(concat);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(concat);
+#endif
+
 #endif
diff --git a/src/operators/conv_op.cpp b/src/operators/conv_op.cpp
index 01d284a06ed33142a8d16cdc32f304c3d1a75e28..c4601995219b32db75f22c7c2ed959e18af85f36 100644
--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -48,22 +48,17 @@ void ConvOp<Dtype, T>::InferShape() const {
   this->param_.Output()->Resize(ddim);
 }
 
-template class ConvOp<CPU, float>;
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(conv2d);
 REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(conv2d);
 REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(conv2d);
 REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
 #endif
 
diff --git a/src/operators/conv_op.h b/src/operators/conv_op.h
index f8e8952d47fd726c712c0f7817606d959095b65b..267abfeb614dc8e19a2cf0cf43e7c5f232a62072 100644
--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -24,19 +24,19 @@ namespace paddle_mobile {
 namespace operators {
 using std::string;
 template <typename DeviceType, typename T>
-class ConvOp
-    : public framework::OperatorWithKernel<
-          DeviceType, ConvParam, operators::ConvKernel<DeviceType, T>> {
+class ConvOp : public framework::OperatorWithKernel<
+                   DeviceType, ConvParam<DeviceType>,
+                   operators::ConvKernel<DeviceType, T>> {
  public:
   ConvOp(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
          std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType, ConvParam,
+      : framework::OperatorWithKernel<DeviceType, ConvParam<DeviceType>,
                                       operators::ConvKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, ConvParam,
+      DeviceType, ConvParam<DeviceType>,
       operators::ConvKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
@@ -46,4 +46,14 @@ class ConvOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(conv2d);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(conv2d);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(conv2d);
+#endif
+
 #endif
diff --git a/src/common/openmp-fix.cpp b/src/operators/conv_transpose_op.cpp
similarity index 51%
rename from src/common/openmp-fix.cpp
rename to src/operators/conv_transpose_op.cpp
index 8c31ef45c68227c612155e826e664367a7917501..34de4cbb10d3689f0be95f1277cfdd76b4c2c141 100644
--- a/src/common/openmp-fix.cpp
+++ b/src/operators/conv_transpose_op.cpp
@@ -12,16 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_MOBILE_USE_OPENMP
-/**
- * android-ndk-r17 has a problem when linking with openmp.
- * if paddle-mobile enables -fopenmp, but didn't use those omp_* functions,
- * after linking another binary with libpaddle-mobile.so, the omp_get_thread_num
- * will not work. see test/common/test_openmp.cc the detailed reason is still
- * unclear, but this trick will work. a better solution is hacking the linker,
- * try some flags to make it link omp_* functions, but I didn't find out how to
- * make it work.
- */
-#include <omp.h>
-static int _ = omp_get_num_procs();
+#ifdef CONV_TRANSPOSE
+
+#include "operators/conv_transpose_op.h"
+
+namespace paddle_mobile {
+namespace operators {}
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(conv2d_transpose, ops::ConvOpTranspose);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/conv_transpose_op.h b/src/operators/conv_transpose_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9b5e86bef0674b176ba901212a9add2ee2def83
--- /dev/null
+++ b/src/operators/conv_transpose_op.h
@@ -0,0 +1,101 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_TRANSPOSE
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "framework/operator.h"
+#include "operators/kernel/conv_transpose_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class ConvOpTranspose : public framework::OperatorWithKernel<
+                            DeviceType, ConvTransposeParam<DeviceType>,
+                            operators::ConvTransposeKernel<DeviceType, T>> {
+ public:
+  ConvOpTranspose(const std::string &type, const VariableNameMap &inputs,
+                  const VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs,
+                  std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, ConvTransposeParam<DeviceType>,
+            operators::ConvTransposeKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  void InferShape() const {
+    auto input = this->param_.Input();
+    auto in_dims = input->dims();
+
+    auto filter = this->param_.Filter();
+    auto filter_dims = filter->dims();
+
+    std::vector<int> strides = this->param_.Strides();
+    std::vector<int> paddings = this->param_.Paddings();
+    std::vector<int> dilations = this->param_.Dilations();
+
+    int groups = this->param_.Groups();
+
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == 4 || in_dims.size() == 5,
+        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() == filter_dims.size(),
+        "ConvTransposeOp input dimension and filter dimension "
+        "should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims.size() - strides.size() == 2U,
+        "ConvTransposeOp input dimension and strides dimension should "
+        "be consistent.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
+                          "ConvTransposeOp paddings dimension and strides "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
+                          "ConvTransposeOp paddings dimension and dilations "
+                          "dimension should be the same.");
+    PADDLE_MOBILE_ENFORCE(
+        in_dims[1] == filter_dims[0],
+        "In ConvTransposeOp, The number of input channels should "
+        "be equal to the number of filter's channels.");
+
+    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
+    for (size_t i = 0; i < strides.size(); ++i) {
+      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
+      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
+                             2 * paddings[i] + filter_extent);
+    }
+    this->param_.Output()->Resize(framework::make_ddim(output_shape));
+  }
+
+ private:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(conv2d_transpose);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(conv2d_transpose);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(conv2d_transpose);
+#endif
+
+#endif
diff --git a/src/operators/crf_op.cpp b/src/operators/crf_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..61f9a54352e236a7fcb7b2765ab11055fbec95ab
--- /dev/null
+++ b/src/operators/crf_op.cpp
@@ -0,0 +1,56 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CRF_OP
+
+#include <vector>
+
+#include "common/enforce.h"
+#include "operators/crf_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void CrfOp<Dtype, T>::InferShape() const {
+  PADDLE_MOBILE_ENFORCE(this->param_.InputEmission(),
+                        "Input(Emission) should be not null.");
+  PADDLE_MOBILE_ENFORCE(this->param_.InputTransition(),
+                        "Input(Transition) should be not null.");
+  PADDLE_MOBILE_ENFORCE(this->param_.outputVBP(),
+                        "Input(ViterbiPath) should be not null.");
+
+  auto emission_dims = this->param_.InputEmission()->dims();
+  PADDLE_MOBILE_ENFORCE(emission_dims.size() == 2U,
+                        "The Input(Emission) should be a 2-D tensor.");
+  PADDLE_MOBILE_ENFORCE(emission_dims[0],
+                        "An empty mini-batch is not allowed.");
+
+  this->param_.outputVBP()->Resize(
+      {this->param_.InputEmission()->dims()[0], 1});
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(crf_decoding, ops::CrfOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/crf_op.h b/src/operators/crf_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c966c9077273282bbcb4f25674e8df401956967
--- /dev/null
+++ b/src/operators/crf_op.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CRF_OP
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/crf_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class CrfOp : public framework::OperatorWithKernel<
+                  DeviceType, CrfParam<DeviceType>,
+                  operators::CrfKernel<DeviceType, T>> {
+ public:
+  CrfOp(const std::string &type, const VariableNameMap &inputs,
+        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+        std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, CrfParam<DeviceType>,
+                                      operators::CrfKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, CrfParam<DeviceType>,
+      operators::CrfKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(crf_decoding);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/depthwise_conv_op.cpp b/src/operators/depthwise_conv_op.cpp
index 46f2db30ba2fbff5839d6a737dda12fa6cd10b43..8d6b6a143c37537be6de1e60cc095f1052136e26 100644
--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -49,14 +49,11 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
   this->param_.Output()->Resize(ddim);
 }
 
-template class DepthwiseConvOp<CPU, float>;
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(depthwise_conv2d);
 REGISTER_OPERATOR_CPU(depthwise_conv2d, ops::DepthwiseConvOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/depthwise_conv_op.h b/src/operators/depthwise_conv_op.h
index 75bcf44cb8790365e7f33719c481354c1a57c80a..40e87a9b1bf9d2b5102a56ff59821b9d122563c5 100644
--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
@@ -25,7 +25,7 @@ namespace operators {
 
 template <typename DeviceType, typename T>
 class DepthwiseConvOp : public framework::OperatorWithKernel<
-                            DeviceType, ConvParam,
+                            DeviceType, ConvParam<DeviceType>,
                             operators::DepthwiseConvKernel<DeviceType, T>> {
  public:
   DepthwiseConvOp(const std::string &type, const VariableNameMap &inputs,
@@ -33,12 +33,12 @@ class DepthwiseConvOp : public framework::OperatorWithKernel<
                   const framework::AttributeMap &attrs,
                   std::shared_ptr<framework::Scope> scope)
       : framework::OperatorWithKernel<
-            DeviceType, ConvParam,
+            DeviceType, ConvParam<DeviceType>,
             operators::DepthwiseConvKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, ConvParam,
+      DeviceType, ConvParam<DeviceType>,
       operators::DepthwiseConvKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
@@ -48,4 +48,12 @@ class DepthwiseConvOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(depthwise_conv2d);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/dropout_op.cpp b/src/operators/dropout_op.cpp
index 709d83a1f57b7faa0ecce1f1f8590c86c1eba1a8..a913ff017bfe776a2c2dfea5696e4c0f23683c46 100644
--- a/src/operators/dropout_op.cpp
+++ b/src/operators/dropout_op.cpp
@@ -22,18 +22,18 @@ void DropoutOp<Dtype, T>::InferShape() const {
   auto input_dims = this->param_.InputX()->dims();
   this->param_.Out()->Resize(input_dims);
 }
-template class DropoutOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(dropout);
 REGISTER_OPERATOR_CPU(dropout, ops::DropoutOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(dropout, ops::DropoutOp);
 #endif
 
 #endif
diff --git a/src/operators/dropout_op.h b/src/operators/dropout_op.h
index bc2986b791e9069e8782d778a0f16cb639ac4396..7523fd5fa8f21dfce20bce963be4b3bc323948e9 100644
--- a/src/operators/dropout_op.h
+++ b/src/operators/dropout_op.h
@@ -28,18 +28,18 @@ namespace operators {
 using paddle_mobile::framework::Tensor;
 
 template <typename DeviceType, typename T>
-class DropoutOp
-    : public framework::OperatorWithKernel<
-          DeviceType, DropoutParam, operators::DropoutKernel<DeviceType, T>> {
+class DropoutOp : public framework::OperatorWithKernel<
+                      DeviceType, DropoutParam<DeviceType>,
+                      operators::DropoutKernel<DeviceType, T>> {
  public:
   DropoutOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs, const framework::AttributeMap attrs,
             std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType, DropoutParam,
+      : framework::OperatorWithKernel<DeviceType, DropoutParam<DeviceType>,
                                       operators::DropoutKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
-  // using framework::OperatorWithKernel<DeviceType, DropoutParam,
+  // using framework::OperatorWithKernel<DeviceType, DropoutParam<DeviceType>,
   //                                    operators::DropoutKernel<DeviceType,
   //                                    T>>;
   void InferShape() const override;
@@ -50,4 +50,13 @@ class DropoutOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(dropout);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(dropout);
+#endif
+
 #endif
diff --git a/src/operators/elementwise_add_op.cpp b/src/operators/elementwise_add_op.cpp
index 12c59da6452992e3dd73b985db685a651df02250..49885f783417d61c6348fc4563e7306036994f17 100644
--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -24,17 +24,15 @@ void ElementwiseAddOp<Dtype, T>::InferShape() const {
   auto x_dim = this->param_.InputX()->dims();
   this->param_.Out()->Resize(x_dim);
 }
-template class ElementwiseAddOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(elementwise_add);
 REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(elementwise_add);
 REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
diff --git a/src/operators/elementwise_add_op.h b/src/operators/elementwise_add_op.h
index 6cb80d06d0a4d66935c77a3c23a6264d0be53ecc..14bcd5264d136007e2eb2ffe917697570b32e40b 100644
--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
@@ -26,7 +26,7 @@ namespace operators {
 using std::string;
 template <typename DeviceType, typename T>
 class ElementwiseAddOp : public framework::OperatorWithKernel<
-                             DeviceType, ElementwiseAddParam,
+                             DeviceType, ElementwiseAddParam<DeviceType>,
                              operators::ElementwiseAddKernel<DeviceType, T>> {
  public:
   ElementwiseAddOp(const string &type, const VariableNameMap &inputs,
@@ -34,12 +34,12 @@ class ElementwiseAddOp : public framework::OperatorWithKernel<
                    const framework::AttributeMap &attrs,
                    std::shared_ptr<framework::Scope> scope)
       : framework::OperatorWithKernel<
-            DeviceType, ElementwiseAddParam,
+            DeviceType, ElementwiseAddParam<DeviceType>,
             operators::ElementwiseAddKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, ElementwiseAddParam,
+      DeviceType, ElementwiseAddParam<DeviceType>,
       operators::ElementwiseAddKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
@@ -48,4 +48,13 @@ class ElementwiseAddOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(elementwise_add);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(elementwise_add);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/feed_op.cpp b/src/operators/feed_op.cpp
index a40eac098c7bef442befa1758b21904269cc22d5..77acb5db31e66d78bccd8dbef51832bda1a1bb60 100644
--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -14,8 +14,16 @@ limitations under the License. */
 
 #include "feed_op.h"
 namespace paddle_mobile {
-namespace operators {
-
-template class FeedOp<CPU, float>;
-}
+namespace operators {}
 }  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(feed, ops::FeedOp);
+#endif
diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h
index 8753bfa9375f50930f9ec57e1b48b26c127edbc6..1b36461932798153af60d936dbac91817a4100df 100644
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -29,32 +29,70 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
          std::shared_ptr<framework::Scope> scope)
       : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
                                             scope),
-        param_(inputs, outputs, attrs, *scope) {}
-  void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
-
-  void Init() {}
+        param_(inputs, outputs, attrs, scope.get()) {}
 
   void InferShape() const {
     auto out_dims = param_.Out()->dims();
     out_dims[0] = param_.BatchSize();
     param_.Out()->Resize(out_dims);
+
+    //  note : mobile infershape iscalled when executer is created.  so  do not
+    //  pass lod here .
+    // it is empty
+  }
+
+#ifdef PADDLE_MOBILE_FPGA
+
+  void Init() {
+    Tensor *output = param_.Out();
+    fpga::format_fp16_ofm(output);
+  }
+
+  void RunImpl() const {
+    auto input = (Tensor *)const_cast<LoDTensor *>(param_.InputX());
+    fpga::format_image(input);
+    auto input_ptr = input->data<float>();
+    Tensor *output = param_.Out();
+    auto output_ptr = output->data<float>();
+
+    fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
+
+    args.input_data_type = fpga::DATA_TYPE_FP32;
+    args.output_data_type = fpga::DATA_TYPE_FP16;
+    args.input_layout_type = fpga::LAYOUT_CHW;
+    args.output_layout_type = fpga::LAYOUT_HWC;
+    args.image.address = (void *)input_ptr;
+    args.image.channels = (uint32_t)input->dims()[1];
+    args.image.height = (uint32_t)input->dims()[2];
+    args.image.width = (uint32_t)input->dims()[3];
+    args.image.pad_height = 0;
+    args.image.pad_width = 0;
+    args.output.address = output_ptr;
+    args.output.scale_address = output->scale;
+    fpga::PerformBypass(args);
   }
 
+#else
+  void Init() {}
+  void RunImpl() const {
+    param_.Out()->ShareDataWith(*param_.InputX());
+    param_.Out()->set_lod(param_.InputX()->lod());
+  }
+#endif
+
  protected:
-  FeedParam param_;
+  FeedParam<DeviceType> param_;
 };
 
-namespace ops = paddle_mobile::operators;
+}  // namespace operators
+}  // namespace paddle_mobile
+
 #ifdef PADDLE_MOBILE_CPU
 USE_OP_CPU(feed);
-REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 USE_OP_MALI_GPU(feed);
-REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(feed);
 #endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/src/operators/fetch_op.cpp b/src/operators/fetch_op.cpp
index 45d6afc07b597156a746b7cd6657c3b58f1b9950..30cddceaa45da91be5ea91d70f78503c404552c3 100644
--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
@@ -14,8 +14,16 @@ limitations under the License. */
 
 #include "fetch_op.h"
 namespace paddle_mobile {
-namespace operators {
-
-template class FetchOp<CPU, float>;
-}
+namespace operators {}
 }  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fetch, ops::FetchOp);
+#endif
diff --git a/src/operators/fetch_op.h b/src/operators/fetch_op.h
index b46093e18e1d92ed9dacbdb456bb591d0c546456..1efe0832b1fc4b2ce240ed838e2f4554c29dccd9 100644
--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -41,20 +41,18 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
   }
 
  protected:
-  FetchParam param_;
+  FetchParam<DeviceType> param_;
 };
 
-namespace ops = paddle_mobile::operators;
+}  // namespace operators
+}  // namespace paddle_mobile
+
 #ifdef PADDLE_MOBILE_CPU
 USE_OP_CPU(fetch);
-REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 USE_OP_MALI_GPU(fetch);
-REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fetch);
 #endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/src/operators/flatten_op.cpp b/src/operators/flatten_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0282414ca6ed0be743849e9d295a354144fccdb9
--- /dev/null
+++ b/src/operators/flatten_op.cpp
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FLATTEN_OP
+
+#include "operators/flatten_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+void FlattenOp<DeviceType, T>::InferShape() const {
+  PADDLE_MOBILE_ENFORCE(this->param_.InputX() != nullptr,
+                        "Input (X) of Flatten op should not be null.");
+  PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
+                        "Output (Output) of Flatten op should not be null.");
+
+  auto &axis = this->param_.Axis();
+  PADDLE_MOBILE_ENFORCE(axis >= 0,
+                        "The axis should be greater than or equal to 0.");
+
+  auto &in_dims = this->param_.InputX()->dims();
+  //  const auto &in_dims = ctx->GetInputDim("X");
+  PADDLE_MOBILE_ENFORCE(
+      axis <= in_dims.size(),
+      "The axis should be less than or equal to input tensor's rank.");
+
+  const auto &out_dims = GetOutputShape(axis, in_dims);
+  this->param_.Out()->Resize(in_dims);
+  // todo supprot lodtensor
+  //  if (in_dims[0] == out_dims[0]) {
+  //    // Only pass LoD when the first dimension of output and Input(X)
+  //    // are the same.
+  //    ctx->ShareLoD("X", "Out");
+  //  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(flatten, ops::FlattenOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/flatten_op.h b/src/operators/flatten_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c1f6ff8a0f2b3212750f3be4d1a6aa2bad790ee
--- /dev/null
+++ b/src/operators/flatten_op.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FLATTEN_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/flatten_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+inline std::vector<int32_t> GetOutputShape(const int axis,
+                                           const framework::DDim &in_dims) {
+  int64_t outer = 1, inner = 1;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    if (i < axis) {
+      outer *= in_dims[i];
+    } else {
+      inner *= in_dims[i];
+    }
+  }
+  std::vector<int32_t> out_shape(2);
+  out_shape[0] = static_cast<int>(outer);
+  out_shape[1] = static_cast<int>(inner);
+  return out_shape;
+}
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class FlattenOp : public framework::OperatorWithKernel<
+                      DeviceType, FlattenParam<DeviceType>,
+                      operators::FlattenKernel<DeviceType, T>> {
+ public:
+  FlattenOp(const std::string &type, const VariableNameMap &inputs,
+            const VariableNameMap &outputs,
+            const framework::AttributeMap &attrs,
+            std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, FlattenParam<DeviceType>,
+                                      operators::FlattenKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, FlattenParam<DeviceType>,
+      operators::FlattenKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(flatten);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_add.cpp b/src/operators/fusion_conv_add.cpp
index 731bb66cb058dd8562b5fc9257bd8e9ed5f9c0af..cdd6a6db2bb11ebf8dce2aca85630aa8805adf3e 100644
--- a/src/operators/fusion_conv_add.cpp
+++ b/src/operators/fusion_conv_add.cpp
@@ -44,18 +44,16 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
   framework::DDim ddim = framework::make_ddim(output_shape);
   this->param_.Output()->Resize(ddim);
 }
-template class FusionConvAddOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add);
 REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(conv_add);
-REGISTER_OPERATOR_MALI_GPU(conv_add, ops::FusionConvAddOp);
+REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
diff --git a/src/operators/fusion_conv_add.h b/src/operators/fusion_conv_add.h
index 8b843f55266300b9fbb758b2b5ce43b908d1dc82..ba1ca997662ce67fdcd8f39d2a12e2f535c5b1a7 100644
--- a/src/operators/fusion_conv_add.h
+++ b/src/operators/fusion_conv_add.h
@@ -36,8 +36,6 @@ class FusionConvAddMatcher : public framework::FusionOpMatcher {
   void FolderNodes(
       framework::Node *node,
       std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node->OpDescs(node_.Depth());
     node->Folder(node_.Depth(), Type(),
                  {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
   }
@@ -47,19 +45,20 @@ class FusionConvAddMatcher : public framework::FusionOpMatcher {
 
 template <typename DeviceType, typename T>
 class FusionConvAddOp : public framework::OperatorWithKernel<
-                            DeviceType, FusionConvAddParam,
+                            DeviceType, FusionConvAddParam<DeviceType>,
                             operators::ConvAddKernel<DeviceType, T>> {
  public:
   FusionConvAddOp(const string &type, const VariableNameMap &inputs,
                   const VariableNameMap &outputs,
                   const framework::AttributeMap &attrs,
                   std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType, FusionConvAddParam,
+      : framework::OperatorWithKernel<DeviceType,
+                                      FusionConvAddParam<DeviceType>,
                                       operators::ConvAddKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, FusionConvAddParam,
+      DeviceType, FusionConvAddParam<DeviceType>,
       operators::ConvAddKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
@@ -82,6 +81,7 @@ static framework::FusionOpRegistrar convadd_registrar(
 static framework::FusionOpRegistrar convadd_registrar(
     new FusionConvAddMatcher());
 #define CONV_ADD_REGISTER
+
 #endif
 
 #endif
@@ -92,4 +92,13 @@ static framework::FusionOpRegistrar convadd_registrar(
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fusion_conv_add);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/fusion_conv_add_add_prelu.cpp b/src/operators/fusion_conv_add_add_prelu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5104c989415eee46e66bdbf419fc6ecf7a2baa34
--- /dev/null
+++ b/src/operators/fusion_conv_add_add_prelu.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDADDPRELU_OP
+
+#include "fusion_conv_add_add_prelu_op.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionConvAddAddPReluOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_add_add_prelu_op.h b/src/operators/fusion_conv_add_add_prelu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d91b4d28d728efb4ecf817294f37e67ac19cfe72
--- /dev/null
+++ b/src/operators/fusion_conv_add_add_prelu_op.h
@@ -0,0 +1,112 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDADDPRELU_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/conv_add_add_prelu_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+class FusionConvAddAddPReluOpMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionConvAddAddPReluOpMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_PRELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD,
+                   {{"Y", "Y"}, {"Out", "addOut"}, {"X", "addX"}}},
+                  {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}},
+
+                 removed_nodes);
+  }
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU; }
+
+  std::vector<std::pair<int, std::string>> NeedCheck() {
+    DLOG << " conv add add prelu check add X ";
+    return {{2, "Y"}, {2, "X"}};
+  }
+};
+
+template <typename DeviceType, typename T>
+class FusionConvAddAddPReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionConvAddAddPReluParam<DeviceType>,
+          operators::ConvAddAddPReluKernel<DeviceType, T>> {
+ public:
+  FusionConvAddAddPReluOp(const string &type, const VariableNameMap &inputs,
+                          const VariableNameMap &outputs,
+                          const framework::AttributeMap &attrs,
+                          std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionConvAddAddPReluParam<DeviceType>,
+            operators::ConvAddAddPReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, FusionConvAddAddPReluParam<DeviceType>,
+      operators::ConvAddAddPReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef CONV_ADD_ADD_PRELU_REGISTER
+#define CONV_ADD_ADD_PRELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_add_prelu_registrar(
+    new FusionConvAddAddPReluOpMatcher());
+#endif
+
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+
+#ifndef CONV_ADD_ADD_PRELU_REGISTER
+#define CONV_ADD_ADD_PRELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_add_prelu_registrar(
+    new FusionConvAddAddPReluOpMatcher());
+#endif
+
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_add_prelu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_conv_add_add_prelu);
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_add_bn_op.cpp b/src/operators/fusion_conv_add_bn_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5b61bf5d390cc2904a3f40f5400a5a3eec9a2dd5
--- /dev/null
+++ b/src/operators/fusion_conv_add_bn_op.cpp
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBN_OP
+
+#include "operators/fusion_conv_add_bn_op.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionConvAddBNOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_conv_add_bn, ops::FusionConvAddBNOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_conv_add_bn, ops::FusionConvAddBNOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_add_bn_op.h b/src/operators/fusion_conv_add_bn_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec10787697deb006fe03a35192efb0d80bd00a3c
--- /dev/null
+++ b/src/operators/fusion_conv_add_bn_op.h
@@ -0,0 +1,115 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBN_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "op_param.h"
+#include "operators/kernel/conv_add_bn_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionConvAddBNMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionConvAddBNMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
+                  {G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_BN; }
+};
+
+template <typename DeviceType, typename T>
+class FusionConvAddBNOp : public framework::OperatorWithKernel<
+                              DeviceType, FusionConvAddBNParam<DeviceType>,
+                              operators::ConvAddBNKernel<DeviceType, T>> {
+ public:
+  FusionConvAddBNOp(const string &type, const VariableNameMap &inputs,
+                    const VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs,
+                    std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionConvAddBNParam<DeviceType>,
+            operators::ConvAddBNKernel<DeviceType, T>>(type, inputs, outputs,
+                                                       attrs, scope) {}
+
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef FUSION_CONV_ADD_BN_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
+    new FusionConvAddBNMatcher());
+#define FUSION_CONV_ADD_BN_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+
+#ifndef FUSION_CONV_ADD_BN_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
+    new FusionConvAddBNMatcher());
+#define FUSION_CONV_ADD_BN_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+
+#ifndef FUSION_CONV_ADD_BN_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
+    new FusionConvAddBNMatcher());
+#define FUSION_CONV_ADD_BN_REGISTER
+#endif
+
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_bn);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_conv_add_bn);
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_add_bn_relu_op.cpp b/src/operators/fusion_conv_add_bn_relu_op.cpp
index 63d0b23444ae6bf625e5e8640d3dc2ad314917d2..793634eec392fabe6c7399127ec9cb3e187697bc 100644
--- a/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
@@ -44,18 +44,18 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
   framework::DDim ddim = framework::make_ddim(output_shape);
   this->param_.Output()->Resize(ddim);
 }
-template class FusionConvAddBNReluOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add_bn_relu);
 REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
 #endif
 
 #endif
diff --git a/src/operators/fusion_conv_add_bn_relu_op.h b/src/operators/fusion_conv_add_bn_relu_op.h
index 494e49280dbdc3fe778cd7bdf5f5d30a82f2d9ff..4dee4416622e8dee8ca495026843c7506d084617 100644
--- a/src/operators/fusion_conv_add_bn_relu_op.h
+++ b/src/operators/fusion_conv_add_bn_relu_op.h
@@ -39,8 +39,6 @@ class FusionConvAddBNReluMatcher : public framework::FusionOpMatcher {
   void FolderNodes(
       framework::Node *node,
       std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node->OpDescs(node_.Depth());
     node->Folder(node_.Depth(), Type(),
                  {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
                   {G_OP_TYPE_BATCHNORM,
@@ -57,7 +55,7 @@ class FusionConvAddBNReluMatcher : public framework::FusionOpMatcher {
 template <typename DeviceType, typename T>
 class FusionConvAddBNReluOp
     : public framework::OperatorWithKernel<
-          DeviceType, FusionConvAddBNReluParam,
+          DeviceType, FusionConvAddBNReluParam<DeviceType>,
           operators::ConvAddBNReluKernel<DeviceType, T>> {
  public:
   FusionConvAddBNReluOp(const string &type, const VariableNameMap &inputs,
@@ -65,12 +63,12 @@ class FusionConvAddBNReluOp
                         const framework::AttributeMap &attrs,
                         std::shared_ptr<framework::Scope> scope)
       : framework::OperatorWithKernel<
-            DeviceType, FusionConvAddBNReluParam,
+            DeviceType, FusionConvAddBNReluParam<DeviceType>,
             operators::ConvAddBNReluKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, FusionConvAddBNReluParam,
+      DeviceType, FusionConvAddBNReluParam<DeviceType>,
       operators::ConvAddBNReluKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
@@ -79,11 +77,11 @@ class FusionConvAddBNReluOp
 
 #ifdef PADDLE_MOBILE_CPU
 
-//#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
-// static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
-//    new FusionConvAddBNReluMatcher());
-//#define FUSION_CONV_ADD_BN_RELU_REGISTER
-//#endif
+#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
+    new FusionConvAddBNReluMatcher());
+#define FUSION_CONV_ADD_BN_RELU_REGISTER
+#endif
 
 #endif
 
@@ -98,9 +96,25 @@ static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
 #endif
 
 #ifdef PADDLE_MOBILE_FPGA
+
+#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
+    new FusionConvAddBNReluMatcher());
+#define FUSION_CONV_ADD_BN_RELU_REGISTER
+#endif
+
 #endif
 
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_bn_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_conv_add_bn_relu);
+#endif
+
 #endif
diff --git a/src/operators/fusion_conv_add_prelu_op.cpp b/src/operators/fusion_conv_add_prelu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0cd30ae6888cd2372b0968717de14f9ca3c72e18
--- /dev/null
+++ b/src/operators/fusion_conv_add_prelu_op.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDPRELU_OP
+
+#include "fusion_conv_add_prelu_op.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionConvAddPReluOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_add_prelu_op.h b/src/operators/fusion_conv_add_prelu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c968be68230fe6252e72655f47b2a347f720526
--- /dev/null
+++ b/src/operators/fusion_conv_add_prelu_op.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDPRELU_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/conv_add_prelu_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+class FusionConvAddPReluOpMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionConvAddPReluOpMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_PRELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
+                  {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}
+
+                 },
+
+                 removed_nodes);
+  }
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_PRELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionConvAddPReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionConvAddPReluParam<DeviceType>,
+          operators::ConvAddPReluKernel<DeviceType, T>> {
+ public:
+  FusionConvAddPReluOp(const string &type, const VariableNameMap &inputs,
+                       const VariableNameMap &outputs,
+                       const framework::AttributeMap &attrs,
+                       std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionConvAddPReluParam<DeviceType>,
+            operators::ConvAddPReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                          attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, FusionConvAddPReluParam<DeviceType>,
+      operators::ConvAddPReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef CONV_ADD_PRELU_REGISTER
+#define CONV_ADD_PRELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_prelu_registrar(
+    new FusionConvAddPReluOpMatcher());
+#endif
+
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+
+#ifndef CONV_ADD_PRELU_REGISTER
+#define CONV_ADD_PRELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_prelu_registrar(
+    new FusionConvAddPReluOpMatcher());
+#endif
+
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_prelu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_conv_add_prelu);
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_add_relu_op.cpp b/src/operators/fusion_conv_add_relu_op.cpp
index 694e46af1f8dec3513c5a6d2ff26e3676e9204e4..99b770a6c5e3bc89024e467631e129b914f0bcec 100644
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef CONVADDRELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 
 #include "fusion_conv_add_relu_op.h"
 #include "operators/math/conv_func.h"
@@ -49,12 +49,12 @@ void FusionConvAddReluOp<Dtype, T>::InferShape() const {
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add_relu);
 REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #endif
 
 #endif
diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h
index bcacb3da3e2ec5371021f3552ffd2c9f53947874..926f309403d37fa8ec1f15f7cb955c1c13842405 100644
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef CONVADDRELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 
 #pragma once
 
+#include <string>
+#include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
 #include "operators/kernel/conv_add_relu_kernel.h"
@@ -43,7 +45,7 @@ class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher {
 
 template <typename DeviceType, typename T>
 class FusionConvAddReluOp : public framework::OperatorWithKernel<
-                                DeviceType, FusionConvAddReluParam,
+                                DeviceType, FusionConvAddReluParam<DeviceType>,
                                 operators::ConvAddReluKernel<DeviceType, T>> {
  public:
   FusionConvAddReluOp(const string &type, const VariableNameMap &inputs,
@@ -51,12 +53,12 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
                       const framework::AttributeMap &attrs,
                       std::shared_ptr<framework::Scope> scope)
       : framework::OperatorWithKernel<
-            DeviceType, FusionConvAddReluParam,
+            DeviceType, FusionConvAddReluParam<DeviceType>,
             operators::ConvAddReluKernel<DeviceType, T>>(type, inputs, outputs,
                                                          attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, FusionConvAddReluParam,
+      DeviceType, FusionConvAddReluParam<DeviceType>,
       operators::ConvAddReluKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
@@ -65,19 +67,35 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
 
 #ifdef PADDLE_MOBILE_CPU
 
-//#ifndef CONV_ADD_RELU_REGISTER
-//#define CONV_ADD_RELU_REGISTER
-// static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(new
-// FusionConvAddReluOpMatcher());
-//#endif
+#ifndef CONV_ADD_RELU_REGISTER
+#define CONV_ADD_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
+    new FusionConvAddReluOpMatcher());
+#endif
 
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+
+#ifndef CONV_ADD_RELU_REGISTER
+#define CONV_ADD_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
+    new FusionConvAddReluOpMatcher());
+#endif
+
 #endif
 
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_conv_add_relu);
+#endif
+
 #endif
diff --git a/src/operators/fusion_conv_bn_add_relu_op.cpp b/src/operators/fusion_conv_bn_add_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9823a3111e54f5aec90d5518073ca52255706c1a
--- /dev/null
+++ b/src/operators/fusion_conv_bn_add_relu_op.cpp
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNADDRELU_OP
+
+#include "operators/fusion_conv_bn_add_relu_op.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionConvBNAddReluOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_bn_add_relu_op.h b/src/operators/fusion_conv_bn_add_relu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..62f3ccf37dfbff9720f39fb96b099f6d7eb5ddcc
--- /dev/null
+++ b/src/operators/fusion_conv_bn_add_relu_op.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNADDRELU_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "op_param.h"
+#include "operators/kernel/conv_bn_add_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionConvBNAddReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionConvBNAddReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}},
+                  {G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"},
+                    {"Y", "BNY"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_BN_ADD_RELU; }
+  std::vector<std::pair<int, std::string>> NeedCheck() {
+    DLOG << " conv bn add relu check add X ";
+    return {{2, "Y"}, {2, "X"}};
+  }
+};
+
+template <typename DeviceType, typename T>
+class FusionConvBNAddReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionConvBNAddReluParam<DeviceType>,
+          operators::ConvBNAddReluKernel<DeviceType, T>> {
+ public:
+  FusionConvBNAddReluOp(const string &type, const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs,
+                        std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionConvBNAddReluParam<DeviceType>,
+            operators::ConvBNAddReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, FusionConvBNAddReluParam<DeviceType>,
+      operators::ConvBNAddReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_bn_add_relu_registrar(
+    new FusionConvBNAddReluMatcher());
+#define FUSION_CONV_BN_ADD_RELU_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+
+#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_bn_add_relu_registrar(
+    new FusionConvBNAddReluMatcher());
+#define FUSION_CONV_BN_ADD_RELU_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+
+#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_bn_add_relu_registrar(
+    new FusionConvBNAddReluMatcher());
+#define FUSION_CONV_BN_ADD_RELU_REGISTER
+#endif
+
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_bn_add_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_conv_bn_add_relu);
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_bn_op.cpp b/src/operators/fusion_conv_bn_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..470678bfe57a41e66d6f11f3bfd469d97369d939
--- /dev/null
+++ b/src/operators/fusion_conv_bn_op.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBN_OP
+
+#include "operators/fusion_conv_bn_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionConvBNOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_conv_bn, ops::FusionConvBNOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_conv_bn, ops::FusionConvBNOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_bn_op.h b/src/operators/fusion_conv_bn_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f43e62c9fa5c4b40c07fcb9cbdab4d06ab2c482f
--- /dev/null
+++ b/src/operators/fusion_conv_bn_op.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBN_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/conv_bn_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionConvBNMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionConvBNMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_BN; }
+};
+
+template <typename DeviceType, typename T>
+class FusionConvBNOp : public framework::OperatorWithKernel<
+                           DeviceType, FusionConvBNParam<DeviceType>,
+                           operators::ConvBNKernel<DeviceType, T>> {
+ public:
+  FusionConvBNOp(const string &type, const VariableNameMap &inputs,
+                 const VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs,
+                 std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, FusionConvBNParam<DeviceType>,
+                                      operators::ConvBNKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef FUSION_CONV_BN_REGISTER
+static framework::FusionOpRegistrar fusion_conv_bn_registrar(
+    new FusionConvBNMatcher());
+#define FUSION_CONV_BN_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+
+#ifndef FUSION_CONV_BN_REGISTER
+static framework::FusionOpRegistrar fusion_conv_bn_registrar(
+    new FusionConvBNMatcher());
+#define FUSION_CONV_BN_REGISTER
+#endif
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_bn);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_conv_bn);
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_bn_relu_op.cpp b/src/operators/fusion_conv_bn_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bfc9b99ea796bfdcc1a4ae1a23b2e39e8a513393
--- /dev/null
+++ b/src/operators/fusion_conv_bn_relu_op.cpp
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#include "operators/fusion_conv_bn_relu_op.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionConvBNReluOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_bn_relu_op.h b/src/operators/fusion_conv_bn_relu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b5ff4ea9d3e77ad9449b3968667ecc4558c2147
--- /dev/null
+++ b/src/operators/fusion_conv_bn_relu_op.h
@@ -0,0 +1,110 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/conv_bn_relu_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionConvBNReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionConvBNReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_BN_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionConvBNReluOp : public framework::OperatorWithKernel<
+                               DeviceType, FusionConvBNReluParam<DeviceType>,
+                               operators::ConvBNReluKernel<DeviceType, T>> {
+ public:
+  FusionConvBNReluOp(const string &type, const VariableNameMap &inputs,
+                     const VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs,
+                     std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionConvBNReluParam<DeviceType>,
+            operators::ConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                        attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, FusionConvBNReluParam<DeviceType>,
+      operators::ConvBNReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef FUSION_CONV_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_bn_relu_registrar(
+    new FusionConvBNReluMatcher());
+#define FUSION_CONV_BN_RELU_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+
+#ifndef FUSION_CONV_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_bn_relu_registrar(
+    new FusionConvBNReluMatcher());
+#define FUSION_CONV_BN_RELU_REGISTER
+#endif
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_bn_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_conv_bn_relu);
+#endif
+
+#endif
diff --git a/src/operators/fusion_dwconv_bn_relu_op.cpp b/src/operators/fusion_dwconv_bn_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e55295830e19b5b39a5ae2501e30170ffb1a7854
--- /dev/null
+++ b/src/operators/fusion_dwconv_bn_relu_op.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DWCONVBNRELU_OP
+
+#include "operators/fusion_dwconv_bn_relu_op.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/fusion_dwconv_bn_relu_op.h b/src/operators/fusion_dwconv_bn_relu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd1f85688f576106a46cd3070ab2034ec8f55881
--- /dev/null
+++ b/src/operators/fusion_dwconv_bn_relu_op.h
@@ -0,0 +1,110 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DWCONVBNRELU_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "op_param.h"
+#include "operators/kernel/dwconv_bn_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionDWConvBNReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDWConvBNReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_DEPTHWISE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_DWCONV_BN_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionDWConvBNReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionDWConvBNReluParam<DeviceType>,
+          operators::DWConvBNReluKernel<DeviceType, T>> {
+ public:
+  FusionDWConvBNReluOp(const string &type, const VariableNameMap &inputs,
+                       const VariableNameMap &outputs,
+                       const framework::AttributeMap &attrs,
+                       std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDWConvBNReluParam<DeviceType>,
+            operators::DWConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                          attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, FusionDWConvBNReluParam<DeviceType>,
+      operators::DWConvBNReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef FUSION_DWCONV_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_dwconv_bn_relu_registrar(
+    new FusionDWConvBNReluMatcher());
+#define FUSION_DWCONV_BN_RELU_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+
+#ifndef FUSION_DWCONV_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_dwconv_bn_relu_registrar(
+    new FusionDWConvBNReluMatcher());
+#define FUSION_DWCONV_BN_RELU_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_dwconv_bn_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/fusion_elementwise_add_relu_op.cpp b/src/operators/fusion_elementwise_add_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fa2739ab4283c1fbb35e541ed2d40ea7a1904580
--- /dev/null
+++ b/src/operators/fusion_elementwise_add_relu_op.cpp
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_ELEMENTWISEADDRELU_OP
+
+#include "fusion_elementwise_add_relu_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionElementwiseAddReluOp<Dtype, T>::InferShape() const {
+  auto x_dim = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(x_dim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+// REGISTER_OPERATOR_CPU(fusion_elementwise_add_relu,
+//                      ops::FusionElementwiseAddReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+// REGISTER_OPERATOR_MALI_GPU(fusion_elementwise_add_relu,
+//                           ops::FusionElementwiseAddReluOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_elementwise_add_relu,
+                       ops::FusionElementwiseAddReluOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_elementwise_add_relu_op.h b/src/operators/fusion_elementwise_add_relu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a92f1e2471cb9e14d84ef03e4bfb872fc738d68
--- /dev/null
+++ b/src/operators/fusion_elementwise_add_relu_op.h
@@ -0,0 +1,99 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_ELEMENTWISEADDRELU_OP
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/elementwise_add_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusioneElementwiseAddReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusioneElementwiseAddReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_ELEMENTWISE_ADD);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionElementwiseAddReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, ElementwiseAddReluParam<DeviceType>,
+          operators::ElementwiseAddReluKernel<DeviceType, T>> {
+ public:
+  FusionElementwiseAddReluOp(const string &type, const VariableNameMap &inputs,
+                             const VariableNameMap &outputs,
+                             const framework::AttributeMap &attrs,
+                             std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, ElementwiseAddReluParam<DeviceType>,
+            operators::ElementwiseAddReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+#ifndef FUSION_ELEMENTWISE_ADD_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_elementwise_relu_registrar(
+    new FusioneElementwiseAddReluMatcher());
+#define FUSION_ELEMENTWISE_ADD_RELU_REGISTER
+#endif
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+#ifndef FUSION_ELEMENTWISE_ADD_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_elementwise_relu_registrar(
+    new FusioneElementwiseAddReluMatcher());
+#define FUSION_ELEMENTWISE_ADD_RELU_REGISTER
+#endif
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+#ifndef FUSION_ELEMENTWISE_ADD_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_elementwise_relu_registrar(
+    new FusioneElementwiseAddReluMatcher());
+#define FUSION_ELEMENTWISE_ADD_RELU_REGISTER
+#endif
+#endif
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_elementwise_add_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_elementwise_add_relu);
+#endif
+
+#endif
diff --git a/src/operators/fusion_fc_op.cpp b/src/operators/fusion_fc_op.cpp
index 2e591b678cf7987eba5fdc74643cd7e15c35271f..9fa80fbf12d0fe300921418705b6900108c68faf 100644
--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -49,20 +49,19 @@ void FusionFcOp<Dtype, T>::InferShape() const {
   framework::DDim ddim = framework::make_ddim(output_dims);
   this->param_.Out()->Resize(ddim);
 }
-template class FusionFcOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_fc);
 REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(fc);
-REGISTER_OPERATOR_MALI_GPU(fc, ops::FusionFcOp);
+REGISTER_OPERATOR_MALI_GPU(fusion_fc, ops::FusionFcOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_fc, ops::FusionFcOp);
 #endif
 
 #endif
diff --git a/src/operators/fusion_fc_op.h b/src/operators/fusion_fc_op.h
index ea1f42f0adfb532982f50c2da41fc58f63b54834..076a95d745e8d44a417dd95fb75844a67b11e653 100644
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -45,20 +45,20 @@ class FusionFcMatcher : public framework::FusionOpMatcher {
 };
 
 template <typename DeviceType, typename T>
-class FusionFcOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionFcParam, operators::FusionFcKernel<DeviceType, T>> {
+class FusionFcOp : public framework::OperatorWithKernel<
+                       DeviceType, FusionFcParam<DeviceType>,
+                       operators::FusionFcKernel<DeviceType, T>> {
  public:
   FusionFcOp(const string &type, const VariableNameMap &inputs,
              const VariableNameMap &outputs,
              const framework::AttributeMap &attrs,
              std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType, FusionFcParam,
+      : framework::OperatorWithKernel<DeviceType, FusionFcParam<DeviceType>,
                                       operators::FusionFcKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, FusionFcParam,
+      DeviceType, FusionFcParam<DeviceType>,
       operators::FusionFcKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
@@ -66,27 +66,36 @@ class FusionFcOp
 };
 
 #ifdef PADDLE_MOBILE_CPU
-
-#ifndef CONV_CPU_REGISTER
-#define CONV_CPU_REGISTER
+#ifndef FUSION_FC_REGISTER
 static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
+#define FUSION_FC_REGISTER
 #endif
-
 #endif
 
 #ifdef PADDLE_MOBILE_MALI_GPU
-
-#ifndef CONV_CPU_REGISTER
-#define CONV_CPU_REGISTER
+#ifndef FUSION_FC_REGISTER
 static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
+#define FUSION_FC_REGISTER
 #endif
-
 #endif
 
 #ifdef PADDLE_MOBILE_FPGA
+#ifndef FUSION_FC_REGISTER
+static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
+#define FUSION_FC_REGISTER
+#endif
 #endif
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_fc);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fusion_fc);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_fc);
+#endif
+
 #endif
diff --git a/src/operators/fusion_fc_relu_op.cpp b/src/operators/fusion_fc_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..97568323a3c204da06546ffc6b4d9a2483e95848
--- /dev/null
+++ b/src/operators/fusion_fc_relu_op.cpp
@@ -0,0 +1,67 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_FC_RELU_OP
+
+#include "operators/fusion_fc_relu_op.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionFcReluOp<Dtype, T>::InferShape() const {
+  auto x_dims = this->param_.InputX()->dims();
+  auto y_dims = this->param_.InputY()->dims();
+  int x_num_col_dims = this->param_.XNumColDims();
+  int y_num_col_dims = this->param_.YNumColDims();
+
+  assert(x_dims.size() > x_num_col_dims);
+  assert(y_dims.size() > y_num_col_dims);
+
+  /// (1,2,3,4) , x_num_col_dims = 2  -> (2,12)
+  auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
+  auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
+
+  assert(x_mat_dims[1] == y_mat_dims[0]);
+
+  std::vector<int64_t> output_dims;
+  output_dims.reserve(
+      static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
+
+  for (int i = 0; i < x_num_col_dims; ++i) {
+    output_dims.push_back(x_dims[i]);
+  }
+
+  for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
+    output_dims.push_back(y_dims[i]);
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_dims);
+  this->param_.Out()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_fc_relu, ops::FusionFcReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(fusion_fc_relu, ops::FusionFcReluOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fusion_fc_relu, ops::FusionFcReluOp);
+#endif
+
+#endif
diff --git a/src/operators/fusion_fc_relu_op.h b/src/operators/fusion_fc_relu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa7d4045fc10d6e240d93e129aa736be793f7bbf
--- /dev/null
+++ b/src/operators/fusion_fc_relu_op.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FCRELU_OP
+#pragma once
+#include <string>
+#include <vector>
+
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/fc_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionFcReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionFcReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_MUL);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Z"}}}}, removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_FC_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionFcReluOp : public framework::OperatorWithKernel<
+                           DeviceType, FusionFcReluParam<DeviceType>,
+                           operators::FusionFcReluKernel<DeviceType, T>> {
+ public:
+  FusionFcReluOp(const string &type, const VariableNameMap &inputs,
+                 const VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs,
+                 std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionFcReluParam<DeviceType>,
+            operators::FusionFcReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                          attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, FusionFcReluParam<DeviceType>,
+      operators::FusionFcReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+#ifndef FUSION_FC_RELU_REGISTER
+static framework::FusionOpRegistrar fc_relu_registrar(
+    new FusionFcReluMatcher());
+#define FUSION_FC_RELU_REGISTER
+#endif
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+#ifndef FUSION_FC_RELU_REGISTER
+static framework::FusionOpRegistrar fc_relu_registrar(
+    new FusionFcReluMatcher());
+#define FUSION_FC_RELU_REGISTER
+#endif
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+#ifndef FUSION_FC_RELU_REGISTER
+static framework::FusionOpRegistrar fc_relu_registrar(
+    new FusionFcReluMatcher());
+#define FUSION_FC_RELU_REGISTER
+#endif
+#endif
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_fc_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fusion_fc_relu);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(fusion_fc_relu);
+#endif
+#endif  // FUSION_FC_RELU_OP
diff --git a/src/operators/gru_op.cpp b/src/operators/gru_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c141cbc06531fabcf5e29546e832480cff850b8c
--- /dev/null
+++ b/src/operators/gru_op.cpp
@@ -0,0 +1,72 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef GRU_OP
+
+#include <vector>
+
+#include "common/enforce.h"
+#include "operators/gru_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void GruOp<Dtype, T>::InferShape() const {
+  auto lod_size = this->param_.InputInput()->lod().size();
+  PADDLE_MOBILE_ENFORCE((lod_size == 1),
+                        "Current LoD only supports one dimension.");
+  auto input_dims = this->param_.InputInput()->dims();
+  auto weight_dims = this->param_.InputWeight()->dims();
+  int input_size = input_dims[1];
+  int frame_size = weight_dims[0];
+  PADDLE_MOBILE_ENFORCE(
+      (input_size == frame_size * 3),
+      "The input_size must be 3 times of frame_size in GRUOp.");
+  PADDLE_MOBILE_ENFORCE(
+      (weight_dims[1] == frame_size * 3),
+      "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+  if (this->param_.InputH0()) {
+    auto h0_dims = this->param_.InputH0()->dims();
+    PADDLE_MOBILE_ENFORCE((h0_dims[1] == frame_size),
+                          "The width of H0 must be equal to frame_size.");
+  }
+  if (this->param_.InputBias()) {
+    auto bias_dims = this->param_.InputBias()->dims();
+    int bias_height = bias_dims[0];
+    int bias_width = bias_dims[1];
+    PADDLE_MOBILE_ENFORCE((bias_height == 1),
+                          "The shape of Bias must be [1, frame_size * 3].");
+    PADDLE_MOBILE_ENFORCE((bias_width == frame_size * 3),
+                          "The shape of Bias must be [1, frame_size * 3].");
+  }
+  this->param_.OutBatchGate()->Resize(input_dims);
+  this->param_.OutBatchResetHiddenPrev()->Resize({input_dims[0], frame_size});
+  this->param_.OutBatchHidden()->Resize({input_dims[0], frame_size});
+  this->param_.OutHidden()->Resize({input_dims[0], frame_size});
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(gru, ops::GruOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/gru_op.h b/src/operators/gru_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d348b6c52431f93673f1b772f8c8a9462878cfd5
--- /dev/null
+++ b/src/operators/gru_op.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef GRU_OP
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/gru_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class GruOp : public framework::OperatorWithKernel<
+                  DeviceType, GruParam<DeviceType>,
+                  operators::GruKernel<DeviceType, T>> {
+ public:
+  GruOp(const std::string &type, const VariableNameMap &inputs,
+        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+        std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, GruParam<DeviceType>,
+                                      operators::GruKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, GruParam<DeviceType>,
+      operators::GruKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(gru);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/im2sequence_op.cpp b/src/operators/im2sequence_op.cpp
index 2cb2d6398f85f461bd6214e2631dd0a8f951fb2d..3c929af9cf0a8a1550f197ffdb42ee590cd43235 100644
--- a/src/operators/im2sequence_op.cpp
+++ b/src/operators/im2sequence_op.cpp
@@ -47,14 +47,11 @@ void Im2SequenceOp<Dtype, T>::InferShape() const {
   this->param_.Output()->Resize(ddim);
 }
 
-template class Im2SequenceOp<CPU, float>;
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(im2sequence);
 REGISTER_OPERATOR_CPU(im2sequence, ops::Im2SequenceOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/im2sequence_op.h b/src/operators/im2sequence_op.h
index 4c4dee617277bcee874d1fcd840c7ddd8fd68cbd..edb87d0012e5514cb5541f94a965965f3dc02825 100644
--- a/src/operators/im2sequence_op.h
+++ b/src/operators/im2sequence_op.h
@@ -27,7 +27,7 @@ using namespace framework;
 
 template <typename DeviceType, typename T>
 class Im2SequenceOp : public framework::OperatorWithKernel<
-                          DeviceType, Im2SequenceParam,
+                          DeviceType, Im2SequenceParam<DeviceType>,
                           operators::Im2SequenceKernel<DeviceType, T>> {
  public:
   Im2SequenceOp(const std::string &type, const VariableNameMap &inputs,
@@ -35,12 +35,12 @@ class Im2SequenceOp : public framework::OperatorWithKernel<
                 const framework::AttributeMap &attrs,
                 std::shared_ptr<framework::Scope> scope)
       : framework::OperatorWithKernel<
-            DeviceType, Im2SequenceParam,
+            DeviceType, Im2SequenceParam<DeviceType>,
             operators::Im2SequenceKernel<DeviceType, T>>(type, inputs, outputs,
                                                          attrs, scope) {}
 
   // using framework::OperatorWithKernel<
-  //    DeviceType, Im2SequenceParam,
+  //    DeviceType, Im2SequenceParam<DeviceType>,
   //    operators::Im2SequenceKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
@@ -50,4 +50,12 @@ class Im2SequenceOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(im2sequence);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/kernel/arm/batchnorm_kernel.cpp b/src/operators/kernel/arm/batchnorm_kernel.cpp
index f78d1fdc95ac9e10619dbf32fdc84d01a370f315..c420727f425092240994ee834117225c72abeec2 100644
--- a/src/operators/kernel/arm/batchnorm_kernel.cpp
+++ b/src/operators/kernel/arm/batchnorm_kernel.cpp
@@ -21,12 +21,13 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool BatchNormKernel<CPU, float>::Init(BatchNormParam *param) {
+bool BatchNormKernel<CPU, float>::Init(BatchNormParam<CPU> *param) {
   return true;
 }
 
 template <>
-void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
+void BatchNormKernel<CPU, float>::Compute(
+    const BatchNormParam<CPU> &param) const {
   BatchnormCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/bilinear_interp_kernel.cpp b/src/operators/kernel/arm/bilinear_interp_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ea5ff627d7ea2e0fa5434f9f7fc9f5ec44ce60a7
--- /dev/null
+++ b/src/operators/kernel/arm/bilinear_interp_kernel.cpp
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef BILINEAR_INTERP_OP
+
+#include "operators/kernel/bilinear_interp_kernel.h"
+#include "operators/kernel/central-arm-func/bilinear_interp_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool BilinearInterpKernel<CPU, float>::Init(BilinearInterpParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void BilinearInterpKernel<CPU, float>::Compute(
+    const BilinearInterpParam<CPU> &param) const {
+  BilinearInterpCompute<float>(param);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/box_coder_kernel.cpp b/src/operators/kernel/arm/box_coder_kernel.cpp
index d2a479391fbbb416eea7d19ae64125cac4637ef1..b769d4fbbaa7570ee741476f960d9e5b60c61917 100644
--- a/src/operators/kernel/arm/box_coder_kernel.cpp
+++ b/src/operators/kernel/arm/box_coder_kernel.cpp
@@ -21,12 +21,13 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool BoxCoderKernel<CPU, float>::Init(BoxCoderParam *param) {
+bool BoxCoderKernel<CPU, float>::Init(BoxCoderParam<CPU> *param) {
   return true;
 }
 
 template <>
-void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam &param) const {
+void BoxCoderKernel<CPU, float>::Compute(
+    const BoxCoderParam<CPU> &param) const {
   BoxCoderCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/concat_kernel.cpp b/src/operators/kernel/arm/concat_kernel.cpp
index b6810bf76946bfb8151f3001b76fcbaa5e99e5fc..04c590e6b432fbf88cd136eac942485adf9a9003 100644
--- a/src/operators/kernel/arm/concat_kernel.cpp
+++ b/src/operators/kernel/arm/concat_kernel.cpp
@@ -21,13 +21,14 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ConcatKernel<CPU, float>::Init(ConcatParam *param) {
+bool ConcatKernel<CPU, float>::Init(ConcatParam<CPU> *param) {
   return true;
 }
 
 template <>
-void ConcatKernel<CPU, float>::Compute(const ConcatParam &param) const {
+void ConcatKernel<CPU, float>::Compute(const ConcatParam<CPU> &param) const {
   ConcatCompute<float>(param);
+  param.Out()->set_lod(param.Inputs()[0]->lod());
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp b/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..74b88f5d4f5e24b1401803c8c48d99319f412d1b
--- /dev/null
+++ b/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDADDPRELU_OP
+
+#include "operators/kernel/conv_add_add_prelu_kernel.h"
+#include "operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddAddPReluKernel<CPU, float>::Init(
+    FusionConvAddAddPReluParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void ConvAddAddPReluKernel<CPU, float>::Compute(
+    const FusionConvAddAddPReluParam<CPU> &param) const {
+  ConvAddAddPReluCompute<float>(param);
+}
+template class ConvAddAddPReluKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
index e95bd8e76c5034f3897eff81e0ba67119d04a95b..ca53ebea8e4577fdc52fad066691d4351eaf12f9 100644
--- a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
@@ -15,18 +15,19 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBNRELU_OP
 
 #include "operators/kernel/conv_add_bn_relu_kernel.h"
-#include "operators/kernel/central-arm-func/conv_add_bn_relu_func.h"
+#include "operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ConvAddBNReluKernel<CPU, float>::Init(FusionConvAddBNReluParam *param) {
-  const Tensor *mean = (*param).InputMean();
-  const Tensor *variance = (*param).InputVariance();
-  const Tensor *scale = (*param).InputScale();
-  const Tensor *bias = (*param).InputBias();
-  const float epsilon = (*param).Epsilon();
+bool ConvAddBNReluKernel<CPU, float>::Init(
+    FusionConvAddBNReluParam<CPU> *param) {
+  const Tensor *mean = param->InputMean();
+  const Tensor *variance = param->InputVariance();
+  const Tensor *scale = param->InputScale();
+  const Tensor *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
 
   auto mean_ptr = mean->data<float>();
   auto variance_ptr = variance->data<float>();
@@ -47,14 +48,14 @@ bool ConvAddBNReluKernel<CPU, float>::Init(FusionConvAddBNReluParam *param) {
     new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
     new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
   }
-  (*param).SetNewScale(new_scale);
-  (*param).SetNewBias(new_bias);
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
   return true;
 }
 
 template <>
 void ConvAddBNReluKernel<CPU, float>::Compute(
-    const FusionConvAddBNReluParam &param) const {
+    const FusionConvAddBNReluParam<CPU> &param) const {
   ConvAddBNReluCompute<float>(param);
 }
 template class ConvAddBNReluKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/conv_add_kernel.cpp b/src/operators/kernel/arm/conv_add_kernel.cpp
index 64d6dfa64dc3feae5b73a17ae5b148053df34a0b..1af1c3db1159cd4fed007ebf153ba15b804eee75 100644
--- a/src/operators/kernel/arm/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_kernel.cpp
@@ -14,120 +14,20 @@ limitations under the License. */
 #ifdef FUSION_CONVADD_OP
 
 #include "operators/kernel/conv_add_kernel.h"
+#include "../central-arm-func/conv_add_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ConvAddKernel<CPU, float>::Init(FusionConvAddParam *param) {
+bool ConvAddKernel<CPU, float>::Init(FusionConvAddParam<CPU> *param) {
   return true;
 }
 
-void ConvAddBasic(const FusionConvAddParam &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-  int axis = param.Axis();
-  Tensor *output = param.Output();
-  math::expand_bias(bias, axis, output->dims());
-  output->ShareDataWith(bias);
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(1));
-    }
-  }
-}
-
 template <>
-void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam &param) const {
-  if (param.Groups() == param.Input()->dims()[1] &&
-      param.Input()->dims()[1] == param.Output()->dims()[1] &&
-      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
-    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
-                               param.Bias(), true);
-  } else if (param.Groups() == param.Input()->dims()[1] &&
-             param.Input()->dims()[1] == param.Output()->dims()[1] &&
-             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3) {
-    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
-                           param.Filter(), param.Bias(), param.Output(), true);
-  } else {
-    ConvAddBasic(param);
-  }
+void ConvAddKernel<CPU, float>::Compute(
+    const FusionConvAddParam<CPU> &param) const {
+  ConvAddCompute<float>(param);
 }
 
 template class ConvAddKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/conv_add_prelu_kernel.cpp b/src/operators/kernel/arm/conv_add_prelu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5930cfdcfc0f983c9f07754113dc37d5122d19f0
--- /dev/null
+++ b/src/operators/kernel/arm/conv_add_prelu_kernel.cpp
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDPRELU_OP
+
+#include "operators/kernel/conv_add_prelu_kernel.h"
+#include "operators/kernel/central-arm-func/conv_add_prelu_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddPReluKernel<CPU, float>::Init(FusionConvAddPReluParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void ConvAddPReluKernel<CPU, float>::Compute(
+    const FusionConvAddPReluParam<CPU> &param) const {
+  ConvAddPReluCompute<float>(param);
+}
+template class ConvAddPReluKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/conv_add_relu_kernel.cpp b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
index 356dd191e761afc5d5b6bfacd250f90ae31017b2..f50e1e3900bb5fce35a29100d6c2cb6004b4af74 100644
--- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef FUSION_CONVADD_RELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 
 #include "operators/kernel/conv_add_relu_kernel.h"
 #include "operators/kernel/central-arm-func/conv_add_relu_arm_func.h"
@@ -21,13 +21,13 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ConvAddReluKernel<CPU, float>::Init(FusionConvAddReluParam *param) {
+bool ConvAddReluKernel<CPU, float>::Init(FusionConvAddReluParam<CPU> *param) {
   return true;
 }
 
 template <>
 void ConvAddReluKernel<CPU, float>::Compute(
-    const FusionConvAddReluParam &param) const {
+    const FusionConvAddReluParam<CPU> &param) const {
   ConvAddReluCompute<float>(param);
 }
 template class ConvAddReluKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp b/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..785b13dde2ec1196792d17b253bb0d904da799f5
--- /dev/null
+++ b/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNADDRELU_OP
+
+#include "operators/kernel/conv_bn_add_relu_kernel.h"
+#include "operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvBNAddReluKernel<CPU, float>::Init(
+    FusionConvBNAddReluParam<CPU> *param) {
+  const Tensor *mean = param->InputMean();
+  const Tensor *variance = param->InputVariance();
+  const Tensor *scale = param->InputScale();
+  const Tensor *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+
+  const int C = mean->numel();
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  Tensor *new_scale = new Tensor();
+  Tensor *new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({C});
+  auto new_bias_ptr = new_bias->mutable_data<float>({C});
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  return true;
+}
+
+template <>
+void ConvBNAddReluKernel<CPU, float>::Compute(
+    const FusionConvBNAddReluParam<CPU> &param) const {
+  ConvBNAddReluCompute<float>(param);
+}
+template class ConvBNAddReluKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6b9ea0428fa496980a234c7c895ef9cbf1245b51
--- /dev/null
+++ b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#include "operators/kernel/conv_bn_relu_kernel.h"
+#include "operators/kernel/central-arm-func/conv_bn_relu_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvBNReluKernel<CPU, float>::Init(FusionConvBNReluParam<CPU> *param) {
+  const Tensor *mean = param->InputMean();
+  const Tensor *variance = param->InputVariance();
+  const Tensor *scale = param->InputScale();
+  const Tensor *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+
+  //   DLOG << "variance: " << *variance;
+
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+
+  const int C = mean->numel();
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  Tensor *new_scale = new Tensor();
+  Tensor *new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({C});
+  auto new_bias_ptr = new_bias->mutable_data<float>({C});
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  return true;
+}
+
+template <>
+void ConvBNReluKernel<CPU, float>::Compute(
+    const FusionConvBNReluParam<CPU> &param) const {
+  ConvBNReluCompute<float>(param);
+}
+template class ConvBNReluKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp
index ca8aeff0dd3db5fe7b625bdeb947b2927eb619ce..4e9d3a34f231485685bc8f7b087382cb99a3b036 100644
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -21,12 +21,12 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ConvKernel<CPU, float>::Init(ConvParam *param) {
+bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
   return true;
 }
 
 template <>
-void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
+void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) const {
   ConvCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/conv_transpose_kernel.cpp b/src/operators/kernel/arm/conv_transpose_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d695e6144b40d945857d547f7c208f1192481e8f
--- /dev/null
+++ b/src/operators/kernel/arm/conv_transpose_kernel.cpp
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_TRANSPOSE
+
+#include "operators/kernel/conv_transpose_kernel.h"
+#include "operators/kernel/central-arm-func/conv_transpose_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvTransposeKernel<CPU, float>::Init(ConvTransposeParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void ConvTransposeKernel<CPU, float>::Compute(
+    const ConvTransposeParam<CPU> &param) const {
+  ConvTransposeCompute<float>(param);
+}
+
+template class ConvTransposeKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/crf_kernel.cpp b/src/operators/kernel/arm/crf_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..89769c50a6fc05b28192ebf584ba3cb12f19ac2c
--- /dev/null
+++ b/src/operators/kernel/arm/crf_kernel.cpp
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CRF_OP
+
+#include "operators/kernel/crf_kernel.h"
+#include "common/types.h"
+#include "operators/kernel/central-arm-func/crf_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool CrfKernel<CPU, float>::Init(CrfParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void CrfKernel<CPU, float>::Compute(const CrfParam<CPU> &param) const {
+  CrfCompute<float>(param);
+}
+
+template class CrfKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/depthwise_conv_kernel.cpp b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
index 6ede0e2bef2383df8aa0593a07297f2f6233acaf..fd5e068afb6f7f2a069a7d8fccc459d4c2a6828d 100644
--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
@@ -21,12 +21,13 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool DepthwiseConvKernel<CPU, float>::Init(ConvParam *param) {
+bool DepthwiseConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
   return true;
 }
 
 template <>
-void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
+void DepthwiseConvKernel<CPU, float>::Compute(
+    const ConvParam<CPU> &param) const {
   DepthwiseConvCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/dropout_kernel.cpp b/src/operators/kernel/arm/dropout_kernel.cpp
index af16048a1b4eba2ff36f842b6cf968031989576e..4578ac6607d87c316853f6201f02f8204bc41de1 100644
--- a/src/operators/kernel/arm/dropout_kernel.cpp
+++ b/src/operators/kernel/arm/dropout_kernel.cpp
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #ifdef DROPOUT_OP
 
-#pragma once
-
 #include "operators/kernel/dropout_kernel.h"
 #include <operators/math/transform.h>
 
@@ -23,23 +21,27 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool DropoutKernel<CPU, float>::Init(DropoutParam *para) {
+bool DropoutKernel<CPU, float>::Init(DropoutParam<CPU> *para) {
   return true;
 }
 
 template <typename T>
 struct DropoutFunctor {
-  inline T operator()(T in) const { return in; }
+  DropoutFunctor(T drop_pro) : dropout_pro_(drop_pro) {}
+  inline T operator()(T in) const { return (1 - dropout_pro_) * in; }
+
+ private:
+  T dropout_pro_;
 };
 
 template <>
-void DropoutKernel<CPU, float>::Compute(const DropoutParam &param) const {
+void DropoutKernel<CPU, float>::Compute(const DropoutParam<CPU> &param) const {
   const auto *input_x = param.InputX();
   auto *input_x_ptr = input_x->data<float>();
   auto *out = param.Out();
   auto *out_ptr = out->mutable_data<float>();
-
-  DropoutFunctor<float> func_;
+  const float dropoutProb = param.DropoutProb();
+  DropoutFunctor<float> func_(dropoutProb);
   math::Transform trans;
   trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_);
 }
diff --git a/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b85701bb936b2ccc0323e4d534424abb726a69be
--- /dev/null
+++ b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DWCONVBNRELU_OP
+
+#include "operators/kernel/dwconv_bn_relu_kernel.h"
+#include "operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DWConvBNReluKernel<CPU, float>::Init(FusionDWConvBNReluParam<CPU> *param) {
+  const Tensor *mean = param->InputMean();
+  const Tensor *variance = param->InputVariance();
+  const Tensor *scale = param->InputScale();
+  const Tensor *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+
+  const int C = mean->numel();
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  Tensor *new_scale = new Tensor();
+  Tensor *new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({C});
+  auto new_bias_ptr = new_bias->mutable_data<float>({C});
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  return true;
+}
+
+template <>
+void DWConvBNReluKernel<CPU, float>::Compute(
+    const FusionDWConvBNReluParam<CPU> &param) const {
+  DWConvBNReluCompute<float>(param);
+}
+template class DWConvBNReluKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/elementwise_add_kernel.cpp b/src/operators/kernel/arm/elementwise_add_kernel.cpp
index fdab1c60a310480d8e59f3f84802001ea592433a..9c6f4a3316385b803a8fdb833490f1fe9e7f41ac 100644
--- a/src/operators/kernel/arm/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_add_kernel.cpp
@@ -21,14 +21,15 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ElementwiseAddKernel<CPU, float>::Init(ElementwiseAddParam *param) {
+bool ElementwiseAddKernel<CPU, float>::Init(ElementwiseAddParam<CPU> *param) {
   return true;
 }
 
 template <>
 void ElementwiseAddKernel<CPU, float>::Compute(
-    const ElementwiseAddParam &param) const {
+    const ElementwiseAddParam<CPU> &param) const {
   ElementwiseAddCompute<float>(param);
+  param.Out()->set_lod(param.InputX()->lod());
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/arm/flatten_kernel.cpp b/src/operators/kernel/arm/flatten_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6866b740aa945852050e7fca4991489f48435150
--- /dev/null
+++ b/src/operators/kernel/arm/flatten_kernel.cpp
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef FLATTEN_OP
+
+#include "operators/kernel/flatten_kernel.h"
+#include "operators/kernel/central-arm-func/flatten_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FlattenKernel<CPU, float>::Init(FlattenParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void FlattenKernel<CPU, float>::Compute(const FlattenParam<CPU> &param) const {
+  FlattenCompute<float>(param);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/fusion_fc_kernel.cpp b/src/operators/kernel/arm/fusion_fc_kernel.cpp
index c72960e67f19c601e6f27a3bedf7123c80875e0c..d9d112e7a762705efe041c74eea9ddb7d5162918 100644
--- a/src/operators/kernel/arm/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/arm/fusion_fc_kernel.cpp
@@ -21,13 +21,15 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool FusionFcKernel<CPU, float>::Init(FusionFcParam *param) {
+bool FusionFcKernel<CPU, float>::Init(FusionFcParam<CPU> *param) {
   return true;
 }
 
 template <>
-void FusionFcKernel<CPU, float>::Compute(const FusionFcParam &param) const {
+void FusionFcKernel<CPU, float>::Compute(
+    const FusionFcParam<CPU> &param) const {
   FusionFcCompute<float>(param);
+  param.Out()->set_lod(param.InputX()->lod());
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/arm/gru_kernel.cpp b/src/operators/kernel/arm/gru_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..168471185e07a9c1814c708238996a82c1ee0891
--- /dev/null
+++ b/src/operators/kernel/arm/gru_kernel.cpp
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef GRU_OP
+
+#include "operators/kernel/gru_kernel.h"
+#include "operators/kernel/central-arm-func/gru_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool GruKernel<CPU, float>::Init(GruParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void GruKernel<CPU, float>::Compute(const GruParam<CPU> &param) const {
+  GruCompute<float>(param);
+  param.OutHidden()->set_lod(param.InputInput()->lod());
+  //  DLOG << "________________" << param.OutHidden()->dims();
+  //  DLOG << "________________" << param.OutHidden()->numel();
+  //  auto *hiden_data = param.OutHidden()->data<float>();
+  //  for (int64_t i = 0; i < 10; i++) {
+  //    DLOG << "****************" << hiden_data[i];
+  //  }
+}
+
+template class GruKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/im2sequence_kernel.cpp b/src/operators/kernel/arm/im2sequence_kernel.cpp
index 709fa30a23d4efba3531d9bc567c99f53875bc12..8295fd94a31db2ad1c10d32a8c639b067e422f45 100644
--- a/src/operators/kernel/arm/im2sequence_kernel.cpp
+++ b/src/operators/kernel/arm/im2sequence_kernel.cpp
@@ -20,7 +20,7 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool Im2SequenceKernel<CPU, float>::Init(Im2SequenceParam *para) {
+bool Im2SequenceKernel<CPU, float>::Init(Im2SequenceParam<CPU> *para) {
   return true;
 }
 
@@ -33,7 +33,7 @@ inline int Im2SeqOutputSize(int input_size, int filter_size, int padding_0,
 
 template <>
 void Im2SequenceKernel<CPU, float>::Compute(
-    const Im2SequenceParam &param) const {
+    const Im2SequenceParam<CPU> &param) const {
   const Tensor *in_x = param.Input();
   Tensor *out = param.Output();
   out->mutable_data<float>();
diff --git a/src/operators/kernel/arm/lookup_kernel.cpp b/src/operators/kernel/arm/lookup_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..584c497c701bd0598e0a151774fe60b7c7fee718
--- /dev/null
+++ b/src/operators/kernel/arm/lookup_kernel.cpp
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef LOOKUP_OP
+
+#include "operators/kernel/lookup_kernel.h"
+#include "operators/kernel/central-arm-func/lookup_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool LookupKernel<CPU, float>::Init(LookupParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void LookupKernel<CPU, float>::Compute(const LookupParam<CPU> &param) const {
+  LookupCompute<float>(param);
+  param.Out()->set_lod(param.InputIds()->lod());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/lrn_kernel.cpp b/src/operators/kernel/arm/lrn_kernel.cpp
index 0c20c5167adee5165067cc5ab4935df255751755..3ec1bdd9a0e2ebbce555eef944fe56750505430f 100644
--- a/src/operators/kernel/arm/lrn_kernel.cpp
+++ b/src/operators/kernel/arm/lrn_kernel.cpp
@@ -21,12 +21,12 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool LrnKernel<CPU, float>::Init(LrnParam *param) {
+bool LrnKernel<CPU, float>::Init(LrnParam<CPU> *param) {
   return true;
 }
 
 template <>
-void LrnKernel<CPU, float>::Compute(const LrnParam &param) const {
+void LrnKernel<CPU, float>::Compute(const LrnParam<CPU> &param) const {
   LrnCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/mul_kernel.cpp b/src/operators/kernel/arm/mul_kernel.cpp
index ac5010ce5492ae1d99e59bfa761e22bb3aa5d1c9..aa3ee7077eb7db440c8493eae5b95f03a42196a4 100644
--- a/src/operators/kernel/arm/mul_kernel.cpp
+++ b/src/operators/kernel/arm/mul_kernel.cpp
@@ -21,13 +21,14 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool MulKernel<CPU, float>::Init(MulParam *param) {
+bool MulKernel<CPU, float>::Init(MulParam<CPU> *param) {
   return true;
 }
 
 template <>
-void MulKernel<CPU, float>::Compute(const MulParam &param) const {
+void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) const {
   MulCompute<float>(param);
+  param.Out()->set_lod(param.InputX()->lod());
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/arm/multiclass_nms_kernel.cpp b/src/operators/kernel/arm/multiclass_nms_kernel.cpp
index 9ed8f1731afe2bab723c66ea1e2e8c5042f6ce28..938f81cf485eb64f408c0fb274eeec673349e306 100644
--- a/src/operators/kernel/arm/multiclass_nms_kernel.cpp
+++ b/src/operators/kernel/arm/multiclass_nms_kernel.cpp
@@ -21,13 +21,13 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool MultiClassNMSKernel<CPU, float>::Init(MultiClassNMSParam *param) {
+bool MultiClassNMSKernel<CPU, float>::Init(MultiClassNMSParam<CPU> *param) {
   return true;
 }
 
 template <>
 void MultiClassNMSKernel<CPU, float>::Compute(
-    const MultiClassNMSParam &param) const {
+    const MultiClassNMSParam<CPU> &param) const {
   MultiClassNMSCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/pool_kernel.cpp b/src/operators/kernel/arm/pool_kernel.cpp
index 5c92d5be014faf4007c0853bde08e450ebc4f79a..60d6f1401876b957649d08889218b88cf1fe5eef 100644
--- a/src/operators/kernel/arm/pool_kernel.cpp
+++ b/src/operators/kernel/arm/pool_kernel.cpp
@@ -14,70 +14,19 @@ limitations under the License. */
 
 #ifdef POOL_OP
 
-#include <operators/kernel/pool_kernel.h>
-#include "common/log.h"
-
+#include "operators/kernel/pool_kernel.h"
+#include "../central-arm-func/pool_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
 
-inline void PoolBasic(std::string pooling_type, std::vector<int> ksize,
-                      std::vector<int> strides, std::vector<int> paddings,
-                      const Tensor *in_x, Tensor *out) {
-  if (pooling_type == "max") {
-    math::PoolFunctor<CPU, math::MaxPool<float>, float> pool2d_forward;
-    math::MaxPool<float> pool_process;
-    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
-
-  } else if (pooling_type == "avg") {
-    math::PoolFunctor<CPU, math::AvgPool<float>, float> pool2d_forward;
-    math::AvgPool<float> pool_process;
-    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
-  }
-}
-
 template <>
-bool PoolKernel<CPU, float>::Init(PoolParam *param) {
+bool PoolKernel<CPU, float>::Init(PoolParam<CPU> *param) {
   return true;
 }
 
 template <>
-void PoolKernel<CPU, float>::Compute(const PoolParam &param) const {
-  const Tensor *in_x = param.Input();
-  Tensor *out = param.Output();
-  std::string pooling_type = param.PoolingType();
-
-  std::vector<int> ksize = param.Ksize();
-
-  std::vector<int> strides = param.Strides();
-
-  std::vector<int> paddings = param.Paddings();
-  if (ksize.size() != 2) {
-    LOG(paddle_mobile::LogLevel::kLOG_ERROR)
-        << "Pool op only supports 2D and 3D input.";
-  }
-
-  if (param.isGlobalPooling()) {
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
-      ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
-    }
-  } else if (ksize[0] == 3 && ksize[0] == ksize[1]) {
-    if (pooling_type == "max") {
-      math::Pool3x3Max(strides, paddings, in_x, out);
-    } else if (pooling_type == "avg") {
-      math::Pool3x3Avg(strides, paddings, in_x, out);
-    }
-
-  } else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
-    if (pooling_type == "max") {
-      math::Pool2x2Max(strides, paddings, in_x, out);
-    } else if (pooling_type == "avg") {
-      math::Pool2x2Avg(strides, paddings, in_x, out);
-    }
-
-  } else {
-    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
-  }
+void PoolKernel<CPU, float>::Compute(const PoolParam<CPU> &param) const {
+  PoolCompute<float>(param);
 }
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/arm/prelu_kernel.cpp b/src/operators/kernel/arm/prelu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1ec927fb13d1f4a2e600d46f65f2806448059d9
--- /dev/null
+++ b/src/operators/kernel/arm/prelu_kernel.cpp
@@ -0,0 +1,122 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PRELU_OP
+
+#include "operators/kernel/prelu_kernel.h"
+#include <operators/math/transform.h>
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct PReluFunctor {
+  explicit PReluFunctor(float slope) { this->slope_ = slope; }
+  inline T operator()(T in) const { return in > 0 ? in : in * slope_; }
+
+  float slope_ = 0.0f;
+};
+
+/*
+ * @b 特化到具体平台的实现, param 从 op 层传入
+ * */
+template <>
+void PReluKernel<CPU, float>::Compute(const PReluParam<CPU> &param) const {
+  auto *x = param.InputX();
+  auto *alpha = param.InputAlpha();
+  auto *out = param.Out();
+  std::string mode = param.Mode();
+  auto *x_ptr = x->data<float>();
+  auto *o_ptr = out->mutable_data<float>();
+  auto *alpha_ptr = alpha->data<float>();
+  int numel = x->numel();
+  auto dim = x->dims();
+  int k = dim[0] * dim[1];
+  int n = dim[2] * dim[3];
+  int index = 0;
+  int i = 0;
+  int temp = 0;
+#if __ARM_NEON
+  #pragma omp parallel for
+  for (int i = 0; i < k; i++) {
+    float32x4_t zero = vdupq_n_f32(0.0);
+    float32x4_t cv;
+    float32x4_t cv1;
+    float32x4_t cv2;
+    float32x4_t pv;
+    for (int j = 0; (j + 3) < n; j += 4) {
+      const float *in = x_ptr + i * n + j;
+      float *out = o_ptr + i * n + j;
+      cv = vld1q_f32(in);
+      cv1 = vmaxq_f32(cv, zero);
+      cv2 = vminq_f32(cv, zero);
+      if (mode == "channel") {
+        cv2 = vmulq_n_f32(cv2, alpha_ptr[i]);
+      } else if (mode == "element") {
+        pv = vld1q_f32(alpha_ptr + i * n + j);
+        cv2 = vmulq_f32(cv2, pv);
+      } else {
+        cv2 = vmulq_n_f32(cv2, alpha_ptr[0]);
+      }
+      cv = vaddq_f32(cv1, cv2);
+      vst1q_f32(out, cv);
+    }
+    int j;
+    for (j = 0; (j + 3) < n; j += 4) {
+    }
+    for (int m = j; m < n; m++) {
+      if (mode == "channel") {
+        o_ptr[i * n + m] = x_ptr[i * n + m] > 0
+                               ? x_ptr[i * n + m]
+                               : alpha_ptr[i] * x_ptr[i * n + m];
+      } else if (mode == "element") {
+        o_ptr[i * n + m] = x_ptr[i * n + m] > 0
+                               ? x_ptr[i * n + m]
+                               : alpha_ptr[i * n + m] * x_ptr[i * n + m];
+      } else {
+        o_ptr[i * n + m] = x_ptr[i * n + m] > 0
+                               ? x_ptr[i * n + m]
+                               : alpha_ptr[0] * x_ptr[i * n + m];
+      }
+    }
+  }
+
+#else
+  if (mode == "channel") {
+    temp = numel / (dim[0] * dim[1]);
+#pragma omp parallel for
+    for (i = 0; i < numel; i++) {
+      index = (i / temp) % dim[1];
+      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
+    }
+  } else if (mode == "element") {
+#pragma omp parallel for
+    for (i = 0; i < numel; i++) {
+      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[i] * x_ptr[i];
+    }
+  } else {
+#pragma omp parallel for
+    for (i = 0; i < numel; i++) {
+      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i];
+    }
+  }
+#endif
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/prior_box_kernel.cpp b/src/operators/kernel/arm/prior_box_kernel.cpp
index 217d4b83cb1156a0e942c5ced5917546250e8bb1..71011fa2112f36d573b5bdc55f1b5bf92318c448 100644
--- a/src/operators/kernel/arm/prior_box_kernel.cpp
+++ b/src/operators/kernel/arm/prior_box_kernel.cpp
@@ -21,12 +21,13 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool PriorBoxKernel<CPU, float>::Init(PriorBoxParam *param) {
+bool PriorBoxKernel<CPU, float>::Init(PriorBoxParam<CPU> *param) {
   return true;
 }
 
 template <>
-void PriorBoxKernel<CPU, float>::Compute(const PriorBoxParam &param) const {
+void PriorBoxKernel<CPU, float>::Compute(
+    const PriorBoxParam<CPU> &param) const {
   PriorBoxCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/relu_kernel.cpp b/src/operators/kernel/arm/relu_kernel.cpp
index 63259a0c303f5e186f9eb90b98f2a8685f8ba5ca..6e04e6013aa8dd5c50dcc22a720b470b08ecd648 100644
--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -21,12 +21,12 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ReluKernel<CPU, float>::Init(ReluParam *param) {
+bool ReluKernel<CPU, float>::Init(ReluParam<CPU> *param) {
   return true;
 }
 
 template <>
-void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
+void ReluKernel<CPU, float>::Compute(const ReluParam<CPU> &param) const {
   ReluCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/reshape_kernel.cpp b/src/operators/kernel/arm/reshape_kernel.cpp
index 5ae8e5e3f945d115215652ded58dc8571868fcd7..235288ae13e2c557e6f7310727f5d8e6e83cedf6 100644
--- a/src/operators/kernel/arm/reshape_kernel.cpp
+++ b/src/operators/kernel/arm/reshape_kernel.cpp
@@ -21,12 +21,12 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ReshapeKernel<CPU, float>::Init(ReshapeParam *param) {
+bool ReshapeKernel<CPU, float>::Init(ReshapeParam<CPU> *param) {
   return true;
 }
 
 template <>
-void ReshapeKernel<CPU, float>::Compute(const ReshapeParam &param) const {
+void ReshapeKernel<CPU, float>::Compute(const ReshapeParam<CPU> &param) const {
   ReshapeCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/arm/resize_kernel.cpp b/src/operators/kernel/arm/resize_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5c0c186554a31454447b1df47a1b7573fd948fb9
--- /dev/null
+++ b/src/operators/kernel/arm/resize_kernel.cpp
@@ -0,0 +1,124 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESIZE_OP
+
+#include "operators/kernel/resize_kernel.h"
+#include <cmath>
+
+namespace paddle_mobile {
+namespace operators {
+void BiLinearResizeTensor(const float* src, const int src_height,
+                          const int src_width, float* dst, const int dst_height,
+                          const int dst_width) {
+  const float scale_w = src_width / (float)dst_width;
+  const float scale_h = src_height / (float)dst_height;
+  float* dst_data = dst;
+  const float* src_data = src;
+
+  for (int dst_h = 0; dst_h < dst_height; ++dst_h) {
+    float fh = dst_h * scale_h;
+
+    int src_h = std::floor(fh);
+
+    fh -= src_h;
+    const float w_h0 = std::abs((float)1.0 - fh);
+    const float w_h1 = std::abs(fh);
+
+    const int dst_offset_1 = dst_h * dst_width;
+    const int src_offset_1 = src_h * src_width;
+
+    float* dst_data_ptr = dst_data + dst_offset_1;
+
+    for (int dst_w = 0; dst_w < dst_width; ++dst_w) {
+      float fw = dst_w * scale_w;
+      int src_w = std::floor(fw);
+      fw -= src_w;
+      const float w_w0 = std::abs((float)1.0 - fw);
+      const float w_w1 = std::abs(fw);
+
+      float dst_value = 0;
+
+      const int src_idx = src_offset_1 + src_w;
+      dst_value += (w_h0 * w_w0 * src_data[src_idx]);
+      int flag = 0;
+      if (src_w + 1 < src_width) {
+        dst_value += (w_h0 * w_w1 * src_data[src_idx + 1]);
+        ++flag;
+      }
+      if (src_h + 1 < src_height) {
+        dst_value += (w_h1 * w_w0 * src_data[src_idx + src_width]);
+        ++flag;
+      }
+
+      if (flag > 1) {
+        dst_value += (w_h1 * w_w1 * src_data[src_idx + src_width + 1]);
+        //                ++flag;
+      }
+      *(dst_data_ptr++) = dst_value;
+    }
+  }
+}
+
+void ResizeTensor(const Tensor* src, const int src_n, const int src_c,
+                  Tensor* dst, const int dst_n, const int dst_c) {
+  framework::DDim in_dims = src->dims();
+  const int src_chans = in_dims[1];
+  const int src_height = in_dims[2];
+  const int src_width = in_dims[3];
+  const int src_offset = (src_n * src_chans + src_c) * src_height * src_width;
+
+  framework::DDim out_dims = dst->dims();
+  const int dst_chans = out_dims[1];
+  const int dst_height = out_dims[2];
+  const int dst_width = out_dims[3];
+  const int dst_offset = (dst_n * dst_chans + dst_c) * dst_height * dst_width;
+
+  const auto* src_ptr = src->data<float>();
+  auto* dst_ptr = dst->data<float>();
+  const auto* src_data = &(src_ptr[src_offset]);
+  auto* dst_data = &(dst_ptr[dst_offset]);
+  BiLinearResizeTensor(src_data, src_height, src_width, dst_data, dst_height,
+                       dst_width);
+}
+
+void ResizeTensor(const Tensor* src, Tensor* dst) {
+  framework::DDim in_dims = src->dims();
+  framework::DDim out_dims = dst->dims();
+  PADDLE_MOBILE_ENFORCE(in_dims[0] == out_dims[0],
+                        "src tensor batch num not equal to dst tensor");
+  PADDLE_MOBILE_ENFORCE(in_dims[1] == out_dims[1],
+                        "src tensor channel num not equal to dst tensor");
+  for (int n = 0, batch_num = in_dims[0]; n < batch_num; ++n) {
+    for (int c = 0, chan_num = in_dims[1]; c < chan_num; ++c) {
+      ResizeTensor(src, n, c, dst, n, c);
+    }
+  }
+}
+
+template <>
+void ResizeKernel<CPU, float>::Compute(const ResizeParam<CPU>& param) const {
+  const auto* input_x = param.InputX();
+  const auto& input_x_dims = input_x->dims();
+  auto* out = param.Out();
+  framework::DDim out_dims = CalOutputShape(param);
+
+  out->Resize(out_dims);
+  ResizeTensor(input_x, out);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/scale_kernel.cpp b/src/operators/kernel/arm/scale_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..299132ea00f40838249022c45d994e7d88547eaa
--- /dev/null
+++ b/src/operators/kernel/arm/scale_kernel.cpp
@@ -0,0 +1,146 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SCALE_OP
+
+#include "operators/kernel/scale_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+/*
+ * @b 特化到具体平台的实现, param 从 op 层传入
+ * */
+template <>
+void ScaleKernel<CPU, float>::Compute(const ScaleParam<CPU> &param) const {
+  const auto *input_x = param.InputX();
+  auto *input_x_ptr = input_x->data<float>();
+  auto *out = param.Out();
+  auto *out_ptr = out->mutable_data<float>();
+
+  const vector<float> scales = param.Scales();
+  bool has_bias = param.HasBias();
+
+  const int dim_size = input_x->dims().size();
+  switch (dim_size) {
+    case 1: {
+      const int input_width = input_x->dims()[0];
+      if (has_bias) {
+        const vector<float> biases = param.Biases();
+        #pragma omp parallel for
+        for (int w = 0; w < input_width; w++) {
+          out_ptr[w] = input_x_ptr[w] * scales[w] + biases[w];
+        }
+      } else {
+        #pragma omp parallel for
+        for (int w = 0; w < input_width; w++) {
+          out_ptr[w] = input_x_ptr[w] * scales[w];
+        }
+      }
+    } break;
+    case 2: {
+      const int input_height = input_x->dims()[0];
+      const int input_width = input_x->dims()[1];
+
+      if (has_bias) {
+        const vector<float> biases = param.Biases();
+        #pragma omp parallel for
+        for (int h = 0; h < input_height; ++h) {
+          const float *iptr = input_x_ptr + h * input_width;
+          float *optr = out_ptr + h * input_width;
+          for (int w = 0; w < input_width; ++w) {
+            optr[w] = iptr[w] * scales[w] + biases[w];
+          }
+        }
+      } else {
+        #pragma omp parallel for
+        for (int h = 0; h < input_height; ++h) {
+          const float *iptr = input_x_ptr + h * input_width;
+          float *optr = out_ptr + h * input_width;
+          for (int w = 0; w < input_width; ++w) {
+            optr[w] = iptr[w] * scales[w];
+          }
+        }
+      }
+    } break;
+    case 3: {
+      const int chan_size = input_x->dims()[0];
+      const int input_height = input_x->dims()[1];
+      const int input_width = input_x->dims()[2];
+      int size = input_width * input_height;
+
+      if (has_bias) {
+        const vector<float> biases = param.Biases();
+
+        #pragma omp parallel for
+        for (int c = 0; c < chan_size; ++c) {
+          const float *iptr = input_x_ptr + c * size;
+          float *optr = out_ptr + c * size;
+          for (int i = 0; i < size; ++i) {
+            optr[i] = iptr[i] * scales[c] + biases[c];
+          }
+        }
+      } else {
+        #pragma omp parallel for
+        for (int c = 0; c < chan_size; ++c) {
+          const float *iptr = input_x_ptr + c * size;
+          float *optr = out_ptr + c * size;
+          for (int i = 0; i < size; ++i) {
+            optr[i] = iptr[i] * scales[c];
+          }
+        }
+      }
+    } break;
+
+    case 4: {
+      const int batch_size = input_x->dims()[0];
+      const int chan_size = input_x->dims()[0];
+      const int input_height = input_x->dims()[1];
+      const int input_width = input_x->dims()[2];
+      int size = input_width * input_height;
+
+      if (has_bias) {
+        const vector<float> biases = param.Biases();
+
+        #pragma omp parallel for
+        for (int b = 0; b < batch_size; ++b) {
+          for (int c = 0; c < chan_size; ++c) {
+            const float *iptr = input_x_ptr + b * c * size;
+            float *optr = out_ptr + b * c * size;
+            for (int i = 0; i < size; ++i) {
+              optr[i] = iptr[i] * scales[c] + biases[c];
+            }
+          }
+        }
+      } else {
+        #pragma omp parallel for
+        for (int b = 0; b < batch_size; ++b) {
+          for (int c = 0; c < chan_size; ++c) {
+            const float *iptr = input_x_ptr + b * c * size;
+            float *optr = out_ptr + b * c * size;
+            for (int i = 0; i < size; ++i) {
+              optr[i] = iptr[i] * scales[c];
+            }
+          }
+        }
+      }
+    } break;
+    default:
+      break;
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/shape_kernel.cpp b/src/operators/kernel/arm/shape_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..69fd4021fe3110a7cea02a67443939048c1dddab
--- /dev/null
+++ b/src/operators/kernel/arm/shape_kernel.cpp
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef SHAPE_OP
+
+#include "operators/kernel/shape_kernel.h"
+#include "operators/kernel/central-arm-func/shape_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ShapeKernel<CPU, float>::Init(ShapeParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void ShapeKernel<CPU, float>::Compute(const ShapeParam<CPU> &param) const {
+  ShapeCompute<float>(param);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/sigmoid_kernel.cpp b/src/operators/kernel/arm/sigmoid_kernel.cpp
index 5eb65cd6cebf453e46dc16c4982f81cb679bbc72..7912fd8762b693cd40c632d6b152406ed4b0c568 100644
--- a/src/operators/kernel/arm/sigmoid_kernel.cpp
+++ b/src/operators/kernel/arm/sigmoid_kernel.cpp
@@ -15,7 +15,8 @@ limitations under the License. */
 #ifdef SIGMOID_OP
 
 #include "../sigmoid_kernel.h"
-#if __ARM_NEON
+#include "../central-arm-func/sigmoid_arm_func.h"
+#ifdef __ARM_NEON
 #include "../../math/math_func_neon.h"
 #endif
 #include <cmath>
@@ -25,64 +26,14 @@ namespace operators {
 using framework::DDim;
 using framework::Tensor;
 
-void sigmoid(const Tensor *X, Tensor *Y) {
-#if __ARM_NEON
-  const float *input = X->data<float>();
-  float *output = Y->mutable_data<float>();
-  const DDim &dDim = X->dims();
-  int axis_index = 1;
-  if (dDim.size() < 4) {
-    axis_index = 0;
-  }
-  DDim outer_ddim =
-      paddle_mobile::framework::slice_ddim(dDim, 0, axis_index + 1);
-  DDim inner_ddim =
-      paddle_mobile::framework::slice_ddim(dDim, axis_index + 1, dDim.size());
-  int out_size = paddle_mobile::framework::product(outer_ddim);
-  int inner_size = paddle_mobile::framework::product(inner_ddim);
-
-  DLOG << "outsize=" << out_size;
-  DLOG << "innersize=" << inner_size;
-  #pragma omp parallel for
-  for (int i = 0; i < out_size; ++i) {
-    const float *input_outer_ptr = input + i * inner_size;
-    float *output_outer_ptr = output + i * inner_size;
-    int nn = inner_size >> 2;
-    int remain = inner_size - (nn << 2);
-    float32x4_t _one = vdupq_n_f32(1.f);
-    for (; nn > 0; nn--) {
-      float32x4_t data = vld1q_f32(input_outer_ptr);
-      data = vnegq_f32(data);
-      data = exp_ps(data);
-      data = vaddq_f32(data, _one);
-      float32x4_t out_data = vrecpeq_f32(data);
-      out_data = vmulq_f32(vrecpsq_f32(data, out_data), out_data);
-      vst1q_f32(output_outer_ptr, out_data);
-
-      input_outer_ptr += 4;
-      output_outer_ptr += 4;
-    }
-    for (; remain > 0; remain--) {
-      *output_outer_ptr = 1.f / (1.f + exp(-*input_outer_ptr));
-      output_outer_ptr++;
-      input_outer_ptr++;
-    }
-  }
-#endif
-}
-
 template <>
-bool SigmoidKernel<CPU, float>::Init(SigmoidParam *param) {
+bool SigmoidKernel<CPU, float>::Init(SigmoidParam<CPU> *param) {
   return true;
 }
 
 template <>
-void SigmoidKernel<CPU, float>::Compute(const SigmoidParam &param) const {
-  const Tensor *in_x = param.InputX();
-  Tensor *out = param.Out();
-  auto x_dims = in_x->dims();
-  out->Resize(x_dims);
-  sigmoid(in_x, out);
+void SigmoidKernel<CPU, float>::Compute(const SigmoidParam<CPU> &param) const {
+  SigmoidCompute<float>(param);
 }
 
 template class SigmoidKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/slice_kernel.cpp b/src/operators/kernel/arm/slice_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..62efec9d2fb01568a108df8f3516085d81865bf7
--- /dev/null
+++ b/src/operators/kernel/arm/slice_kernel.cpp
@@ -0,0 +1,22 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SLICE_OP
+
+#include "operators/kernel/slice_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {}
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/arm/softmax_kernel.cpp b/src/operators/kernel/arm/softmax_kernel.cpp
index 29006d48dc00b650a725cd0a9cc3c37568e829a9..f86a10601aa3a67300736f2f4c751c05bf41a781 100644
--- a/src/operators/kernel/arm/softmax_kernel.cpp
+++ b/src/operators/kernel/arm/softmax_kernel.cpp
@@ -15,22 +15,19 @@ limitations under the License. */
 #ifdef SOFTMAX_OP
 
 #include "../softmax_kernel.h"
-#include "../../math/softmax.h"
+#include "../central-arm-func/softmax_arm_func.h"
+#include "operators/math/softmax.h"
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool SoftmaxKernel<CPU, float>::Init(SoftmaxParam *param) {
+bool SoftmaxKernel<CPU, float>::Init(SoftmaxParam<CPU> *param) {
   return true;
 }
 
 template <>
-void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam &param) const {
-  const Tensor *in_x = param.InputX();
-  Tensor *out = param.Out();
-  auto x_dims = in_x->dims();
-  out->Resize(x_dims);
-  math::SoftmaxFuntor<CPU, float>()(in_x, out);
+void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam<CPU> &param) const {
+  SoftmaxCompute<float>(param);
 }
 
 template class SoftmaxKernel<CPU, float>;
diff --git a/src/operators/kernel/arm/split_kernel.cpp b/src/operators/kernel/arm/split_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..292b5bda99a524615df4a8552e5617fd4470d8a0
--- /dev/null
+++ b/src/operators/kernel/arm/split_kernel.cpp
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef SPLIT_OP
+
+#include "operators/kernel/split_kernel.h"
+#include "operators/kernel/central-arm-func/split_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool SplitKernel<CPU, float>::Init(SplitParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void SplitKernel<CPU, float>::Compute(const SplitParam<CPU> &param) const {
+  SplitCompute<float>(param);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/transpose_kernel.cpp b/src/operators/kernel/arm/transpose_kernel.cpp
index c358edd76e93cee3f8be6086a70c34671c87d383..bb7a881bdc1d2706a25a77833ca38695ede2fec7 100644
--- a/src/operators/kernel/arm/transpose_kernel.cpp
+++ b/src/operators/kernel/arm/transpose_kernel.cpp
@@ -20,12 +20,13 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool TransposeKernel<CPU, float>::Init(TransposeParam *param) {
+bool TransposeKernel<CPU, float>::Init(TransposeParam<CPU> *param) {
   return true;
 }
 
 template <>
-void TransposeKernel<CPU, float>::Compute(const TransposeParam &param) const {
+void TransposeKernel<CPU, float>::Compute(
+    const TransposeParam<CPU> &param) const {
   TransposeCompute<float>(param);
 }
 
diff --git a/src/operators/kernel/batchnorm_kernel.h b/src/operators/kernel/batchnorm_kernel.h
index 367dd0996c0df5fba7c3570285cf5e2cfd3fac99..beac7399583d074956fa4564fdd9312b2d7985f0 100644
--- a/src/operators/kernel/batchnorm_kernel.h
+++ b/src/operators/kernel/batchnorm_kernel.h
@@ -26,10 +26,10 @@ using namespace framework;
 
 template <typename DeviceType, typename T>
 class BatchNormKernel
-    : public framework::OpKernelBase<DeviceType, BatchNormParam> {
+    : public framework::OpKernelBase<DeviceType, BatchNormParam<DeviceType>> {
  public:
-  void Compute(const BatchNormParam &param) const;
-  bool Init(BatchNormParam *param);
+  void Compute(const BatchNormParam<DeviceType> &param) const;
+  bool Init(BatchNormParam<DeviceType> *param);
 };
 
 }  // namespace operators
diff --git a/src/operators/kernel/bilinear_interp_kernel.h b/src/operators/kernel/bilinear_interp_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac3dfcb16190315f72dc60da54c4f944874e4458
--- /dev/null
+++ b/src/operators/kernel/bilinear_interp_kernel.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef BILINEAR_INTERP_OP
+
+#pragma once
+
+#include <vector>
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class BilinearInterpKernel
+    : public framework::OpKernelBase<DeviceType,
+                                     BilinearInterpParam<DeviceType>> {
+ public:
+  void Compute(const BilinearInterpParam<DeviceType>& param) const;
+  bool Init(BilinearInterpParam<DeviceType>* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/box_coder_kernel.h b/src/operators/kernel/box_coder_kernel.h
index 2ad63ecd90a07d955c3e239277ac1bd60f3510bb..58144a87349ed3a6504e0074903594be3aa6fe8f 100644
--- a/src/operators/kernel/box_coder_kernel.h
+++ b/src/operators/kernel/box_coder_kernel.h
@@ -27,10 +27,10 @@ namespace operators {
 
 template <typename DeviceType, typename T>
 class BoxCoderKernel
-    : public framework::OpKernelBase<DeviceType, BoxCoderParam> {
+    : public framework::OpKernelBase<DeviceType, BoxCoderParam<DeviceType>> {
  public:
-  void Compute(const BoxCoderParam& param) const;
-  bool Init(BoxCoderParam* param);
+  void Compute(const BoxCoderParam<DeviceType>& param) const;
+  bool Init(BoxCoderParam<DeviceType>* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/central-arm-func/batchnorm_arm_func.h b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
index 2086265ec6d9d36c5b3a334fad7a5cf66fc01f2b..1723835a6ae54b0c0f0a1e6153bf278efb7d8a12 100644
--- a/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
+++ b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
@@ -23,7 +23,7 @@ namespace paddle_mobile {
 namespace operators {
 
 template <typename P>
-void BatchnormCompute(const BatchNormParam &param) {
+void BatchnormCompute(const BatchNormParam<CPU> &param) {
   const Tensor *input_x = param.InputX();
   auto input_x_ptr = input_x->data<float>();
   const auto &x_dims = input_x->dims();
@@ -53,6 +53,41 @@ void BatchnormCompute(const BatchNormParam &param) {
                         "C must equal to variance.numel()");
 
   int HXW = H * W;
+
+#if __ARM_NEON
+#if __aarch64__
+  float *inv_std_ptr = new float[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+
+  Tensor new_scale;
+  auto new_scale_ptr = new_scale.mutable_data<float>(framework::make_ddim({C}));
+  Tensor new_bias;
+  auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
+
+  /// ((x - est_mean) * (inv_var) * scale + bias equal to
+  /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+    {
+      for (int n = 0; n < N; n++) {
+        for (int h = 0; h < H; h++) {
+          int tmp_index = n * stride0 + i * stride1 + h * stride2;
+          for (int w = 0; w < W; w++) {
+            int index = tmp_index + w;
+            out_ptr[index] =
+                input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
+          }
+        }
+      }
+    }
+  }
+  delete[] inv_std_ptr;
+#else
+
   if (HXW > 32) {
     int NXC = N * C;
     float *inv_std_ptr = new float[NXC * 4];
@@ -227,6 +262,39 @@ void BatchnormCompute(const BatchNormParam &param) {
 
     delete[] inv_std_ptr;
   }
+#endif
+#else
+  float *inv_std_ptr = new float[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+
+  Tensor new_scale;
+  auto new_scale_ptr = new_scale.mutable_data<float>(framework::make_ddim({C}));
+  Tensor new_bias;
+  auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
+
+  /// ((x - est_mean) * (inv_var) * scale + bias equal to
+  /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+    {
+      for (int n = 0; n < N; n++) {
+        for (int h = 0; h < H; h++) {
+          int tmp_index = n * stride0 + i * stride1 + h * stride2;
+          for (int w = 0; w < W; w++) {
+            int index = tmp_index + w;
+            out_ptr[index] =
+                input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
+          }
+        }
+      }
+    }
+  }
+  delete[] inv_std_ptr;
+#endif
 }
 
 }  // namespace operators
diff --git a/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h b/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..3840985ab8a963eae7d9a4cf96d9a55acf38f68c
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef BILINEAR_INTERP_OP
+#pragma once
+
+#include <vector>
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void BilinearInterpCompute(const BilinearInterpParam<CPU>& param) {
+  auto out_dims = param.Out()->dims();
+  auto* input = param.InputX()->data<float>();
+  auto out_size_t = param.InputOutPutSize();
+
+  int out_h = param.OutH();
+  int out_w = param.OutW();
+  if (out_size_t != nullptr) {
+    auto out_size_data = out_size_t->data<int>();
+    out_h = out_size_data[0];
+    out_w = out_size_data[1];
+  }
+  auto* output = param.Out()->mutable_data<float>(
+      {out_dims[0], out_dims[1], out_h, out_w});
+  auto batch_size = param.InputX()->dims()[0];
+  auto channels = param.InputX()->dims()[1];
+  auto in_h = param.InputX()->dims()[2];
+  auto in_w = param.InputX()->dims()[3];
+
+  auto in_hw = in_h * in_w;
+  auto out_hw = out_h * out_w;
+  auto in_chw = channels * in_hw;
+  auto out_chw = channels * out_hw;
+
+  float ratio_h =
+      (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
+  float ratio_w =
+      (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
+
+  if (in_h == out_h && in_w == out_w) {
+    memcpy(output, input, param.InputX()->numel() * sizeof(float));
+  } else {
+    for (int k = 0; k < batch_size; ++k) {  // loop for batches
+      for (int i = 0; i < out_h; ++i) {     // loop for images
+        int h = ratio_h * i;
+        int hid = (h < in_h - 1) ? 1 : 0;
+        float h1lambda = ratio_h * i - h;
+        float h2lambda = 1.f - h1lambda;
+
+        for (int j = 0; j < out_w; ++j) {
+          int w = ratio_w * j;
+          int wid = (w < in_w - 1) ? 1 : 0;
+          float w1lambda = ratio_w * j - w;
+          float w2lambda = 1.f - w1lambda;
+          // calculate four position for bilinear interpolation
+          const float* in_pos = &input[k * in_chw + h * in_w + w];
+          float* out_pos = &output[k * out_chw + i * out_w + j];
+
+          for (int c = 0; c < channels; ++c) {  // loop for channels
+            // bilinear interpolation
+            out_pos[0] = static_cast<float>(
+                h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
+                h1lambda * (w2lambda * in_pos[hid * in_w] +
+                            w1lambda * in_pos[hid * in_w + wid]));
+            in_pos += in_hw;
+            out_pos += out_hw;
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/box_coder_arm_func.h b/src/operators/kernel/central-arm-func/box_coder_arm_func.h
index eeb05f31b744c9e55e78375a495c5a5debf095c2..9cdc22cff0bc52d8ae1ff24d619735accd6dca3e 100644
--- a/src/operators/kernel/central-arm-func/box_coder_arm_func.h
+++ b/src/operators/kernel/central-arm-func/box_coder_arm_func.h
@@ -16,6 +16,8 @@ limitations under the License. */
 #pragma once
 
 #include <cmath>
+#include "framework/tensor.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -111,7 +113,7 @@ void DecodeCenterSize(const framework::Tensor& target_box,
 }
 
 template <typename P>
-void BoxCoderCompute(const BoxCoderParam& param) {
+void BoxCoderCompute(const BoxCoderParam<CPU>& param) {
   const auto* input_priorbox = param.InputPriorBox();
   const auto* input_priorboxvar = param.InputPriorBoxVar();
   const auto* input_targetbox = param.InputTargetBox();
@@ -122,7 +124,7 @@ void BoxCoderCompute(const BoxCoderParam& param) {
   auto col = input_priorbox->dims()[0];
   auto len = input_priorbox->dims()[1];
 
-  Tensor* output_box = param.OutputBox();
+  framework::Tensor* output_box = param.OutputBox();
   auto* output_box_dataptr = output_box->mutable_data<float>({row, col, len});
 
   if (code_type == "encode_center_size") {
diff --git a/src/operators/kernel/central-arm-func/concat_arm_func.h b/src/operators/kernel/central-arm-func/concat_arm_func.h
index e9926505b33b32ee83a16f882cc0f775797f154a..57a22aafa5e0bc75c1041c379c2229deaa310ffe 100644
--- a/src/operators/kernel/central-arm-func/concat_arm_func.h
+++ b/src/operators/kernel/central-arm-func/concat_arm_func.h
@@ -54,7 +54,7 @@ class ConcatFunctor {
 };
 
 template <typename P>
-void ConcatCompute(const ConcatParam &param) {
+void ConcatCompute(const ConcatParam<CPU> &param) {
   auto inputs = param.Inputs();
   auto *out = param.Out();
   int64_t axis = param.Axis();
diff --git a/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..a19c67e68366fc57a305e0dbb955229a763737d9
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
@@ -0,0 +1,140 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDADDPRELU_OP
+
+#pragma once
+#include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor bias = *param.Bias();
+  Tensor bias1 = *param.Bias1();
+  int axis = param.Axis();
+  Tensor *output = param.Output();
+
+  float *biase_data = bias.data<float>();
+
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+  Tensor aa = *param.InputAlpha();
+  float *p = aa.data<float>();
+
+  std::string mode = param.Mode();
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+    Tensor bias1_batch = bias1.Slice(i, i + 1).Resize(output_matrix_shape);
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      Tensor bias1_slice = bias1_batch.Slice(g * out_step, (g + 1) * out_step);
+      float *biase_data1 = bias1_slice.data<float>();
+      //                    int n = bias1_slice.dims()[0];
+      //                    int m = bias1_slice.dims()[1];
+      //                    for(int i=0;i<n*m;i++){
+      //                        if(biase_data1[i]!=0)
+      //                        DLOG<<biase_data1[i]<<",yangfei";
+      //                    }
+
+      //                    math::matmul<float>(filter_slice, false, col_matrix,
+      //                    false,
+      //                                        static_cast<float>(1),
+      //                                        &out_slice,
+      //                                        static_cast<float>(1), true,
+      //                                        biase_data);
+      math::matmulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
+                            p, mode, biase_data, biase_data1);
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_add_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..d71bc235977236fbd0dd332df556ea4bd41eacf4
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
@@ -0,0 +1,147 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADD_OP
+#pragma once
+
+#include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor bias = *param.Bias();
+  int axis = param.Axis();
+  Tensor *output = param.Output();
+  float *biase_data = bias.data<float>();
+
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(1), false, biase_data);
+    }
+  }
+}
+
+template <typename P>
+void ConvAddCompute(const FusionConvAddParam<CPU> &param) {
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
+                               param.Bias(), true);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    //        math::DepthwiseConv3x3(param.Input(), param.Strides(),
+    //        param.Paddings(),
+    //                               param.Filter(), param.Bias(),
+    //                               param.Output(), false);
+    if (param.Paddings()[0] == 0) {
+      math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
+                                 *param.Bias(), true);
+    } else {
+      math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
+                                   param.Output(), *param.Bias(), true);
+    }
+  } else {
+    ConvAddBasic(param);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7d14fbad1e4b72a8571d13898e55a6cad8bf9a8
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
@@ -0,0 +1,143 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#pragma once
+
+#include <vector>
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor new_bias = *param.NewBias();
+  Tensor new_scale = *param.NewScale();
+  Tensor *output = param.Output();
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+
+      math::matmulWithBn<float>(
+          filter_slice, false, col_matrix, false, static_cast<float>(1),
+          &out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
+    }
+  }
+}
+template <typename P>
+void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> &param) {
+  Tensor Bias;
+  Bias.mutable_data<float>({param.Groups()});
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
+                                        param.Output(), param.NewScale(),
+                                        param.NewBias(), true);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
+    //                                        param.Output(), param.NewScale(),
+    //                                        param.NewBias(), 1);
+    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
+                                          param.Output(), param.NewScale(),
+                                          param.NewBias(), true);
+  } else {
+    ConvAddBNReluBasic(param);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h b/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h
deleted file mode 100644
index bf96a2d46fd96516743127b71db57496e35b8a77..0000000000000000000000000000000000000000
--- a/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBNRELU_OP
-
-#pragma once
-#include "operators/math/depthwise_conv_3x3.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void ConvAddBNReluCompute(const FusionConvAddBNReluParam &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-  Tensor new_bias = *param.NewBias();
-  Tensor new_scale = *param.NewScale();
-  auto new_bias_ptr = new_bias.data<float>();
-  auto new_scale_ptr = new_scale.data<float>();
-  int axis = param.Axis();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-  Tensor *output = param.Output();
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  if (filter_shape_vec[2] == 3 && strides[0] == 1 && groups > 1) {
-    math::DepthwiseConvAddBNRelu3x3s1p1(input, filter, output, &bias, 1,
-                                        &new_scale, &new_bias, 1, 1);
-  } else {
-    const int batch_size = static_cast<int>(input->dims()[0]);
-
-    math::expand_bias(bias, axis, output->dims());
-    output->ShareDataWith(bias);
-
-    std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = input->dims()[1] / groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-    framework::DDim col_matrix_shape =
-        framework::flatten_to_2d(col_shape, data_dim + 1);
-
-    bool is_expand =
-        math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-    Tensor col;
-    Tensor col_matrix;
-    if (is_expand) {
-      col.mutable_data<float>(col_shape);
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-
-    framework::DDim input_shape = framework::slice_ddim(
-        input->dims(), 1, static_cast<int>(input->dims().size()));
-
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-    framework::DDim output_matrix_shape = {
-        output->dims()[1],
-        output->numel() / (output->dims()[0] * output->dims()[1])};
-
-    // convolution operator: im2col(or vol2col) + gemm
-    int in_step = static_cast<int>(input->dims()[1]) / groups;
-    int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-    math::Vol2ColFunctor<CPU, float> vol2col;
-    math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-      for (int g = 0; g < groups; g++) {
-        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-        if (!is_expand) {
-          col.ShareDataWith(in_slice);
-          col_matrix.ShareDataWith(col);
-          col_matrix.Resize(col_matrix_shape);
-        } else if (data_dim == 2U) {
-          // im2col
-          im2col(in_slice, dilations, strides,
-                 std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                  paddings[1]},
-                 &col);
-        } else if (data_dim == 3U) {
-          // vol2col
-          vol2col(in_slice, dilations, strides, paddings, &col);
-        }
-
-        // gemm
-        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-        Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        math::matmul<float>(filter_slice, false, col_matrix, false,
-                            static_cast<float>(1), &out_slice,
-                            static_cast<float>(1), false);
-      }
-    }
-
-    auto output_ptr = output->data<float>();
-    for (int c = 0; c < output_matrix_shape[0]; c++) {
-      int start = c * output_matrix_shape[1];
-      for (int j = 0; j < output_matrix_shape[1]; j++) {
-        output_ptr[start + j] =
-            output_ptr[start + j] * new_scale_ptr[c] + new_bias_ptr[c];
-        output_ptr[start + j] =
-            output_ptr[start + j] < 0 ? 0 : output_ptr[start + j];
-      }
-    }
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..df63379d967606e15106937534bb82496ee83b4e
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDPRELU_OP
+
+#pragma once
+#include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor bias = *param.Bias();
+  //            DLOG<<"yangfei";
+  //            DLOG<<bias.dims();
+  int axis = param.Axis();
+  Tensor *output = param.Output();
+  float *biase_data = bias.data<float>();
+
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+  Tensor aa = *param.InputAlpha();
+  float *p = aa.data<float>();
+  std::string mode = param.Mode();
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      //                    math::matmul<float>(filter_slice, false, col_matrix,
+      //                    false,
+      //                                        static_cast<float>(1),
+      //                                        &out_slice,
+      //                                        static_cast<float>(1), true,
+      //                                        biase_data);
+      math::matmulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
+                            p, mode, biase_data, nullptr);
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
index 6aadbab95c591d4286fdbb3c3f01a291cdd90429..36886b9e2ccfaaa3f557eb7941e294a42b5edb94 100644
--- a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
@@ -12,24 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef FUSION_CONVADD_RELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 
 #pragma once
 #include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <typename P>
-void ConvAddReluCompute(const FusionConvAddReluParam &param) {
+void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
   const Tensor *input = param.Input();
   Tensor filter = *param.Filter();
   Tensor bias = *param.Bias();
   int axis = param.Axis();
   Tensor *output = param.Output();
-  math::expand_bias(bias, axis, output->dims());
-  output->ShareDataWith(bias);
+  float *biase_data = bias.data<float>();
+
   int groups = param.Groups();
   std::vector<int> strides = param.Strides();
   std::vector<int> paddings = param.Paddings();
@@ -106,7 +110,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam &param) {
       Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
       math::matmul<float>(filter_slice, false, col_matrix, false,
                           static_cast<float>(1), &out_slice,
-                          static_cast<float>(1), true);
+                          static_cast<float>(1), true, biase_data);
     }
   }
 }
diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h
index d08eebe5493bd9026073c3349631a42024579b95..a3e21e4b4b702630f7942f2a5171a3401f29a431 100644
--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -17,13 +17,15 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "operators/math/conv_func.h"
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
-
-template <typename P>
-void ConvCompute(const ConvParam &param) {
+inline void ConvBasic(const ConvParam<CPU> &param) {
   const Tensor *input = param.Input();
   Tensor filter = *param.Filter();
   Tensor *output = param.Output();
@@ -109,6 +111,25 @@ void ConvCompute(const ConvParam &param) {
   }
 }
 
+template <typename P>
+void ConvCompute(const ConvParam<CPU> &param) {
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
+                               nullptr, false);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3) {
+    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
+                           param.Filter(), nullptr, param.Output(), false);
+  } else {
+    ConvBasic(param);
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c31eed19693d20084e25daa485a0553d5d795f2
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
@@ -0,0 +1,147 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNADDRELU_OP
+
+#pragma once
+
+#include <vector>
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor new_bias = *param.NewBias();
+  Tensor new_scale = *param.NewScale();
+  Tensor *output = param.Output();
+  Tensor *bias1 = param.Bias();
+  int groups = param.Groups();
+  DLOG << "yangfei2";
+  DLOG << bias1->dims();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+    Tensor bias_batch = bias1->Slice(i, i + 1).Resize(output_matrix_shape);
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step);
+      math::matmulWithBn<float>(filter_slice, false, col_matrix, false,
+                                static_cast<float>(1), &out_slice,
+                                static_cast<float>(1), true, &new_scale,
+                                &new_bias, g, bias_data.data<float>());
+    }
+  }
+}
+template <typename P>
+void ConvBNAddReluCompute(const FusionConvBNAddReluParam<CPU> &param) {
+  Tensor Bias;
+  Bias.mutable_data<float>({param.Groups()});
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
+                                        param.Output(), param.NewScale(),
+                                        param.NewBias(), true);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
+    //                                        param.Output(), param.NewScale(),
+    //                                        param.NewBias(), 1);
+    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
+                                          param.Output(), param.NewScale(),
+                                          param.NewBias(), true);
+  } else {
+    ConvBNAddReluBasic(param);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6300f96e1b999c45538417c7b513068697ad4dd
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#pragma once
+#include <vector>
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+void ConvBNReluBasic(const FusionConvBNReluParam<CPU> &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor new_bias = *param.NewBias();
+  Tensor new_scale = *param.NewScale();
+
+  Tensor *output = param.Output();
+
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+
+      math::matmulWithBn<float>(
+          filter_slice, false, col_matrix, false, static_cast<float>(1),
+          &out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
+    }
+  }
+}
+
+template <typename P>
+void ConvBNReluCompute(const FusionConvBNReluParam<CPU> &param) {
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
+                                        param.Output(), param.NewScale(),
+                                        param.NewBias(), true);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
+    //                                        param.Output(), param.NewScale(),
+    //                                        param.NewBias(), 1);
+    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
+                                          param.Output(), param.NewScale(),
+                                          param.NewBias(), true);
+  } else {
+    ConvBNReluBasic(param);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h b/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..343e5f147644cc5bb86c2929d4bd35b44301c4cf
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
@@ -0,0 +1,114 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_TRANSPOSE
+
+#include <vector>
+
+#include "framework/ddim.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+#pragma once
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void ConvTransposeCompute(const ConvTransposeParam<CPU> &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor *output = param.Output();
+
+  auto strides = param.Strides();
+  auto paddings = param.Paddings();
+  auto dilations = param.Dilations();
+  auto groups = param.Groups();
+
+  const int batch_size = input->dims()[0];
+
+  std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
+  std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
+
+  size_t data_dim = filter_shape_vec.size() - 2;
+
+  // 5 或者 7
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+
+  // output c / groups
+  col_shape_vec[0] = output->dims()[1] / groups;
+  for (size_t i = 0; i < data_dim; ++i) {
+    // filter shape  filter h  filter w
+    col_shape_vec[i + 1] = filter_shape_vec[i + 2];
+    // input shape  input h  input w
+    col_shape_vec[i + 1 + data_dim] = input_shape_vec[i + 2];
+  }
+
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  Tensor col;
+  col.mutable_data<P>(col_shape);
+
+  Tensor col_matrix;
+  col_matrix.ShareDataWith(col);
+  col_matrix.Resize(col_matrix_shape);
+
+  framework::DDim output_shape =
+      framework::slice_ddim(output->dims(), 1, output->dims().size());
+
+  framework::DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
+
+  // filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w)
+  framework::DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
+  filter.Resize(filter_matrix_shape);
+
+  output->mutable_data<P>();
+
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Col2ImFunctor<math::ColFormat::kCFO, CPU, P> col2im;
+  math::Col2VolFunctor<CPU, P> col2vol;
+
+  for (int i = 0; i < batch_size; ++i) {
+    Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
+    Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
+
+    for (int g = 0; g < groups; ++g) {
+      Tensor in_slice = input_batch.Slice(g * in_step, (g + 1) * in_step);
+      Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
+      Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);
+
+      math::matmul(filter_slice, true, in_slice, false, static_cast<P>(1.0),
+                   &col_matrix, static_cast<P>(0.0));
+      if (data_dim == 2U) {
+        col2im(col, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &out_slice);
+
+      } else if (data_dim == 3U) {
+        col2vol(col, dilations, strides, paddings, &out_slice);
+      }
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/central-arm-func/crf_arm_func.h b/src/operators/kernel/central-arm-func/crf_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..2cf95081e9678325046d49f86ebf072a14a76795
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/crf_arm_func.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CRF_OP
+#pragma once
+
+#include <limits>
+#include <vector>
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+template <typename P>
+void Decode(const Tensor& emission_weights, const Tensor& transition_weights,
+            Tensor* decoded_path) {
+  auto emission_dims = emission_weights.dims();
+  const size_t seq_len = emission_dims[0];
+  const size_t tag_num = emission_dims[1];
+
+  const size_t state_trans_base_idx = 2;
+
+  const P* x = emission_weights.data<P>();
+  const P* w = transition_weights.data<P>();
+  int64_t* path = decoded_path->data<int64_t>();
+
+  // alpha is a memo table. An element alpha(k, v) records the score of the
+  // best sequence of tags from position 1 to position k with v being the end
+  // tag.
+  Tensor alpha;
+  P* alpha_value = alpha.mutable_data<P>(emission_dims);
+  Tensor track;
+  int* track_value = track.mutable_data<int>(emission_dims);
+  for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
+
+  for (size_t k = 1; k < seq_len; ++k) {
+    for (size_t i = 0; i < tag_num; ++i) {
+      P max_score = -std::numeric_limits<P>::max();
+      int max_j = 0;
+      for (size_t j = 0; j < tag_num; ++j) {
+        P score = alpha_value[(k - 1) * tag_num + j] +
+                  w[(j + state_trans_base_idx) * tag_num + i];
+        if (score > max_score) {
+          max_score = score;
+          max_j = j;
+        }
+      }
+
+      alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
+      track_value[k * tag_num + i] = max_j;
+    }
+  }
+  P max_score = -std::numeric_limits<P>::max();
+  int max_i = 0;
+  for (size_t i = 0; i < tag_num; ++i) {
+    P score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i];
+    if (score > max_score) {
+      max_score = score;
+      max_i = i;
+    }
+  }
+  path[seq_len - 1] = max_i;
+  for (int k = seq_len - 1; k >= 1; --k) {
+    path[k - 1] = max_i = track_value[k * tag_num + max_i];
+  }
+}
+template <typename P>
+void CrfCompute(const CrfParam<CPU>& param) {
+  auto* emission = param.InputEmission();
+  auto* transition = param.InputTransition();
+  auto* label = param.InputLabel();
+  auto* decoded_path = param.outputVBP();
+  //  DLOG<<*emission;
+  //  DLOG<<*transition;
+  //  DLOG<<*label;
+
+  PADDLE_MOBILE_ENFORCE(emission->NumLevels() == 1U,
+                        "The Input(Emission) should be a sequence.");
+  auto lod = emission->lod();
+  PADDLE_MOBILE_ENFORCE(lod.size(),
+                        "The Input(Emission) should be a sequence.");
+  const size_t level = 0;
+  const size_t seq_num = lod[level].size() - 1;
+  int64_t* path = decoded_path->mutable_data<int64_t>();
+  int numel = decoded_path->numel();
+  memset(static_cast<void*>(path), 0, sizeof(int64_t) * numel);
+  for (size_t i = 0; i < seq_num; ++i) {
+    int start_pos = static_cast<int>(lod[level][i]);
+    int end_pos = static_cast<int>(lod[level][i + 1]);
+    Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos);
+    Decode<P>(emission->Slice(start_pos, end_pos), *transition,
+              &decoded_path_one_seq);
+  }
+  if (label) {
+    PADDLE_MOBILE_ENFORCE(label->NumLevels() == 1U,
+                          "The Input(Label) should be a sequence.");
+    const int64_t* label_value = label->data<int64_t>();
+    size_t batch_size = emission->dims()[0];
+    for (size_t i = 0; i < batch_size; ++i) {
+      path[i] = label_value[i] == path[i] ? 1 : 0;
+    }
+  }
+}
+}  // namespace operators
+
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
index e43e3664cb005bab4d3c5ec8b5b35bd6925c982d..2a1afb3cf6fdbdc0a80cec5558c2b42fec6699f3 100644
--- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
@@ -15,98 +15,36 @@ limitations under the License. */
 #ifdef DEPTHWISECONV_OP
 
 #pragma once
+#include <operators/math/depthwise_conv_3x3.h>
 #include <vector>
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
+
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <typename P>
-void DepthwiseConvCompute(const ConvParam &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  //  DLOG << " compute end get Attrs " << strides[0];
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(0));
-    }
+void DepthwiseConvCompute(const ConvParam<CPU> &param) {
+  Tensor Bias;
+  Bias.mutable_data<float>({param.Groups()});
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
+                               &Bias, false);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    //    math::DepthwiseConv3x3(param.Input(), param.Strides(),
+    //    param.Paddings(),
+    //                           param.Filter(), &Bias, param.Output(), false);
+    math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), param.Output(),
+                                 Bias, false);
+
+  } else {
+    ConvBasic(param);
   }
 }
 
diff --git a/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h b/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..b60bf9b4d6df9d85cc2fbe378a3904c2d13e5e60
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
@@ -0,0 +1,140 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_DWCONVBNRELU_OP
+
+#pragma once
+#include <vector>
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+void DWConvBNReluBasic(const FusionDWConvBNReluParam<CPU> &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor new_bias = *param.NewBias();
+  Tensor new_scale = *param.NewScale();
+
+  Tensor *output = param.Output();
+
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmulWithBn<float>(
+          filter_slice, false, col_matrix, false, static_cast<float>(1),
+          &out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
+    }
+  }
+}
+template <typename P>
+void DWConvBNReluCompute(const FusionDWConvBNReluParam<CPU> &param) {
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
+                                        param.Output(), param.NewScale(),
+                                        param.NewBias(), true);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
+    //                                        param.Output(), param.NewScale(),
+    //                                        param.NewBias(), 1);
+    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
+                                          param.Output(), param.NewScale(),
+                                          param.NewBias(), true);
+  } else {
+    DWConvBNReluBasic(param);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
index 8b3f5d0a8083b63334319b2054f9bf463efa66c7..ace72b6faddb04ee3547f1b2bc01461d8c9f2e98 100644
--- a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifdef ELEMENTWISEADD_OP
 
 #pragma once
+#include "operators/math/elementwise_op_function.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -25,7 +27,7 @@ struct AddFunctor {
 };
 
 template <typename P>
-void ElementwiseAddCompute(const ElementwiseAddParam &param) {
+void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
   const Tensor *input_x = param.InputX();
   const Tensor *input_y = param.InputY();
   Tensor *Out = param.Out();
diff --git a/src/operators/kernel/central-arm-func/flatten_arm_func.h b/src/operators/kernel/central-arm-func/flatten_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..396658013310a84c763f90f7cec515fba4fd7e4e
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/flatten_arm_func.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FLATTEN_OP
+
+#ifndef RESHAPE_OP
+#define RESHAPE_OP
+#endif
+
+#pragma once
+
+#include <operators/kernel/reshape_kernel.h>
+#include <vector>
+#include "operators/flatten_op.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void FlattenCompute(const FlattenParam<CPU> &param) {
+  const auto *input_x = param.InputX();
+  const auto axis = param.Axis();
+  const auto &input_x_dims = input_x->dims();
+  auto *out = param.Out();
+
+  const auto &out_shape_v = GetOutputShape(axis, input_x_dims);
+  const framework::DDim &out_dim = ValidateShape(out_shape_v, input_x_dims);
+
+  out->Resize(out_dim);
+  out->mutable_data<float>();
+  framework::TensorCopy(*input_x, out);
+  out->Resize(out_dim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
index 8a01f554140712c6a941b40372cbcfe35a951ce7..42c01d2825e052a52e7021a1b2a97997fb9c915b 100644
--- a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
+++ b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
@@ -15,12 +15,14 @@ limitations under the License. */
 #ifdef FUSION_FC_OP
 
 #pragma once
+#include "operators/math/math_function.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <typename P>
-void FusionFcCompute(const FusionFcParam &param) {
+void FusionFcCompute(const FusionFcParam<CPU> &param) {
   const Tensor *input_x = param.InputX();
   const Tensor *input_y = param.InputY();
   const Tensor *input_z = param.InputZ();
@@ -28,6 +30,9 @@ void FusionFcCompute(const FusionFcParam &param) {
   int axis = param.Axis();
   Tensor *out = param.Out();
   auto *out_data = out->mutable_data<float>();
+  //  int m = out->dims()[0];
+  //  int n = out->dims()[1];
+
   const Tensor x_matrix =
       input_x->dims().size() > 2
           ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
@@ -45,18 +50,19 @@ void FusionFcCompute(const FusionFcParam &param) {
   PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
                         " out_dim.size must be 2.");
   axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
-  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ")
+  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ");
 
   int64_t classes = input_z->numel();
   for (int i = 0; i < out_dim[0]; i++) {
     memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
   }
 
-  for (int i = 0; i < out->numel(); i++) {
-    DLOG << out_data[i];
-  }
+  //  for (int i = 0; i < out->numel(); i++) {
+  //    DLOG << out_data[i];
+  //  }
+  // bias_data的维度和out的维度一致
   math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
-                      out, static_cast<float>(1));
+                      out, static_cast<float>(1), false);
   PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
   //            if (out_dim.size() != 2) {
   //                out->Resize(out_dim);
diff --git a/src/operators/kernel/central-arm-func/gru_arm_func.h b/src/operators/kernel/central-arm-func/gru_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e00e839ff10da0d40612c9f63d5d0f7e059a0fe
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/gru_arm_func.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef GRU_OP
+#pragma once
+
+#include <operators/math/sequence2batch.h>
+#include <vector>
+#include "common/types.h"
+#include "operators/math/gru_compute.h"
+#include "operators/math/math_function.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+template <typename DeviceType, typename T>
+inline void ReorderInitState(const framework::Tensor& src,
+                             std::vector<size_t> index_lod,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<DeviceType, T> row_shuffle;
+  dst->mutable_data<T>(src.dims());
+  row_shuffle(src, index_lod, dst, indexed_src);
+}
+template <typename P>
+void GruCompute(const GruParam<CPU>& param) {
+  auto* input = param.InputInput();
+  auto* h0 = param.InputH0();
+  auto* weight = param.InputWeight();
+  const auto* weight_data = weight->data<float>();
+  auto* bias = param.InputBias();
+  auto* batch_gate = param.OutBatchGate();
+  batch_gate->mutable_data<float>();
+  auto* batch_reset_hidden_prev = param.OutBatchResetHiddenPrev();
+  batch_reset_hidden_prev->mutable_data<float>();
+  auto* batch_hidden = param.OutBatchHidden();
+  batch_hidden->mutable_data<float>();
+  auto* hidden = param.OutHidden();
+  hidden->mutable_data<float>();
+
+  auto hidden_dims = hidden->dims();
+
+  bool is_reverse = param.IsReverse();
+  math::LoDTensor2BatchFunctor<CPU, float> to_batch;
+  to_batch(*input, batch_gate, true, is_reverse);
+  //  math::ClearTensor<CPU, float> clearTensor;
+  //  clearTensor(batch_gate);
+  if (bias) {
+    math::RowwiseAdd<CPU, float> add_bias;
+    add_bias(*batch_gate, *bias, batch_gate);
+  }
+  int frame_size = hidden_dims[1];
+  math::GRUMetaValue<float> gru_value;
+  gru_value.gate_weight = const_cast<float*>(weight_data);
+  gru_value.state_weight =
+      const_cast<float*>(weight_data + 2 * frame_size * frame_size);
+  Tensor ordered_h0;
+  std::vector<size_t> order(batch_gate->lod()[2]);
+  if (h0) {
+    // Since the batch computing for GRU reorders the input sequences
+    // according to their length. The initialized cell state also needs
+    // to reorder.
+    ReorderInitState<CPU, float>(*h0, order, &ordered_h0, true);
+    gru_value.prev_out_value = ordered_h0.data<float>();
+  } else {
+    gru_value.prev_out_value = nullptr;
+  }
+  auto batch_starts = batch_gate->lod()[0];
+  size_t seq_len = batch_starts.size() - 1;
+  auto active_node = math::GetActivationType(param.Activation());
+  auto active_gate = math::GetActivationType(param.GateActivation());
+  for (size_t n = 0; n < seq_len; n++) {
+    int bstart = static_cast<int>(batch_starts[n]);
+    int bend = static_cast<int>(batch_starts[n + 1]);
+    int cur_batch_size = bend - bstart;
+    Tensor gate_t = batch_gate->Slice(bstart, bend);  // BUG
+    Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+    Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+    gru_value.output_value = hidden_t.data<float>();
+    gru_value.gate_value = gate_t.data<float>();
+    gru_value.reset_output_value = reset_hidden_prev_t.data<float>();
+
+    math::GRUUnitFunctor<CPU, float>::compute(
+        gru_value, frame_size, cur_batch_size, active_node, active_gate);
+
+    gru_value.prev_out_value = gru_value.output_value;
+  }
+  math::Batch2LoDTensorFunctor<CPU, float> to_seq;
+  batch_hidden->set_lod(batch_gate->lod());
+  to_seq(*batch_hidden, hidden);
+}
+
+}  // namespace operators
+
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/lookup_arm_func.h b/src/operators/kernel/central-arm-func/lookup_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..917973822f90b5015ea6b49aef0b7437ce8988e1
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/lookup_arm_func.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef LOOKUP_OP
+#pragma once
+
+#include <vector>
+#include "framework/ddim.h"
+#include "operators/op_param.h"
+
+constexpr int64_t kNoPadding = -1;
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void LookupCompute(const LookupParam<CPU> &param) {
+  auto *ids_t = param.InputIds();
+  auto *table_t = param.InputW();
+  auto *output_t = param.Out();
+  int64_t padding_idx = param.PaddingIdx();
+  const framework::DDim &table_dim = table_t->dims();
+  int64_t ids_numel;
+  const auto *ids = ids_t->data<int64_t>();
+  ids_numel = ids_t->numel();
+  int64_t row_number = table_t->dims()[0];
+  int64_t row_width = table_t->dims()[1];
+  auto *table = table_t->data<float>();
+  auto *output = output_t->mutable_data<float>();
+  for (int64_t i = 0; i < ids_numel; ++i) {
+    if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+      memset(output + i * row_width, 0, row_width * sizeof(float));
+    } else {
+      PADDLE_MOBILE_ENFORCE(ids[i] < row_number,
+                            "look uptable ids[i] <row_number check failed");
+      PADDLE_MOBILE_ENFORCE(ids[i] >= 0,
+                            "lookuptable ids[i] >= 0 check failed");
+
+      memcpy(output + i * row_width, table + ids[i] * row_width,
+             row_width * sizeof(float));
+    }
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/lrn_arm_func.h b/src/operators/kernel/central-arm-func/lrn_arm_func.h
index 52bb1b67dee83c28f513649a8763034a8d538d73..165ad8dd8af1f8a09f16b1737fbaff363cfaf5bd 100644
--- a/src/operators/kernel/central-arm-func/lrn_arm_func.h
+++ b/src/operators/kernel/central-arm-func/lrn_arm_func.h
@@ -15,12 +15,12 @@ limitations under the License. */
 #ifdef LRN_OP
 
 #pragma once
-
+#include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
 
 template <typename P>
-void LrnCompute(const LrnParam &param) {
+void LrnCompute(const LrnParam<CPU> &param) {
   const Tensor *input_x = param.InputX();
   auto x_dims = input_x->dims();
   Tensor *out = param.Out();
diff --git a/src/operators/kernel/central-arm-func/mul_arm_func.h b/src/operators/kernel/central-arm-func/mul_arm_func.h
index 9dfb1f48a574156f1b026fc6af3a03d77b81263f..dd6df54da5a81c2c4d1030103b6bb9811a54246a 100644
--- a/src/operators/kernel/central-arm-func/mul_arm_func.h
+++ b/src/operators/kernel/central-arm-func/mul_arm_func.h
@@ -19,8 +19,42 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 
+// 1、如果x,y维度都是2维，
+// x = [[1,2],   y = [[5,6],
+//      [3,4]]        [7,8]]
+// 运算结果为正常矩阵相乘。结果 out =
+//  [[1*5+2*7,1*6+2*8],[3*5+4*7, 3*6+4*8]]
+//
+// 2、如果x的维度大于2或者y的维度大于2,x的维度(2,3,4) ,y的维度(4,1,2)
+// x = [[[1,2,3,4],
+//       [2,3,4,5],
+//       [3,4,5,6]],
+//      [[1,2,3,4],
+//       [2,3,4,5],
+//       [3,4,5,6]]]
+// y = [[[1,2]],
+//      [[3,4]],
+//      [[5,6]],
+//      [[7,8]]]
+// 需要借助x_num_col_dims和y_num_col_dims将x和y的维度转换为2维
+// 从模型中读到参数,x_num_col_dims = 2,y_num_col_dims = 1,左开右闭
+// (1) 将x = (2,3,4)的index [0,x_num_col_dims)部分2,3相乘，得到6，
+//     [x_num_col_dims,xdim.size())部分4相乘，得到4，
+//     将Tensor x的dims重写成(6,4)
+// (2) 将y = (4,1,2)的index [0,y_num_col_dims)部分4相乘，得到4，
+//     [y_num_col_dims,ydim.size())部分1,2相乘，得到2,
+//     将Tensor y的dims重写成(4,2)
+// 并不影响x,y在内存中的分布。
+// x = [[1,2,3,4],             y = [[1,2],
+//      [2,3,4,5],                  [3,4],
+//      [3,4,5,6],   矩阵乘法        [5,6],
+//      [1,2,3,4],                  [7,8]]
+//      [2,3,4,5],
+//      [3,4,5,6]]
+// 结果x(6行4列)乘y(4行2列)，按1中矩阵相乘，结果out(6行2列)
+
 template <typename P>
-void MulCompute(const MulParam &param) {
+void MulCompute(const MulParam<CPU> &param) {
   const Tensor *input_x = param.InputX();
   const Tensor *input_y = param.InputY();
   Tensor *out = param.Out();
diff --git a/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h b/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
index 8833f012d97390e758ac6fc394ef237cb86632b1..9de57910540b4c9f7ab807053add9c5af9947ae7 100644
--- a/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
+++ b/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
@@ -19,6 +19,8 @@ limitations under the License. */
 #include <map>
 #include <utility>
 #include <vector>
+#include "framework/tensor.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -89,7 +91,8 @@ static inline T JaccardOverlap(const T* box1, const T* box2,
 }
 
 template <typename T>
-static inline void NMSFast(const Tensor& bbox, const Tensor& scores,
+static inline void NMSFast(const framework::Tensor& bbox,
+                           const framework::Tensor& scores,
                            const T score_threshold, const T nms_threshold,
                            const T eta, const int64_t top_k,
                            std::vector<int>* selected_indices) {
@@ -131,7 +134,8 @@ static inline void NMSFast(const Tensor& bbox, const Tensor& scores,
 }
 
 template <typename T>
-void MultiClassNMS(const Tensor& scores, const Tensor& bboxes,
+void MultiClassNMS(const framework::Tensor& scores,
+                   const framework::Tensor& bboxes,
                    std::map<int, std::vector<int>>* indices, int* num_nmsed_out,
                    const int& background_label, const int& nms_top_k,
                    const int& keep_top_k, const T& nms_threshold,
@@ -141,7 +145,7 @@ void MultiClassNMS(const Tensor& scores, const Tensor& bboxes,
   int num_det = 0;
   for (int64_t c = 0; c < class_num; ++c) {
     if (c == background_label) continue;
-    Tensor score = scores.Slice(c, c + 1);
+    framework::Tensor score = scores.Slice(c, c + 1);
     /// [c] is key
     NMSFast<float>(bboxes, score, score_threshold, nms_threshold, nms_eta,
                    nms_top_k, &((*indices)[c]));
@@ -181,9 +185,10 @@ void MultiClassNMS(const Tensor& scores, const Tensor& bboxes,
 }
 
 template <typename T>
-void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
+void MultiClassOutput(const framework::Tensor& scores,
+                      const framework::Tensor& bboxes,
                       const std::map<int, std::vector<int>>& selected_indices,
-                      Tensor* outs) {
+                      framework::Tensor* outs) {
   int predict_dim = scores.dims()[1];
   auto* scores_data = scores.data<T>();
   auto* bboxes_data = bboxes.data<T>();
@@ -208,7 +213,7 @@ void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
 }
 
 template <typename P>
-void MultiClassNMSCompute(const MultiClassNMSParam& param) {
+void MultiClassNMSCompute(const MultiClassNMSParam<CPU>& param) {
   const auto* input_bboxes = param.InputBBoxes();
   const auto& input_bboxes_dims = input_bboxes->dims();
 
@@ -231,10 +236,10 @@ void MultiClassNMSCompute(const MultiClassNMSParam& param) {
   std::vector<std::map<int, std::vector<int>>> all_indices;
   std::vector<size_t> batch_starts = {0};
   for (int64_t i = 0; i < batch_size; ++i) {
-    Tensor ins_score = input_scores->Slice(i, i + 1);
+    framework::Tensor ins_score = input_scores->Slice(i, i + 1);
     ins_score.Resize({class_num, predict_dim});
 
-    Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
+    framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
     ins_boxes.Resize({predict_dim, box_dim});
 
     std::map<int, std::vector<int>> indices;
@@ -253,16 +258,16 @@ void MultiClassNMSCompute(const MultiClassNMSParam& param) {
   } else {
     outs->mutable_data<float>({num_kept, kOutputDim});
     for (int64_t i = 0; i < batch_size; ++i) {
-      Tensor ins_score = input_scores->Slice(i, i + 1);
+      framework::Tensor ins_score = input_scores->Slice(i, i + 1);
       ins_score.Resize({class_num, predict_dim});
 
-      Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
+      framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
       ins_boxes.Resize({predict_dim, box_dim});
 
       int64_t s = batch_starts[i];
       int64_t e = batch_starts[i + 1];
       if (e > s) {
-        Tensor out = outs->Slice(s, e);
+        framework::Tensor out = outs->Slice(s, e);
         MultiClassOutput<float>(ins_score, ins_boxes, all_indices[i], &out);
       }
     }
diff --git a/src/operators/kernel/central-arm-func/pool_arm_func.h b/src/operators/kernel/central-arm-func/pool_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..37479c22efe95b6506054cf3ded5855aa766c34c
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/pool_arm_func.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef POOL_OP
+#pragma once
+
+#include <string>
+#include <vector>
+#include "operators/math/pooling.h"
+
+namespace paddle_mobile {
+namespace operators {
+using framework::Tensor;
+
+inline void PoolBasic(std::string pooling_type, std::vector<int> ksize,
+                      std::vector<int> strides, std::vector<int> paddings,
+                      const Tensor *in_x, Tensor *out) {
+  if (pooling_type == "max") {
+    math::PoolFunctor<CPU, math::MaxPool<float>, float> pool2d_forward;
+    math::MaxPool<float> pool_process;
+    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
+
+  } else if (pooling_type == "avg") {
+    math::PoolFunctor<CPU, math::AvgPool<float>, float> pool2d_forward;
+    math::AvgPool<float> pool_process;
+    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
+  }
+}
+template <typename P>
+void PoolCompute(const PoolParam<CPU> &param) {
+  const Tensor *in_x = param.Input();
+  Tensor *out = param.Output();
+  std::string pooling_type = param.PoolingType();
+
+  std::vector<int> ksize = param.Ksize();
+
+  std::vector<int> strides = param.Strides();
+
+  std::vector<int> paddings = param.Paddings();
+  if (ksize.size() != 2) {
+    LOG(paddle_mobile::LogLevel::kLOG_ERROR)
+        << "Pool op only supports 2D and 3D input.";
+  }
+
+  if (param.isGlobalPooling()) {
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      paddings[i] = 0;
+      ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+    }
+  }
+  if (ksize[0] == 3 && ksize[0] == ksize[1]) {
+    if (pooling_type == "max") {
+      if (strides[0] == strides[1] && strides[0] == 1 &&
+          paddings[0] == paddings[1] && paddings[1] == 1) {
+        math::Pool3x3Maxs1p1(in_x, out);
+      } else {
+        math::Pool3x3Max(strides, paddings, in_x, out);
+      }
+    } else if (pooling_type == "avg") {
+      if (strides[0] == strides[1] && strides[0] == 1 &&
+          paddings[0] == paddings[1] && paddings[1] == 1) {
+        math::Pool3x3Avgs1p1(in_x, out);
+      } else {
+        math::Pool3x3Avg(strides, paddings, in_x, out);
+      }
+    }
+
+  } else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 &&
+             strides[0] == strides[1] && paddings[0] == paddings[1] &&
+             paddings[1] == 0) {
+#if __ARM_NEON
+#if __aarch64__
+    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+#else
+    if (pooling_type == "max") {
+      math::Pool2x2Maxs2p0(strides, paddings, in_x, out);
+    } else if (pooling_type == "avg") {
+      math::Pool2x2Avgs2p0(strides, paddings, in_x, out);
+    }
+#endif
+#else
+    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+#endif  // __ARM_NEON
+
+  } else {
+    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/central-arm-func/prior_box_arm_func.h b/src/operators/kernel/central-arm-func/prior_box_arm_func.h
index 892dceb9254ac423d3591a0fc9e9347bc375831b..e783c52f8184d6e09b04cd5c8210f5b89276541e 100644
--- a/src/operators/kernel/central-arm-func/prior_box_arm_func.h
+++ b/src/operators/kernel/central-arm-func/prior_box_arm_func.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #pragma once
 
 #include <algorithm>
+#include <cmath>
 #include <vector>
 
 namespace paddle_mobile {
@@ -29,7 +30,7 @@ struct ClipFunctor {
 };
 
 template <typename P>
-void PriorBoxCompute(const PriorBoxParam &param) {
+void PriorBoxCompute(const PriorBoxParam<CPU> &param) {
   const auto *input_ = param.Input();
   const auto &input_dims = input_->dims();
 
@@ -89,26 +90,8 @@ void PriorBoxCompute(const PriorBoxParam &param) {
       int idx = 0;
       for (size_t s = 0; s < min_sizes.size(); ++s) {
         auto min_size = min_sizes[s];
-        // priors with different aspect ratios
-        for (float ar : aspect_ratios) {
-          box_width = min_size * sqrt(ar) / 2.;
-          box_height = min_size / sqrt(ar) / 2.;
-          /// box_width/2 , / img_width 为了得到feature map 相对于
-          /// 原图的归一化位置的比例。
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] =
-              (center_x - box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] =
-              (center_y - box_height) / img_height;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 2] =
-              (center_x + box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] =
-              (center_y + box_height) / img_height;
-          idx++;
-        }
-        if (!max_sizes.empty()) {
-          auto max_size = max_sizes[s];
-          // square prior with size sqrt(minSize * maxSize)
-          box_width = box_height = sqrt(min_size * max_size) / 2.;
+        if (param.MinMaxAspectRatiosOrder()) {
+          box_width = box_height = min_size / 2.;
           output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] =
               (center_x - box_width) / img_width;
           output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] =
@@ -118,6 +101,73 @@ void PriorBoxCompute(const PriorBoxParam &param) {
           output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] =
               (center_y + box_height) / img_height;
           idx++;
+
+          if (max_sizes.size() > 0) {
+            auto max_size = max_sizes[s];
+            // square prior with size sqrt(minSize * maxSize)
+            box_width = box_height = sqrt(min_size * max_size) / 2.;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 0] = (center_x - box_width) / img_width;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 1] = (center_y - box_height) / img_height;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 2] = (center_x + box_width) / img_width;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 3] = (center_y + box_height) / img_height;
+            idx++;
+          }
+
+          // priors with different aspect ratios
+          for (float ar : aspect_ratios) {
+            if (fabs(ar - 1.) < 1e-6) {
+              continue;
+            }
+            box_width = min_size * sqrt(ar) / 2.;
+            box_height = min_size / sqrt(ar) / 2.;
+            /// box_width/2 , / img_width 为了得到feature map 相对于
+            /// 原图的归一化位置的比例。
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 0] = (center_x - box_width) / img_width;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 1] = (center_y - box_height) / img_height;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 2] = (center_x + box_width) / img_width;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 3] = (center_y + box_height) / img_height;
+            idx++;
+          }
+
+        } else {
+          // priors with different aspect ratios
+          for (float ar : aspect_ratios) {
+            box_width = min_size * sqrt(ar) / 2.;
+            box_height = min_size / sqrt(ar) / 2.;
+            /// box_width/2 , / img_width 为了得到feature map 相对于
+            /// 原图的归一化位置的比例。
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 0] = (center_x - box_width) / img_width;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 1] = (center_y - box_height) / img_height;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 2] = (center_x + box_width) / img_width;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 3] = (center_y + box_height) / img_height;
+            idx++;
+          }
+          if (!max_sizes.empty()) {
+            auto max_size = max_sizes[s];
+            // square prior with size sqrt(minSize * maxSize)
+            box_width = box_height = sqrt(min_size * max_size) / 2.;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 0] = (center_x - box_width) / img_width;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 1] = (center_y - box_height) / img_height;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 2] = (center_x + box_width) / img_width;
+            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
+                                 3] = (center_y + box_height) / img_height;
+            idx++;
+          }
         }
       }
     }
diff --git a/src/operators/kernel/central-arm-func/relu_arm_func.h b/src/operators/kernel/central-arm-func/relu_arm_func.h
index 19ccb3e862a29cab79453572b24ed0c5a2a8301d..d68569c0a5c0730d96a89cd534b2a89c0d3a9bff 100644
--- a/src/operators/kernel/central-arm-func/relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/relu_arm_func.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #pragma once
 
 #include <operators/math/transform.h>
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -29,7 +30,7 @@ struct ReluFunctor {
  * @b 特化到具体平台的实现, param 从 op 层传入
  * */
 template <typename P>
-void ReluCompute(const ReluParam &param) {
+void ReluCompute(const ReluParam<CPU> &param) {
   const auto *input_x = param.InputX();
   auto *input_x_ptr = input_x->data<float>();
   auto *out = param.Out();
diff --git a/src/operators/kernel/central-arm-func/reshape_arm_func.h b/src/operators/kernel/central-arm-func/reshape_arm_func.h
index a2fb836257418923f41e94ceaf499e38033c6b4c..6e1a29dee6003ec26d58fd61e7445d74eca85edb 100644
--- a/src/operators/kernel/central-arm-func/reshape_arm_func.h
+++ b/src/operators/kernel/central-arm-func/reshape_arm_func.h
@@ -16,12 +16,14 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+#include "operators/kernel/reshape_kernel.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <typename P>
-void ReshapeCompute(const ReshapeParam &param) {
+void ReshapeCompute(const ReshapeParam<CPU> &param) {
   const auto *input_x = param.InputX();
   const auto &input_x_dims = input_x->dims();
   auto *out = param.Out();
diff --git a/src/operators/kernel/central-arm-func/shape_arm_func.h b/src/operators/kernel/central-arm-func/shape_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa9154211fe24ff8e1cc4966f9684f1fbf5a3111
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/shape_arm_func.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SHAPE_OP
+#pragma once
+
+#include <vector>
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void ShapeCompute(const ShapeParam<CPU>& param) {
+  auto* in_t = param.Input();
+  auto* out_t = param.Out();
+  auto out_data = out_t->mutable_data<int32_t>();
+  auto in_dims = in_t->dims();
+  for (int i = 0; i < in_dims.size(); ++i) {
+    out_data[i] = static_cast<int32_t>(in_dims[i]);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/sigmoid_arm_func.h b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..1076fa49d555d14da76ff08a67c0943fb9ab115a
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SIGMOID_OP
+#pragma once
+
+#include <cmath>
+
+#include "operators/op_param.h"
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#include "operators/math/math_func_neon.h"
+#endif
+
+namespace paddle_mobile {
+namespace operators {
+using framework::DDim;
+void sigmoid(const Tensor *X, Tensor *Y) {
+#ifdef __ARM_NEON
+  const float *input = X->data<float>();
+  float *output = Y->mutable_data<float>();
+  const DDim &dDim = X->dims();
+  int axis_index = 1;
+  if (dDim.size() < 4) {
+    axis_index = 0;
+  }
+  DDim outer_ddim =
+      paddle_mobile::framework::slice_ddim(dDim, 0, axis_index + 1);
+  DDim inner_ddim =
+      paddle_mobile::framework::slice_ddim(dDim, axis_index + 1, dDim.size());
+  int out_size = paddle_mobile::framework::product(outer_ddim);
+  int inner_size = paddle_mobile::framework::product(inner_ddim);
+
+  DLOG << "outsize=" << out_size;
+  DLOG << "innersize=" << inner_size;
+  #pragma omp parallel for
+  for (int i = 0; i < out_size; ++i) {
+    const float *input_outer_ptr = input + i * inner_size;
+    float *output_outer_ptr = output + i * inner_size;
+    int nn = inner_size >> 2;
+    int remain = inner_size - (nn << 2);
+    float32x4_t _one = vdupq_n_f32(1.f);
+    for (; nn > 0; nn--) {
+      float32x4_t data = vld1q_f32(input_outer_ptr);
+      data = vnegq_f32(data);
+      data = exp_ps(data);
+      data = vaddq_f32(data, _one);
+      float32x4_t out_data = vrecpeq_f32(data);
+      out_data = vmulq_f32(vrecpsq_f32(data, out_data), out_data);
+      vst1q_f32(output_outer_ptr, out_data);
+
+      input_outer_ptr += 4;
+      output_outer_ptr += 4;
+    }
+    for (; remain > 0; remain--) {
+      *output_outer_ptr = 1.f / (1.f + exp(-*input_outer_ptr));
+      output_outer_ptr++;
+      input_outer_ptr++;
+    }
+  }
+#else
+#endif
+}
+
+template <typename P>
+void SigmoidCompute(const SigmoidParam<CPU> &param) {
+  const Tensor *in_x = param.InputX();
+  Tensor *out = param.Out();
+  auto x_dims = in_x->dims();
+  out->Resize(x_dims);
+  sigmoid(in_x, out);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/central-arm-func/softmax_arm_func.h b/src/operators/kernel/central-arm-func/softmax_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..d311d97984a7207df9075befe71a9806092966e1
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/softmax_arm_func.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SOFTMAX_OP
+#pragma once
+#include "../../math/softmax.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename P>
+void SoftmaxCompute(const SoftmaxParam<CPU> &param) {
+  const Tensor *in_x = param.InputX();
+  Tensor *out = param.Out();
+  auto x_dims = in_x->dims();
+  out->Resize(x_dims);
+  math::SoftmaxFuntor<CPU, float>()(in_x, out);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/central-arm-func/split_arm_func.h b/src/operators/kernel/central-arm-func/split_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..24ab2f83a4f3be8b29cb9e33347d639c52f9eea1
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/split_arm_func.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SPLIT_OP
+#pragma once
+
+#include <vector>
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+// Strided numel memory copy from src to dst by the specified axis
+//
+// For example, for a tensor dims [4, 20, 100], the strieded numel is
+// [8000, 2000, 100]
+//
+// NOTE: The src and dst tensor should have the same elements
+// except the specified axis.
+template <typename T>
+inline void StridedNumelCopyWithAxis(int64_t axis, T* dst,
+                                     const framework::DDim& dst_stride_numel,
+                                     const T* src,
+                                     const framework::DDim& src_stride_numel,
+                                     int64_t size) {
+  int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
+  int64_t src_after = src_stride_numel[axis];
+  int64_t dst_after = dst_stride_numel[axis];
+
+  PADDLE_MOBILE_ENFORCE(src_stride_numel.size() == dst_stride_numel.size(),
+                        "src and dst tensor should have the same dims size.");
+
+  for (int64_t i = 0; i < axis; ++i) {
+    if (i < axis) {
+      PADDLE_MOBILE_ENFORCE(src_stride_numel[i] / src_stride_numel[axis] ==
+                                dst_stride_numel[i] / dst_stride_numel[axis],
+                            "src and dst should have the same elements "
+                            "except the specified axis.");
+    } else if (i == axis) {
+      continue;
+    } else {
+      PADDLE_MOBILE_ENFORCE(src_stride_numel[i] == dst_stride_numel[i],
+                            "src and dst should have the same elements "
+                            "except the specified axis.");
+    }
+  }
+
+  for (int64_t i = 0; i < before; ++i) {
+    memory::Copy(dst + i * dst_after, src + i * src_after, sizeof(T) * size);
+  }
+}
+
+template <typename P>
+void SplitCompute(const SplitParam<CPU>& param) {
+  auto* in = param.InputX();
+  auto outs = param.Outs();
+  auto in_stride = framework::stride_numel(in->dims());
+  int64_t axis = param.Axis();
+
+  size_t input_offset = 0;
+  for (auto& out : outs) {
+    out->mutable_data<float>();
+    auto out_stride = framework::stride_numel(out->dims());
+
+    StridedNumelCopyWithAxis<float>(axis, out->data<float>(), out_stride,
+                                    in->data<float>() + input_offset, in_stride,
+                                    out_stride[axis]);
+    input_offset += out_stride[axis];
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/transpose_arm_func.h b/src/operators/kernel/central-arm-func/transpose_arm_func.h
index 1cbebc4525113374061541518775a94c6a64401f..1bd2e11a3405abc99c5a33be4ec9b61855f77b08 100644
--- a/src/operators/kernel/central-arm-func/transpose_arm_func.h
+++ b/src/operators/kernel/central-arm-func/transpose_arm_func.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -38,7 +39,7 @@ namespace operators {
 // }
 
 template <typename P>
-void TransposeCompute(const TransposeParam& param) {
+void TransposeCompute(const TransposeParam<CPU>& param) {
   const auto* input_x = param.InputX();
   const auto input_x_dims = input_x->dims();
   auto* out = param.Out();
diff --git a/src/operators/kernel/concat_kernel.h b/src/operators/kernel/concat_kernel.h
index adba64391e3e79569030c95e2d2681a31187f03a..61100bf5f0e9de43bfb6295a0719f1be0954d128 100644
--- a/src/operators/kernel/concat_kernel.h
+++ b/src/operators/kernel/concat_kernel.h
@@ -24,10 +24,11 @@ namespace operators {
 using namespace framework;
 
 template <typename DeviceType, typename T>
-class ConcatKernel : public framework::OpKernelBase<DeviceType, ConcatParam> {
+class ConcatKernel
+    : public framework::OpKernelBase<DeviceType, ConcatParam<DeviceType>> {
  public:
-  void Compute(const ConcatParam &param) const;
-  bool Init(ConcatParam *param);
+  void Compute(const ConcatParam<DeviceType> &param) const;
+  bool Init(ConcatParam<DeviceType> *param);
 };
 
 }  // namespace operators
diff --git a/src/operators/kernel/conv_add_add_prelu_kernel.h b/src/operators/kernel/conv_add_add_prelu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5715cd46d5a6c7e80ab5ff77ba83c7973e1db811
--- /dev/null
+++ b/src/operators/kernel/conv_add_add_prelu_kernel.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_CONVADDADDPRELU_OP
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::DDim;
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class ConvAddAddPReluKernel
+    : public OpKernelBase<DeviceType, FusionConvAddAddPReluParam<DeviceType>> {
+ public:
+  void Compute(const FusionConvAddAddPReluParam<DeviceType> &param) const;
+  bool Init(FusionConvAddAddPReluParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/conv_add_bn_kernel.h b/src/operators/kernel/conv_add_bn_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee73215c4688c3e604de69cda55b05e63844c0b8
--- /dev/null
+++ b/src/operators/kernel/conv_add_bn_kernel.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_CONVADDBN_OP
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::DDim;
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class ConvAddBNKernel
+    : public OpKernelBase<DeviceType, FusionConvAddBNParam<DeviceType>> {
+ public:
+  void Compute(const FusionConvAddBNParam<DeviceType> &param) const;
+  bool Init(FusionConvAddBNParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/conv_add_bn_relu_kernel.h b/src/operators/kernel/conv_add_bn_relu_kernel.h
index 73aaf4c900393b9cbee4682fc67147d9ef0853fc..9faaaedcf8d6f825f818ebf5121dc7685185d5d8 100644
--- a/src/operators/kernel/conv_add_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_add_bn_relu_kernel.h
@@ -33,10 +33,10 @@ using framework::OpKernelBase;
 
 template <typename DeviceType, typename T>
 class ConvAddBNReluKernel
-    : public OpKernelBase<DeviceType, FusionConvAddBNReluParam> {
+    : public OpKernelBase<DeviceType, FusionConvAddBNReluParam<DeviceType>> {
  public:
-  void Compute(const FusionConvAddBNReluParam &param) const;
-  bool Init(FusionConvAddBNReluParam *param);
+  void Compute(const FusionConvAddBNReluParam<DeviceType> &param) const;
+  bool Init(FusionConvAddBNReluParam<DeviceType> *param);
 };
 
 }  // namespace operators
diff --git a/src/operators/kernel/conv_add_kernel.h b/src/operators/kernel/conv_add_kernel.h
index 465d8bdd8cfd71d678eb2816cae10ea6a06cec35..360cbb6775168885e9c1a25db1f9ffb9e552324b 100644
--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
-#if __ARM_NEON
+#ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
 #include "common/common.h"
@@ -37,10 +37,11 @@ using framework::DDim;
 using framework::OpKernelBase;
 
 template <typename DeviceType, typename T>
-class ConvAddKernel : public OpKernelBase<DeviceType, FusionConvAddParam> {
+class ConvAddKernel
+    : public OpKernelBase<DeviceType, FusionConvAddParam<DeviceType>> {
  public:
-  void Compute(const FusionConvAddParam &param) const;
-  bool Init(FusionConvAddParam *param);
+  void Compute(const FusionConvAddParam<DeviceType> &param) const;
+  bool Init(FusionConvAddParam<DeviceType> *param);
 };
 
 }  // namespace operators
diff --git a/src/operators/kernel/conv_add_prelu_kernel.h b/src/operators/kernel/conv_add_prelu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..a109f84cf09b4d0e2469a1885b902c0f70acc6c8
--- /dev/null
+++ b/src/operators/kernel/conv_add_prelu_kernel.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_CONVADDPRELU_OP
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::DDim;
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class ConvAddPReluKernel
+    : public OpKernelBase<DeviceType, FusionConvAddPReluParam<DeviceType>> {
+ public:
+  void Compute(const FusionConvAddPReluParam<DeviceType> &param) const;
+  bool Init(FusionConvAddPReluParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/conv_add_relu_kernel.h b/src/operators/kernel/conv_add_relu_kernel.h
index 3f36d80c4781aebea756b04e340d056a79cfd7d7..f33b1dc312e1d94be0c23cff55e9e6789a556bc7 100644
--- a/src/operators/kernel/conv_add_relu_kernel.h
+++ b/src/operators/kernel/conv_add_relu_kernel.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef FUSION_CONVADD_RELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 
 #include <vector>
 #include "framework/ddim.h"
@@ -33,10 +33,10 @@ using framework::OpKernelBase;
 
 template <typename DeviceType, typename T>
 class ConvAddReluKernel
-    : public OpKernelBase<DeviceType, FusionConvAddReluParam> {
+    : public OpKernelBase<DeviceType, FusionConvAddReluParam<DeviceType>> {
  public:
-  void Compute(const FusionConvAddReluParam &param) const;
-  bool Init(FusionConvAddReluParam *param);
+  void Compute(const FusionConvAddReluParam<DeviceType> &param) const;
+  bool Init(FusionConvAddReluParam<DeviceType> *param);
 };
 
 }  // namespace operators
diff --git a/src/operators/kernel/conv_bn_add_relu_kernel.h b/src/operators/kernel/conv_bn_add_relu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..820e5f8bcbf58676e8374e575044b10fe4676efa
--- /dev/null
+++ b/src/operators/kernel/conv_bn_add_relu_kernel.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_CONVBNADDRELU_OP
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::DDim;
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class ConvBNAddReluKernel
+    : public OpKernelBase<DeviceType, FusionConvBNAddReluParam<DeviceType>> {
+ public:
+  void Compute(const FusionConvBNAddReluParam<DeviceType> &param) const;
+  bool Init(FusionConvBNAddReluParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/conv_bn_kernel.h b/src/operators/kernel/conv_bn_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f740ca836481c1331ea2e889865b3078d48644a6
--- /dev/null
+++ b/src/operators/kernel/conv_bn_kernel.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_CONVBN_OP
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::DDim;
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class ConvBNKernel
+    : public OpKernelBase<DeviceType, FusionConvBNParam<DeviceType>> {
+ public:
+  void Compute(const FusionConvBNParam<DeviceType> &param) const;
+  bool Init(FusionConvBNParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/conv_bn_relu_kernel.h b/src/operators/kernel/conv_bn_relu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..225976aa5db31096ef691ecefa8b63d4ae3dc277
--- /dev/null
+++ b/src/operators/kernel/conv_bn_relu_kernel.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::DDim;
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class ConvBNReluKernel
+    : public OpKernelBase<DeviceType, FusionConvBNReluParam<DeviceType>> {
+ public:
+  void Compute(const FusionConvBNReluParam<DeviceType> &param) const;
+  bool Init(FusionConvBNReluParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/conv_kernel.h b/src/operators/kernel/conv_kernel.h
index fedbee32a006f263fd3de25064496dad1a23177b..93474adaa97743d1850b53df114ae08f144aebca 100644
--- a/src/operators/kernel/conv_kernel.h
+++ b/src/operators/kernel/conv_kernel.h
@@ -29,10 +29,10 @@ namespace operators {
 using framework::OpKernelBase;
 
 template <typename DeviceType, typename T>
-class ConvKernel : public OpKernelBase<DeviceType, ConvParam> {
+class ConvKernel : public OpKernelBase<DeviceType, ConvParam<DeviceType>> {
  public:
-  void Compute(const ConvParam &param) const;
-  bool Init(ConvParam *param);
+  void Compute(const ConvParam<DeviceType> &param) const;
+  bool Init(ConvParam<DeviceType> *param);
 };
 
 }  // namespace operators
diff --git a/src/operators/kernel/conv_transpose_kernel.h b/src/operators/kernel/conv_transpose_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..9cbd7c8c3bafde8b4f4939e86ceabdd94dbd3bc8
--- /dev/null
+++ b/src/operators/kernel/conv_transpose_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_TRANSPOSE
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class ConvTransposeKernel
+    : public OpKernelBase<DeviceType, ConvTransposeParam<DeviceType>> {
+ public:
+  void Compute(const ConvTransposeParam<DeviceType> &param) const;
+
+  bool Init(ConvTransposeParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // PADDLE_MOBILE_DE_CONV_KERNEL_H
diff --git a/src/operators/kernel/crf_kernel.h b/src/operators/kernel/crf_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..71c07cf0384d482522de3a6652c6d24a22af656a
--- /dev/null
+++ b/src/operators/kernel/crf_kernel.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CRF_OP
+
+#pragma once
+
+#include <vector>
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class CrfKernel
+    : public framework::OpKernelBase<DeviceType, CrfParam<DeviceType>> {
+ public:
+  void Compute(const CrfParam<DeviceType>& param) const;
+  bool Init(CrfParam<DeviceType>* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/depthwise_conv_kernel.h b/src/operators/kernel/depthwise_conv_kernel.h
index b74a58a649bd9fa27e941e2cd5ea50b30c0218cb..605b81cd6ed4ccd54b1803cf7a603b8f4576982d 100644
--- a/src/operators/kernel/depthwise_conv_kernel.h
+++ b/src/operators/kernel/depthwise_conv_kernel.h
@@ -28,10 +28,11 @@ namespace operators {
 using framework::OpKernelBase;
 
 template <typename DeviceType, typename T>
-class DepthwiseConvKernel : public OpKernelBase<DeviceType, ConvParam> {
+class DepthwiseConvKernel
+    : public OpKernelBase<DeviceType, ConvParam<DeviceType>> {
  public:
-  void Compute(const ConvParam &param) const;
-  bool Init(ConvParam *param);
+  void Compute(const ConvParam<DeviceType> &param) const;
+  bool Init(ConvParam<DeviceType> *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/dropout_kernel.h b/src/operators/kernel/dropout_kernel.h
index 5a3783971959db8fba9ca6b701fb6eb6340fcb3f..b7535095d4fef11ee628aea96a074abcc3562f7f 100644
--- a/src/operators/kernel/dropout_kernel.h
+++ b/src/operators/kernel/dropout_kernel.h
@@ -17,16 +17,17 @@ limitations under the License. */
 #include "framework/operator.h"
 #include "operators/op_param.h"
 
-#pragma once;
+#pragma once
 
 namespace paddle_mobile {
 namespace operators {
 
 template <typename DeviceType, typename T>
-class DropoutKernel : public framework::OpKernelBase<DeviceType, DropoutParam> {
+class DropoutKernel
+    : public framework::OpKernelBase<DeviceType, DropoutParam<DeviceType>> {
  public:
-  void Compute(const DropoutParam& param) const;
-  bool Init(DropoutParam* para);
+  void Compute(const DropoutParam<DeviceType>& param) const;
+  bool Init(DropoutParam<DeviceType>* para);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/dwconv_bn_relu_kernel.h b/src/operators/kernel/dwconv_bn_relu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..594c594cb00f8f4ddd8a511f3c992c4efbfcdfc6
--- /dev/null
+++ b/src/operators/kernel/dwconv_bn_relu_kernel.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_DWCONVBNRELU_OP
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::DDim;
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class DWConvBNReluKernel
+    : public OpKernelBase<DeviceType, FusionDWConvBNReluParam<DeviceType>> {
+ public:
+  void Compute(const FusionDWConvBNReluParam<DeviceType> &param) const;
+  bool Init(FusionDWConvBNReluParam<DeviceType> *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/elementwise_add_kernel.h b/src/operators/kernel/elementwise_add_kernel.h
index 70334c1d3f788f60e974da74133823f82ab05765..67182af2e20e23c40effab6b87eefde1e0ab629d 100644
--- a/src/operators/kernel/elementwise_add_kernel.h
+++ b/src/operators/kernel/elementwise_add_kernel.h
@@ -27,10 +27,11 @@ using namespace framework;
 
 template <typename DeviceType, typename T>
 class ElementwiseAddKernel
-    : public framework::OpKernelBase<DeviceType, ElementwiseAddParam> {
+    : public framework::OpKernelBase<DeviceType,
+                                     ElementwiseAddParam<DeviceType>> {
  public:
-  void Compute(const ElementwiseAddParam &param) const;
-  bool Init(ElementwiseAddParam *param);
+  void Compute(const ElementwiseAddParam<DeviceType> &param) const;
+  bool Init(ElementwiseAddParam<DeviceType> *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/elementwise_add_relu_kernel.h b/src/operators/kernel/elementwise_add_relu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5eda5a0c56c228ad54c888b6faa82ce9417f2dc1
--- /dev/null
+++ b/src/operators/kernel/elementwise_add_relu_kernel.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_ELEMENTWISEADDRELU_OP
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using namespace framework;
+
+template <typename DeviceType, typename T>
+class ElementwiseAddReluKernel
+    : public framework::OpKernelBase<DeviceType,
+                                     ElementwiseAddReluParam<DeviceType>> {
+ public:
+  void Compute(const ElementwiseAddReluParam<DeviceType> &param) const;
+  bool Init(ElementwiseAddReluParam<DeviceType> *param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fc_relu_kernel.h b/src/operators/kernel/fc_relu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e9446da37df4ba83db85d416aa87f216816c4a5
--- /dev/null
+++ b/src/operators/kernel/fc_relu_kernel.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_FCRELU_OP
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/math/math_function.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class FusionFcReluKernel
+    : public framework::OpKernelBase<DeviceType,
+                                     FusionFcReluParam<DeviceType>> {
+ public:
+  void Compute(const FusionFcReluParam<DeviceType>& param) const;
+  bool Init(FusionFcReluParam<DeviceType>* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/flatten_kernel.h b/src/operators/kernel/flatten_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..80d66ccf87c21532c8b4590d992f5bccbe4f00dc
--- /dev/null
+++ b/src/operators/kernel/flatten_kernel.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FLATTEN_OP
+
+#pragma once
+
+#include <vector>
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class FlattenKernel
+    : public framework::OpKernelBase<DeviceType, FlattenParam<DeviceType>> {
+ public:
+  void Compute(const FlattenParam<DeviceType>& param) const;
+  bool Init(FlattenParam<DeviceType>* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/concat_kernel.cpp b/src/operators/kernel/fpga/concat_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c48ef021945e6a7b8b53ee946a33b862766deeb
--- /dev/null
+++ b/src/operators/kernel/fpga/concat_kernel.cpp
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONCAT_OP
+
+#include "operators/kernel/concat_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
+  auto inputs = param->Inputs();
+  auto out = param->Out();
+  auto image_num = inputs.size();
+  auto images_in = (half **)fpga::fpga_malloc(image_num * sizeof(int *));
+  auto scales_in = (float **)fpga::fpga_malloc(image_num * sizeof(float *));
+  auto channel_num =
+      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));
+
+  auto height = inputs[0]->dims()[2];
+  auto width = inputs[0]->dims()[3];
+  for (int i = 0; i < image_num; i++) {
+    auto input = inputs[i];
+    PADDLE_MOBILE_ENFORCE(
+        input->dims()[2] == height && input->dims()[3] == width,
+        "Image height & width should be unified");
+    images_in[i] = (half *)input->data<float>();
+    channel_num[i] = (uint32_t)inputs[i]->dims()[1];
+    scales_in[i] = input->scale;
+  }
+  fpga::format_concat_output(out, (int)height, (int)width, (int)image_num,
+                             channel_num);
+
+  fpga::ConcatArgs concatArgs = {0};
+  concatArgs.image_num = (uint32_t)image_num;
+  concatArgs.images_in = images_in;
+  concatArgs.scales_in = scales_in;
+  concatArgs.image_out = (half *)out->data<float>();
+  concatArgs.scale_out = out->scale;
+  concatArgs.channel_num = channel_num;
+  concatArgs.height = (uint32_t)height;
+  concatArgs.width = (uint32_t)width;
+  param->SetFpgaArgs(concatArgs);
+  return true;
+}
+
+template <>
+void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) const {
+  ComputeFPGAConcat(param.FpgaArgs());
+}
+template class ConcatKernel<FPGA, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..671df76967b4537d111695cdbe091b9c7de2c5a2
--- /dev/null
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBN_OP
+
+#include "operators/kernel/conv_add_bn_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto input = const_cast<Tensor *>(param->Input());
+
+  auto bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+
+  auto out = param->Output();
+
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
+                            bias->dims()[0] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+
+  const int channel = out->dims()[1];
+  auto bs_ptr =
+      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] =
+        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  float max_value = fpga::filter_find_max(filter);
+  fpga::format_filter(filter, max_value, param->Groups());
+
+  int element_num_per_div =
+      fpga::get_filter_num_per_div(filter, param->Groups());
+  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
+  fpga::format_fp16_ofm(out);
+
+  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
+                      param->Groups(), param->Strides()[0], param->Strides()[1],
+                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+
+  return true;
+}
+
+template <>
+void ConvAddBNKernel<FPGA, float>::Compute(
+    const FusionConvAddBNParam<FPGA> &param) const {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d435692db6b40568afc599733c2adb6b05b00ffa
--- /dev/null
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
@@ -0,0 +1,84 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDBNRELU_OP
+
+#include "operators/kernel/conv_add_bn_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddBNReluKernel<FPGA, float>::Init(
+    FusionConvAddBNReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input = const_cast<Tensor *>(param->Input());
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
+                            bias->dims()[0] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+
+  const int channel = out->dims()[1];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] =
+        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + 2] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  float max_value = fpga::filter_find_max(filter);
+  fpga::format_filter(filter, max_value, param->Groups());
+
+  int element_num_per_div =
+      fpga::get_filter_num_per_div(filter, param->Groups());
+  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
+
+  fpga::format_fp16_ofm(out);
+
+  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
+                      param->Groups(), param->Strides()[0], param->Strides()[1],
+                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void ConvAddBNReluKernel<FPGA, float>::Compute(
+    const FusionConvAddBNReluParam<FPGA> &param) const {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..32d90b36e4c14a60219a3779da03100651aa2f13
--- /dev/null
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDRELU_OP
+
+#include "operators/kernel/conv_add_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input = const_cast<Tensor *>(param->Input());
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = bias_ptr[i];
+  }
+
+  float max_value = fpga::filter_find_max(filter);
+  fpga::format_filter(filter, max_value, param->Groups());
+
+  int element_num_per_div =
+      fpga::get_filter_num_per_div(filter, param->Groups());
+  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
+
+  fpga::format_fp16_ofm(out);
+
+  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
+                      param->Groups(), param->Strides()[0], param->Strides()[1],
+                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void ConvAddReluKernel<FPGA, float>::Compute(
+    const FusionConvAddReluParam<FPGA> &param) const {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/conv_bn_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4263c9c40491366813d3c9a5bf7dbc8ae976d39e
--- /dev/null
+++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBN_OP
+
+#include "operators/kernel/conv_bn_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto input = const_cast<Tensor *>(param->Input());
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+  const int channel = out->dims()[1];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  float max_value = fpga::filter_find_max(filter);
+  fpga::format_filter(filter, max_value, param->Groups());
+
+  int element_num_per_div =
+      fpga::get_filter_num_per_div(filter, param->Groups());
+  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
+
+  fpga::format_fp16_ofm(out);
+
+  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
+                      param->Groups(), param->Strides()[0], param->Strides()[1],
+                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void ConvBNKernel<FPGA, float>::Compute(
+    const FusionConvBNParam<FPGA> &param) const {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3d6e0faa5fe3d4ef3514bbe1679298b11d96727c
--- /dev/null
+++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#include "operators/kernel/conv_bn_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input = const_cast<Tensor *>(param->Input());
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+  auto bn_mean_ptr = param->InputMean()->data<float>();
+  auto bn_var_ptr = param->InputVariance()->data<float>();
+  auto bn_scale_ptr = param->InputScale()->data<float>();
+  auto bn_bias_ptr = param->InputBias()->data<float>();
+  const float epsilon = param->Epsilon();
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
+                        "Output channel should be equal to bias number");
+  const int channel = out->dims()[1];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto new_scale = new Tensor();
+  auto new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
+  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
+
+  for (int i = 0; i < channel; i++) {
+    new_scale_ptr[i] = bn_scale_ptr[i] /
+                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
+    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
+    bs_ptr[i + channel] = new_scale_ptr[i];
+    bs_ptr[i] = new_bias_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  float max_value = fpga::filter_find_max(filter);
+  fpga::format_filter(filter, max_value, param->Groups());
+
+  int element_num_per_div =
+      fpga::get_filter_num_per_div(filter, param->Groups());
+  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
+
+  fpga::format_fp16_ofm(out);
+
+  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
+                      param->Groups(), param->Strides()[0], param->Strides()[1],
+                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void ConvBNReluKernel<FPGA, float>::Compute(
+    const FusionConvBNReluParam<FPGA> &param) const {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/conv_kernel.cpp b/src/operators/kernel/fpga/dropout_kernel.cpp
similarity index 73%
rename from src/operators/kernel/fpga/conv_kernel.cpp
rename to src/operators/kernel/fpga/dropout_kernel.cpp
index dc537362a216983974bea325433c456136356fc8..b0981c4254060996a16f4ae5beabb7c22edd6d34 100644
--- a/src/operators/kernel/fpga/conv_kernel.cpp
+++ b/src/operators/kernel/fpga/dropout_kernel.cpp
@@ -12,21 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef CONV_OP
+#ifdef DROPOUT_OP
 
-#include "operators/kernel/conv_kernel.h"
+#include "operators/kernel/dropout_kernel.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ConvKernel<FPGA, float>::Init(ConvParam *param) {
+bool DropoutKernel<FPGA, float>::Init(DropoutParam<FPGA> *param) {
+  param->Out()->ShareDataWith(*param->InputX());
   return true;
 }
 
 template <>
-void ConvKernel<FPGA, float>::Compute(const ConvParam &param) const {}
-template class ConvKernel<FPGA, float>;
+void DropoutKernel<FPGA, float>::Compute(
+    const DropoutParam<FPGA> &param) const {}
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f0d8533641941fe43a6d06b49266ac06646a7b4d
--- /dev/null
+++ b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_ELEMENTWISEADDRELU_OP
+
+#include "operators/kernel/elementwise_add_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ElementwiseAddReluKernel<FPGA, float>::Init(
+    ElementwiseAddReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto *input_x = const_cast<LoDTensor *>(param->InputX());
+  auto *input_y = const_cast<LoDTensor *>(param->InputY());
+  auto *out = param->Out();
+  auto input_x_ptr = input_x->data<float>();
+  auto input_y_ptr = input_y->data<float>();
+  fpga::format_fp16_ofm(out);
+  auto out_ptr = out->mutable_data<float>();
+
+  fpga::EWAddArgs ewaddArgs = {0};
+  ewaddArgs.relu_enabled = relu_enabled;
+  ewaddArgs.const0 = 1;
+  ewaddArgs.const1 = 1;
+  ewaddArgs.image0.address = input_x_ptr;
+  ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
+  ewaddArgs.image0.scale_address = input_x->scale;
+  ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
+  ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
+  ewaddArgs.image0.pad_height = 0;
+  ewaddArgs.image0.pad_width = 0;
+  ewaddArgs.image1.address = input_y_ptr;
+  ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
+  ewaddArgs.image1.scale_address = input_y->scale;
+  ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
+  ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
+  ewaddArgs.image1.pad_height = 0;
+  ewaddArgs.image1.pad_width = 0;
+  ewaddArgs.output.scale_address = out->scale;
+  ewaddArgs.output.address = out_ptr;
+  param->SetFpgaArgs(ewaddArgs);
+  return true;
+}
+
+template <>
+void ElementwiseAddReluKernel<FPGA, float>::Compute(
+    const ElementwiseAddReluParam<FPGA> &param) const {
+  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..38b39f982ce41c7d5a88b82f21e446b05c859a2c
--- /dev/null
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FCRELU_OP
+#include "operators/kernel/fc_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input_x = const_cast<LoDTensor *>(param->InputX());
+  auto filter = const_cast<Tensor *>(param->InputY());
+  auto input_z = param->InputZ();
+  auto input_z_ptr = input_z->data<float>();
+  auto out = param->Out();
+  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
+                        "Image channel should be equal to weight number");
+  int channel = (uint32_t)out->dims()[1];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = input_z_ptr[i];
+  }
+
+  int num = (uint32_t)filter->dims()[1];
+  int chw = (uint32_t)filter->dims()[0];
+  PADDLE_MOBILE_ENFORCE(
+      chw == input_x->numel(),
+      "Filter element num should be equal to IFM element num");
+  int height = (uint32_t)input_x->dims()[2];
+  int width = (uint32_t)input_x->dims()[3];
+  int filter_channel = chw / height / width;
+
+  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
+  float max_value = fpga::filter_find_max(filter);
+  fpga::format_filter(filter, max_value, 1);
+
+  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
+  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
+  fpga::format_fp16_ofm(out);
+
+  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
+                      0, bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+template <>
+void FusionFcReluKernel<FPGA, float>::Compute(
+    const FusionFcReluParam<FPGA> &param) const {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6dee8ea6a7e1b26bec4ffd3ed324db4a4ac3be2d
--- /dev/null
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
@@ -0,0 +1,70 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FC_OP
+
+#include "operators/kernel/fusion_fc_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto input_x = const_cast<LoDTensor *>(param->InputX());
+  auto filter = const_cast<Tensor *>(param->InputY());
+  const Tensor *input_z = param->InputZ();
+  auto input_z_ptr = input_z->data<float>();
+  auto out = param->Out();
+
+  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
+                        "Image channel should be equal to weight number");
+  int channel = (uint32_t)out->dims()[1];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  for (int i = 0; i < channel; i++) {
+    bs_ptr[i + channel] = 1;
+    bs_ptr[i] = input_z_ptr[i];
+  }
+  int num = (uint32_t)filter->dims()[1];
+  int chw = (uint32_t)filter->dims()[0];
+  PADDLE_MOBILE_ENFORCE(
+      chw == input_x->numel(),
+      "Filter element num should be equal to IFM element num");
+  int height = (uint32_t)input_x->dims()[2];
+  int width = (uint32_t)input_x->dims()[3];
+  int filter_channel = chw / height / width;
+
+  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
+  float max_value = fpga::filter_find_max(filter);
+  fpga::format_filter(filter, max_value, 1);
+
+  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
+  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
+  fpga::format_fp16_ofm(out);
+
+  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
+                      0, bs_ptr);
+  param->SetFpgaArgs(conv_arg);
+  return true;
+}
+
+template <>
+void FusionFcKernel<FPGA, float>::Compute(
+    const FusionFcParam<FPGA> &param) const {
+  fpga::ComputeFpgaConv(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/pool_kernel.cpp b/src/operators/kernel/fpga/pool_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4dad2f789baeb6e381c66ed861b8a8360fa2996e
--- /dev/null
+++ b/src/operators/kernel/fpga/pool_kernel.cpp
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POOL_OP
+
+#include "operators/kernel/pool_kernel.h"
+
+class PoolingArgs;
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
+  auto *input = const_cast<Tensor *>(param->Input());
+  auto input_ptr = input->data<float>();
+  Tensor *output = param->Output();
+  fpga::format_fp16_ofm(output);
+  auto output_ptr = output->mutable_data<float>();
+  vector<int> ksize = param->Ksize();
+  vector<int> strides = param->Strides();
+  vector<int> paddings = param->Paddings();
+
+  fpga::PoolingArgs poolArgs = {0};
+  poolArgs.image.address = input_ptr;
+  poolArgs.image.channels = (uint32_t)input->dims()[1];
+  poolArgs.image.height = (uint32_t)input->dims()[2];
+  poolArgs.image.width = (uint32_t)input->dims()[3];
+  poolArgs.image.pad_height = (uint32_t)paddings[0];
+  poolArgs.image.pad_width = (uint32_t)paddings[1];
+  poolArgs.image.scale_address = input->scale;
+  poolArgs.output.address = output_ptr;
+  poolArgs.output.scale_address = output->scale;
+  poolArgs.kernel.height = (uint32_t)ksize[0];
+  poolArgs.kernel.width = (uint32_t)ksize[1];
+  poolArgs.kernel.stride_h = (uint32_t)strides[0];
+  poolArgs.kernel.stride_w = (uint32_t)strides[1];
+  param->SetFpgaArgs(poolArgs);
+  return true;
+}
+
+template <>
+void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) const {
+  fpga::ComputeFpgaPool(param.FpgaArgs());
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ef68cc3c33fdc4c0a8537cbb1dd3a49583c6c8b1
--- /dev/null
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -0,0 +1,67 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SOFTMAX_OP
+
+#include "../softmax_kernel.h"
+#include "../central-arm-func/softmax_arm_func.h"
+#include "common/types.h"
+#include "fpga/api.h"
+#include "operators/math/softmax.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
+  auto input = const_cast<Tensor *>(param->InputX());
+  auto input_ptr = input->data<float>();
+  auto float_input = new Tensor;
+  float_input->mutable_data<float>(input->dims());
+  fpga::format_fp32_ofm(float_input);
+
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_CHW;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input_ptr;
+  args.image.height = 1;
+  args.image.width = 1;
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = float_input->data<float>();
+  args.output.scale_address = float_input->scale;
+  param->SetFloatInput(float_input);
+  param->SetFpgaArgs(args);
+  return true;
+}
+
+template <>
+void SoftmaxKernel<FPGA, float>::Compute(
+    const SoftmaxParam<FPGA> &param) const {
+  Tensor *in_x = param.FloatInput();
+  Tensor *out = param.Out();
+
+  fpga::PerformBypass(param.FpgaArgs());
+  fpga::fpga_invalidate(
+      (void *)in_x->data<float>(),
+      (size_t)fpga::get_align_image_cw((int)in_x->dims()[1]) * sizeof(float));
+
+  math::SoftmaxFuntor<CPU, float>()(in_x, out);
+  fpga::fpga_flush(out->data<float>(), out->memory_size());
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fusion_fc_kernel.h b/src/operators/kernel/fusion_fc_kernel.h
index 0e31134ba5a18405a5855db1e85b3885608c4071..39cfd898a203e742168a775ec892e562bd19f5db 100644
--- a/src/operators/kernel/fusion_fc_kernel.h
+++ b/src/operators/kernel/fusion_fc_kernel.h
@@ -25,10 +25,10 @@ namespace operators {
 
 template <typename DeviceType, typename T>
 class FusionFcKernel
-    : public framework::OpKernelBase<DeviceType, FusionFcParam> {
+    : public framework::OpKernelBase<DeviceType, FusionFcParam<DeviceType>> {
  public:
-  void Compute(const FusionFcParam& param) const;
-  bool Init(FusionFcParam* param);
+  void Compute(const FusionFcParam<DeviceType>& param) const;
+  bool Init(FusionFcParam<DeviceType>* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/gru_kernel.h b/src/operators/kernel/gru_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b02663bd0e2982bdb2480c54632d2a8da9f67fc
--- /dev/null
+++ b/src/operators/kernel/gru_kernel.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef GRU_OP
+
+#pragma once
+
+#include <vector>
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class GruKernel
+    : public framework::OpKernelBase<DeviceType, GruParam<DeviceType>> {
+ public:
+  void Compute(const GruParam<DeviceType>& param) const;
+  bool Init(GruParam<DeviceType>* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/im2sequence_kernel.h b/src/operators/kernel/im2sequence_kernel.h
index cb592613f73d90dae5a7d6e515f8bc091981776e..df93ea5abacda1a5291caa53dc5dae7ea2b5d710 100644
--- a/src/operators/kernel/im2sequence_kernel.h
+++ b/src/operators/kernel/im2sequence_kernel.h
@@ -20,19 +20,17 @@ limitations under the License. */
 #include "operators/math/vol2col.h"
 #include "operators/op_param.h"
 
-#pragma once;
+#pragma once
 
 namespace paddle_mobile {
 namespace operators {
 
-using namespace framework;
-
 template <typename DeviceType, typename T>
 class Im2SequenceKernel
-    : public framework::OpKernelBase<DeviceType, Im2SequenceParam> {
+    : public framework::OpKernelBase<DeviceType, Im2SequenceParam<DeviceType>> {
  public:
-  void Compute(const Im2SequenceParam& param) const;
-  bool Init(Im2SequenceParam* para);
+  void Compute(const Im2SequenceParam<DeviceType>& param) const;
+  bool Init(Im2SequenceParam<DeviceType>* para);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/lookup_kernel.h b/src/operators/kernel/lookup_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..73f6cfcced078382b40526eae1f6560d7d168b97
--- /dev/null
+++ b/src/operators/kernel/lookup_kernel.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef LOOKUP_OP
+
+#pragma once
+
+#include <vector>
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class LookupKernel
+    : public framework::OpKernelBase<DeviceType, LookupParam<DeviceType>> {
+ public:
+  void Compute(const LookupParam<DeviceType>& param) const;
+  bool Init(LookupParam<DeviceType>* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/lrn_kernel.h b/src/operators/kernel/lrn_kernel.h
index 7327451a0aa21b7bcf9ae111f63c19f2b6bb2d3a..164178f1dcc0ee2523fc9c5fdc4736c14a3e55ce 100644
--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef LRN_OP
-
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include "framework/operator.h"
 #include "operators/op_param.h"
 
@@ -47,6 +49,7 @@ struct LRNFunctor {
     std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), 0.0);
 
     for (int a = 0; a < N; a++) {
+#pragma parallel for
       for (int b = 0; b < C; b++) {
         for (int index = start; index < end; index++) {
           int channel = b + index;
@@ -167,10 +170,11 @@ struct LRNFunctor {
 };
 
 template <typename DeviceType, typename T>
-class LrnKernel : public framework::OpKernelBase<DeviceType, LrnParam> {
+class LrnKernel
+    : public framework::OpKernelBase<DeviceType, LrnParam<DeviceType>> {
  public:
-  void Compute(const LrnParam &param) const;
-  bool Init(LrnParam *param);
+  void Compute(const LrnParam<DeviceType> &param) const;
+  bool Init(LrnParam<DeviceType> *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/mali/acl_operator.cc b/src/operators/kernel/mali/acl_operator.cc
old mode 100644
new mode 100755
diff --git a/src/operators/kernel/mali/acl_operator.h b/src/operators/kernel/mali/acl_operator.h
old mode 100644
new mode 100755
index c2e13283b1c679d6dfc8972af5ace5e579d568e6..bf8200d486f91998c79540177ab1b26596a3e9dc
--- a/src/operators/kernel/mali/acl_operator.h
+++ b/src/operators/kernel/mali/acl_operator.h
@@ -225,6 +225,7 @@ class AclParameters {
 
   bool is_global_pool;
   bool is_channel_concat;
+  bool is_bypass;
 
   std::vector<framework::LoDTensor *> in_tensor;
 };
diff --git a/src/operators/kernel/mali/acl_tensor.cc b/src/operators/kernel/mali/acl_tensor.cc
old mode 100644
new mode 100755
diff --git a/src/operators/kernel/mali/acl_tensor.h b/src/operators/kernel/mali/acl_tensor.h
old mode 100644
new mode 100755
diff --git a/src/operators/kernel/mali/batchnorm_kernel.cpp b/src/operators/kernel/mali/batchnorm_kernel.cpp
old mode 100644
new mode 100755
index 22ce472c464bc9ed89ee721244e9873c01601ebd..50f6ef5f566347c089869c30b8f7534a4f8b6779
--- a/src/operators/kernel/mali/batchnorm_kernel.cpp
+++ b/src/operators/kernel/mali/batchnorm_kernel.cpp
@@ -36,7 +36,7 @@ class AclBatchNormOp : public acl::ACLOperator {
   AclBatchNormOp& operator=(AclBatchNormOp&&) = delete;
 
   acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const BatchNormParam& param) {
+  void InitAclLayer(const BatchNormParam<DeviceType>& param) {
     setTargetHint(acl::TargetHint::OPENCL);
     arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
                                          args.in_depth, args.batch);
@@ -68,9 +68,10 @@ class AclBatchNormOp : public acl::ACLOperator {
   void RunAcl(void* input, void* output) {
     acl::ACLOperator::acl_run(input, output);
   }
-  bool Bypass_acl(const BatchNormParam& param) {
+  bool Bypass_acl(const BatchNormParam<DeviceType>& param) {
     bool bypass_acl = false;
     AclParametersByContext(param);
+    InitAclLayer(param);
     // for performance, more groups impact GPU performance
     if (this->force_bypass_acl_path_) {
       bypass_acl = true;
@@ -80,7 +81,7 @@ class AclBatchNormOp : public acl::ACLOperator {
   }
 
  private:
-  void AclParametersByContext(const BatchNormParam& param) {
+  void AclParametersByContext(const BatchNormParam<DeviceType>& param) {
     const Tensor* in_x = param.InputX();
     Tensor* out = param.OutputY();
     const Tensor* scale = param.InputScale();
@@ -128,34 +129,31 @@ class AclBatchNormOp : public acl::ACLOperator {
 };
 
 template <>
-bool BatchNormKernel<GPU_MALI, float>::Init(BatchNormParam* param) {
+bool BatchNormKernel<GPU_MALI, float>::Init(BatchNormParam<GPU_MALI>* param) {
   AclBatchNormOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
     acl_op = new AclBatchNormOp<GPU_MALI, float>();
     this->SetAclOp((void*)acl_op, (void*)this);
   }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
   return true;
 }
 
 template <>
 void BatchNormKernel<GPU_MALI, float>::Compute(
-    const BatchNormParam& param) const {
+    const BatchNormParam<GPU_MALI>& param) const {
   std::cout << "init acl" << std::endl;
   AclBatchNormOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
     return;
   }
-  if (acl_op->Bypass_acl(param)) {
-    std::cout << "init acl failed" << std::endl;
-    return;
-  }
   acl::AclParameters& args = acl_op->getargs();
-  const float* input_data = (const float*)args.input_data;
-  const float* output_data = (const float*)args.output_data;
-  acl_op->InitAclLayer(param);
-  acl_op->RunAcl((void*)input_data, (void*)output_data);
+  acl_op->RunAcl(args.input_data, args.output_data);
 }
 
 template class BatchNormKernel<GPU_MALI, float>;
diff --git a/src/operators/kernel/mali/concat_kernel.cpp b/src/operators/kernel/mali/concat_kernel.cpp
index 08ee58d41577dfb5fd3a99755d66b5677b7b7ed2..267c0101a8f66de3d508dbe5795c87ee5027a288 100644
--- a/src/operators/kernel/mali/concat_kernel.cpp
+++ b/src/operators/kernel/mali/concat_kernel.cpp
@@ -38,7 +38,7 @@ class AclConcatOp : public acl::ACLOperator {
 
   acl::AclParameters& getargs() { return args; }
 
-  void InitAclLayer(const ConcatParam& param) {
+  void InitAclLayer(const ConcatParam<DeviceType>& param) {
     setTargetHint(acl::TargetHint::OPENCL);
     const std::vector<framework::LoDTensor*>* input_data = &args.in_tensor;
     arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
@@ -50,8 +50,6 @@ class AclConcatOp : public acl::ACLOperator {
     T type;
 
     for (int i = 0; i < input_data->size(); i++) {
-      const T* idata = (*input_data)[i]->data<T>();
-      const T* pdata = (*input_data)[i]->data<T>();
       int in_batch = (*input_data)[i]->dims()[0];
       int in_channels = (*input_data)[i]->dims()[1];
       int in_width = (*input_data)[i]->dims()[2];
@@ -72,9 +70,10 @@ class AclConcatOp : public acl::ACLOperator {
     T type;
     acl::acl_run(this, input, output, type);
   }
-  bool Bypass_acl(const ConcatParam& param) {
+  bool Bypass_acl(const ConcatParam<DeviceType>& param) {
     bool bypass_acl = false;
     AclParametersByContext(param);
+    InitAclLayer(param);
     // for performance, more groups impact GPU performance
     if (this->force_bypass_acl_path_ || !args.is_channel_concat) {
       bypass_acl = true;
@@ -83,7 +82,7 @@ class AclConcatOp : public acl::ACLOperator {
   }
 
  private:
-  void AclParametersByContext(const ConcatParam& param) {
+  void AclParametersByContext(const ConcatParam<DeviceType>& param) {
     auto inputs = param.Inputs();
     auto* output = param.Out();
     int64_t axis = param.Axis();
@@ -103,33 +102,31 @@ class AclConcatOp : public acl::ACLOperator {
 };
 
 template <>
-bool ConcatKernel<GPU_MALI, float>::Init(const ConcatParam& param) const {
+bool ConcatKernel<GPU_MALI, float>::Init(ConcatParam<GPU_MALI>* param) {
   AclConcatOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
     acl_op = new AclConcatOp<GPU_MALI, float>();
     this->SetAclOp((void*)acl_op, (void*)this);
   }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
   return true;
 }
 
 template <>
-void ConcatKernel<GPU_MALI, float>::Compute(const ConcatParam& param) const {
+void ConcatKernel<GPU_MALI, float>::Compute(
+    const ConcatParam<GPU_MALI>& param) const {
   std::cout << "init acl" << std::endl;
   AclConcatOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
     return;
   }
-  if (acl_op->Bypass_acl(param)) {
-    std::cout << "init acl failed" << std::endl;
-    return;
-  }
   acl::AclParameters& args = acl_op->getargs();
-  std::vector<framework::LoDTensor*> temp_data = args.in_tensor;
-  const float* output_data = (const float*)args.output_data;
-  acl_op->InitAclLayer(param);
-  acl_op->RunAcl(temp_data, (void*)output_data);
+  acl_op->RunAcl(args.in_tensor, args.output_data);
 }
 
 template class ConcatKernel<GPU_MALI, float>;
diff --git a/src/operators/kernel/mali/conv_add_kernel.cpp b/src/operators/kernel/mali/conv_add_kernel.cpp
index 1d34910231c086673c58d8dba2c1e44992b5d593..74cace00dd2dead7a5d9ddfc76e2d48c67cccf89 100644
--- a/src/operators/kernel/mali/conv_add_kernel.cpp
+++ b/src/operators/kernel/mali/conv_add_kernel.cpp
@@ -37,7 +37,7 @@ class AclConvAddOp : public acl::ACLOperator {
   AclConvAddOp& operator=(AclConvAddOp&&) = delete;
 
   acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const FusionConvAddParam& param) {
+  void InitAclLayer(const FusionConvAddParam<DeviceType>& param) {
     setTargetHint(acl::TargetHint::OPENCL);
     arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
                                          args.in_depth, args.batch);
@@ -55,7 +55,8 @@ class AclConvAddOp : public acl::ACLOperator {
     set_operator_init_done();
     this->force_bypass_acl_path_ = false;
 
-    check_direct_conv();
+    // check_direct_conv();
+    group() = args.num_group;
     //[kernel_x, kernel_y, IFM, OFM]
     new_tensor(weights(), weights_shape, args.weight_data);
     //[OFM]
@@ -63,8 +64,6 @@ class AclConvAddOp : public acl::ACLOperator {
       new_tensor(biases(), biases_shape, args.biases_data);
     }
 
-    group() = args.num_group;
-
     //[width, height, IFM]
     new_tensor(input(), input_shape, args.input_data);
     //[width, height, OFM]
@@ -76,9 +75,10 @@ class AclConvAddOp : public acl::ACLOperator {
   void RunAcl(void* input, void* output) {
     acl::ACLOperator::acl_run(input, output);
   }
-  bool Bypass_acl(const FusionConvAddParam& param) {
+  bool Bypass_acl(const FusionConvAddParam<DeviceType>& param) {
     bool bypass_acl = false;
     AclParametersByContext(param);
+    InitAclLayer(param);
     // for performance, more groups impact GPU performance
     if (this->force_bypass_acl_path_ || args.num_group >= 5) {
       bypass_acl = true;
@@ -119,7 +119,7 @@ class AclConvAddOp : public acl::ACLOperator {
     }
   }
 
-  void AclParametersByContext(const FusionConvAddParam& param) {
+  void AclParametersByContext(const FusionConvAddParam<DeviceType>& param) {
     const Tensor* input = param.Input();
     Tensor filter = *param.Filter();
     Tensor* output = param.Output();
@@ -196,35 +196,32 @@ class AclConvAddOp : public acl::ACLOperator {
 };
 
 template <>
-bool ConvAddKernel<GPU_MALI, float>::Init(
-    const FusionConvAddParam& param) const {
+bool ConvAddKernel<GPU_MALI, float>::Init(FusionConvAddParam<GPU_MALI>* param) {
   AclConvAddOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
     acl_op = new AclConvAddOp<GPU_MALI, float>();
     this->SetAclOp((void*)acl_op, (void*)this);
   }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
   return true;
 }
 
 template <>
 void ConvAddKernel<GPU_MALI, float>::Compute(
-    const FusionConvAddParam& param) const {
+    const FusionConvAddParam<GPU_MALI>& param) const {
   std::cout << "init acl" << std::endl;
   AclConvAddOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
     return;
   }
-  if (acl_op->Bypass_acl(param)) {
-    std::cout << "init acl failed" << std::endl;
-    return;
-  }
   acl::AclParameters& args = acl_op->getargs();
-  const float* input_data = (const float*)args.input_data;
-  const float* output_data = (const float*)args.output_data;
-  acl_op->InitAclLayer(param);
-  acl_op->RunAcl((void*)input_data, (void*)output_data);
+
+  acl_op->RunAcl(args.input_data, args.output_data);
 }
 
 template class ConvAddKernel<GPU_MALI, float>;
diff --git a/src/operators/kernel/mali/conv_kernel.cpp b/src/operators/kernel/mali/conv_kernel.cpp
index 36f438605317dd016d2f44cf9c5efc0ab33c5923..7852e64990e5a2cd6f3d7e803e71c23c55aa7a27 100644
--- a/src/operators/kernel/mali/conv_kernel.cpp
+++ b/src/operators/kernel/mali/conv_kernel.cpp
@@ -37,7 +37,7 @@ class AclConvOp : public acl::ACLOperator {
   AclConvOp& operator=(AclConvOp&&) = delete;
 
   acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const ConvParam& param) {
+  void InitAclLayer(const ConvParam<DeviceType>& param) {
     setTargetHint(acl::TargetHint::OPENCL);
     arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
                                          args.in_depth, args.batch);
@@ -76,9 +76,10 @@ class AclConvOp : public acl::ACLOperator {
   void RunAcl(void* input, void* output) {
     acl::ACLOperator::acl_run(input, output);
   }
-  bool Bypass_acl(const ConvParam& param) {
+  bool Bypass_acl(const ConvParam<DeviceType>& param) {
     bool bypass_acl = false;
     AclParametersByContext(param);
+    InitAclLayer(param);
     // for performance, more groups impact GPU performance
     if (this->force_bypass_acl_path_ || args.num_group >= 5) {
       bypass_acl = true;
@@ -119,7 +120,7 @@ class AclConvOp : public acl::ACLOperator {
     }
   }
 
-  void AclParametersByContext(const ConvParam& param) {
+  void AclParametersByContext(const ConvParam<DeviceType>& param) {
     const Tensor* input = param.Input();
     Tensor filter = *param.Filter();
     Tensor* output = param.Output();
@@ -195,33 +196,31 @@ class AclConvOp : public acl::ACLOperator {
 };
 
 template <>
-bool ConvKernel<GPU_MALI, float>::Init(ConvParam* param) {
+bool ConvKernel<GPU_MALI, float>::Init(ConvParam<GPU_MALI>* param) {
   AclConvOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
     acl_op = new AclConvOp<GPU_MALI, float>();
     this->SetAclOp((void*)acl_op, (void*)this);
   }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
   return true;
 }
 
 template <>
-void ConvKernel<GPU_MALI, float>::Compute(const ConvParam& param) const {
+void ConvKernel<GPU_MALI, float>::Compute(
+    const ConvParam<GPU_MALI>& param) const {
   std::cout << "init acl" << std::endl;
   AclConvOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
     return;
   }
-  if (acl_op->Bypass_acl(param)) {
-    std::cout << "init acl failed" << std::endl;
-    return;
-  }
   acl::AclParameters& args = acl_op->getargs();
-  const float* input_data = (const float*)args.input_data;
-  const float* output_data = (const float*)args.output_data;
-  acl_op->InitAclLayer(param);
-  acl_op->RunAcl((void*)input_data, (void*)output_data);
+  acl_op->RunAcl(args.input_data, args.output_data);
 }
 
 template class ConvKernel<GPU_MALI, float>;
diff --git a/src/operators/kernel/mali/elementwise_add_kernel.cpp b/src/operators/kernel/mali/elementwise_add_kernel.cpp
index 9748bbbb5454f10ad9ea83e37d599fb1c6cdb53e..5596476e1bb33ecc2b3122bf237090b099307156 100644
--- a/src/operators/kernel/mali/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp
@@ -27,13 +27,14 @@ struct AddFunctor {
 };
 
 template <>
-bool ElementwiseAddKernel<GPU_MALI, float>::Init(ElementwiseAddParam *param) {
+bool ElementwiseAddKernel<GPU_MALI, float>::Init(
+    ElementwiseAddParam<GPU_MALI> *param) {
   return true;
 }
 
 template <>
 void ElementwiseAddKernel<GPU_MALI, float>::Compute(
-    const ElementwiseAddParam &param) const {
+    const ElementwiseAddParam<GPU_MALI> &param) const {
   const Tensor *input_x = param.InputX();
   const Tensor *input_y = param.InputY();
   Tensor *Out = param.Out();
diff --git a/src/operators/kernel/mali/fushion_fc_kernel.cpp b/src/operators/kernel/mali/fushion_fc_kernel.cpp
old mode 100644
new mode 100755
index a76c3c46012a758a05cf8f846a15376ad1b9f33c..c3197f38c6c6ee1a4f4f684c824a9a9e43d69d4f
--- a/src/operators/kernel/mali/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
@@ -14,21 +14,19 @@ limitations under the License. */
 
 #ifdef FUSION_FC_OP
 
-#pragma once
-
 #include "operators/kernel/fusion_fc_kernel.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool FusionFcKernel<GPU_MALI, float>::Init(FusionFcParam *param) {
+bool FusionFcKernel<GPU_MALI, float>::Init(FusionFcParam<GPU_MALI> *param) {
   return true;
 }
 
 template <>
 void FusionFcKernel<GPU_MALI, float>::Compute(
-    const FusionFcParam &param) const {
+    const FusionFcParam<GPU_MALI> &param) const {
   const Tensor *input_x = param.InputX();
   const Tensor *input_y = param.InputY();
   const Tensor *input_z = param.InputZ();
diff --git a/src/operators/kernel/mali/lrn_kernel.cpp b/src/operators/kernel/mali/lrn_kernel.cpp
index c063ec8783382ccef79086368df8a97320010c23..fc088f735c538bedc4d5c79593aa31c48acc4fc6 100644
--- a/src/operators/kernel/mali/lrn_kernel.cpp
+++ b/src/operators/kernel/mali/lrn_kernel.cpp
@@ -20,6 +20,7 @@ limitations under the License. */
 #ifdef PADDLE_MOBILE_MALI_GPU
 #include "acl_operator.h"
 #include "framework/operator.h"
+#include "operators/kernel/central-arm-func/lrn_arm_func.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
@@ -39,7 +40,7 @@ class AclLrnOp : public acl::ACLOperator {
   AclLrnOp& operator=(AclLrnOp&&) = delete;
 
   acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const LrnParam& param) {
+  void InitAclLayer(const LrnParam<DeviceType>& param) {
     setTargetHint(acl::TargetHint::OPENCL);
     arm_compute::TensorShape shape(args.in_cols, args.in_rows, args.in_depth);
 
@@ -59,12 +60,15 @@ class AclLrnOp : public acl::ACLOperator {
     acl_configure(lrn, this, norm_info);
   }
 
+  void Set_bypass(bool bypass) { args.is_bypass = bypass; }
+
   void RunAcl(void* input, void* output) {
     acl::ACLOperator::acl_run(input, output);
   }
-  bool Bypass_acl(const LrnParam& param) {
+  bool Bypass_acl(const LrnParam<DeviceType>& param) {
     bool bypass_acl = false;
     AclParametersByContext(param);
+    InitAclLayer(param);
     // for performance, more groups impact GPU performance
     if (this->force_bypass_acl_path_) {
       bypass_acl = true;
@@ -74,7 +78,7 @@ class AclLrnOp : public acl::ACLOperator {
   }
 
  private:
-  void AclParametersByContext(const LrnParam& param) {
+  void AclParametersByContext(const LrnParam<DeviceType>& param) {
     const Tensor* in_x = param.InputX();
     Tensor* out = param.Out();
 
@@ -107,32 +111,38 @@ class AclLrnOp : public acl::ACLOperator {
 };
 
 template <>
-bool LrnKernel<GPU_MALI, float>::Init(const LrnParam& param) const {
+bool LrnKernel<GPU_MALI, float>::Init(LrnParam<GPU_MALI>* param) {
   AclLrnOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
     acl_op = new AclLrnOp<GPU_MALI, float>();
     this->SetAclOp((void*)acl_op, (void*)this);
   }
+  if (acl_op->Bypass_acl(*param)) {
+    acl_op->Set_bypass(true);
+    std::cout << "init acl failed" << std::endl;
+    return true;
+  }
   return true;
 }
 
 template <>
-void LrnKernel<GPU_MALI, float>::Compute(const LrnParam& param) const {
+void LrnKernel<GPU_MALI, float>::Compute(
+    const LrnParam<GPU_MALI>& param) const {
   std::cout << "init acl" << std::endl;
   AclLrnOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
     return;
   }
-  if (acl_op->Bypass_acl(param)) {
-    std::cout << "init acl failed" << std::endl;
+  acl::AclParameters& args = acl_op->getargs();
+  if (args.is_bypass) {
+    std::cout << "bypass op" << std::endl;
+    LrnCompute<float>(param);
     return;
   }
-  acl::AclParameters& args = acl_op->getargs();
   const float* input_data = (const float*)args.input_data;
   const float* output_data = (const float*)args.output_data;
-  acl_op->InitAclLayer(param);
   for (int n = 0; n < args.batch; ++n) {
     acl_op->RunAcl((void*)input_data, (void*)output_data);
     input_data += args.in_depth * args.in_cols * args.in_rows;
diff --git a/src/operators/kernel/mali/mul_kernel.cpp b/src/operators/kernel/mali/mul_kernel.cpp
index 3a9ec4ebb319d9e521240ad987a49549c22c1ff2..a9e54dad2b51c595be4f68df3916a4803047617e 100644
--- a/src/operators/kernel/mali/mul_kernel.cpp
+++ b/src/operators/kernel/mali/mul_kernel.cpp
@@ -22,12 +22,13 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool MulKernel<GPU_MALI, float>::Init(MulParam *param) {
+bool MulKernel<GPU_MALI, float>::Init(MulParam<GPU_MALI> *param) {
   return true;
 }
 
 template <>
-void MulKernel<GPU_MALI, float>::Compute(const MulParam &param) const {
+void MulKernel<GPU_MALI, float>::Compute(
+    const MulParam<GPU_MALI> &param) const {
   const Tensor *input_x = param.InputX();
   const Tensor *input_y = param.InputY();
   Tensor *out = param.Out();
diff --git a/src/operators/kernel/mali/pool_kernel.cpp b/src/operators/kernel/mali/pool_kernel.cpp
index 9de90deebca05ef50cf94fa958f37bbcf1a08c4b..33b3bd7017739144a519bfb1be247b4751883779 100644
--- a/src/operators/kernel/mali/pool_kernel.cpp
+++ b/src/operators/kernel/mali/pool_kernel.cpp
@@ -39,7 +39,7 @@ class AclPoolOp : public acl::ACLOperator {
   AclPoolOp& operator=(AclPoolOp&&) = delete;
 
   acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const PoolParam& param) {
+  void InitAclLayer(const PoolParam<DeviceType>& param) {
     setTargetHint(acl::TargetHint::OPENCL);
     arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
                                          args.in_depth);
@@ -79,9 +79,10 @@ class AclPoolOp : public acl::ACLOperator {
   void RunAcl(void* input, void* output) {
     acl::ACLOperator::acl_run(input, output);
   }
-  bool Bypass_acl(const PoolParam& param) {
+  bool Bypass_acl(const PoolParam<DeviceType>& param) {
     bool bypass_acl = false;
     AclParametersByContext(param);
+    InitAclLayer(param);
     // for performance, more groups impact GPU performance
     if (this->force_bypass_acl_path_) {
       bypass_acl = true;
@@ -99,7 +100,7 @@ class AclPoolOp : public acl::ACLOperator {
   }
 
  private:
-  void AclParametersByContext(const PoolParam& param) {
+  void AclParametersByContext(const PoolParam<DeviceType>& param) {
     const Tensor* in_x = param.Input();
     Tensor* out = param.Output();
     std::string pooling_type = param.PoolingType();
@@ -179,32 +180,32 @@ class AclPoolOp : public acl::ACLOperator {
 };
 
 template <>
-bool PoolKernel<GPU_MALI, float>::Init(const PoolParam& param) const {
+bool PoolKernel<GPU_MALI, float>::Init(PoolParam<GPU_MALI>* param) {
   AclPoolOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
     acl_op = new AclPoolOp<GPU_MALI, float>();
     this->SetAclOp((void*)acl_op, (void*)this);
   }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
   return true;
 }
 
 template <>
-void PoolKernel<GPU_MALI, float>::Compute(const PoolParam& param) const {
+void PoolKernel<GPU_MALI, float>::Compute(
+    const PoolParam<GPU_MALI>& param) const {
   std::cout << "init acl" << std::endl;
   AclPoolOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
     return;
   }
-  if (acl_op->Bypass_acl(param)) {
-    std::cout << "init acl failed" << std::endl;
-    return;
-  }
   acl::AclParameters& args = acl_op->getargs();
   const float* input_data = (const float*)args.input_data;
   const float* output_data = (const float*)args.output_data;
-  acl_op->InitAclLayer(param);
   for (int n = 0; n < args.batch; ++n) {
     acl_op->RunAcl((void*)input_data, (void*)output_data);
     input_data += args.in_depth * args.in_cols * args.in_rows;
diff --git a/src/operators/kernel/mali/relu_kernel.cpp b/src/operators/kernel/mali/relu_kernel.cpp
index 3deebc9d2f1a9f652813362f4947f744f0541482..10b270800dee1a0ad8176da1f788100d29b60173 100644
--- a/src/operators/kernel/mali/relu_kernel.cpp
+++ b/src/operators/kernel/mali/relu_kernel.cpp
@@ -39,12 +39,12 @@ class AclReluOp : public acl::ACLOperator {
   AclReluOp& operator=(AclReluOp&&) = delete;
 
   acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const ReluParam& param) {
+  void InitAclLayer(const ReluParam<DeviceType>& param) {
     setTargetHint(acl::TargetHint::OPENCL);
-    arm_compute::TensorShape input_shape(args.in_cols * args.in_rows *
-                                         args.in_depth * args.batch);
-    arm_compute::TensorShape output_shape(args.in_cols * args.in_rows *
-                                          args.in_depth * args.out_num);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.in_cols, args.in_rows,
+                                          args.in_depth, args.out_num);
     // arm_compute::TensorShape weights_shape(
     // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
     // arm_compute::TensorShape biases_shape(args.out_depth);
@@ -68,9 +68,10 @@ class AclReluOp : public acl::ACLOperator {
   void RunAcl(void* input, void* output) {
     acl::ACLOperator::acl_run(input, output);
   }
-  bool Bypass_acl(const ReluParam& param) {
+  bool Bypass_acl(const ReluParam<DeviceType>& param) {
     bool bypass_acl = false;
     AclParametersByContext(param);
+    InitAclLayer(param);
     // for performance, more groups impact GPU performance
     if (this->force_bypass_acl_path_) {
       bypass_acl = true;
@@ -79,7 +80,7 @@ class AclReluOp : public acl::ACLOperator {
   }
 
  private:
-  void AclParametersByContext(const ReluParam& param) {
+  void AclParametersByContext(const ReluParam<DeviceType>& param) {
     const auto* input_x = param.InputX();
     auto* out = param.Out();
 
@@ -99,33 +100,31 @@ class AclReluOp : public acl::ACLOperator {
 };
 
 template <>
-bool ReluKernel<GPU_MALI, float>::Init(const ReluParam& param) const {
+bool ReluKernel<GPU_MALI, float>::Init(ReluParam<GPU_MALI>* param) {
   AclReluOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
     acl_op = new AclReluOp<GPU_MALI, float>();
     this->SetAclOp((void*)acl_op, (void*)this);
   }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
   return true;
 }
 
 template <>
-void ReluKernel<GPU_MALI, float>::Compute(const ReluParam& param) const {
+void ReluKernel<GPU_MALI, float>::Compute(
+    const ReluParam<GPU_MALI>& param) const {
   std::cout << "init acl" << std::endl;
   AclReluOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
     return;
   }
-  if (acl_op->Bypass_acl(param)) {
-    std::cout << "init acl failed" << std::endl;
-    return;
-  }
   acl::AclParameters& args = acl_op->getargs();
-  const float* input_data = (const float*)args.input_data;
-  const float* output_data = (const float*)args.output_data;
-  acl_op->InitAclLayer(param);
-  acl_op->RunAcl((void*)input_data, (void*)output_data);
+  acl_op->RunAcl(args.input_data, args.output_data);
 }
 
 template class ReluKernel<GPU_MALI, float>;
diff --git a/src/operators/kernel/mali/reshape_kernel.cpp b/src/operators/kernel/mali/reshape_kernel.cpp
index 57837a677033590e92a307bd69a77c076c5ba805..69c077e252162017cb477a000b5f17f5a968fc10 100644
--- a/src/operators/kernel/mali/reshape_kernel.cpp
+++ b/src/operators/kernel/mali/reshape_kernel.cpp
@@ -22,12 +22,13 @@ namespace paddle_mobile {
 namespace operators {
 
 template <>
-bool ReshapeKernel<GPU_MALI, float>::Init(ReshapeParam *param) {
+bool ReshapeKernel<GPU_MALI, float>::Init(ReshapeParam<GPU_MALI> *param) {
   return true;
 }
 
 template <>
-void ReshapeKernel<GPU_MALI, float>::Compute(const ReshapeParam &param) const {
+void ReshapeKernel<GPU_MALI, float>::Compute(
+    const ReshapeParam<GPU_MALI> &param) const {
   const auto *input_x = param.InputX();
   const auto &input_x_dims = input_x->dims();
   auto *out = param.Out();
diff --git a/src/operators/kernel/mali/softmax_kernel.cpp b/src/operators/kernel/mali/softmax_kernel.cpp
index 36edb3724600ada43606c23b1989615183ff21e8..d4f25c96cc47d7baa394645d4e0c84e0e3f7ad29 100644
--- a/src/operators/kernel/mali/softmax_kernel.cpp
+++ b/src/operators/kernel/mali/softmax_kernel.cpp
@@ -39,7 +39,7 @@ class AclSoftmaxOp : public acl::ACLOperator {
   AclSoftmaxOp& operator=(AclSoftmaxOp&&) = delete;
 
   acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const SoftmaxParam& param) {
+  void InitAclLayer(const SoftmaxParam<DeviceType>& param) {
     setTargetHint(acl::TargetHint::OPENCL);
     arm_compute::TensorShape shape(args.in_depth, args.batch);
 
@@ -58,9 +58,10 @@ class AclSoftmaxOp : public acl::ACLOperator {
   void RunAcl(void* input, void* output) {
     acl::ACLOperator::acl_run(input, output);
   }
-  bool Bypass_acl(const SoftmaxParam& param) {
+  bool Bypass_acl(const SoftmaxParam<DeviceType>& param) {
     bool bypass_acl = false;
     AclParametersByContext(param);
+    InitAclLayer(param);
     // for performance, more groups impact GPU performance
     if (this->force_bypass_acl_path_) {
       bypass_acl = true;
@@ -70,7 +71,7 @@ class AclSoftmaxOp : public acl::ACLOperator {
   }
 
  private:
-  void AclParametersByContext(const SoftmaxParam& param) {
+  void AclParametersByContext(const SoftmaxParam<DeviceType>& param) {
     const framework::Tensor* in_x = param.InputX();
     framework::Tensor* out = param.Out();
     auto x_dims = in_x->dims();
@@ -96,32 +97,33 @@ class AclSoftmaxOp : public acl::ACLOperator {
 };
 
 template <>
-bool SoftmaxKernel<GPU_MALI, float>::Init(const SoftmaxParam& param) const {
+bool SoftmaxKernel<GPU_MALI, float>::Init(SoftmaxParam<GPU_MALI>* param) {
   AclSoftmaxOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
     acl_op = new AclSoftmaxOp<GPU_MALI, float>();
     this->SetAclOp((void*)acl_op, (void*)this);
   }
+  if (acl_op->Bypass_acl(*param)) {
+    std::cout << "init acl failed" << std::endl;
+    return false;
+  }
   return true;
 }
 
 template <>
-void SoftmaxKernel<GPU_MALI, float>::Compute(const SoftmaxParam& param) const {
+void SoftmaxKernel<GPU_MALI, float>::Compute(
+    const SoftmaxParam<GPU_MALI>& param) const {
   std::cout << "init acl" << std::endl;
   AclSoftmaxOp<GPU_MALI, float>* acl_op =
       reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
   if (acl_op == nullptr) {
     return;
   }
-  if (acl_op->Bypass_acl(param)) {
-    std::cout << "init acl failed" << std::endl;
-    return;
-  }
   acl::AclParameters& args = acl_op->getargs();
   const float* input_data = (const float*)args.input_data;
   const float* output_data = (const float*)args.output_data;
-  acl_op->InitAclLayer(param);
+
   for (int n = 0; n < args.out_num; ++n) {
     acl_op->RunAcl((void*)input_data, (void*)output_data);
     input_data += args.in_depth;
diff --git a/src/operators/kernel/mul_kernel.h b/src/operators/kernel/mul_kernel.h
index f7dcb738b38448fe38eb60dcbbd4a2abda7a858a..e441de4d4495b736aec248c0ef85191b32bfcbf9 100644
--- a/src/operators/kernel/mul_kernel.h
+++ b/src/operators/kernel/mul_kernel.h
@@ -26,10 +26,11 @@ namespace operators {
 using namespace framework;
 
 template <typename DeviceType, typename T>
-class MulKernel : public framework::OpKernelBase<DeviceType, MulParam> {
+class MulKernel
+    : public framework::OpKernelBase<DeviceType, MulParam<DeviceType>> {
  public:
-  void Compute(const MulParam &param) const;
-  bool Init(MulParam *param);
+  void Compute(const MulParam<DeviceType> &param) const;
+  bool Init(MulParam<DeviceType> *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/multiclass_nms_kernel.h b/src/operators/kernel/multiclass_nms_kernel.h
index 9bd00b874a1140373decca582f793febf0e941ec..b1b20ddd81b395ea94ae62b1abf2fe861d9257db 100644
--- a/src/operators/kernel/multiclass_nms_kernel.h
+++ b/src/operators/kernel/multiclass_nms_kernel.h
@@ -25,10 +25,11 @@ namespace operators {
 
 template <typename DeviceType, typename T>
 class MultiClassNMSKernel
-    : public framework::OpKernelBase<DeviceType, MultiClassNMSParam> {
+    : public framework::OpKernelBase<DeviceType,
+                                     MultiClassNMSParam<DeviceType>> {
  public:
-  void Compute(const MultiClassNMSParam& param) const;
-  bool Init(MultiClassNMSParam* param);
+  void Compute(const MultiClassNMSParam<DeviceType>& param) const;
+  bool Init(MultiClassNMSParam<DeviceType>* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/pool_kernel.h b/src/operators/kernel/pool_kernel.h
index d666910b73e7a3cef2cc59d4ba32b826ae6d0876..2be254444cc410fb95a94125cccb224ca9505545 100644
--- a/src/operators/kernel/pool_kernel.h
+++ b/src/operators/kernel/pool_kernel.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #pragma once
 
 #include "framework/operator.h"
-#include "operators/math/pooling.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
@@ -25,10 +24,10 @@ namespace operators {
 using framework::OpKernelBase;
 
 template <typename DeviceType, typename T>
-class PoolKernel : public OpKernelBase<DeviceType, PoolParam> {
+class PoolKernel : public OpKernelBase<DeviceType, PoolParam<DeviceType>> {
  public:
-  void Compute(const PoolParam &param) const override;
-  bool Init(PoolParam *param);
+  void Compute(const PoolParam<DeviceType> &param) const override;
+  bool Init(PoolParam<DeviceType> *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/prelu_kernel.h b/src/operators/kernel/prelu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6c7c3ac7f139cf7eafe8843ef48e53c90292082
--- /dev/null
+++ b/src/operators/kernel/prelu_kernel.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+#pragma once
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class PReluKernel
+    : public framework::OpKernelBase<DeviceType, PReluParam<DeviceType>> {
+ public:
+  void Compute(const PReluParam<DeviceType>& param) const;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/prior_box_kernel.h b/src/operators/kernel/prior_box_kernel.h
index d169a01d7f45f7dbdcc02be0e1e71690b8550af8..5640375483d42d52965986dab6795254bbf4b908 100644
--- a/src/operators/kernel/prior_box_kernel.h
+++ b/src/operators/kernel/prior_box_kernel.h
@@ -52,10 +52,10 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
 
 template <typename DeviceType, typename T>
 class PriorBoxKernel
-    : public framework::OpKernelBase<DeviceType, PriorBoxParam> {
+    : public framework::OpKernelBase<DeviceType, PriorBoxParam<DeviceType>> {
  public:
-  void Compute(const PriorBoxParam& param) const;
-  bool Init(PriorBoxParam* param);
+  void Compute(const PriorBoxParam<DeviceType>& param) const;
+  bool Init(PriorBoxParam<DeviceType>* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/relu_kernel.h b/src/operators/kernel/relu_kernel.h
index 64016656b20b0fdb08f1342f7853e2e727a6bb81..b0c32791d626f14b0840ce1c8f3f12f02b403d97 100644
--- a/src/operators/kernel/relu_kernel.h
+++ b/src/operators/kernel/relu_kernel.h
@@ -24,10 +24,11 @@ namespace paddle_mobile {
 namespace operators {
 
 template <typename DeviceType, typename T>
-class ReluKernel : public framework::OpKernelBase<DeviceType, ReluParam> {
+class ReluKernel
+    : public framework::OpKernelBase<DeviceType, ReluParam<DeviceType>> {
  public:
-  void Compute(const ReluParam& param) const;
-  bool Init(ReluParam* param);
+  void Compute(const ReluParam<DeviceType>& param) const;
+  bool Init(ReluParam<DeviceType>* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/reshape_kernel.h b/src/operators/kernel/reshape_kernel.h
index 47eba531b9f36d83d44588d9cdfb162519c24180..73eb63f797f34ec4eb2baec8c4ab79fafb06f0e2 100644
--- a/src/operators/kernel/reshape_kernel.h
+++ b/src/operators/kernel/reshape_kernel.h
@@ -68,10 +68,11 @@ inline framework::DDim ValidateShape(const std::vector<int> shape,
 }
 
 template <typename DeviceType, typename T>
-class ReshapeKernel : public framework::OpKernelBase<DeviceType, ReshapeParam> {
+class ReshapeKernel
+    : public framework::OpKernelBase<DeviceType, ReshapeParam<DeviceType>> {
  public:
-  void Compute(const ReshapeParam& param) const;
-  bool Init(ReshapeParam* param);
+  void Compute(const ReshapeParam<DeviceType>& param) const;
+  bool Init(ReshapeParam<DeviceType>* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/resize_kernel.h b/src/operators/kernel/resize_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c06429858b9575ffc061c000e4a9343fa7eee26
--- /dev/null
+++ b/src/operators/kernel/resize_kernel.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESIZE_OP
+
+#pragma once
+
+#include <vector>
+#include "framework/operator.h"
+
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType>
+inline framework::DDim CalOutputShape(const ResizeParam<DeviceType> &param) {
+  const auto *input_x = param.InputX();
+  const auto &input_x_dims = input_x->dims();
+  auto *out = param.Out();
+  framework::DDim out_dims = out->dims();
+  const auto *input_shape = param.InputShape();
+
+  if (input_shape) {
+    input_x->dims()[0];
+    auto *shape_data = input_shape->template data<int>();
+    framework::Tensor cpu_shape_tensor;
+    auto shape =
+        std::vector<int>(shape_data, shape_data + input_shape->numel());
+    const int in_batch_size = input_x->dims()[0];
+    const int in_chan_size = input_x->dims()[1];
+    const int in_height = input_x->dims()[2];
+    const int in_width = input_x->dims()[3];
+
+    int out_height = 0;
+    int out_width = 0;
+    bool is_pyramid_test = param.IsPyramidTest();
+    if (is_pyramid_test == false) {
+      out_height = param.Height();
+      out_width = param.Width();
+      PADDLE_MOBILE_ENFORCE(out_height > 0, "output height is required");
+      PADDLE_MOBILE_ENFORCE(out_width > 0, "output width is required");
+
+    } else {
+      float out_height_scale = param.OutHeightScale();
+      float out_width_scale = param.OutWidthScale();
+      PADDLE_MOBILE_ENFORCE(out_height_scale > 0,
+                            "output height scale is required");
+      PADDLE_MOBILE_ENFORCE(out_width_scale > 0,
+                            "output width scale is required");
+
+      out_height = int(out_height_scale * in_height);
+      out_width = int(out_width_scale * in_width);
+    }
+
+    out_dims = framework::make_ddim(
+        {in_batch_size, in_chan_size, in_height, in_width});
+  }
+  return out_dims;
+}
+
+template <typename DeviceType, typename T>
+class ResizeKernel
+    : public framework::OpKernelBase<DeviceType, ResizeParam<DeviceType>> {
+ public:
+  void Compute(const ResizeParam<DeviceType> &param) const;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/scale_kernel.h b/src/operators/kernel/scale_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..2da92d8d3c8b0d7867e7e6e628a04a853dd69464
--- /dev/null
+++ b/src/operators/kernel/scale_kernel.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+#pragma once
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class ScaleKernel
+    : public framework::OpKernelBase<DeviceType, ScaleParam<DeviceType>> {
+ public:
+  void Compute(const ScaleParam<DeviceType>& param) const;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/shape_kernel.h b/src/operators/kernel/shape_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..7caf3e427a4f3b469265248708a3090c52d1ca91
--- /dev/null
+++ b/src/operators/kernel/shape_kernel.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SHAPE_OP
+
+#pragma once
+
+#include <vector>
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class ShapeKernel
+    : public framework::OpKernelBase<DeviceType, ShapeParam<DeviceType>> {
+ public:
+  void Compute(const ShapeParam<DeviceType>& param) const;
+  bool Init(ShapeParam<DeviceType>* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/sigmoid_kernel.h b/src/operators/kernel/sigmoid_kernel.h
index fc3eb5e1bf158c541b2f00d9e57ddd4699344006..e8cfe6cad9ce2f25b9f38e1784ded9ea0741ff9a 100644
--- a/src/operators/kernel/sigmoid_kernel.h
+++ b/src/operators/kernel/sigmoid_kernel.h
@@ -23,10 +23,11 @@ namespace operators {
 using framework::OpKernelBase;
 void sigmoid(const Tensor* X, Tensor* Y);
 template <typename DeviceType, typename T>
-class SigmoidKernel : public OpKernelBase<DeviceType, SigmoidParam> {
+class SigmoidKernel
+    : public OpKernelBase<DeviceType, SigmoidParam<DeviceType>> {
  public:
-  void Compute(const SigmoidParam& param) const override;
-  bool Init(SigmoidParam* param);
+  void Compute(const SigmoidParam<DeviceType>& param) const override;
+  bool Init(SigmoidParam<DeviceType>* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/slice_kernel.h b/src/operators/kernel/slice_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..17f7fe4a9ebf5b78fc92c41abd4756a7bc6bff45
--- /dev/null
+++ b/src/operators/kernel/slice_kernel.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+#pragma once
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class SliceKernel
+    : public framework::OpKernelBase<DeviceType, SliceParam<DeviceType>> {
+ public:
+  void Compute(const SliceParam<DeviceType>& param) const {}
+};
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/softmax_kernel.h b/src/operators/kernel/softmax_kernel.h
index 5a87d64dd9987d445b13a4fa9dc29a04e4ecc398..67bd9167e8c717355fc326d3025cde410ce66010 100644
--- a/src/operators/kernel/softmax_kernel.h
+++ b/src/operators/kernel/softmax_kernel.h
@@ -23,13 +23,12 @@ namespace paddle_mobile {
 namespace operators {
 using framework::OpKernelBase;
 
-void simoid(Tensor *X, Tensor *Y);
-
 template <typename DeviceType, typename T>
-class SoftmaxKernel : public OpKernelBase<DeviceType, SoftmaxParam> {
+class SoftmaxKernel
+    : public OpKernelBase<DeviceType, SoftmaxParam<DeviceType>> {
  public:
-  void Compute(const SoftmaxParam &param) const override;
-  bool Init(SoftmaxParam *param);
+  void Compute(const SoftmaxParam<DeviceType> &param) const override;
+  bool Init(SoftmaxParam<DeviceType> *param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/split_kernel.h b/src/operators/kernel/split_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..03a418de59606e42684c67ca3053fa8e39b07940
--- /dev/null
+++ b/src/operators/kernel/split_kernel.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SPLIT_OP
+
+#pragma once
+
+#include <vector>
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class SplitKernel
+    : public framework::OpKernelBase<DeviceType, SplitParam<DeviceType>> {
+ public:
+  void Compute(const SplitParam<DeviceType>& param) const;
+  bool Init(SplitParam<DeviceType>* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/transpose_kernel.h b/src/operators/kernel/transpose_kernel.h
index f1a21ebbb28c2acdb905ce9f09c28f0d47e17294..56c41fd221e080a4db3b34fbd4ab208c9986c2a8 100644
--- a/src/operators/kernel/transpose_kernel.h
+++ b/src/operators/kernel/transpose_kernel.h
@@ -26,10 +26,10 @@ namespace operators {
 
 template <typename DeviceType, typename T>
 class TransposeKernel
-    : public framework::OpKernelBase<DeviceType, TransposeParam> {
+    : public framework::OpKernelBase<DeviceType, TransposeParam<DeviceType>> {
  public:
-  void Compute(const TransposeParam& param) const;
-  bool Init(TransposeParam* param);
+  void Compute(const TransposeParam<DeviceType>& param) const;
+  bool Init(TransposeParam<DeviceType>* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/lookup_op.cpp b/src/operators/lookup_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..33f2b434adaec19acd36aab0d5157138ebd3e91e
--- /dev/null
+++ b/src/operators/lookup_op.cpp
@@ -0,0 +1,67 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef LOOKUP_OP
+
+#include <vector>
+
+#include "common/enforce.h"
+#include "operators/lookup_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void LookupOp<Dtype, T>::InferShape() const {
+  PADDLE_MOBILE_ENFORCE(this->param_.InputW() != nullptr,
+                        "Input(W) of LookupTableOp should not be null.");
+  auto *ids_t = this->param_.InputIds();
+
+  PADDLE_MOBILE_ENFORCE(ids_t != nullptr,
+                        "Input(Ids) of LookupTableOp should not be null.");
+  PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
+                        "Output(Out) of LookupTableOp should not be null.");
+  //    this->param__.InputW()->
+
+  auto table_dims = this->param_.InputW()->dims();
+  auto ids_dims = ids_t->dims();
+
+  int ids_rank = ids_dims.size();
+
+  PADDLE_MOBILE_ENFORCE(table_dims.size() == 2,
+                        "table_dims.size()==2 check failed");
+
+  PADDLE_MOBILE_ENFORCE(ids_dims[ids_rank - 1] == 1,
+                        "The last dimension of the 'Ids' tensor must be 1.");
+
+  auto output_dims =
+      framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1));
+  output_dims.push_back(table_dims[1]);
+
+  this->param_.Out()->Resize(framework::make_ddim(output_dims));
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(lookup_table, ops::LookupOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/lookup_op.h b/src/operators/lookup_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c9d03c8d10e9b01ad958c12d31a49908075eb27
--- /dev/null
+++ b/src/operators/lookup_op.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef LOOKUP_OP
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/lookup_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class LookupOp : public framework::OperatorWithKernel<
+                     DeviceType, LookupParam<DeviceType>,
+                     operators::LookupKernel<DeviceType, T>> {
+ public:
+  LookupOp(const std::string &type, const VariableNameMap &inputs,
+           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+           std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, LookupParam<DeviceType>,
+                                      operators::LookupKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, LookupParam<DeviceType>,
+      operators::LookupKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(lookup_table);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/lrn_op.cpp b/src/operators/lrn_op.cpp
index dc43cb022ac9d7435654cbc565c81c57ba80b350..dde9123edf3568020f933bb7375be99e40f2367b 100644
--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -24,17 +24,15 @@ void LrnOp<Dtype, T>::InferShape() const {
   auto x_dims = this->param_.InputX()->dims();
   this->param_.Out()->Resize(x_dims);
 }
-template class LrnOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(lrn);
 REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(lrn);
 REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
diff --git a/src/operators/lrn_op.h b/src/operators/lrn_op.h
index d67b9f6be741581918b09d19a8a8b26c28ceed1c..6c609c7654cca022f473dba0aad1f4214a4e43e3 100644
--- a/src/operators/lrn_op.h
+++ b/src/operators/lrn_op.h
@@ -26,17 +26,18 @@ namespace operators {
 using std::string;
 template <typename DeviceType, typename T>
 class LrnOp : public framework::OperatorWithKernel<
-                  DeviceType, LrnParam, operators::LrnKernel<DeviceType, T>> {
+                  DeviceType, LrnParam<DeviceType>,
+                  operators::LrnKernel<DeviceType, T>> {
  public:
   LrnOp(const string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
         std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType, LrnParam,
+      : framework::OperatorWithKernel<DeviceType, LrnParam<DeviceType>,
                                       operators::LrnKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, LrnParam,
+      DeviceType, LrnParam<DeviceType>,
       operators::LrnKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
@@ -46,4 +47,13 @@ class LrnOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(lrn);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(lrn);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/math/activation_functions.h b/src/operators/math/activation_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..8604065a2570cc17c970c487fcaa898f78c72a85
--- /dev/null
+++ b/src/operators/math/activation_functions.h
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include <string>
+#include "common/enforce.h"
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+
+enum ActivationType {
+  kSigmoid,
+  kReLU,
+  kTanh,
+  kIdentity,
+};
+
+inline ActivationType GetActivationType(const std::string &type) {
+  if (type == "sigmoid") {
+    return ActivationType::kSigmoid;
+  } else if (type == "relu") {
+    return ActivationType::kReLU;
+  } else if (type == "tanh") {
+    return ActivationType::kTanh;
+  } else if (type == "identity" || type == "") {
+    return ActivationType::kIdentity;
+  }
+  PADDLE_MOBILE_THROW_EXCEPTION("Not support activation type.");
+}
+
+namespace forward {
+
+template <typename T>
+T Identity(const T a) {
+  return a;
+}
+
+template <typename T>
+T Relu(const T a) {
+  return a > static_cast<T>(0.0) ? a : static_cast<T>(0.0);
+}
+
+template <typename T>
+T Sigmoid(const T a) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  T tmp = (a < min) ? min : ((a > max) ? max : a);
+  return static_cast<T>(1.0) / (static_cast<T>(1.0) + exp(-tmp));
+}
+
+template <typename T>
+T Tanh(const T a) {
+  T tmp = -2.0 * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  return (2.0 / (1.0 + exp(tmp))) - 1.0;
+}
+
+}  // namespace forward
+
+template <typename T>
+struct Active {
+  typedef T (*Act)(T);
+};
+
+static Active<float>::Act kActFloat[] = {
+    &forward::Sigmoid<float>, &forward::Relu<float>, &forward::Tanh<float>,
+    &forward::Identity<float>};
+
+namespace forward {
+inline float activation(float a, int index) { return kActFloat[index](a); }
+
+}  // namespace forward
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/math/conv_func.h b/src/operators/math/conv_func.h
index 3d23f6c8a24be7f52e1b322e07addb47ccd8b056..d9e2da0db5c50e0b0f9b11d5584bfce8b75777cd 100644
--- a/src/operators/math/conv_func.h
+++ b/src/operators/math/conv_func.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#if __ARM_NEON
+#ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif
 
@@ -49,7 +49,7 @@ inline void expand_bias(Tensor &bias, int axis, const DDim &dDim) {
   auto new_ptr = bias.mutable_data<float>();
   int axis_size = dDim[axis];
 
-#if __ARM_NEON
+#ifdef __ARM_NEON
   for (int i = 0; i < outer_size; ++i) {
     int inner_num = inner_size >> 4;
     int remain = inner_size - (inner_num << 4);
diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp
index f74e365c7e087551e55363566d3dbd6ba530bfea..716256a376a50f2ec1c4c62fa25703cabf3a0c66 100644
--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "operators/math/depthwise_conv_3x3.h"
+#if __ARM_NEON
 #include <arm_neon.h>
+#endif
 #include <vector>
 
 namespace paddle_mobile {
@@ -21,7 +23,6 @@ namespace math {
 void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
                       vector<int> paddings, const Tensor *filter, Tensor *bias,
                       Tensor *output, bool if_bias) {
-#if __ARM_NEON
   const int batch_size = input->dims()[0];
 
   const int input_height = input->dims()[2];
@@ -179,7 +180,27 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
             }
 
           } else {
-#if defined(ARMV17)
+#if __ARM_NEON
+#if __aarch64__
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos2);
+            const float32x4_t data3 = vld1q_f32(pos3);
+
+            const float32x4_t v_filter1 = vld1q_f32(filter1);
+            const float32x4_t v_filter2 = vld1q_f32(filter2);
+            const float32x4_t v_filter3 = vld1q_f32(filter3);
+            float32x4_t mula = vmulq_f32(data1, v_filter1);
+            mula = vmlaq_f32(mula, data2, v_filter2);
+            mula = vmlaq_f32(mula, data3, v_filter3);
+            float32x2_t res = vpadd_f32(
+                vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
+            res = vpadd_f32(res, res);
+            if (if_bias) {
+              output_data[ph * output_width + pw] += vget_lane_f32(res, 0);
+            } else {
+              output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
+            }
+#else
             asm volatile(
 
                 "vld1.32  {q1}, [%[pos1]]        \n\t"
@@ -207,26 +228,10 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
                   [filter2] "r"(filter2), [filter3] "r"(filter3),
                   [output_ptr] "r"(output_ptr), [zero] "r"(zero)
                 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
+#endif  // __aarch64__
 #else
-            const float32x4_t data1 = vld1q_f32(pos1);
-            const float32x4_t data2 = vld1q_f32(pos2);
-            const float32x4_t data3 = vld1q_f32(pos3);
 
-            const float32x4_t v_filter1 = vld1q_f32(filter1);
-            const float32x4_t v_filter2 = vld1q_f32(filter2);
-            const float32x4_t v_filter3 = vld1q_f32(filter3);
-            float32x4_t mula = vmulq_f32(data1, v_filter1);
-            mula = vmlaq_f32(mula, data2, v_filter2);
-            mula = vmlaq_f32(mula, data3, v_filter3);
-            float32x2_t res = vpadd_f32(
-                vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
-            res = vpadd_f32(res, res);
-            if (if_bias) {
-              output_data[ph * output_width + pw] += vget_lane_f32(res, 0);
-            } else {
-              output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
-            }
-#endif
+#endif  // __ARM_NEON
           }
         }
       }
@@ -237,15 +242,18 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
     input_data += input_batch_stride;
     output_data += output_batch_stride;
   }
-#endif
 }
 
 void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
                           Tensor *output, Tensor *bias, bool if_bias) {
+#if __ARM_NEON
   const float *input_data = input->data<float>();
   const float *filter_data = filter->data<float>();
   float *output_data = output->data<float>();
-  const float *bias_data = bias->data<float>();
+  const float *bias_data;
+  if (if_bias) {
+    bias_data = bias->data<float>();
+  }
 
   const int h = static_cast<int>(input->dims()[2]);
   const int w = static_cast<int>(input->dims()[3]);
@@ -275,33 +283,40 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
       float w22 = filter_data_tmp[8];
 
       output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
-                       w21 * input_data[l] + w22 * input_data[l + 1] +
-                       bias_data[j];
+                       w21 * input_data[l] + w22 * input_data[l + 1];
       output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
                            w20 * input_data[2 * l - 2] +
-                           w21 * input_data[2 * l - 1] + bias_data[j];
+                           w21 * input_data[2 * l - 1];
       output_data[(l - 1) * l] =
           w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
-          w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1] +
-          bias_data[j];
+          w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
       output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
                                w01 * input_data[(l - 2) * (l + 1) + 1] +
                                w10 * input_data[l * l - 2] +
-                               w11 * input_data[l * l - 1] + bias_data[j];
+                               w11 * input_data[l * l - 1];
+      if (if_bias) {
+        output_data[0] += bias_data[j];
+        output_data[l - 1] += bias_data[j];
+        output_data[(l - 1) * l] += bias_data[j];
+        output_data[l * l - 1] += bias_data[j];
+      }
 
       for (int i = 1; i < l - 1; ++i) {
         output_data[i * l] =
             w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
             w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
-            w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1] +
-            bias_data[j];
+            w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1];
+
         output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
                                      w01 * input_data[i * l + l - 1 - l] +
                                      w10 * input_data[i * l + l - 1 - 1] +
                                      w11 * input_data[i * l + l - 1] +
                                      w20 * input_data[i * l + l - 1 + l - 1] +
-                                     w21 * input_data[i * l + l - 1 + l] +
-                                     bias_data[j];
+                                     w21 * input_data[i * l + l - 1 + l];
+        if (if_bias) {
+          output_data[i * l] += bias_data[j];
+          output_data[i * l + l - 1] += bias_data[j];
+        }
       }
 
       // top 1 row and bottom 1 row
@@ -501,75 +516,135 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
       filter_data_tmp += 9;
     }
   }
+#endif
 }
-void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, Tensor filter,
-                                   Tensor *output, Tensor *bias, bool if_bias,
-                                   Tensor *new_scale, Tensor *new_bias,
-                                   bool if_bn, bool if_relu) {
+
+void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
+                                   Tensor *output, const Tensor *new_scale,
+                                   const Tensor *new_bias, bool if_relu) {
+#if __ARM_NEON
   const float *input_data = input->data<float>();
-  const float *filter_data = filter.data<float>();
+  const float *filter_data = filter->data<float>();
   float *output_data = output->data<float>();
-  const float *bias_data = bias->data<float>();
   const float *newscale_data = new_scale->data<float>();
   const float *newbias_data = new_bias->data<float>();
 
-  const int h = static_cast<int>(input->dims()[2]);
-  const int w = static_cast<int>(input->dims()[3]);
-  const int l = h;
-
   const int batch_size = static_cast<int>(input->dims()[0]);
-  const int c = static_cast<int>(input->dims()[1]);
-  const int hxw = h * w;
-  float32x4_t vbias = vdupq_n_f32(0.0);
-  float32x4_t vnewbias = vdupq_n_f32(0.0);
-  float32x4_t vnewscale = vdupq_n_f32(1.0);
+  const int input_channel = static_cast<int>(input->dims()[1]);
+
+  const int input_height = static_cast<int>(input->dims()[2]);
+  const int input_width = static_cast<int>(input->dims()[3]);
+  const int output_height = static_cast<int>(output->dims()[2]);
+  const int output_width = static_cast<int>(output->dims()[3]);
+
+  const int hxw = input_height * input_width;
+
+  const int l = input_height;
+
   float32x4_t vzero = vdupq_n_f32(0);
 
-  for (int b = 0; b < batch_size; ++b) {
-    const float *filter_data_tmp = filter_data;
+  for (int b = 0; b < batch_size; b++) {
+#pragma omp parallel for
+    for (int c = 0; c < input_channel; c++) {
+      const float *filter_data = filter->data<float>() + c * 9;
+      const float *input_data = input->data<float>() + c * hxw;
+      float *output_data = output->data<float>() + c * hxw;
+      float32x4_t vnewbias = vdupq_n_f32(newbias_data[c]);
+      float32x4_t vnewscale = vdupq_n_f32(newscale_data[c]);
+
+      float w00 = filter_data[0];
+      float w01 = filter_data[1];
+      float w02 = filter_data[2];
+      float w10 = filter_data[3];
+      float w11 = filter_data[4];
+      float w12 = filter_data[5];
+      float w20 = filter_data[6];
+      float w21 = filter_data[7];
+      float w22 = filter_data[8];
+
+      for (int i = 1; i < output_height - 1; i++) {
+        float *output_ptr;
+        float32x4_t in0, in1, in2, in3, in4, in5, tmp0, tmp1, tmp2, tmp3, tmp4,
+            tmp5, out0;
+        for (int m = 1; m < output_width - 4; m += 4) {
+          output_ptr = output_data + i * output_width + m;
+          in0 = vld1q_f32(input_data + (i - 1) * input_width + m - 1);
+          in1 = vld1q_f32(input_data + (i - 1) * input_width + m + 3);
+          in2 = vld1q_f32(input_data + i * input_width + m - 1);
+          in3 = vld1q_f32(input_data + i * input_width + m + 3);
+          in4 = vld1q_f32(input_data + (i + 1) * input_width + m - 1);
+          in5 = vld1q_f32(input_data + (i + 1) * input_width + m + 3);
+
+          tmp0 = vextq_f32(in0, in1, 1);
+          tmp1 = vextq_f32(in0, in1, 2);
+          tmp2 = vextq_f32(in2, in3, 1);
+          tmp3 = vextq_f32(in2, in3, 2);
+          tmp4 = vextq_f32(in4, in5, 1);
+          tmp5 = vextq_f32(in4, in5, 2);
+
+          out0 = vmulq_n_f32(in0, w00);
+          out0 = vmlaq_n_f32(out0, tmp0, w01);
+          out0 = vmlaq_n_f32(out0, tmp1, w02);
+          out0 = vmlaq_n_f32(out0, in2, w10);
+          out0 = vmlaq_n_f32(out0, tmp2, w11);
+          out0 = vmlaq_n_f32(out0, tmp3, w12);
+          out0 = vmlaq_n_f32(out0, in4, w20);
+          out0 = vmlaq_n_f32(out0, tmp4, w21);
+          out0 = vmlaq_n_f32(out0, tmp5, w22);
 
-    for (int j = 0; j < c; ++j) {
-      if (if_bias) {
-        vbias = vdupq_n_f32(bias_data[j]);
-      }
-      if (if_bn) {
-        vnewbias = vdupq_n_f32(newbias_data[j]);
-        vnewscale = vdupq_n_f32(newscale_data[j]);
-      }
-      int l_mid = l - 2;  // l=1->l_mid=-1,l=2->l_mid=0
-      float w00 = filter_data_tmp[0];
-      float w01 = filter_data_tmp[1];
-      float w02 = filter_data_tmp[2];
-      float w10 = filter_data_tmp[3];
-      float w11 = filter_data_tmp[4];
-      float w12 = filter_data_tmp[5];
-      float w20 = filter_data_tmp[6];
-      float w21 = filter_data_tmp[7];
-      float w22 = filter_data_tmp[8];
+          out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+          if (if_relu) {
+            out0 = vmaxq_f32(out0, vzero);
+          }
+          vst1q_f32(output_ptr, out0);
+        }
+        int m;
+        for (m = 1; (m + 3) < output_width - 1; m = m + 4) {
+        }
 
-      output_data[0] =
-          (w11 * input_data[0] + w12 * input_data[1] + w21 * input_data[l] +
-           w22 * input_data[l + 1] + bias_data[j]) *
-              newscale_data[j] +
-          newbias_data[j];
-      output_data[l - 1] = (w10 * input_data[l - 2] + w11 * input_data[l - 1] +
-                            w20 * input_data[2 * l - 2] +
-                            w21 * input_data[2 * l - 1] + bias_data[j]) *
-                               newscale_data[j] +
-                           newbias_data[j];
+        for (int j = m; j < output_width - 1; j++) {
+          output_data[i * output_width + j] =
+              input_data[(i - 1) * input_width + j - 1] * w00 +
+              input_data[(i - 1) * input_width + j] * w01 +
+              input_data[(i - 1) * input_width + j + 1] * w02 +
+              input_data[(i)*input_width + j - 1] * w10 +
+              input_data[(i)*input_width + j] * w11 +
+              input_data[(i)*input_width + j + 1] * w12 +
+              input_data[(i + 1) * input_width + j - 1] * w20 +
+              input_data[(i + 1) * input_width + j] * w21 +
+              input_data[(i + 1) * input_width + j + 1] * w22;
+          output_data[i * output_width + j] =
+              newscale_data[c] * output_data[i * output_width + j] +
+              newbias_data[c];
+          if (if_relu) {
+            output_data[i * output_width + j] =
+                output_data[i * output_width + j] < 0
+                    ? 0
+                    : output_data[i * output_width + j];
+          }
+        }
+      }
 
+      output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
+                       w21 * input_data[l] + w22 * input_data[l + 1];
+      output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
+                           w20 * input_data[2 * l - 2] +
+                           w21 * input_data[2 * l - 1];
       output_data[(l - 1) * l] =
-          (w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
-           w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1] +
-           bias_data[j]) *
-              newscale_data[j] +
-          newbias_data[j];
-      output_data[l * l - 1] = (w00 * input_data[(l - 2) * (l + 1)] +
-                                w01 * input_data[(l - 2) * (l + 1) + 1] +
-                                w10 * input_data[l * l - 2] +
-                                w11 * input_data[l * l - 1] + bias_data[j]) *
-                                   newscale_data[j] +
-                               newbias_data[j];
+          w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
+          w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
+      output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
+                               w01 * input_data[(l - 2) * (l + 1) + 1] +
+                               w10 * input_data[l * l - 2] +
+                               w11 * input_data[l * l - 1];
+      output_data[0] = output_data[0] * newscale_data[c] + newbias_data[c];
+      output_data[l - 1] =
+          output_data[l - 1] * newscale_data[c] + newbias_data[c];
+      output_data[(l - 1) * l] =
+          output_data[(l - 1) * l] * newscale_data[c] + newbias_data[c];
+      output_data[l * l - 1] =
+          output_data[l * l - 1] * newscale_data[c] + newbias_data[c];
+
       if (if_relu) {
         output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
         output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - 1];
@@ -580,21 +655,21 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, Tensor filter,
       }
       for (int i = 1; i < l - 1; ++i) {
         output_data[i * l] =
-            (w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
-             w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
-             w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1] +
-             bias_data[j]) *
-                newscale_data[j] +
-            newbias_data[j];
+            w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
+            w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
+            w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1];
+
+        output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
+                                     w01 * input_data[i * l + l - 1 - l] +
+                                     w10 * input_data[i * l + l - 1 - 1] +
+                                     w11 * input_data[i * l + l - 1] +
+                                     w20 * input_data[i * l + l - 1 + l - 1] +
+                                     w21 * input_data[i * l + l - 1 + l];
+        output_data[i * l] =
+            output_data[i * l] * newscale_data[c] + newbias_data[c];
         output_data[i * l + l - 1] =
-            (w00 * input_data[i * l + l - 1 - l - 1] +
-             w01 * input_data[i * l + l - 1 - l] +
-             w10 * input_data[i * l + l - 1 - 1] +
-             w11 * input_data[i * l + l - 1] +
-             w20 * input_data[i * l + l - 1 + l - 1] +
-             w21 * input_data[i * l + l - 1 + l] + bias_data[j]) *
-                newscale_data[j] +
-            newbias_data[j];
+            output_data[i * l + l - 1] * newscale_data[c] + newbias_data[c];
+
         if (if_relu) {
           output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i * l];
           output_data[i * l + l - 1] =
@@ -602,222 +677,1307 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, Tensor filter,
         }
       }
 
-      // top 1 row and bottom 1 row
-      const float *input_tmp = input_data;
-
-      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
-          tmp3, tmp4, tmp5, out0;
-      in0 = vld1q_f32(input_tmp);
-      in2 = vld1q_f32(input_tmp + l);
-      const float *input_tmp_end = input_tmp + (l - 2) * l;
-      in4 = vld1q_f32(input_tmp_end);
-      in6 = vld1q_f32(input_tmp_end + l);
-      int c_mid = l_mid;
-      auto output_ptr = output_data + 1;
-      for (; c_mid > 3; c_mid -= 4) {
-        in1 = vld1q_f32(input_tmp + 4);
-        in3 = vld1q_f32(input_tmp + l + 4);
-
+      int m;
+      for (m = 1; m < output_width - 4; m += 4) {
+        float *output_ptr = output_data + m;
+        float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0;
+        in0 = vld1q_f32(input_data + m - 1);
+        in1 = vld1q_f32(input_data + m + 3);
+        in2 = vld1q_f32(input_data + input_width + m - 1);
+        in3 = vld1q_f32(input_data + input_width + m + 3);
         tmp0 = vextq_f32(in0, in1, 1);
         tmp1 = vextq_f32(in0, in1, 2);
-
         tmp2 = vextq_f32(in2, in3, 1);
         tmp3 = vextq_f32(in2, in3, 2);
-
         out0 = vmulq_n_f32(in0, w10);
         out0 = vmlaq_n_f32(out0, tmp0, w11);
         out0 = vmlaq_n_f32(out0, tmp1, w12);
         out0 = vmlaq_n_f32(out0, in2, w20);
         out0 = vmlaq_n_f32(out0, tmp2, w21);
         out0 = vmlaq_n_f32(out0, tmp3, w22);
-        out0 = vaddq_f32(out0, vbias);
         out0 = vmlaq_f32(vnewbias, vnewscale, out0);
         if (if_relu) {
           out0 = vmaxq_f32(out0, vzero);
         }
         vst1q_f32(output_ptr, out0);
+      }
 
-        in5 = vld1q_f32(input_tmp_end + 4);
-        in7 = vld1q_f32(input_tmp_end + l + 4);
+      for (m = 1; (m + 3) < output_width - 1; m += 4) {
+      }
+      for (int j = m; j < output_width - 1; j++) {
+        output_data[j] = input_data[j - 1] * w10 + input_data[j] * w11 +
+                         input_data[j + 1] * w12 +
+                         input_data[input_width + j - 1] * w20 +
+                         input_data[input_width + j] * w21 +
+                         input_data[input_width + j + 1] * w22;
+        output_data[j] = output_data[j] * newscale_data[c] + newbias_data[c];
 
-        tmp0 = vextq_f32(in4, in5, 1);
-        tmp1 = vextq_f32(in4, in5, 2);
-        tmp2 = vextq_f32(in6, in7, 1);
-        tmp3 = vextq_f32(in6, in7, 2);
+        if (if_relu) {
+          output_data[j] = output_data[j] < 0 ? 0 : output_data[j];
+        }
+      }
 
-        out0 = vmulq_n_f32(in4, w00);
+      for (m = 1; m < output_width - 4; m += 4) {
+        float *output_ptr =
+            output_data + (output_height - 1) * output_width + m;
+
+        float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0;
+        in0 = vld1q_f32(input_data + (output_height - 2) * input_width + m - 1);
+        in1 = vld1q_f32(input_data + (output_height - 2) * input_width + m + 3);
+        in2 = vld1q_f32(input_data + (output_height - 1) * input_width + m - 1);
+        in3 = vld1q_f32(input_data + (output_height - 1) * input_width + m + 3);
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+        out0 = vmulq_n_f32(in0, w00);
         out0 = vmlaq_n_f32(out0, tmp0, w01);
         out0 = vmlaq_n_f32(out0, tmp1, w02);
-        out0 = vmlaq_n_f32(out0, in6, w10);
+        out0 = vmlaq_n_f32(out0, in2, w10);
         out0 = vmlaq_n_f32(out0, tmp2, w11);
         out0 = vmlaq_n_f32(out0, tmp3, w12);
-        out0 = vaddq_f32(out0, vbias);
         out0 = vmlaq_f32(vnewbias, vnewscale, out0);
         if (if_relu) {
           out0 = vmaxq_f32(out0, vzero);
         }
-        vst1q_f32(output_ptr + (l - 1) * l, out0);
+        vst1q_f32(output_ptr, out0);
+      }
+      for (m = 1; (m + 3) < output_width - 1; m = m + 4) {
+      }
+      for (int j = m; j < output_width - 1; j++) {
+        output_data[(output_height - 1) * input_width + j] =
+            input_data[(output_height - 2) * input_width + j - 1] * w00 +
+            input_data[(output_height - 2) * input_width + j] * w01 +
+            input_data[(output_height - 2) * input_width + j + 1] * w02 +
+            input_data[(output_height - 1) * input_width + j - 1] * w10 +
+            input_data[(output_height - 1) * input_width + j] * w11 +
+            input_data[(output_height - 1) * input_width + j + 1] * w12;
+        output_data[(output_height - 1) * output_width + j] =
+            output_data[(output_height - 1) * output_width + j] *
+                newscale_data[c] +
+            newbias_data[c];
 
-        // can optimize to each 8 stride.
-        input_tmp += 4;
-        input_tmp_end += 4;
-        output_ptr += 4;
-        in0 = in1;
-        in2 = in3;
-        in4 = in5;
-        in6 = in7;
+        if (if_relu) {
+          output_data[(output_height - 1) * output_width + j] =
+              output_data[(output_height - 1) * output_width + j] < 0
+                  ? 0
+                  : output_data[(output_height - 1) * output_width + j];
+        }
       }
+    }
+  }
 
-      // top right pad
-      float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]);
-      float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]);
+    /*
+        const float *input_data = input->data<float>();
+        const float *filter_data = filter->data<float>();
+        float *output_data = output->data<float>();
+        const float *newscale_data = new_scale->data<float>();
+        const float *newbias_data = new_bias->data<float>();
+
+        const int h = static_cast<int>(input->dims()[2]);
+        const int w = static_cast<int>(input->dims()[3]);
+        const int l = h;
+
+        const int batch_size = static_cast<int>(input->dims()[0]);
+        const int c = static_cast<int>(input->dims()[1]);
+        const int hxw = h * w;
+        float32x4_t vnewbias = vdupq_n_f32(0.0);
+        float32x4_t vnewscale = vdupq_n_f32(1.0);
+        float32x4_t vzero = vdupq_n_f32(0);
+
+        for (int b = 0; b < batch_size; ++b) {
+          const float *filter_data_tmp = filter_data;
+
+          for (int j = 0; j < c; ++j) {
+            vnewbias = vdupq_n_f32(newbias_data[j]);
+            vnewscale = vdupq_n_f32(newscale_data[j]);
+
+            int l_mid = l - 2;  // l=1->l_mid=-1,l=2->l_mid=0
+            float w00 = filter_data_tmp[0];
+            float w01 = filter_data_tmp[1];
+            float w02 = filter_data_tmp[2];
+            float w10 = filter_data_tmp[3];
+            float w11 = filter_data_tmp[4];
+            float w12 = filter_data_tmp[5];
+            float w20 = filter_data_tmp[6];
+            float w21 = filter_data_tmp[7];
+            float w22 = filter_data_tmp[8];
+
+            output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
+                             w21 * input_data[l] + w22 * input_data[l + 1];
+
+            output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l -
+       1] + w20 * input_data[2 * l - 2] + w21 * input_data[2 * l - 1];
+
+            output_data[(l - 1) * l] =
+                w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l +
+       1] + w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
+            output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
+                                     w01 * input_data[(l - 2) * (l + 1) + 1] +
+                                     w10 * input_data[l * l - 2] +
+                                     w11 * input_data[l * l - 1];
+            output_data[0] = output_data[0] * newscale_data[j] +
+       newbias_data[j]; output_data[l - 1] = output_data[l - 1] *
+       newscale_data[j] + newbias_data[j]; output_data[(l - 1) * l] =
+                output_data[(l - 1) * l] * newscale_data[j] + newbias_data[j];
+            output_data[l * l - 1] =
+                output_data[l * l - 1] * newscale_data[j] + newbias_data[j];
+
+            if (if_relu) {
+              output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
+              output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l -
+       1]; output_data[(l - 1) * l] = output_data[(l - 1) * l] < 0 ? 0 :
+       output_data[(l - 1) * l]; output_data[l * l - 1] = output_data[l * l - 1]
+       < 0 ? 0 : output_data[l * l - 1];
+            }
+            for (int i = 1; i < l - 1; ++i) {
+              output_data[i * l] =
+                  w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1]
+       + w11 * input_data[i * l] + w12 * input_data[i * l + 1] + w21 *
+       input_data[i * l + l] + w22 * input_data[i * l + l + 1]; output_data[i *
+       l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] + w01 * input_data[i
+       * l + l - 1 - l] + w10 * input_data[i * l + l - 1 - 1] + w11 *
+       input_data[i * l + l - 1] + w20 * input_data[i * l + l - 1 + l - 1] + w21
+       * input_data[i * l + l - 1 + l]; output_data[i * l] = output_data[i * l]
+       * newscale_data[j] + newbias_data[j]; output_data[i * l + l - 1] =
+                  output_data[i * l + l - 1] * newscale_data[j] +
+       newbias_data[j];
+
+              if (if_relu) {
+                output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i
+       * l]; output_data[i * l + l - 1] = output_data[i * l + l - 1] < 0 ? 0 :
+       output_data[i * l + l - 1];
+              }
+            }
 
-      tmp0 = vextq_f32(in0, pad0, 1);
-      tmp1 = vextq_f32(in0, pad0, 2);
-      tmp2 = vextq_f32(in2, pad1, 1);
-      tmp3 = vextq_f32(in2, pad1, 2);
+            // top 1 row and bottom 1 row
+            const float *input_tmp = input_data;
+
+            float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1,
+       tmp2, tmp3, tmp4, tmp5, out0; in0 = vld1q_f32(input_tmp); in2 =
+       vld1q_f32(input_tmp + l); const float *input_tmp_end = input_tmp + (l -
+       2) * l; in4 = vld1q_f32(input_tmp_end); in6 = vld1q_f32(input_tmp_end +
+       l); int c_mid = l_mid; auto output_ptr = output_data + 1; for (; c_mid >
+       3; c_mid -= 4) { in1 = vld1q_f32(input_tmp + 4); in3 =
+       vld1q_f32(input_tmp + l + 4);
+
+              tmp0 = vextq_f32(in0, in1, 1);
+              tmp1 = vextq_f32(in0, in1, 2);
+
+              tmp2 = vextq_f32(in2, in3, 1);
+              tmp3 = vextq_f32(in2, in3, 2);
+
+              out0 = vmulq_n_f32(in0, w10);
+              out0 = vmlaq_n_f32(out0, tmp0, w11);
+              out0 = vmlaq_n_f32(out0, tmp1, w12);
+              out0 = vmlaq_n_f32(out0, in2, w20);
+              out0 = vmlaq_n_f32(out0, tmp2, w21);
+              out0 = vmlaq_n_f32(out0, tmp3, w22);
+              out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+              if (if_relu) {
+                out0 = vmaxq_f32(out0, vzero);
+              }
+              vst1q_f32(output_ptr, out0);
+
+              in5 = vld1q_f32(input_tmp_end + 4);
+              in7 = vld1q_f32(input_tmp_end + l + 4);
+
+              tmp0 = vextq_f32(in4, in5, 1);
+              tmp1 = vextq_f32(in4, in5, 2);
+              tmp2 = vextq_f32(in6, in7, 1);
+              tmp3 = vextq_f32(in6, in7, 2);
+
+              out0 = vmulq_n_f32(in4, w00);
+              out0 = vmlaq_n_f32(out0, tmp0, w01);
+              out0 = vmlaq_n_f32(out0, tmp1, w02);
+              out0 = vmlaq_n_f32(out0, in6, w10);
+              out0 = vmlaq_n_f32(out0, tmp2, w11);
+              out0 = vmlaq_n_f32(out0, tmp3, w12);
+              out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+              if (if_relu) {
+                out0 = vmaxq_f32(out0, vzero);
+              }
+              vst1q_f32(output_ptr + (l - 1) * l, out0);
+
+              // can optimize to each 8 stride.
+              input_tmp += 4;
+              input_tmp_end += 4;
+              output_ptr += 4;
+              in0 = in1;
+              in2 = in3;
+              in4 = in5;
+              in6 = in7;
+            }
 
-      out0 = vmulq_n_f32(in0, w10);
-      out0 = vmlaq_n_f32(out0, tmp0, w11);
-      out0 = vmlaq_n_f32(out0, tmp1, w12);
-      out0 = vmlaq_n_f32(out0, in2, w20);
-      out0 = vmlaq_n_f32(out0, tmp2, w21);
-      out0 = vmlaq_n_f32(out0, tmp3, w22);
-      out0 = vaddq_f32(out0, vbias);
-      out0 = vmlaq_f32(vnewbias, vnewscale, out0);
-      if (if_relu) {
-        out0 = vmaxq_f32(out0, vzero);
-      }
-      for (int i = 0; i < c_mid; ++i) {
-        if (i == 0) {
-          vst1q_lane_f32(output_ptr + i, out0, 0);
-        }
-        if (i == 1) {
-          vst1q_lane_f32(output_ptr + i, out0, 1);
-        }
-        if (i == 2) {
-          vst1q_lane_f32(output_ptr + i, out0, 2);
-        }
-      }
+            // top right pad
+            float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]);
+            float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]);
+
+            tmp0 = vextq_f32(in0, pad0, 1);
+            tmp1 = vextq_f32(in0, pad0, 2);
+            tmp2 = vextq_f32(in2, pad1, 1);
+            tmp3 = vextq_f32(in2, pad1, 2);
+
+            out0 = vmulq_n_f32(in0, w10);
+            out0 = vmlaq_n_f32(out0, tmp0, w11);
+            out0 = vmlaq_n_f32(out0, tmp1, w12);
+            out0 = vmlaq_n_f32(out0, in2, w20);
+            out0 = vmlaq_n_f32(out0, tmp2, w21);
+            out0 = vmlaq_n_f32(out0, tmp3, w22);
+            out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+            if (if_relu) {
+              out0 = vmaxq_f32(out0, vzero);
+            }
+            for (int i = 0; i < c_mid; ++i) {
+              if (i == 0) {
+                vst1q_lane_f32(output_ptr + i, out0, 0);
+              }
+              if (i == 1) {
+                vst1q_lane_f32(output_ptr + i, out0, 1);
+              }
+              if (i == 2) {
+                vst1q_lane_f32(output_ptr + i, out0, 2);
+              }
+            }
 
-      // bottom right pad
-      float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]);
-      float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]);
+            // bottom right pad
+            float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]);
+            float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]);
+
+            tmp0 = vextq_f32(in4, pad2, 1);
+            tmp1 = vextq_f32(in4, pad2, 2);
+            tmp2 = vextq_f32(in6, pad3, 1);
+            tmp3 = vextq_f32(in6, pad3, 2);
+
+            out0 = vmulq_n_f32(in4, w00);
+            out0 = vmlaq_n_f32(out0, tmp0, w01);
+            out0 = vmlaq_n_f32(out0, tmp1, w02);
+            out0 = vmlaq_n_f32(out0, in6, w10);
+            out0 = vmlaq_n_f32(out0, tmp2, w11);
+            out0 = vmlaq_n_f32(out0, tmp3, w12);
+            out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+            if (if_relu) {
+              out0 = vmaxq_f32(out0, vzero);
+            }
+            for (int i = 0; i < c_mid; ++i) {
+              if (i == 0) {
+                vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0);
+              }
+              if (i == 1) {
+                vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1);
+              }
+              if (i == 2) {
+                vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2);
+              }
+            }
+            // mid
+
+
+            for (int i = 0; i < l - 2; ++i) {
+              auto output_ptr = output_data + (i + 1) * l + 1;
+              input_tmp = input_data + i * l;
+              auto in0_tmp = vld1q_f32(input_tmp);
+              auto in2_tmp = vld1q_f32(input_tmp + l);
+              auto in4_tmp = vld1q_f32(input_tmp + l + l);
+              c_mid = l_mid;
+              for (; c_mid > 3; c_mid -= 4) {
+                auto in1_tmp = vld1q_f32(input_tmp + 4);
+                auto in3_tmp = vld1q_f32(input_tmp + l + 4);
+                auto in5_tmp = vld1q_f32(input_tmp + l + l + 4);
+
+                tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
+                tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
+                tmp2 = vextq_f32(in2_tmp, in3_tmp, 1);
+                tmp3 = vextq_f32(in2_tmp, in3_tmp, 2);
+                tmp4 = vextq_f32(in4_tmp, in5_tmp, 1);
+                tmp5 = vextq_f32(in4_tmp, in5_tmp, 2);
+
+                out0 = vmulq_n_f32(in0_tmp, w00);
+                out0 = vmlaq_n_f32(out0, tmp0, w01);
+                out0 = vmlaq_n_f32(out0, tmp1, w02);
+                out0 = vmlaq_n_f32(out0, in2_tmp, w10);
+                out0 = vmlaq_n_f32(out0, tmp2, w11);
+                out0 = vmlaq_n_f32(out0, tmp3, w12);
+                out0 = vmlaq_n_f32(out0, in4_tmp, w20);
+                out0 = vmlaq_n_f32(out0, tmp4, w21);
+                out0 = vmlaq_n_f32(out0, tmp5, w22);
+                out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+                if (if_relu) {
+                  out0 = vmaxq_f32(out0, vzero);
+                }
+                vst1q_f32(output_ptr, out0);
 
-      tmp0 = vextq_f32(in4, pad2, 1);
-      tmp1 = vextq_f32(in4, pad2, 2);
-      tmp2 = vextq_f32(in6, pad3, 1);
-      tmp3 = vextq_f32(in6, pad3, 2);
+                output_ptr += 4;
+                input_tmp += 4;
+                in0_tmp = in1_tmp;
+                in2_tmp = in3_tmp;
+                in4_tmp = in5_tmp;
+              }
 
-      out0 = vmulq_n_f32(in4, w00);
-      out0 = vmlaq_n_f32(out0, tmp0, w01);
-      out0 = vmlaq_n_f32(out0, tmp1, w02);
-      out0 = vmlaq_n_f32(out0, in6, w10);
-      out0 = vmlaq_n_f32(out0, tmp2, w11);
-      out0 = vmlaq_n_f32(out0, tmp3, w12);
-      out0 = vaddq_f32(out0, vbias);
-      out0 = vmlaq_f32(vnewbias, vnewscale, out0);
-      if (if_relu) {
-        out0 = vmaxq_f32(out0, vzero);
-      }
-      for (int i = 0; i < c_mid; ++i) {
-        if (i == 0) {
-          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0);
-        }
-        if (i == 1) {
-          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1);
-        }
-        if (i == 2) {
-          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2);
+              float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]);
+              float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]);
+              float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]);
+
+              tmp0 = vextq_f32(in0_tmp, pad0, 1);
+              tmp1 = vextq_f32(in0_tmp, pad0, 2);
+              tmp2 = vextq_f32(in2_tmp, pad1, 1);
+              tmp3 = vextq_f32(in2_tmp, pad1, 2);
+              tmp4 = vextq_f32(in4_tmp, pad2, 1);
+              tmp5 = vextq_f32(in4_tmp, pad2, 2);
+
+              out0 = vmulq_n_f32(in0_tmp, w00);
+              out0 = vmlaq_n_f32(out0, tmp0, w01);
+              out0 = vmlaq_n_f32(out0, tmp1, w02);
+              out0 = vmlaq_n_f32(out0, in2_tmp, w10);
+              out0 = vmlaq_n_f32(out0, tmp2, w11);
+              out0 = vmlaq_n_f32(out0, tmp3, w12);
+              out0 = vmlaq_n_f32(out0, in4_tmp, w20);
+              out0 = vmlaq_n_f32(out0, tmp4, w21);
+              out0 = vmlaq_n_f32(out0, tmp5, w22);
+              out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+              if (if_relu) {
+                out0 = vmaxq_f32(out0, vzero);
+              }
+              for (int i = 0; i < c_mid; ++i) {
+                if (i == 0) {
+                  vst1q_lane_f32(output_ptr + i, out0, 0);
+                }
+                if (i == 1) {
+                  vst1q_lane_f32(output_ptr + i, out0, 1);
+                }
+                if (i == 2) {
+                  vst1q_lane_f32(output_ptr + i, out0, 2);
+                }
+              }
+            }
+            output_data += hxw;
+            input_data += hxw;
+            filter_data_tmp += 9;
+          }
         }
-      }
-      // mid
+    */
 
-      for (int i = 0; i < l - 2; ++i) {
-        auto output_ptr = output_data + (i + 1) * l + 1;
-        input_tmp = input_data + i * l;
-        auto in0_tmp = vld1q_f32(input_tmp);
-        auto in2_tmp = vld1q_f32(input_tmp + l);
-        auto in4_tmp = vld1q_f32(input_tmp + l + l);
-        c_mid = l_mid;
-        for (; c_mid > 3; c_mid -= 4) {
-          auto in1_tmp = vld1q_f32(input_tmp + 4);
-          auto in3_tmp = vld1q_f32(input_tmp + l + 4);
-          auto in5_tmp = vld1q_f32(input_tmp + l + l + 4);
+#endif
+}
 
-          tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
-          tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
-          tmp2 = vextq_f32(in2_tmp, in3_tmp, 1);
-          tmp3 = vextq_f32(in2_tmp, in3_tmp, 2);
-          tmp4 = vextq_f32(in4_tmp, in5_tmp, 1);
-          tmp5 = vextq_f32(in4_tmp, in5_tmp, 2);
+void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
+                                   Tensor *output, const Tensor *new_scale,
+                                   const Tensor *new_bias, bool if_relu) {
+#if __ARM_NEON
 
-          out0 = vmulq_n_f32(in0_tmp, w00);
-          out0 = vmlaq_n_f32(out0, tmp0, w01);
-          out0 = vmlaq_n_f32(out0, tmp1, w02);
-          out0 = vmlaq_n_f32(out0, in2_tmp, w10);
-          out0 = vmlaq_n_f32(out0, tmp2, w11);
-          out0 = vmlaq_n_f32(out0, tmp3, w12);
-          out0 = vmlaq_n_f32(out0, in4_tmp, w20);
-          out0 = vmlaq_n_f32(out0, tmp4, w21);
-          out0 = vmlaq_n_f32(out0, tmp5, w22);
-          out0 = vaddq_f32(out0, vbias);
-          out0 = vmlaq_f32(vnewbias, vnewscale, out0);
-          if (if_relu) {
-            out0 = vmaxq_f32(out0, vzero);
-          }
-          vst1q_f32(output_ptr, out0);
+  const int batch_size = input->dims()[0];
 
-          output_ptr += 4;
-          input_tmp += 4;
-          in0_tmp = in1_tmp;
-          in2_tmp = in3_tmp;
-          in4_tmp = in5_tmp;
+  const int input_height = input->dims()[2];
+
+  const int input_width = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int _kernel_size = 3;
+  const int stride_height = 2;
+  const int stride_width = 2;
+  const int padding_height = 1;
+  const int padding_width = 1;
+  const float zero = 0;
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+  const int filter_channel_stride = 9;
+  const float *newscale_data = new_scale->data<float>();
+  const float *newbias_data = new_bias->data<float>();
+
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+
+  float *output_data = output->mutable_data<float>();
+
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  const int filter_batch_stride = output_channels * output_channel_stride;
+  const float *pos1, *pos2, *pos3, *filter1, *filter2, *filter3, *output_ptr;
+  int hstart, wstart, hend, wend;
+  float result;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      filter1 = filter_data;
+      filter2 = filter1 + 3;
+      filter3 = filter2 + 3;
+
+      for (int ph = 0; ph < output_height; ph++) {
+        for (int pw = 0; pw < output_width; pw++) {
+          hstart = ph * stride_height - padding_height;
+          wstart = pw * stride_width - padding_width;
+          hend = min(hstart + _kernel_size, input_height + padding_height);
+          wend = min(wstart + _kernel_size, input_width + padding_width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          hend = min(hend, input_height);
+          wend = min(wend, input_width);
+          pos1 = input_data + hstart * input_width + wstart;
+          pos2 = input_data + (hstart + 1) * input_width + wstart;
+          pos3 = input_data + (hstart + 2) * input_width + wstart;
+          output_ptr = output_data + ph * output_width + pw;
+
+          if (hend - hstart != 3 || wend - wstart != 3) {
+            result = 0;
+            float fake_input[9] = {0};
+            if (hstart == 0 && wstart == 0) {
+              // 左上角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend && k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k -
+                                   (3 - wend)];
+                  }
+                }
+              }
+            } else if (hstart == 0 && wend == input_width) {
+              // 右上角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend && k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k + wstart];
+                  }
+                }
+              }
+
+            } else if (hend == input_height && wstart == 0) {
+              // 左下角
+
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - 1 - hstart && k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k - (3 - wend)];
+                  }
+                }
+              }
+            } else if (hend == input_height && wend == input_width) {
+              // 右下角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - hstart - 1 &&
+                      k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+            } else if (hstart == 0) {
+              // 顶部
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k + wstart];
+                  }
+                }
+              }
+
+            } else if (hend == input_height) {
+              // 底部
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - hstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+
+            } else if (wstart == 0) {
+              // 左侧
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width +
+                                   (k - (3 - wend))];
+                  }
+                }
+              }
+
+            } else if (wend == input_width) {
+              // 右侧
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+            }
+            for (int l = 0; l < 9; ++l) {
+              result += fake_input[l] * filter1[l];
+            }
+            output_data[ph * output_width + pw] =
+                newscale_data[c] * result + newbias_data[c];
+
+            if (if_relu) {
+              output_data[ph * output_width + pw] =
+                  output_data[ph * output_width + pw] < 0
+                      ? 0
+                      : output_data[ph * output_width + pw];
+            }
+          } else {
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos2);
+            const float32x4_t data3 = vld1q_f32(pos3);
+
+            const float32x4_t v_filter1 = vld1q_f32(filter1);
+            const float32x4_t v_filter2 = vld1q_f32(filter2);
+            const float32x4_t v_filter3 = vld1q_f32(filter3);
+            float32x4_t mula = vmulq_f32(data1, v_filter1);
+            mula = vmlaq_f32(mula, data2, v_filter2);
+            mula = vmlaq_f32(mula, data3, v_filter3);
+            float32x2_t res = vpadd_f32(
+                vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
+            res = vpadd_f32(res, res);
+            output_data[ph * output_width + pw] =
+                vget_lane_f32(res, 0) * newscale_data[c] + newbias_data[c];
+
+            if (if_relu) {
+              output_data[ph * output_width + pw] =
+                  output_data[ph * output_width + pw] < 0
+                      ? 0
+                      : output_data[ph * output_width + pw];
+            }
+          }
         }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+      filter_data += filter_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+#endif
+}
 
-        float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]);
-        float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]);
-        float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]);
+void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
+                            Tensor *output, Tensor bias, bool if_bias) {
+#if __ARM_NEON
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+  float *output_data = output->data<float>();
+  const float *bias_data = bias.data<float>();
+
+  const int in_h = static_cast<int>(input->dims()[2]);
+  const int in_w = static_cast<int>(input->dims()[3]);
+  const int out_h = static_cast<int>(output->dims()[2]);
+  const int out_w = static_cast<int>(output->dims()[3]);
+  const int out_l = out_h;
+  const int in_l = in_h;
+  const int inhxw = in_h * in_w;
+  const int outhxw = out_h * out_w;
+  const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0;
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int c = static_cast<int>(input->dims()[1]);
+  const float *input_row_ptr;
+  float *output_row_ptr;
 
-        tmp0 = vextq_f32(in0_tmp, pad0, 1);
-        tmp1 = vextq_f32(in0_tmp, pad0, 2);
-        tmp2 = vextq_f32(in2_tmp, pad1, 1);
-        tmp3 = vextq_f32(in2_tmp, pad1, 2);
-        tmp4 = vextq_f32(in4_tmp, pad2, 1);
-        tmp5 = vextq_f32(in4_tmp, pad2, 2);
+  const int w_times = (out_w - 2) / 3;
 
-        out0 = vmulq_n_f32(in0_tmp, w00);
-        out0 = vmlaq_n_f32(out0, tmp0, w01);
-        out0 = vmlaq_n_f32(out0, tmp1, w02);
-        out0 = vmlaq_n_f32(out0, in2_tmp, w10);
-        out0 = vmlaq_n_f32(out0, tmp2, w11);
-        out0 = vmlaq_n_f32(out0, tmp3, w12);
-        out0 = vmlaq_n_f32(out0, in4_tmp, w20);
-        out0 = vmlaq_n_f32(out0, tmp4, w21);
-        out0 = vmlaq_n_f32(out0, tmp5, w22);
-        out0 = vaddq_f32(out0, vbias);
-        out0 = vmlaq_f32(vnewbias, vnewscale, out0);
-        if (if_relu) {
-          out0 = vmaxq_f32(out0, vzero);
+  float32x4_t vbias = vdupq_n_f32(0.0);
+
+  float32x4x2_t input_buff_mid{}, input_buff_bottom[w_times + 1];
+  float32x4_t elewise_res0, elewise_res1, elewise_res2, res3;
+  int out2in_mid;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int b = batch_size; b > 0; --b) {
+    const float *filter_data_tmp = filter_data;
+    for (int j = 0; j < c; ++j) {
+      auto output_data_tmp = output_data + j * out_h * out_w;
+      auto input_data_tmp = input_data + j * in_h * in_w;
+      auto input_const = input_data_tmp;
+
+      if (if_bias) {
+        vbias = vdupq_n_f32(bias_data[j]);
+      }
+
+      float w00 = filter_data_tmp[0];
+      float w01 = filter_data_tmp[1];
+      float w02 = filter_data_tmp[2];
+      float w10 = filter_data_tmp[3];
+      float w11 = filter_data_tmp[4];
+      float w12 = filter_data_tmp[5];
+      float w20 = filter_data_tmp[6];
+      float w21 = filter_data_tmp[7];
+      float w22 = filter_data_tmp[8];
+
+      int h_mid = 0;
+
+      for (; h_mid < out_h - 1; h_mid++) {
+        input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w;
+        output_row_ptr = output_data_tmp + 1 + h_mid * out_w;
+
+        for (int w4 = 0; w4 < w_times + 1; w4++) {
+          if (h_mid == 0) {
+            elewise_res1 = zero;
+            elewise_res0 = zero;
+            elewise_res2 = zero;
+          } else {
+            elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01);
+            elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00);
+            elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02);
+          }
+          input_buff_mid = vld2q_f32(input_row_ptr);
+          input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w);
+
+          elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1], w11);
+          elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10);
+          elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
+
+          elewise_res1 =
+              vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21);
+          elewise_res0 =
+              vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0], w20);
+          elewise_res2 =
+              vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0], w22);
+
+          res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
+                           vaddq_f32(elewise_res0, elewise_res1));
+          res3 = vaddq_f32(res3, vbias);
+          vst1q_f32(output_row_ptr, res3);
+
+          input_row_ptr += 6;
+          output_row_ptr += 3;
         }
-        for (int i = 0; i < c_mid; ++i) {
-          if (i == 0) {
-            vst1q_lane_f32(output_ptr + i, out0, 0);
+      }
+      clock();
+
+      input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w;
+      output_row_ptr = output_data_tmp + 1 + h_mid * out_w;
+
+      for (int w4 = 0; w4 < w_times + 1; w4++) {
+        elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01);
+        elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00);
+        elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02);
+
+        input_buff_mid = vld2q_f32(input_row_ptr);
+        input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w);
+
+        elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1], w11);
+        elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10);
+        elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
+
+        if (!if_pad) {
+          elewise_res1 =
+              vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21);
+          elewise_res0 =
+              vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0], w20);
+          elewise_res2 =
+              vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0], w22);
+        }
+        res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
+                         vaddq_f32(elewise_res0, elewise_res1));
+        res3 = vaddq_f32(res3, vbias);
+
+        if ((w4 != w_times)) {
+          vst1q_f32(output_row_ptr, res3);
+        } else {
+          if (out_l - 2 - w_times * 3 == 1) {
+            vst1q_lane_f32(output_row_ptr, res3, 0);
+          } else if (out_l - 2 - w_times * 3 == 2) {
+            vst1q_lane_f32(output_row_ptr, res3, 0);
+            vst1q_lane_f32(output_row_ptr + 1, res3, 1);
           }
-          if (i == 1) {
-            vst1q_lane_f32(output_ptr + i, out0, 1);
+        }
+        input_row_ptr += 6;
+        output_row_ptr += 3;
+      }
+
+      output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 +
+                           input_const[in_l] * w21 +
+                           input_const[in_l + 1] * w22;
+
+      out2in_mid = (out_l - 1) * 2;
+      output_data_tmp[out_l - 1] =
+          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+          w20 * input_const[out2in_mid + in_w - 1] +
+          w21 * input_const[out2in_mid + in_w] +
+          (1 - if_pad) * (w12 * input_const[out2in_mid + 1] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+
+      out2in_mid = (out_l - 1) * 2 * in_w;
+
+      output_data_tmp[out_l * (out_l - 1)] =
+          w01 * input_const[out2in_mid - in_w] +
+          w02 * input_const[out2in_mid - in_w + 1] +
+          w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
+          (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+      out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2;
+
+      output_data_tmp[out_l * out_l - 1] =
+          w00 * input_const[out2in_mid - in_w - 1] +
+          w01 * input_const[out2in_mid - in_w] +
+          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+          (1 - if_pad) * (w20 * input_const[out2in_mid + in_w - 1] +
+                          w21 * input_const[out2in_mid + in_w] +
+                          w02 * input_const[out2in_mid - in_w + 1] +
+                          w12 * input_const[out2in_mid + 1] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+      if (if_bias) {
+        output_data_tmp[0] += bias_data[j];
+        output_data_tmp[out_l - 1] += bias_data[j];
+        output_data_tmp[out_l * (out_l - 1)] += bias_data[j];
+        output_data_tmp[out_l * out_l - 1] += bias_data[j];
+      }
+      for (int i = 1; i < out_h - 1; i++) {
+        out2in_mid = i * 2 * in_w;
+        output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] +
+                                     w02 * input_const[out2in_mid - in_w + 1] +
+                                     w11 * input_const[out2in_mid] +
+                                     w12 * input_const[out2in_mid + 1] +
+                                     w21 * input_const[out2in_mid + in_w] +
+                                     w22 * input_const[out2in_mid + in_w + 1];
+
+        out2in_mid = i * 2 * in_w + (out_l - 1) * 2;
+        output_data_tmp[i * out_l + out_l - 1] =
+            w00 * input_const[out2in_mid - in_w - 1] +
+            w01 * input_const[out2in_mid - in_w] +
+            w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+            w20 * input_const[out2in_mid + in_w - 1] +
+            w21 * input_const[out2in_mid + in_w] +
+            (1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] +
+                            w12 * input_const[out2in_mid + 1] +
+                            w22 * input_const[out2in_mid + in_w + 1]);
+        if (if_bias) {
+          output_data_tmp[i * out_l] += bias_data[j];
+          output_data_tmp[i * out_l + out_l - 1] += bias_data[j];
+        }
+      }
+      filter_data_tmp += 9;
+    }
+    input_data += inhxw * c;
+    output_data += outhxw * c;
+  }
+#endif
+}
+
+void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
+                                     Tensor *output, const Tensor *new_scale,
+                                     const Tensor *new_bias, bool if_relu) {
+#if __ARM_NEON
+  //#ifdef _OPENMP
+  //  const float *newscale_data = new_scale->data<float>();
+  //  const float *newbias_data = new_bias->data<float>();
+  //
+  //  const int batch_size = static_cast<int>(input->dims()[0]);
+  //  const int input_channel = static_cast<int>(input->dims()[1]);
+  //
+  //  const int input_height = static_cast<int>(input->dims()[2]);
+  //  const int input_width = static_cast<int>(input->dims()[3]);
+  //  const int output_height = static_cast<int>(output->dims()[2]);
+  //  const int output_width = static_cast<int>(output->dims()[3]);
+  //  const int inhxw = input_height * input_width;
+  //  const int outhxw = output_height * output_width;
+  //
+  //  float32x4_t zero = vdupq_n_f32(0.0);
+  //  for (int b = 0; b < batch_size; b++) {
+  //    #pragma omp parallel for
+  //    for (int c = 0; c < input_channel; c++) {
+  //      const float *filter_data = filter->data<float>() + c * 9;
+  //      const float *input_data = input->data<float>() + c * inhxw;
+  //      float *output_data = output->data<float>() + c * outhxw;
+  //      float32x4_t vnewbias = vdupq_n_f32(newbias_data[c]);
+  //      float32x4_t vnewscale = vdupq_n_f32(newscale_data[c]);
+  //
+  //      float w00 = filter_data[0];
+  //      float w01 = filter_data[1];
+  //      float w02 = filter_data[2];
+  //      float w10 = filter_data[3];
+  //      float w11 = filter_data[4];
+  //      float w12 = filter_data[5];
+  //      float w20 = filter_data[6];
+  //      float w21 = filter_data[7];
+  //      float w22 = filter_data[8];
+  //
+  //      int m;
+  //      for (m = 1; m < output_width - 2; m = m + 3) {
+  //        float *output_ptr = output_data + m;
+  //        float32x4x2_t input_buff_mid{}, input_buff_bottom{};
+  //        float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0;
+  //        input_buff_mid = vld2q_f32(input_data + (2 * m - 1));
+  //        input_buff_bottom = vld2q_f32(input_data + input_width + (2 * m -
+  //        1));
+  //
+  //        in0 = input_buff_mid.val[0];
+  //        tmp0 = input_buff_mid.val[1];
+  //        tmp1 = vextq_f32(in0, zero, 1);
+  //
+  //        in2 = input_buff_bottom.val[0];
+  //        tmp2 = input_buff_bottom.val[1];
+  //        tmp3 = vextq_f32(in2, zero, 1);
+  //
+  //        out0 = vmulq_n_f32(in0, w10);
+  //        out0 = vmlaq_n_f32(out0, tmp0, w11);
+  //        out0 = vmlaq_n_f32(out0, tmp1, w12);
+  //        out0 = vmlaq_n_f32(out0, in2, w20);
+  //        out0 = vmlaq_n_f32(out0, tmp2, w21);
+  //        out0 = vmlaq_n_f32(out0, tmp3, w22);
+  //        out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+  //        if (if_relu) {
+  //          out0 = vmaxq_f32(out0, zero);
+  //        }
+  //        vst1q_lane_f32(output_ptr, out0, 0);
+  //        vst1q_lane_f32(output_ptr + 1, out0, 1);
+  //        vst1q_lane_f32(output_ptr + 2, out0, 2);
+  //      }
+  //      for (m = 1; m < output_width - 2; m += 3) {
+  //      }
+  //      for (int j = m; j < output_width; j++) {
+  //        output_data[j] = input_data[2 * j - 1] * w10 + input_data[2 * j] *
+  //        w11 +
+  //                         input_data[2 * j + 1] * w12 +
+  //                         input_data[2 * j - 1 + input_width] * w20 +
+  //                         input_data[2 * j + input_width] * w21 +
+  //                         input_data[2 * j + 1 + input_width] * w22;
+  //        output_data[j] = newscale_data[c] * output_data[j] +
+  //        newbias_data[c]; if (if_relu) {
+  //          output_data[j] = output_data[j] < 0 ? 0 : output_data[j];
+  //        }
+  //      }
+  //
+  //      for (int i = 1; i < output_height; i += 1) {
+  //        for (int m = 1; m < output_width - 2; m += 3) {
+  //          float *output_ptr = output_data + i * output_width + m;
+  //          float32x4x2_t input_buff_top{}, input_buff_mid{},
+  //          input_buff_bottom{}; float32x4_t in0, in1, in2, in3, in4, in5,
+  //          tmp0, tmp1, tmp2, tmp3,
+  //              tmp4, tmp5, out0;
+  //          input_buff_top =
+  //              vld2q_f32(input_data + (2 * i - 1) * input_width + (2 * m -
+  //              1));
+  //          input_buff_mid =
+  //              vld2q_f32(input_data + (2 * i) * input_width + (2 * m - 1));
+  //          input_buff_bottom =
+  //              vld2q_f32(input_data + (2 * i + 1) * input_width + (2 * m -
+  //              1));
+  //
+  //          in0 = input_buff_top.val[0];
+  //          tmp0 = input_buff_top.val[1];
+  //          tmp1 = vextq_f32(in0, zero, 1);
+  //
+  //          in2 = input_buff_mid.val[0];
+  //          tmp2 = input_buff_mid.val[1];
+  //          tmp3 = vextq_f32(in2, zero, 1);
+  //
+  //          in4 = input_buff_bottom.val[0];
+  //          tmp4 = input_buff_bottom.val[1];
+  //          tmp5 = vextq_f32(in4, zero, 1);
+  //
+  //          out0 = vmulq_n_f32(in0, w00);
+  //          out0 = vmlaq_n_f32(out0, tmp0, w01);
+  //          out0 = vmlaq_n_f32(out0, tmp1, w02);
+  //          out0 = vmlaq_n_f32(out0, in2, w10);
+  //          out0 = vmlaq_n_f32(out0, tmp2, w11);
+  //          out0 = vmlaq_n_f32(out0, tmp3, w12);
+  //          out0 = vmlaq_n_f32(out0, in4, w20);
+  //          out0 = vmlaq_n_f32(out0, tmp4, w21);
+  //          out0 = vmlaq_n_f32(out0, tmp5, w22);
+  //          out0 = vmlaq_f32(vnewbias, vnewscale, out0);
+  //          if (if_relu) {
+  //            out0 = vmaxq_f32(out0, zero);
+  //          }
+  //          vst1q_lane_f32(output_ptr, out0, 0);
+  //          vst1q_lane_f32(output_ptr + 1, out0, 1);
+  //          vst1q_lane_f32(output_ptr + 2, out0, 2);
+  //        }
+  //        int m;
+  //        for (m = 1; m < output_width - 2; m += 3) {
+  //        }
+  //        for (int j = m; j < output_width; j++) {
+  //          output_data[i * output_width + j] =
+  //              input_data[(2 * i - 1) * input_width + 2 * j - 1] * w00 +
+  //              input_data[(2 * i - 1) * input_width + 2 * j] * w01 +
+  //              input_data[(2 * i - 1) * input_width + 2 * j + 1] * w02 +
+  //              input_data[(2 * i) * input_width + 2 * j - 1] * w10 +
+  //              input_data[(2 * i) * input_width + 2 * j] * w11 +
+  //              input_data[(2 * i) * input_width + 2 * j + 1] * w12 +
+  //              input_data[(2 * i + 1) * input_width + 2 * j - 1] * w20 +
+  //              input_data[(2 * i + 1) * input_width + 2 * j] * w21 +
+  //              input_data[(2 * i + 1) * input_width + 2 * j + 1] * w22;
+  //          output_data[i * output_width + j] =
+  //              newscale_data[c] * output_data[i * output_width + j] +
+  //              newbias_data[c];
+  //          if (if_relu) {
+  //            output_data[i * output_width + j] =
+  //                output_data[i * output_width + j] < 0
+  //                    ? 0
+  //                    : output_data[i * output_width + j];
+  //          }
+  //        }
+  //      }
+  //      output_data[0] = input_data[0] * w11 + input_data[1] * w12 +
+  //                       input_data[input_height] * w21 +
+  //                       input_data[input_height + 1] * w22;
+  //
+  //      output_data[0] = newscale_data[c] * output_data[0] + newbias_data[c];
+  //      if (if_relu) {
+  //        output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
+  //      }
+  //      for (int i = 1; i < output_height; i++) {
+  //        output_data[i * output_width] =
+  //            input_data[(2 * i - 1) * input_width] * w01 +
+  //            input_data[(2 * i - 1) * input_width + 1] * w02 +
+  //            input_data[(2 * i) * input_width] * w11 +
+  //            input_data[(2 * i) * input_width + 1] * w12 +
+  //            input_data[(2 * i + 1) * input_width] * w21 +
+  //            input_data[(2 * i + 1) * input_width + 1] * w22;
+  //
+  //        output_data[i * output_width] =
+  //            newscale_data[c] * output_data[i * output_width] +
+  //            newbias_data[c];
+  //        if (if_relu) {
+  //          output_data[i * output_width] = output_data[i * output_width] < 0
+  //                                              ? 0
+  //                                              : output_data[i *
+  //                                              output_width];
+  //        }
+  //      }
+  //    }
+  //  }
+  //
+  //#else
+
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+  float *output_data = output->data<float>();
+  const float *newscale_data = new_scale->data<float>();
+  const float *newbias_data = new_bias->data<float>();
+
+  const int in_h = static_cast<int>(input->dims()[2]);
+  const int in_w = static_cast<int>(input->dims()[3]);
+  const int out_h = static_cast<int>(output->dims()[2]);
+  const int out_w = static_cast<int>(output->dims()[3]);
+  const int out_l = out_h;
+  const int in_l = in_h;
+  const int inhxw = in_h * in_w;
+  const int outhxw = out_h * out_w;
+  const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0;
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int c = static_cast<int>(input->dims()[1]);
+  const int w_times = (out_w - 2) / 3;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int b = batch_size; b > 0; --b) {
+    #pragma omp parallel for
+    for (int j = 0; j < c; j++) {
+      const float *input_row_ptr;
+      float *output_row_ptr;
+      float32x4x2_t input_buff_mid{}, input_buff_bottom[w_times + 1];
+      float32x4_t elewise_res0, elewise_res1, elewise_res2, res3;
+      int out2in_mid;
+      float32x4_t vnewbias = vdupq_n_f32(0.0);
+      float32x4_t vnewscale = vdupq_n_f32(1.0);
+      auto output_data_tmp = output_data + j * out_h * out_w;
+      auto input_data_tmp = input_data + j * in_h * in_w;
+      auto input_const = input_data_tmp;
+      const float *filter_data_tmp = filter_data + 9 * j;
+      vnewbias = vdupq_n_f32(newbias_data[j]);
+      vnewscale = vdupq_n_f32(newscale_data[j]);
+
+      float w00 = filter_data_tmp[0];
+      float w01 = filter_data_tmp[1];
+      float w02 = filter_data_tmp[2];
+      float w10 = filter_data_tmp[3];
+      float w11 = filter_data_tmp[4];
+      float w12 = filter_data_tmp[5];
+      float w20 = filter_data_tmp[6];
+      float w21 = filter_data_tmp[7];
+      float w22 = filter_data_tmp[8];
+
+      int h_mid = 0;
+
+      for (; h_mid < out_h - 1; h_mid++) {
+        input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w;
+        output_row_ptr = output_data_tmp + 1 + h_mid * out_w;
+
+        for (int w4 = 0; w4 < w_times + 1; w4++) {
+          if (h_mid == 0) {
+            elewise_res1 = zero;
+            elewise_res0 = zero;
+            elewise_res2 = zero;
+          } else {
+            elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01);
+            elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00);
+            elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02);
           }
-          if (i == 2) {
-            vst1q_lane_f32(output_ptr + i, out0, 2);
+          input_buff_mid = vld2q_f32(input_row_ptr);
+          input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w);
+
+          elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1], w11);
+          elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10);
+          elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
+
+          elewise_res1 =
+              vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21);
+          elewise_res0 =
+              vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0], w20);
+          elewise_res2 =
+              vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0], w22);
+
+          res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
+                           vaddq_f32(elewise_res0, elewise_res1));
+          res3 = vmlaq_f32(vnewbias, vnewscale, res3);
+
+          if (if_relu) {
+            res3 = vmaxq_f32(res3, zero);
           }
+          vst1q_lane_f32(output_row_ptr, res3, 0);
+          vst1q_lane_f32(output_row_ptr + 1, res3, 1);
+          vst1q_lane_f32(output_row_ptr + 2, res3, 2);
+
+          input_row_ptr += 6;
+          output_row_ptr += 3;
+        }
+      }
+      clock();
+
+      input_row_ptr = input_data_tmp + 1 + h_mid * 2 * in_w;
+      output_row_ptr = output_data_tmp + 1 + h_mid * out_w;
+
+      for (int w4 = 0; w4 < w_times + 1; w4++) {
+        elewise_res1 = vmulq_n_f32(input_buff_bottom[w4].val[1], w01);
+        elewise_res0 = vmulq_n_f32(input_buff_bottom[w4].val[0], w00);
+        elewise_res2 = vmulq_n_f32(input_buff_bottom[w4].val[0], w02);
+
+        input_buff_mid = vld2q_f32(input_row_ptr);
+        input_buff_bottom[w4] = vld2q_f32(input_row_ptr + in_w);
+
+        elewise_res1 = vmlaq_n_f32(elewise_res1, input_buff_mid.val[1], w11);
+        elewise_res0 = vmlaq_n_f32(elewise_res0, input_buff_mid.val[0], w10);
+        elewise_res2 = vmlaq_n_f32(elewise_res2, input_buff_mid.val[0], w12);
+
+        if (!if_pad) {
+          elewise_res1 =
+              vmlaq_n_f32(elewise_res1, input_buff_bottom[w4].val[1], w21);
+          elewise_res0 =
+              vmlaq_n_f32(elewise_res0, input_buff_bottom[w4].val[0], w20);
+          elewise_res2 =
+              vmlaq_n_f32(elewise_res2, input_buff_bottom[w4].val[0], w22);
+        }
+        res3 = vaddq_f32(vextq_f32(elewise_res2, zero, 1),
+                         vaddq_f32(elewise_res0, elewise_res1));
+        res3 = vmlaq_f32(vnewbias, vnewscale, res3);
+
+        if (if_relu) {
+          res3 = vmaxq_f32(res3, zero);
+        }
+        if ((w4 != w_times)) {
+          vst1q_lane_f32(output_row_ptr, res3, 0);
+          vst1q_lane_f32(output_row_ptr + 1, res3, 1);
+          vst1q_lane_f32(output_row_ptr + 2, res3, 2);
+        } else {
+          if (out_l - 2 - w_times * 3 == 1) {
+            vst1q_lane_f32(output_row_ptr, res3, 0);
+          } else if (out_l - 2 - w_times * 3 == 2) {
+            vst1q_lane_f32(output_row_ptr, res3, 0);
+            vst1q_lane_f32(output_row_ptr + 1, res3, 1);
+          }
+        }
+        input_row_ptr += 6;
+        output_row_ptr += 3;
+      }
+
+      output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 +
+                           input_const[in_l] * w21 +
+                           input_const[in_l + 1] * w22;
+
+      out2in_mid = (out_l - 1) * 2;
+      output_data_tmp[out_l - 1] =
+          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+          w20 * input_const[out2in_mid + in_w - 1] +
+          w21 * input_const[out2in_mid + in_w] +
+          (1 - if_pad) * (w12 * input_const[out2in_mid + 1] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+
+      out2in_mid = (out_l - 1) * 2 * in_w;
+
+      output_data_tmp[out_l * (out_l - 1)] =
+          w01 * input_const[out2in_mid - in_w] +
+          w02 * input_const[out2in_mid - in_w + 1] +
+          w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
+          (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+      out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2;
+
+      output_data_tmp[out_l * out_l - 1] =
+          w00 * input_const[out2in_mid - in_w - 1] +
+          w01 * input_const[out2in_mid - in_w] +
+          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+          (1 - if_pad) * (w20 * input_const[out2in_mid + in_w - 1] +
+                          w21 * input_const[out2in_mid + in_w] +
+                          w02 * input_const[out2in_mid - in_w + 1] +
+                          w12 * input_const[out2in_mid + 1] +
+                          w22 * input_const[out2in_mid + in_w + 1]);
+      output_data_tmp[0] =
+          output_data_tmp[0] * newscale_data[j] + newbias_data[j];
+      output_data_tmp[out_l - 1] =
+          output_data_tmp[out_l - 1] * newscale_data[j] + newbias_data[j];
+      output_data_tmp[out_l * (out_l - 1)] =
+          output_data_tmp[out_l * (out_l - 1)] * newscale_data[j] +
+          newbias_data[j];
+      output_data_tmp[out_l * out_l - 1] =
+          output_data_tmp[out_l * out_l - 1] * newscale_data[j] +
+          newbias_data[j];
+      if (if_relu) {
+        output_data_tmp[0] = output_data_tmp[0] < 0 ? 0 : output_data_tmp[0];
+        output_data_tmp[out_l - 1] =
+            output_data_tmp[out_l - 1] < 0 ? 0 : output_data_tmp[out_l - 1];
+        output_data_tmp[out_l * (out_l - 1)] =
+            output_data_tmp[out_l * (out_l - 1)] < 0
+                ? 0
+                : output_data_tmp[out_l * (out_l - 1)];
+        output_data_tmp[out_l * out_l - 1] =
+            output_data_tmp[out_l * out_l - 1] < 0
+                ? 0
+                : output_data_tmp[out_l * out_l - 1];
+      }
+      for (int i = 1; i < out_h - 1; i++) {
+        out2in_mid = i * 2 * in_w;
+        output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] +
+                                     w02 * input_const[out2in_mid - in_w + 1] +
+                                     w11 * input_const[out2in_mid] +
+                                     w12 * input_const[out2in_mid + 1] +
+                                     w21 * input_const[out2in_mid + in_w] +
+                                     w22 * input_const[out2in_mid + in_w + 1];
+
+        out2in_mid = i * 2 * in_w + (out_l - 1) * 2;
+        output_data_tmp[i * out_l + out_l - 1] =
+            w00 * input_const[out2in_mid - in_w - 1] +
+            w01 * input_const[out2in_mid - in_w] +
+            w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
+            w20 * input_const[out2in_mid + in_w - 1] +
+            w21 * input_const[out2in_mid + in_w] +
+            (1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] +
+                            w12 * input_const[out2in_mid + 1] +
+                            w22 * input_const[out2in_mid + in_w + 1]);
+        output_data_tmp[i * out_l] =
+            output_data_tmp[i * out_l] * newscale_data[j] + newbias_data[j];
+        output_data_tmp[i * out_l + out_l - 1] =
+            output_data_tmp[i * out_l + out_l - 1] * newscale_data[j] +
+            newbias_data[j];
+        if (if_relu) {
+          output_data_tmp[i * out_l] =
+              output_data_tmp[i * out_l] < 0 ? 0 : output_data_tmp[i * out_l];
+          output_data_tmp[i * out_l + out_l - 1] =
+              output_data_tmp[i * out_l + out_l - 1] < 0
+                  ? 0
+                  : output_data_tmp[i * out_l + out_l - 1];
         }
       }
-      output_data += hxw;
-      input_data += hxw;
-      filter_data_tmp += 9;
     }
+    input_data += inhxw * c;
+    output_data += outhxw * c;
   }
+//#endif
+#endif
+}
+
+void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter,
+                          Tensor *output, Tensor bias, bool if_bias) {
+#if __ARM_NEON
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int input_channel = static_cast<int>(input->dims()[1]);
+
+  const int input_height = static_cast<int>(input->dims()[2]);
+  const int input_width = static_cast<int>(input->dims()[3]);
+  const int output_height = static_cast<int>(output->dims()[2]);
+  const int output_width = static_cast<int>(output->dims()[3]);
+  const int inhxw = input_height * input_width;
+  const int outhxw = output_height * output_width;
+
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int b = 0; b < batch_size; b++) {
+#pragma omp parallel for
+    for (int c = 0; c < input_channel; c++) {
+      const float *filter_data = filter->data<float>() + c * 9;
+      const float *input_data = input->data<float>() + c * inhxw;
+      const float *bias_data = bias.data<float>() + c;
+      float *output_data = output->data<float>() + c * outhxw;
+      float w00 = filter_data[0];
+      float w01 = filter_data[1];
+      float w02 = filter_data[2];
+      float w10 = filter_data[3];
+      float w11 = filter_data[4];
+      float w12 = filter_data[5];
+      float w20 = filter_data[6];
+      float w21 = filter_data[7];
+      float w22 = filter_data[8];
+
+      float32x4_t biasv = vld1q_dup_f32(bias_data);
+
+      for (int i = 0; i < output_height; i += 1) {
+        for (int m = 0; m < output_width - 2; m += 3) {
+          float *output_ptr = output_data + i * output_width + m;
+          float32x4x2_t input_buff_top{}, input_buff_mid{}, input_buff_bottom{};
+          float32x4_t in0, in1, in2, in3, in4, in5, tmp0, tmp1, tmp2, tmp3,
+              tmp4, tmp5, out0;
+          input_buff_top =
+              vld2q_f32(input_data + (2 * i) * input_width + (2 * m));
+          input_buff_mid =
+              vld2q_f32(input_data + (2 * i + 1) * input_width + (2 * m));
+          input_buff_bottom =
+              vld2q_f32(input_data + (2 * i + 2) * input_width + (2 * m));
+
+          in0 = input_buff_top.val[0];
+          tmp0 = input_buff_top.val[1];
+          tmp1 = vextq_f32(in0, zero, 1);
+
+          in2 = input_buff_mid.val[0];
+          tmp2 = input_buff_mid.val[1];
+          tmp3 = vextq_f32(in2, zero, 1);
+
+          in4 = input_buff_bottom.val[0];
+          tmp4 = input_buff_bottom.val[1];
+          tmp5 = vextq_f32(in4, zero, 1);
+
+          out0 = vmulq_n_f32(in0, w00);
+          out0 = vmlaq_n_f32(out0, tmp0, w01);
+          out0 = vmlaq_n_f32(out0, tmp1, w02);
+          out0 = vmlaq_n_f32(out0, in2, w10);
+          out0 = vmlaq_n_f32(out0, tmp2, w11);
+          out0 = vmlaq_n_f32(out0, tmp3, w12);
+          out0 = vmlaq_n_f32(out0, in4, w20);
+          out0 = vmlaq_n_f32(out0, tmp4, w21);
+          out0 = vmlaq_n_f32(out0, tmp5, w22);
+          out0 = vaddq_f32(out0, biasv);
+
+          vst1q_lane_f32(output_ptr, out0, 0);
+          vst1q_lane_f32(output_ptr + 1, out0, 1);
+          vst1q_lane_f32(output_ptr + 2, out0, 2);
+        }
+        int m;
+        for (m = 0; m < output_width - 2; m += 3) {
+        }
+        for (int j = m; j < output_width; j++) {
+          output_data[i * output_width + j] =
+              input_data[(2 * i - 1) * input_width + 2 * j - 1] * w00 +
+              input_data[(2 * i - 1) * input_width + 2 * j] * w01 +
+              input_data[(2 * i - 1) * input_width + 2 * j + 1] * w02 +
+              input_data[(2 * i) * input_width + 2 * j - 1] * w10 +
+              input_data[(2 * i) * input_width + 2 * j] * w11 +
+              input_data[(2 * i) * input_width + 2 * j + 1] * w12 +
+              input_data[(2 * i + 1) * input_width + 2 * j - 1] * w20 +
+              input_data[(2 * i + 1) * input_width + 2 * j] * w21 +
+              input_data[(2 * i + 1) * input_width + 2 * j + 1] * w22;
+          output_data[i * output_width + j] += *bias_data;
+        }
+      }
+    }
+  }
+
+#endif
 }
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/math/depthwise_conv_3x3.h b/src/operators/math/depthwise_conv_3x3.h
index 44299295eebad6a90fd994cf74589c09a3573aee..b146b88e737a07ea08250315fc94653f63d2ad05 100644
--- a/src/operators/math/depthwise_conv_3x3.h
+++ b/src/operators/math/depthwise_conv_3x3.h
@@ -32,10 +32,20 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
                       Tensor *output, bool if_bias);
 void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
                           Tensor *output, Tensor *bias, bool if_bias);
-void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, Tensor filter,
-                                   Tensor *output, Tensor *bias, bool if_bias,
-                                   Tensor *new_scale, Tensor *new_bias,
-                                   bool if_bn, bool if_relu);
+void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
+                                   Tensor *output, const Tensor *new_scale,
+                                   const Tensor *new_bias, bool if_relu);
+void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
+                                   Tensor *output, const Tensor *new_scale,
+                                   const Tensor *new_bias, bool if_relu);
+void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
+                            Tensor *output, Tensor bias, bool if_bias);
+void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
+                                     Tensor *output, const Tensor *new_scale,
+                                     const Tensor *new_bias, bool if_relu);
+
+void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter,
+                          Tensor *output, Tensor bias, bool if_bias);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index c35a14bf508835b120e1a4108cba0945208867dc..e3966d3290fac1d736bfa778635e2f943dfd9398 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -13,18 +13,36 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "operators/math/gemm.h"
+#include <string.h>
 #include "common/log.h"
 #include "memory/t_malloc.h"
-#ifndef X86
+#if __ARM_NEON
 #include <arm_neon.h>
 #endif
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 
 namespace paddle_mobile {
 namespace operators {
 namespace math {
-alignas(64) float packedA[MC * KC];
-alignas(64) float packedB[KC * NC];
-alignas(64) float ab[MR * NR];
+int MC = 0;
+int KC = 0;
+int NC = 0;
+
+float *packedA;
+float *packedB;
+float *packedC;
+float *zero;
+
+typedef void (*FnPack)(int, int, int, const float *, int, float *);
+typedef void (*FnAddDot)(int, const float *, const float *, float *, int);
+
+FnPack procPackA;
+FnPack procPackB;
+FnAddDot procAddDot;
+
+/*
 // 将A矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                  float *buffer) {
@@ -52,35 +70,6 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
   }
 }
 
-// 将A矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
-                  float *buffer) {
-  int i, j;
-  const float *Ai, *Ai1, *Ai2, *Ai3;
-  for (i = 0; i < m - m_tail; i += MR) {
-    Ai = &A(i, 0);
-    Ai1 = &A(i + 1, 0);
-    Ai2 = &A(i + 2, 0);
-    Ai3 = &A(i + 3, 0);
-    for (int j = 0; j < k; ++j) {
-      *buffer++ = *Ai++;
-      *buffer++ = *Ai1++;
-      *buffer++ = *Ai2++;
-      *buffer++ = *Ai3++;
-    }
-  }
-  if (m_tail != 0) {
-    for (j = 0; j < k; ++j) {
-      for (i = m - m_tail; i < m; ++i) {
-        *buffer++ = A(i, j);
-      }
-      for (i = m; i < m + (MR - m_tail); ++i) {
-        *buffer++ = 0;
-      }
-    }
-  }
-}
-
 // 将B矩阵分块复制到连续内存(ColMajor)
 void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                  float *buffer) {
@@ -109,98 +98,668 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
     }
   }
 }
+*/
+
+// 将A矩阵分块复制到连续内存(RowMajor)
+void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer) {
+  const float *a0, *a1, *a2, *a3;
+  for (int i = 0; i < m - m_tail; i += MR) {
+    a0 = A + i * lda;
+    a1 = A + (i + 1) * lda;
+    a2 = A + (i + 2) * lda;
+    a3 = A + (i + 3) * lda;
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
+    }
+  }
+
+  if (m_tail != 0) {
+    a0 = &A(m - m_tail, 0);
+    a1 = a0 + lda;
+    a2 = a0 + 2 * lda;
+    a3 = a0 + 3 * lda;
+    switch (m_tail) {
+      case 1:
+        a1 = zero;
+      case 2:
+        a2 = zero;
+      case 3:
+        a3 = zero;
+        break;
+      default:
+        break;
+    }
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
+    }
+  }
+}
+
+void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer) {
+  const int i_length = m - m_tail;
+  for (int i = 0; i < i_length; i += MR) {
+    const float *a0 = A + i * lda;
+    const float *a1 = A + (i + 1) * lda;
+    const float *a2 = A + (i + 2) * lda;
+    const float *a3 = A + (i + 3) * lda;
+    const float *a4 = A + (i + 4) * lda;
+    const float *a5 = A + (i + 5) * lda;
+    float *local_buffer = buffer + i * k;
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+    }
+  }
+  if (m_tail != 0) {
+    const float *a0 = &A(i_length, 0);
+    const float *a1 = a0 + lda;
+    const float *a2 = a0 + 2 * lda;
+    const float *a3 = a0 + 3 * lda;
+    const float *a4 = a0 + 4 * lda;
+    const float *a5 = a0 + 5 * lda;
+    float *local_buffer = buffer + i_length * k;
+    switch (m_tail) {
+      case 1:
+        a1 = zero;
+      case 2:
+        a2 = zero;
+      case 3:
+        a3 = zero;
+      case 4:
+        a4 = zero;
+      case 5:
+        a5 = zero;
+        break;
+      default:
+        break;
+    }
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+    }
+  }
+}
+
+void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
+                        float *buffer) {
+  const int i_length = m - m_tail;
+#pragma omp parallel for
+  for (int i = 0; i < i_length; i += MR) {
+    const float *a0 = A + i * lda;
+    const float *a1 = A + (i + 1) * lda;
+    const float *a2 = A + (i + 2) * lda;
+    const float *a3 = A + (i + 3) * lda;
+    const float *a4 = A + (i + 4) * lda;
+    const float *a5 = A + (i + 5) * lda;
+    float *local_buffer = buffer + i * k;
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+    }
+  }
+  if (m_tail != 0) {
+    const float *a0 = &A(i_length, 0);
+    const float *a1 = a0 + lda;
+    const float *a2 = a0 + 2 * lda;
+    const float *a3 = a0 + 3 * lda;
+    const float *a4 = a0 + 4 * lda;
+    const float *a5 = a0 + 5 * lda;
+    float *local_buffer = buffer + i_length * k;
+    switch (m_tail) {
+      case 1:
+        a1 = zero;
+      case 2:
+        a2 = zero;
+      case 3:
+        a3 = zero;
+      case 4:
+        a4 = zero;
+      case 5:
+        a5 = zero;
+        break;
+      default:
+        break;
+    }
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+    }
+  }
+}
+
+void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer) {
+  const int i_length = m - m_tail;
+  for (int i = 0; i < i_length; i += MR) {
+    const float *a0 = A + i * lda;
+    const float *a1 = A + (i + 1) * lda;
+    const float *a2 = A + (i + 2) * lda;
+    const float *a3 = A + (i + 3) * lda;
+    const float *a4 = A + (i + 4) * lda;
+    const float *a5 = A + (i + 5) * lda;
+    const float *a6 = A + (i + 6) * lda;
+    const float *a7 = A + (i + 7) * lda;
+    float *local_buffer = buffer + i * k;
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+      *local_buffer++ = *a6++;
+      *local_buffer++ = *a7++;
+    }
+  }
+  if (m_tail != 0) {
+    const float *a0 = &A(i_length, 0);
+    const float *a1 = a0 + lda;
+    const float *a2 = a0 + 2 * lda;
+    const float *a3 = a0 + 3 * lda;
+    const float *a4 = a0 + 4 * lda;
+    const float *a5 = a0 + 5 * lda;
+    const float *a6 = a0 + 6 * lda;
+    const float *a7 = a0 + 7 * lda;
+    float *local_buffer = buffer + i_length * k;
+    switch (m_tail) {
+      case 1:
+        a1 = zero;
+      case 2:
+        a2 = zero;
+      case 3:
+        a3 = zero;
+      case 4:
+        a4 = zero;
+      case 5:
+        a5 = zero;
+      case 6:
+        a6 = zero;
+      case 7:
+        a7 = zero;
+        break;
+      default:
+        break;
+    }
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+      *local_buffer++ = *a6++;
+      *local_buffer++ = *a7++;
+    }
+  }
+}
+
+void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
+                        float *buffer) {
+  const int i_length = m - m_tail;
+#pragma omp parallel for
+  for (int i = 0; i < i_length; i += MR) {
+    const float *a0 = A + i * lda;
+    const float *a1 = A + (i + 1) * lda;
+    const float *a2 = A + (i + 2) * lda;
+    const float *a3 = A + (i + 3) * lda;
+    const float *a4 = A + (i + 4) * lda;
+    const float *a5 = A + (i + 5) * lda;
+    const float *a6 = A + (i + 6) * lda;
+    const float *a7 = A + (i + 7) * lda;
+    float *local_buffer = buffer + i * k;
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+      *local_buffer++ = *a6++;
+      *local_buffer++ = *a7++;
+    }
+  }
+  if (m_tail != 0) {
+    const float *a0 = &A(i_length, 0);
+    const float *a1 = a0 + lda;
+    const float *a2 = a0 + 2 * lda;
+    const float *a3 = a0 + 3 * lda;
+    const float *a4 = a0 + 4 * lda;
+    const float *a5 = a0 + 5 * lda;
+    const float *a6 = a0 + 6 * lda;
+    const float *a7 = a0 + 7 * lda;
+    float *local_buffer = buffer + i_length * k;
+    switch (m_tail) {
+      case 1:
+        a1 = zero;
+      case 2:
+        a2 = zero;
+      case 3:
+        a3 = zero;
+      case 4:
+        a4 = zero;
+      case 5:
+        a5 = zero;
+      case 6:
+        a6 = zero;
+      case 7:
+        a7 = zero;
+        break;
+      default:
+        break;
+    }
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+      *local_buffer++ = *a6++;
+      *local_buffer++ = *a7++;
+    }
+  }
+}
 
 // 将B矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
-                  float *buffer) {
-  int i, j;
-  const float *Bij;
-  for (j = 0; j < n - n_tail; j += NR) {
-    for (i = 0; i < k; ++i) {
-      Bij = &B(i, j);
+void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
+                    float *buffer) {
+  const int j_length = n - n_tail;
+  for (int j = 0; j < j_length; j += NR) {
+    float *local_buffer = buffer + j * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j);
+#if __ARM_NEON
+#if __aarch64__
+      asm volatile(
+          "prfm   pldl1keep,        [%[b0]]           \n\t"
+          "ld1    {v0.4s, v1.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s},   [%[local_buffer]],  #32 \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1");
+#else
       asm volatile(
-          "vld1.32    {q0}, [%[Bij]]        \n\t"
-          "vst1.32    {q0}, [%[buffer]]!    \n\t"
-          : [buffer] "+r"(buffer)
-          : [Bij] "r"(Bij)
-          : "memory", "q0");
+          //          "pld        [%[b0]]                     \n\t"
+          "vld1.32    {q0, q1},   [%[b0]]         \n\t"
+          "vst1.32    {q0, q1},   [%[local_buffer]]!    \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "q0", "q1");
+#endif  // __aarch64__
+#else
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+#endif  // __ARM_NEON
     }
   }
   if (n_tail != 0) {
-    for (i = 0; i < k; ++i) {
-      Bij = &B(i, n - n_tail);
-      for (int j = n - n_tail; j < n; ++j) {
-        *buffer++ = *Bij++;
+    float *local_buffer = buffer + j_length * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j_length);
+      for (int j = j_length; j < n; ++j) {
+        *local_buffer++ = *b0++;
       }
-      for (int j = n; j < n + (NR - n_tail); ++j) {
-        *buffer++ = 0;
+      for (int j = n; j < j_length + NR; ++j) {
+        *local_buffer++ = 0;
       }
     }
   }
 }
 
-// 分块矩阵乘法
-void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                 const float *B, int ldb, float beta, float *C, int ldc,
-                 int first_time) {
-  int m_block = (m + MR - 1) / MR * MR;
-  int n_block = (n + NR - 1) / NR * NR;
+void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
+                        float *buffer) {
+  const int j_length = n - n_tail;
+#pragma omp parallel for
+  for (int j = 0; j < j_length; j += NR) {
+    float *local_buffer = buffer + j * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j);
+#if __ARM_NEON
+#if __aarch64__
+      asm volatile(
+          "prfm   pldl1keep,        [%[b0]]           \n\t"
+          "ld1    {v0.4s, v1.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s},   [%[local_buffer]],  #32 \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1");
+#else
+      asm volatile(
+          //          "pld        [%[b0]]                     \n\t"
+          "vld1.32    {q0, q1},   [%[b0]]         \n\t"
+          "vst1.32    {q0, q1},   [%[local_buffer]]!    \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "q0", "q1");
+#endif  // __aarch64__
+#else
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+#endif  // __ARM_NEON
+    }
+  }
+  if (n_tail != 0) {
+    float *local_buffer = buffer + j_length * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j_length);
+      for (int j = j_length; j < n; ++j) {
+        *local_buffer++ = *b0++;
+      }
+      for (int j = n; j < j_length + NR; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+}
 
-  int m_tail = m % MR;
-  int n_tail = n % NR;
+#if __aarch64__
+void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
+                     float *buffer) {
+  const int j_length = n - n_tail;
+  for (int j = 0; j < j_length; j += NR) {
+    float *local_buffer = buffer + j * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j);
+      asm volatile(
+          "prfm   pldl2keep,        [%[b0], #64]           \n\t"
+          "ld1    {v0.4s, v1.4s, v2.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s, v2.4s},   [%[local_buffer]],  #48 \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1", "v2");
+    }
+  }
+  if (n_tail != 0) {
+    float *local_buffer = buffer + j_length * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j_length);
+      for (int j = j_length; j < n; ++j) {
+        *local_buffer++ = *b0++;
+      }
+      for (int j = n; j < j_length + NR; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+}
+
+void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
+                         float *buffer) {
+  const int j_length = n - n_tail;
+#pragma omp parallel for
+  for (int j = 0; j < j_length; j += NR) {
+    float *local_buffer = buffer + j * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j);
+      asm volatile(
+          "prfm   pldl2keep,        [%[b0], #64]           \n\t"
+          "ld1    {v0.4s, v1.4s, v2.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s, v2.4s},   [%[local_buffer]],  #48 \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1", "v2");
+    }
+  }
+  if (n_tail != 0) {
+    float *local_buffer = buffer + j_length * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j_length);
+      for (int j = j_length; j < n; ++j) {
+        *local_buffer++ = *b0++;
+      }
+      for (int j = n; j < j_length + NR; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+}
 
-  if (first_time) {
-    PackMatrixB_(k, n, n_tail, B, ldb, packedB);
+void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
+                     float *buffer) {
+  const int j_length = n - n_tail;
+  for (int j = 0; j < n - n_tail; j += NR) {
+    float *local_buffer = buffer + j * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j);
+      asm volatile(
+          "prfm   pldl2keep,        [%[b0], #64]           \n\t"
+          "ld1    {v0.4s, v1.4s, v2.4s, v3.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s, v2.4s, v3.4s},   [%[local_buffer]],  #64 \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1", "v2", "v3");
+    }
+  }
+  if (n_tail != 0) {
+    float *local_buffer = buffer + j_length * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j_length);
+      for (int j = j_length; j < n; ++j) {
+        *local_buffer++ = *b0++;
+      }
+      for (int j = n; j < j_length + NR; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
   }
-  PackMatrixA_(m, k, m_tail, A, lda, packedA);
+}
 
-  int i, j, mc, nc;
+void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
+                         float *buffer) {
+  const int j_length = n - n_tail;
+#pragma omp parallel for
+  for (int j = 0; j < n - n_tail; j += NR) {
+    float *local_buffer = buffer + j * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j);
+      asm volatile(
+          "prfm   pldl2keep,        [%[b0], #64]           \n\t"
+          "ld1    {v0.4s, v1.4s, v2.4s, v3.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s, v2.4s, v3.4s},   [%[local_buffer]],  #64 \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1", "v2", "v3");
+    }
+  }
+  if (n_tail != 0) {
+    float *local_buffer = buffer + j_length * k;
+    for (int i = 0; i < k; ++i) {
+      const float *b0 = &B(i, j_length);
+      for (int j = j_length; j < n; ++j) {
+        *local_buffer++ = *b0++;
+      }
+      for (int j = n; j < j_length + NR; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+}
+#endif  // __aarch64__
 
-  // B 取 4 列, 打包预热
-  for (j = 0; j < n_block; j += NR) {
-    nc = (n - j) < NR ? n_tail : NR;
-    // A 取 4 行，打包预热
-    for (i = 0; i < m_block; i += MR) {
-      mc = (m - i) < MR ? m_tail : MR;
-      AddDot4x4(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
-                &C(i, j), ldc, mc, nc);
+// 分块矩阵乘法
+void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
+                 float beta, float *c, float *C, int ldc, bool relu) {
+#pragma omp parallel for
+  for (int j = 0; j < nc; j += NR) {
+    for (int i = 0; i < mc; i += MR) {
+#if __aarch64__
+      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#else
+      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#endif
     }
   }
+
+  if (alpha != 1) {
+    WriteWithAlphaBeta(mc, nc, c, C, ldc);
+    return;
+  }
+  if (beta == 0) {
+    WriteBasic(mc, nc, c, C, ldc);
+    return;
+  }
+  if (beta == 1 && !relu) {
+    WriteWithAdd(mc, nc, c, C, ldc);
+    return;
+  }
+  if (beta == 1 && relu) {
+    WriteWithAddRelu(mc, nc, c, C, ldc);
+    return;
+  }
 }
 
 // 分块矩阵乘法
-void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda,
-                      const float *B, int ldb, float beta, float *C, int ldc,
-                      int first_time, bool relu = false) {
-  int m_block = (m + MR - 1) / MR * MR;
-  int n_block = (n + NR - 1) / NR * NR;
+void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
+                         const float *b, float beta, float *c, float *C,
+                         int ldc, bool relu, float *bias) {
+#pragma omp parallel for
+  for (int j = 0; j < nc; j += NR) {
+    for (int i = 0; i < mc; i += MR) {
+#if __aarch64__
+      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#else
+      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#endif
+    }
+  }
+
+  if (alpha != 1) {
+    WriteWithAlphaBeta(mc, nc, c, C, ldc);
+    return;
+  }
+  if (beta == 0) {
+    WriteBasic(mc, nc, c, C, ldc);
+    return;
+  }
+  if (beta == 1 && !relu) {
+    if (bias == nullptr) {
+      WriteWithAdd(mc, nc, c, C, ldc);
+    } else {
+      WriteWithAddV1(mc, nc, c, C, ldc, bias);
+    }
+    return;
+  }
+  if (beta == 1 && relu) {
+    if (bias == nullptr) {
+      WriteWithAddRelu(mc, nc, c, C, ldc);
+    } else {
+      WriteWithAddReluV1(mc, nc, c, C, ldc, bias);
+    }
+    return;
+  }
+}
 
-  int m_tail = m % MR;
-  int n_tail = n % NR;
+// 分块矩阵乘法
+void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+                       const float *b, float beta, float *c, float *C, int ldc,
+                       bool relu, float *new_scale, float *new_bias) {
+#pragma omp parallel for
+  for (int j = 0; j < nc; j += NR) {
+    for (int i = 0; i < mc; i += MR) {
+#if __aarch64__
+      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#else
+      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#endif
+    }
+  }
 
-  if (first_time) {
-    PackMatrixB_(k, n, n_tail, B, ldb, packedB);
+  if (relu) {
+    WriteWithBnRelu(mc, nc, c, C, ldc, new_scale, new_bias);
+  } else {
+    WriteWithBn(mc, nc, c, C, ldc, new_scale, new_bias);
   }
-  PackMatrixA_(m, k, m_tail, A, lda, packedA);
+}
 
-  int i, j, mc, nc;
+// 分块矩阵乘法
+void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
+                          const float *b, float beta, float *c, float *C,
+                          int ldc, bool relu, float *new_scale, float *new_bias,
+                          float *bias) {
+#pragma omp parallel for
+  for (int j = 0; j < nc; j += NR) {
+    for (int i = 0; i < mc; i += MR) {
+#if __aarch64__
+      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#else
+      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#endif
+    }
+  }
+  WriteWithBnAddRelu(mc, nc, c, C, ldc, new_scale, new_bias, bias);
+}
 
-  // B 取 4 列, 打包预热
-  for (j = 0; j < n_block; j += NR) {
-    nc = (n - j) < NR ? n_tail : NR;
-    // A 取 4 行，打包预热
-    for (i = 0; i < m_block; i += MR) {
-      mc = (m - i) < MR ? m_tail : MR;
-      AddDot4x4_relu(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
-                     &C(i, j), ldc, mc, nc, relu);
+void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
+                          float *c, float *C, int ldc, float *p,
+                          std::string mode, float *bias, float *bias1) {
+#pragma omp parallel for
+  for (int j = 0; j < nc; j += NR) {
+    for (int i = 0; i < mc; i += MR) {
+#if __aarch64__
+      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#else
+      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#endif
     }
   }
+  WriteWithAddPRelu(mc, nc, c, C, ldc, p, mode, bias, bias1);
 }
 
-// 计算一个更小的 4 * 4 的 C 矩阵分块
-#if defined(IOS)
-void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
-               int ldb, float beta, float *C, int ldc, int mc, int nc) {
+#if __ARM_NEON
+#if __aarch64__
+
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
   // init C
   float32x4_t cv0 = vdupq_n_f32(0.0);
   float32x4_t cv1 = vdupq_n_f32(0.0);
@@ -227,274 +786,501 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
     a += MR;
     b += NR;
   }
-  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
-      }
-      if (j == 0) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
-      } else if (j == 1) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
-      } else if (j == 2) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
-      } else if (j == 3) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
-      }
-    }
-  }
+
+  vst1q_f32(c, cv0);
+  vst1q_f32(c + ldc, cv1);
+  vst1q_f32(c + 2 * ldc, cv2);
+  vst1q_f32(c + 3 * ldc, cv3);
+  //  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
 }
 
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu = false) {
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
   // init C
   float32x4_t cv0 = vdupq_n_f32(0.0);
   float32x4_t cv1 = vdupq_n_f32(0.0);
   float32x4_t cv2 = vdupq_n_f32(0.0);
   float32x4_t cv3 = vdupq_n_f32(0.0);
+  float32x4_t cv4 = vdupq_n_f32(0.0);
+  float32x4_t cv5 = vdupq_n_f32(0.0);
+  float32x4_t cv6 = vdupq_n_f32(0.0);
+  float32x4_t cv7 = vdupq_n_f32(0.0);
 
   float32x4_t av;
-  float32x4_t bv;
+  float32x4_t bv0;
+  float32x4_t bv1;
 
   float32x2_t av01;
   float32x2_t av23;
 
   for (int p = 0; p < k; p += 1) {
     av = vld1q_f32(a);
-    bv = vld1q_f32(b);
+    bv0 = vld1q_f32(b);
+    bv1 = vld1q_f32(b + 4);
 
     av01 = vget_low_f32(av);
-    cv0 = vmlaq_lane_f32(cv0, bv, av01, 0);
-    cv1 = vmlaq_lane_f32(cv1, bv, av01, 1);
+    cv0 = vmlaq_lane_f32(cv0, bv0, av01, 0);
+    cv1 = vmlaq_lane_f32(cv1, bv1, av01, 0);
+    cv2 = vmlaq_lane_f32(cv2, bv0, av01, 1);
+    cv3 = vmlaq_lane_f32(cv3, bv1, av01, 1);
     av23 = vget_high_f32(av);
-    cv2 = vmlaq_lane_f32(cv2, bv, av23, 0);
-    cv3 = vmlaq_lane_f32(cv3, bv, av23, 1);
+    cv4 = vmlaq_lane_f32(cv4, bv0, av23, 0);
+    cv5 = vmlaq_lane_f32(cv5, bv1, av23, 0);
+    cv6 = vmlaq_lane_f32(cv6, bv0, av23, 1);
+    cv7 = vmlaq_lane_f32(cv7, bv1, av23, 1);
 
     a += MR;
     b += NR;
   }
-  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
-      }
-      if (j == 0) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
-      } else if (j == 1) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
-      } else if (j == 2) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
-      } else if (j == 3) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
-      }
-      if (C(i, j) < 0) {
-        C(i, j) = 0;
-      }
-    }
-  }
-}
-
-#elif defined(ARMV7)
-void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
-               int ldb, float beta, float *C, int ldc, int mc, int nc) {
-  int kc1 = k / 4, kc2 = k % 4;
-  int bytes_ldc = 4 * ldc;
-  int flag_alpha = (alpha == 1.0) ? 1 : 2;
-  int flag_beta;
-  if (beta == 0.0) {
-    flag_beta = 0;
-  } else if (beta == 1.0) {
-    flag_beta = 1;
-  } else {
-    flag_beta = 2;
-  }
-  asm volatile(
-      "pld        [%[a]]              \n\t"
-      "pld        [%[b]]              \n\t"
-      "vmov.f32   q10,    #0.0        \n\t"
-      "vmov.f32   q11,    #0.0        \n\t"
-      "vmov.f32   q12,    #0.0        \n\t"
-      "vmov.f32   q13,    #0.0        \n\t"
-
-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "blt        end_kc1_%=          \n\t"
-      "loop_kc1_%=:                   \n\t"
-      "pld        [%[a], #64]         \n\t"
-      "pld        [%[b], #64]         \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
-      "vmla.f32   q10, q2, d0[0]      \n\t"
-      "vmla.f32   q11, q2, d0[1]      \n\t"
-      "vmla.f32   q12, q2, d1[0]      \n\t"
-      "vmla.f32   q13, q2, d1[1]      \n\t"
-      "vmla.f32   q10, q3, d2[0]      \n\t"
-      "vmla.f32   q11, q3, d2[1]      \n\t"
-      "vmla.f32   q12, q3, d3[0]      \n\t"
-      "vmla.f32   q13, q3, d3[1]      \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
-      "vmla.f32   q10, q2, d0[0]      \n\t"
-      "vmla.f32   q11, q2, d0[1]      \n\t"
-      "vmla.f32   q12, q2, d1[0]      \n\t"
-      "vmla.f32   q13, q2, d1[1]      \n\t"
-      "vmla.f32   q10, q3, d2[0]      \n\t"
-      "vmla.f32   q11, q3, d2[1]      \n\t"
-      "vmla.f32   q12, q3, d3[0]      \n\t"
-      "vmla.f32   q13, q3, d3[1]      \n\t"
-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "bge        loop_kc1_%=         \n\t"
-      "end_kc1_%=:                    \n\t"
-
-      "subs       %[kc2], %[kc2], #1  \n\t"
-      "blt        end_kc2_%=          \n\t"
-      "loop_kc2_%=:                   \n\t"
-      "vld1.32    {q0}, [%[a]]!       \n\t"
-      "vld1.32    {q1}, [%[b]]!       \n\t"
-      "vmla.f32   q10, q1, d0[0]      \n\t"
-      "vmla.f32   q11, q1, d0[1]      \n\t"
-      "vmla.f32   q12, q1, d1[0]      \n\t"
-      "vmla.f32   q13, q1, d1[1]      \n\t"
-      "subs       %[kc2], %[kc2], #1  \n\t"
-      "bge        loop_kc2_%=         \n\t"
-      "end_kc2_%=:                    \n\t"
 
-      "cmp        %[mc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-      "cmp        %[nc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-
-      "vmov.f32   d8[0],    %[alpha]  \n\t"
-      "vmov.f32   d8[1],    %[beta]   \n\t"
-
-      "cmp        %[flag_alpha],  #1  \n\t"
-      "bne        alpha_%=            \n\t"
-
-      "alpha_%=:                      \n\t"
-      "vmul.f32   q10, q10, d8[0]     \n\t"
-      "vmul.f32   q11, q11, d8[0]     \n\t"
-      "vmul.f32   q12, q12, d8[0]     \n\t"
-      "vmul.f32   q13, q13, d8[0]     \n\t"
-
-      "beta_%=:                       \n\t"
-      "cmp        %[flag_beta],   #0  \n\t"
-      "beq        memory_%=           \n\t"
-
-      "mov        r4,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
-      "vld1.32    {q0}, [r4], r6      \n\t"
-      "vld1.32    {q1}, [r4], r6      \n\t"
-      "vld1.32    {q2}, [r4], r6      \n\t"
-      "vld1.32    {q3}, [r4]          \n\t"
-      "cmp        %[flag_beta],   #1  \n\t"
-      "beq        beta_eq1_%=         \n\t"
-      "bne        beta_ne1_%=         \n\t"
-
-      "beta_eq1_%=:                   \n\t"
-      "vadd.f32   q10, q10, q0        \n\t"
-      "vadd.f32   q11, q11, q1        \n\t"
-      "vadd.f32   q12, q12, q2        \n\t"
-      "vadd.f32   q13, q13, q3        \n\t"
-      "b          memory_%=           \n\t"
-
-      "beta_ne1_%=:                   \n\t"
-      "vmla.f32   q10, q0, d8[1]      \n\t"
-      "vmla.f32   q11, q1, d8[1]      \n\t"
-      "vmla.f32   q12, q2, d8[1]      \n\t"
-      "vmla.f32   q13, q3, d8[1]      \n\t"
-
-      "memory_%=:                     \n\t"
-      "mov        r5,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
-      "vst1.32    {q10}, [r5], r6     \n\t"
-      "vst1.32    {q11}, [r5], r6     \n\t"
-      "vst1.32    {q12}, [r5], r6     \n\t"
-      "vst1.32    {q13}, [r5]         \n\t"
-      "b          end_%=              \n\t"
+  vst1q_f32(c, cv0);
+  vst1q_f32(c + 4, cv1);
+  vst1q_f32(c + ldc, cv2);
+  vst1q_f32(c + ldc + 4, cv3);
+  vst1q_f32(c + 2 * ldc, cv4);
+  vst1q_f32(c + 2 * ldc + 4, cv5);
+  vst1q_f32(c + 3 * ldc, cv6);
+  vst1q_f32(c + 3 * ldc + 4, cv7);
+}
 
-      "temp_%=:                       \n\t"
-      "vst1.32    {q10, q11}, [%[ab]]!\n\t"
-      "vst1.32    {q12, q13}, [%[ab]] \n\t"
-      "end_%=:                        \n\t"
-      :
-      : [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
-        [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
-        [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13");
-
-  if (mc != MR || nc != NR) {
-    int i, j;
-    for (i = 0; i < mc; ++i) {
-      for (j = 0; j < nc; ++j) {
-        if (beta == 0.0) {
-          if (alpha != 1.0) {
-            C(i, j) = alpha * ab[i * MR + j];
-          } else {
-            C(i, j) = ab[i * MR + j];
-          }
-        } else {
-          if (beta != 1.0) {
-            C(i, j) *= beta;
-          }
-          if (alpha != 1.0) {
-            C(i, j) += alpha * ab[i * MR + j];
-          } else {
-            C(i, j) += ab[i * MR + j];
-          }
-        }
+// 分块矩阵乘法结果回写
+// C = A * B
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
       }
     }
   }
 }
 
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu = false) {
-  int kc1 = k / 4, kc2 = k % 4;
-  int bytes_ldc = 4 * ldc;
-  int flag_alpha = (alpha == 1.0) ? 1 : 2;
-  int flag_beta;
-  if (beta == 0.0) {
-    flag_beta = 0;
-  } else if (beta == 1.0) {
-    flag_beta = 1;
-  } else {
-    flag_beta = 2;
+// C = alpha * A * B + beta * C
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+
+// C = A * B + C
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t cv1;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv1 = vld1q_f32(C_ptr);
+      cv = vaddq_f32(cv, cv1);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv1 = vld1q_f32(C_ptr);
+      cv = vaddq_f32(cv, cv1);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+      }
+    }
   }
-  asm volatile(
-      "pld        [%[a]]              \n\t"
-      "pld        [%[b]]              \n\t"
-      "vmov.f32   q10,    #0.0        \n\t"
-      "vmov.f32   q11,    #0.0        \n\t"
-      "vmov.f32   q12,    #0.0        \n\t"
-      "vmov.f32   q13,    #0.0        \n\t"
+}
+// C = A * B + bias
+void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t biasv;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    biasv = vld1q_dup_f32(bias + i);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+        C_ptr++;
+      }
+    }
+  }
+}
+
+// C = A * B + C, relu(C)
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t cv1;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv1 = vld1q_f32(C_ptr);
+      cv = vaddq_f32(cv, cv1);
+      cv = vmaxq_f32(cv, zero);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv1 = vld1q_f32(C_ptr);
+      cv = vaddq_f32(cv, cv1);
+      cv = vmaxq_f32(cv, zero);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+      }
+    }
+  }
+}
+
+// C = A * B + bias, relu(C)
+void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
+                        float *bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t biasv;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    biasv = vld1q_dup_f32(bias + i);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      cv = vmaxq_f32(cv, zero);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      cv = vmaxq_f32(cv, zero);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+        C_ptr++;
+      }
+    }
+  }
+}
+
+// C = A * B + C,prelu(C)
+void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
+                       std::string mode, float *bias, float *bias1) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t cv1;
+  float32x4_t biasv;
+  float32x4_t biasv1;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  float32x4_t pv;
+  float *ptr = p;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    biasv = vld1q_dup_f32(bias + i);
+    if (bias1 == nullptr) {
+      biasv1 = zero;
+    } else {
+      biasv1 = vld1q_dup_f32(bias1 + i);
+    }
+
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      cv = vaddq_f32(cv, biasv1);
+      cv = vmaxq_f32(cv, zero);
+      cv1 = vminq_f32(cv, zero);
+      if (mode == "channel") {
+        cv1 = vmulq_n_f32(cv1, ptr[i]);
+      } else if (mode == "element") {
+        pv = vld1q_f32(ptr);
+        cv1 = vmulq_f32(cv1, pv);
+        ptr = ptr + 4;
+      } else {
+        cv1 = vmulq_n_f32(cv1, ptr[0]);
+      }
+      cv = vaddq_f32(cv, cv1);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      cv = vaddq_f32(cv, biasv1);
+      cv = vmaxq_f32(cv, zero);
+      cv1 = vminq_f32(cv, zero);
+      if (mode == "channel") {
+        cv1 = vmulq_n_f32(cv1, ptr[i]);
+      } else if (mode == "element") {
+        pv = vld1q_f32(ptr);
+        cv1 = vmulq_f32(cv1, pv);
+        ptr = ptr + 4;
+      } else {
+        cv1 = vmulq_n_f32(cv1, ptr[0]);
+      }
+      cv = vaddq_f32(cv, cv1);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+        C_ptr++;
+      }
+    }
+  }
+}
+
+// C = A * B, batchnorm(C)
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
+                 float *new_bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t cv1;
+  float32x4_t bias;
+  float32x2_t scale;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    bias = vld1q_dup_f32(new_bias);
+    scale = vld1_dup_f32(new_scale);
+    new_bias++;
+    new_scale++;
+    float scale0 = vget_lane_f32(scale, 0);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vmlaq_n_f32(bias, cv, scale0);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vmlaq_n_f32(bias, cv, scale0);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+        C_ptr++;
+      }
+    }
+  }
+}
+
+// C = A * B, batchnorm(C), relu(C)
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                     float *new_scale, float *new_bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t bias;
+  float32x2_t scale;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    bias = vld1q_dup_f32(new_bias);
+    scale = vld1_dup_f32(new_scale);
+    new_bias++;
+    new_scale++;
+    float scale0 = vget_lane_f32(scale, 0);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vmlaq_n_f32(bias, cv, scale0);
+      cv = vmaxq_f32(cv, zero);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vmlaq_n_f32(bias, cv, scale0);
+      cv = vmaxq_f32(cv, zero);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+      }
+    }
+  }
+}
+
+// C = A * B, batchnorm(C),C = C + bias; relu(C)
+void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
+                        float *new_scale, float *new_bias, float *bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr, *bias_ptr;
+  float32x4_t cv;
+  float32x4_t nbias;
+  float32x2_t scale;
+  float32x4_t biasv;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    bias_ptr = bias + i * ldc;
+    nbias = vld1q_dup_f32(new_bias);
+    scale = vld1_dup_f32(new_scale);
+    new_bias++;
+    new_scale++;
+    float scale0 = vget_lane_f32(scale, 0);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      biasv = vld1q_f32(bias_ptr);
+      cv = vmlaq_n_f32(nbias, cv, scale0);
+      cv = vaddq_f32(cv, biasv);
+      cv = vmaxq_f32(cv, zero);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+      bias_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      biasv = vld1q_f32(bias_ptr);
+      cv = vmlaq_n_f32(nbias, cv, scale0);
+      cv = vaddq_f32(cv, biasv);
+      cv = vmaxq_f32(cv, zero);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+      }
+    }
+  }
+}
+
+#else
+
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k / 4;
+  int kc2 = k % 4;
+  int step = 4 * ldc;
+  asm volatile(
+      "pld        [%[a_ptr]]          \n\t"
+      "pld        [%[b_ptr]]          \n\t"
+      "vmov.f32   q10,    #0.0        \n\t"
+      "vmov.f32   q11,    #0.0        \n\t"
+      "vmov.f32   q12,    #0.0        \n\t"
+      "vmov.f32   q13,    #0.0        \n\t"
 
       "subs       %[kc1], %[kc1], #1  \n\t"
       "blt        end_kc1_%=          \n\t"
       "loop_kc1_%=:                   \n\t"
-      "pld        [%[a], #64]         \n\t"
-      "pld        [%[b], #64]         \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
-      "vmla.f32   q10, q2, d0[0]      \n\t"
-      "vmla.f32   q11, q2, d0[1]      \n\t"
-      "vmla.f32   q12, q2, d1[0]      \n\t"
-      "vmla.f32   q13, q2, d1[1]      \n\t"
-      "vmla.f32   q10, q3, d2[0]      \n\t"
-      "vmla.f32   q11, q3, d2[1]      \n\t"
-      "vmla.f32   q12, q3, d3[0]      \n\t"
-      "vmla.f32   q13, q3, d3[1]      \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
+      "pld        [%[a_ptr], #64]     \n\t"
+      "pld        [%[b_ptr], #64]     \n\t"
+      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
       "vmla.f32   q10, q2, d0[0]      \n\t"
       "vmla.f32   q11, q2, d0[1]      \n\t"
       "vmla.f32   q12, q2, d1[0]      \n\t"
@@ -503,6 +1289,16 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
       "vmla.f32   q11, q3, d2[1]      \n\t"
       "vmla.f32   q12, q3, d3[0]      \n\t"
       "vmla.f32   q13, q3, d3[1]      \n\t"
+      "vld1.32    {q4, q5}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q6, q7}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q10, q6, d8[0]      \n\t"
+      "vmla.f32   q11, q6, d8[1]      \n\t"
+      "vmla.f32   q12, q6, d9[0]      \n\t"
+      "vmla.f32   q13, q6, d9[1]      \n\t"
+      "vmla.f32   q10, q7, d10[0]     \n\t"
+      "vmla.f32   q11, q7, d10[1]     \n\t"
+      "vmla.f32   q12, q7, d11[0]     \n\t"
+      "vmla.f32   q13, q7, d11[1]     \n\t"
       "subs       %[kc1], %[kc1], #1  \n\t"
       "bge        loop_kc1_%=         \n\t"
       "end_kc1_%=:                    \n\t"
@@ -510,8 +1306,8 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
       "subs       %[kc2], %[kc2], #1  \n\t"
       "blt        end_kc2_%=          \n\t"
       "loop_kc2_%=:                   \n\t"
-      "vld1.32    {q0}, [%[a]]!       \n\t"
-      "vld1.32    {q1}, [%[b]]!       \n\t"
+      "vld1.32    {q0}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q1}, [%[b_ptr]]!   \n\t"
       "vmla.f32   q10, q1, d0[0]      \n\t"
       "vmla.f32   q11, q1, d0[1]      \n\t"
       "vmla.f32   q12, q1, d1[0]      \n\t"
@@ -520,285 +1316,23 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
       "bge        loop_kc2_%=         \n\t"
       "end_kc2_%=:                    \n\t"
 
-      "cmp        %[mc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-      "cmp        %[nc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-
-      "vmov.f32   d8[0],    %[alpha]  \n\t"
-      "vmov.f32   d8[1],    %[beta]   \n\t"
-
-      "cmp        %[flag_alpha],  #1  \n\t"
-      "bne        alpha_%=            \n\t"
-
-      "alpha_%=:                      \n\t"
-      "vmul.f32   q10, q10, d8[0]     \n\t"
-      "vmul.f32   q11, q11, d8[0]     \n\t"
-      "vmul.f32   q12, q12, d8[0]     \n\t"
-      "vmul.f32   q13, q13, d8[0]     \n\t"
-
-      "beta_%=:                       \n\t"
-      "cmp        %[flag_beta],   #0  \n\t"
-      "beq        memory_%=           \n\t"
-
-      "mov        r4,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
-      "vld1.32    {q0}, [r4], r6      \n\t"
-      "vld1.32    {q1}, [r4], r6      \n\t"
-      "vld1.32    {q2}, [r4], r6      \n\t"
-      "vld1.32    {q3}, [r4]          \n\t"
-      "cmp        %[flag_beta],   #1  \n\t"
-      "beq        beta_eq1_%=         \n\t"
-      "bne        beta_ne1_%=         \n\t"
-
-      "beta_eq1_%=:                   \n\t"
-      "vadd.f32   q10, q10, q0        \n\t"
-      "vadd.f32   q11, q11, q1        \n\t"
-      "vadd.f32   q12, q12, q2        \n\t"
-      "vadd.f32   q13, q13, q3        \n\t"
-      "b          memory_%=           \n\t"
-
-      "beta_ne1_%=:                   \n\t"
-      "vmla.f32   q10, q0, d8[1]      \n\t"
-      "vmla.f32   q11, q1, d8[1]      \n\t"
-      "vmla.f32   q12, q2, d8[1]      \n\t"
-      "vmla.f32   q13, q3, d8[1]      \n\t"
-
-      "memory_%=:                     \n\t"
-      "vmax.f32 q10, q10, q14         \n\t"
-      "vmax.f32 q11, q11, q14         \n\t"
-      "vmax.f32 q12, q12, q14         \n\t"
-      "vmax.f32 q13, q13, q14         \n\t"
-      "mov        r5,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
+      "mov        r5,     %[c]        \n\t"
+      "mov        r6,     %[step]     \n\t"
       "vst1.32    {q10}, [r5], r6     \n\t"
       "vst1.32    {q11}, [r5], r6     \n\t"
       "vst1.32    {q12}, [r5], r6     \n\t"
       "vst1.32    {q13}, [r5]         \n\t"
-      "b          end_%=              \n\t"
-
-      "temp_%=:                       \n\t"
-      "vst1.32    {q10, q11}, [%[ab]]!\n\t"
-      "vst1.32    {q12, q13}, [%[ab]] \n\t"
-      "end_%=:                        \n\t"
       :
-      : [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
-        [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
-        [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13",
-        "q14");
-
-  if (mc != MR || nc != NR) {
-    int i, j;
-    for (i = 0; i < mc; ++i) {
-      for (j = 0; j < nc; ++j) {
-        if (beta == 0.0) {
-          if (alpha != 1.0) {
-            C(i, j) = alpha * ab[i * MR + j];
-          } else {
-            C(i, j) = ab[i * MR + j];
-          }
-        } else {
-          if (beta != 1.0) {
-            C(i, j) *= beta;
-          }
-          if (alpha != 1.0) {
-            C(i, j) += alpha * ab[i * MR + j];
-          } else {
-            C(i, j) += ab[i * MR + j];
-          }
-        }
-        if (relu) {
-          if (C(i, j) < 0) {
-            C(i, j) = 0;
-          }
-        }
-      }
-    }
-  }
-}
-
-#else
-void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
-               int ldb, float beta, float *C, int ldc, int mc, int nc) {
-  float c[16] = {0};
-  float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
-
-  for (int p = 0; p < k; p += 1) {
-    reg_b0 = *b++;
-    reg_b1 = *b++;
-    reg_b2 = *b++;
-    reg_b3 = *b++;
-
-    reg_a0 = *a++;
-    reg_a1 = *a++;
-    reg_a2 = *a++;
-    reg_a3 = *a++;
-
-    // first row
-    c[0] += reg_a0 * reg_b0;
-    c[1] += reg_a0 * reg_b1;
-    c[2] += reg_a0 * reg_b2;
-    c[3] += reg_a0 * reg_b3;
-
-    // second row
-    c[4] += reg_a1 * reg_b0;
-    c[5] += reg_a1 * reg_b1;
-    c[6] += reg_a1 * reg_b2;
-    c[7] += reg_a1 * reg_b3;
-
-    // third row
-    c[8] += reg_a2 * reg_b0;
-    c[9] += reg_a2 * reg_b1;
-    c[10] += reg_a2 * reg_b2;
-    c[11] += reg_a2 * reg_b3;
-
-    // fourth row
-    c[12] += reg_a3 * reg_b0;
-    c[13] += reg_a3 * reg_b1;
-    c[14] += reg_a3 * reg_b2;
-    c[15] += reg_a3 * reg_b3;
-  }
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
-      }
-      if (alpha != 1.0) {
-        C(i, j) += alpha * c[i * MR + j];
-      } else {
-        C(i, j) += c[i * MR + j];
-      }
-    }
-  }
-}
-
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu) {
-  float c[16] = {0};
-  float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
-
-  for (int p = 0; p < k; p += 1) {
-    reg_b0 = *b++;
-    reg_b1 = *b++;
-    reg_b2 = *b++;
-    reg_b3 = *b++;
-
-    reg_a0 = *a++;
-    reg_a1 = *a++;
-    reg_a2 = *a++;
-    reg_a3 = *a++;
-
-    // first row
-    c[0] += reg_a0 * reg_b0;
-    c[1] += reg_a0 * reg_b1;
-    c[2] += reg_a0 * reg_b2;
-    c[3] += reg_a0 * reg_b3;
-
-    // second row
-    c[4] += reg_a1 * reg_b0;
-    c[5] += reg_a1 * reg_b1;
-    c[6] += reg_a1 * reg_b2;
-    c[7] += reg_a1 * reg_b3;
-
-    // third row
-    c[8] += reg_a2 * reg_b0;
-    c[9] += reg_a2 * reg_b1;
-    c[10] += reg_a2 * reg_b2;
-    c[11] += reg_a2 * reg_b3;
-
-    // fourth row
-    c[12] += reg_a3 * reg_b0;
-    c[13] += reg_a3 * reg_b1;
-    c[14] += reg_a3 * reg_b2;
-    c[15] += reg_a3 * reg_b3;
-  }
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
-      }
-      if (alpha != 1.0) {
-        C(i, j) += alpha * c[i * MR + j];
-      } else {
-        C(i, j) += c[i * MR + j];
-      }
-      if (relu) {
-        if (C(i, j) < 0) {
-          C(i, j) = 0;
-        }
-      }
-    }
-  }
-}
-
-#endif
-
-// 32位 float 矩阵乘法
-void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc) {
-  int i, j, p, mc, nc, kc;
-  float beta_;
-  if (m == 1) {
-    VectorKernel(1, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-    return;
-  }
-  for (j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-    for (p = 0; p < k; p += KC) {
-      kc = s_min(k - p, KC);
-      for (i = 0; i < m; i += MC) {
-        mc = s_min(m - i, MC);
-        if (p != 0) {
-          beta_ = 1.0;
-        } else {
-          beta_ = beta;
-        }
-        InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
-                    &C(i, j), ldc, i == 0);
-      }
-    }
-  }
-}
-
-void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
-                const float *B, int ldb, float beta, float *C, int ldc) {
-  int i, j, p, mc, nc, kc;
-  float beta_;
-  for (j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-    for (p = 0; p < k; p += KC) {
-      kc = s_min(k - p, KC);
-      for (i = 0; i < m; i += MC) {
-        mc = s_min(m - i, MC);
-        if (p != 0) {
-          beta_ = 1.0;
-        } else {
-          beta_ = beta;
-        }
-
-        if (p + KC >= k) {
-          InnerKernel_relu(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb,
-                           beta_, &C(i, j), ldc, i == 0, true);
-        } else {
-          InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
-                      &C(i, j), ldc, i == 0);
-        }
-      }
-    }
-  }
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [step] "r"(step)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q10", "q11", "q12", "q13");
 }
 
+/*
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                  const float *B, int ldb, float beta, float *C, int ldc) {
+                  const float *B, int ldb, float beta, float *C, int ldc,
+                  bool relu) {
   float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
 
   const float *a0, *b0, *b1, *b2, *b3;
@@ -1000,17 +1534,2524 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
     }
   }
 
-  c0 = bufferC;
-  C0 = C;
-  for (int i = 0; i < n; i++) {
-    if (beta == 1.0) {
-      *C0++ += *c0++;
-    } else {
-      *C0++ = *c0++;
-    }
+  if (alpha != 1) {
+    VecWriteWithAlphaBeta(n, bufferC, C, ldc);
+    return;
+  }
+  if (beta == 0) {
+    VecWriteBasic(n, bufferC, C, ldc);
+    return;
+  }
+  if (beta == 1 && !relu) {
+    VecWriteWithAdd(n, bufferC, C, ldc);
+    return;
+  }
+  if (beta == 1 && relu) {
+    VecWriteWithAddRelu(n, bufferC, C, ldc);
+    return;
   }
 }
 
+void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+                        int lda, const float *B, int ldb, float beta, float *C,
+                        int ldc, bool relu, float *new_scale, float *new_bias) {
+  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
+
+  const float *a0, *b0, *b1, *b2, *b3;
+  float *c0, *C0;
+
+  int volatile kc1 = k / 4;
+  int volatile kc2 = k % 4;
+  int volatile nc1 = n / 16;
+  int _nc1 = n % 16;
+  int volatile nc2 = _nc1 / 4;
+  int volatile nc3 = _nc1 % 4;
+  for (int i = 0; i < kc1; i++) {
+    a0 = A + i * 4;
+    b0 = B + i * 4 * ldb;
+    b1 = b0 + ldb;
+    b2 = b1 + ldb;
+    b3 = b2 + ldb;
+    c0 = bufferC;
+    asm volatile(
+        "pld        [%[a0], #16]          \n\t"
+        "vld1.32    {q0}, [%[a0]]         \n\t"
+
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "blt        end_nc1_%=            \n\t"
+        "loop_nc1_%=:                     \n\t"
+
+        "cmp        %[i],       #0        \n\t"
+        "beq        i_eq0_%=              \n\t"
+        "bne        i_ne0_%=              \n\t"
+
+        "i_eq0_%=:                        \n\t"
+        "vmov.f32   q10,    #0.0          \n\t"
+        "vmov.f32   q11,    #0.0          \n\t"
+        "vmov.f32   q12,    #0.0          \n\t"
+        "vmov.f32   q13,    #0.0          \n\t"
+        "b          gemm_nc1_%=           \n\t"
+
+        "i_ne0_%=:                        \n\t"
+        "pld        [%[c0], #64]          \n\t"
+        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
+        "sub        %[c0], %[c0], #32     \n\t"
+
+        "gemm_nc1_%=:                     \n\t"
+        "pld        [%[b0], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "vmla.f32   q11, q3, d0[0]        \n\t"
+        "vmla.f32   q12, q4, d0[0]        \n\t"
+        "vmla.f32   q13, q5, d0[0]        \n\t"
+
+        "pld        [%[b1], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b1]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b1]]!    \n\t"
+        "vmla.f32   q10, q2, d0[1]        \n\t"
+        "vmla.f32   q11, q3, d0[1]        \n\t"
+        "vmla.f32   q12, q4, d0[1]        \n\t"
+        "vmla.f32   q13, q5, d0[1]        \n\t"
+
+        "pld        [%[b2], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b2]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b2]]!    \n\t"
+        "vmla.f32   q10, q2, d1[0]        \n\t"
+        "vmla.f32   q11, q3, d1[0]        \n\t"
+        "vmla.f32   q12, q4, d1[0]        \n\t"
+        "vmla.f32   q13, q5, d1[0]        \n\t"
+
+        "pld        [%[b3], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b3]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b3]]!    \n\t"
+        "vmla.f32   q10, q2, d1[1]        \n\t"
+        "vmla.f32   q11, q3, d1[1]        \n\t"
+        "vmla.f32   q12, q4, d1[1]        \n\t"
+        "vmla.f32   q13, q5, d1[1]        \n\t"
+
+        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
+
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "bge        loop_nc1_%=           \n\t"
+        "end_nc1_%=:                      \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "blt        end_nc2_%=            \n\t"
+        "loop_nc2_%=:                     \n\t"
+
+        "cmp        %[i],       #0        \n\t"
+        "beq        ii_eq0_%=             \n\t"
+        "bne        ii_ne0_%=             \n\t"
+
+        "ii_eq0_%=:                       \n\t"
+        "vmov.f32   q10,    #0.0          \n\t"
+        "b          gemm_nc2_%=           \n\t"
+
+        "ii_ne0_%=:                       \n\t"
+        "pld        [%[c0], #16]          \n\t"
+        "vld1.32    {q10}, [%[c0]]        \n\t"
+
+        "gemm_nc2_%=:                     \n\t"
+        "pld        [%[b0], #16]          \n\t"
+        "vld1.32    {q2}, [%[b0]]!        \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+
+        "pld        [%[b1], #16]          \n\t"
+        "vld1.32    {q3}, [%[b1]]!        \n\t"
+        "vmla.f32   q10, q3, d0[1]        \n\t"
+
+        "pld        [%[b2], #16]          \n\t"
+        "vld1.32    {q4}, [%[b2]]!        \n\t"
+        "vmla.f32   q10, q4, d1[0]        \n\t"
+
+        "pld        [%[b3], #16]          \n\t"
+        "vld1.32    {q5}, [%[b3]]!        \n\t"
+        "vmla.f32   q10, q5, d1[1]        \n\t"
+
+        "vst1.32    {q10}, [%[c0]]!       \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "bge        loop_nc2_%=           \n\t"
+        "end_nc2_%=:                      \n\t"
+
+        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
+          [c0] "+r"(c0)
+        : [a0] "r"(a0), [i] "r"(i), [nc1] "r"(nc1), [nc2] "r"(nc2)
+        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
+
+    for (int j = 0; j < nc3; j++) {
+      if (i == 0) {
+        *c0 = (*a0) * (*b0++);
+      } else {
+        *c0 += (*a0) * (*b0++);
+      }
+      *c0 += (*(a0 + 1)) * (*b1++);
+      *c0 += (*(a0 + 2)) * (*b2++);
+      *c0 += (*(a0 + 3)) * (*b3++);
+      c0++;
+    }
+  }
+
+  for (int i = 0; i < kc2; ++i) {
+    a0 = A + 4 * kc1 + i;
+    b0 = B + (4 * kc1 + i) * ldb;
+    c0 = bufferC;
+    asm volatile(
+        "pld        [%[a0], #16]          \n\t"
+        "vld1.32    {d0}, [%[a0]]         \n\t"
+
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "blt        end_nc1_%=            \n\t"
+        "loop_nc1_%=:                     \n\t"
+
+        "pld        [%[c0], #64]          \n\t"
+        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
+        "sub        %[c0], %[c0], #32     \n\t"
+
+        "gemm_nc1_%=:                     \n\t"
+        "pld        [%[b0], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "vmla.f32   q11, q3, d0[0]        \n\t"
+        "vmla.f32   q12, q4, d0[0]        \n\t"
+        "vmla.f32   q13, q5, d0[0]        \n\t"
+
+        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
+
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "bge        loop_nc1_%=           \n\t"
+        "end_nc1_%=:                      \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "blt        end_nc2_%=            \n\t"
+        "loop_nc2_%=:                     \n\t"
+
+        "pld        [%[c0], #16]          \n\t"
+        "vld1.32    {q10}, [%[c0]]        \n\t"
+
+        "gemm_nc2_%=:                     \n\t"
+        "vld1.32    {q2}, [%[b0]]!        \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+
+        "vst1.32    {q10}, [%[c0]]!       \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "bge        loop_nc2_%=           \n\t"
+        "end_nc2_%=:                      \n\t"
+
+        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
+          [c0] "+r"(c0)
+        : [a0] "r"(a0), [nc1] "r"(nc1), [nc2] "r"(nc2)
+        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
+
+    for (int j = 0; j < nc3; j++) {
+      *c0 += (*a0) * (*b0++);
+      c0++;
+    }
+  }
+
+  if (relu) {
+    VecWriteWithBnRelu(n, bufferC, C, ldc, new_scale, new_bias);
+  } else {
+    VecWriteWithBn(n, bufferC, C, ldc, new_scale, new_bias);
+  }
+}
+*/
+
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k / 4;
+  int kc2 = k % 4;
+  int step = 4 * ldc;
+  asm volatile(
+      "pld        [%[a_ptr]]          \n\t"
+      "pld        [%[b_ptr]]          \n\t"
+
+      "vmov.f32   q8,     #0.0        \n\t"
+      "vmov.f32   q9,     #0.0        \n\t"
+      "vmov.f32   q10,    #0.0        \n\t"
+      "vmov.f32   q11,    #0.0        \n\t"
+      "vmov.f32   q12,    #0.0        \n\t"
+      "vmov.f32   q13,    #0.0        \n\t"
+      "vmov.f32   q14,    #0.0        \n\t"
+      "vmov.f32   q15,    #0.0        \n\t"
+
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "blt        end_kc1_%=          \n\t"
+      "loop_kc1_%=:                   \n\t"
+
+      "pld        [%[a_ptr], #64]     \n\t"
+      "pld        [%[b_ptr], #64]     \n\t"
+
+      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vld1.32    {q4, q5}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q8,   q2,   d0[0]      \n\t"
+      "vmla.f32   q9,   q3,   d0[0]      \n\t"
+      "vmla.f32   q10,  q2,   d0[1]      \n\t"
+      "vmla.f32   q11,  q3,   d0[1]      \n\t"
+      "vmla.f32   q12,  q2,   d1[0]      \n\t"
+      "vmla.f32   q13,  q3,   d1[0]      \n\t"
+      "vmla.f32   q14,  q2,   d1[1]      \n\t"
+      "vmla.f32   q15,  q3,   d1[1]      \n\t"
+
+      "vmla.f32   q8,   q4,   d2[0]      \n\t"
+      "vmla.f32   q9,   q5,   d2[0]      \n\t"
+      "vmla.f32   q10,  q4,   d2[1]      \n\t"
+      "vmla.f32   q11,  q5,   d2[1]      \n\t"
+      "vmla.f32   q12,  q4,   d3[0]      \n\t"
+      "vmla.f32   q13,  q5,   d3[0]      \n\t"
+      "vmla.f32   q14,  q4,   d3[1]      \n\t"
+      "vmla.f32   q15,  q5,   d3[1]      \n\t"
+
+      "pld        [%[b_ptr], #64]     \n\t"
+
+      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vld1.32    {q4, q5}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q8,   q2,   d0[0]      \n\t"
+      "vmla.f32   q9,   q3,   d0[0]      \n\t"
+      "vmla.f32   q10,  q2,   d0[1]      \n\t"
+      "vmla.f32   q11,  q3,   d0[1]      \n\t"
+      "vmla.f32   q12,  q2,   d1[0]      \n\t"
+      "vmla.f32   q13,  q3,   d1[0]      \n\t"
+      "vmla.f32   q14,  q2,   d1[1]      \n\t"
+      "vmla.f32   q15,  q3,   d1[1]      \n\t"
+
+      "vmla.f32   q8,   q4,   d2[0]      \n\t"
+      "vmla.f32   q9,   q5,   d2[0]      \n\t"
+      "vmla.f32   q10,  q4,   d2[1]      \n\t"
+      "vmla.f32   q11,  q5,   d2[1]      \n\t"
+      "vmla.f32   q12,  q4,   d3[0]      \n\t"
+      "vmla.f32   q13,  q5,   d3[0]      \n\t"
+      "vmla.f32   q14,  q4,   d3[1]      \n\t"
+      "vmla.f32   q15,  q5,   d3[1]      \n\t"
+
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "bge        loop_kc1_%=         \n\t"
+      "end_kc1_%=:                    \n\t"
+
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "blt        end_kc2_%=          \n\t"
+      "loop_kc2_%=:                   \n\t"
+      "vld1.32    {q0},     [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q8,   q2,   d0[0]      \n\t"
+      "vmla.f32   q9,   q3,   d0[0]      \n\t"
+      "vmla.f32   q10,  q2,   d0[1]      \n\t"
+      "vmla.f32   q11,  q3,   d0[1]      \n\t"
+      "vmla.f32   q12,  q2,   d1[0]      \n\t"
+      "vmla.f32   q13,  q3,   d1[0]      \n\t"
+      "vmla.f32   q14,  q2,   d1[1]      \n\t"
+      "vmla.f32   q15,  q3,   d1[1]      \n\t"
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "bge        loop_kc2_%=         \n\t"
+      "end_kc2_%=:                    \n\t"
+
+      "mov        r5,     %[c]        \n\t"
+      "mov        r6,     %[step]     \n\t"
+      "vst1.32    {q8, q9},   [r5], r6     \n\t"
+      "vst1.32    {q10, q11}, [r5], r6     \n\t"
+      "vst1.32    {q12, q13}, [r5], r6     \n\t"
+      "vst1.32    {q14, q15}, [r5]         \n\t"
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [step] "r"(step)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9",
+        "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// C = A * B
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int step = 4 * ldc;
+  int step1 = 4 * (NC - 16 * nc1);
+  int volatile m = mc;
+
+  float *volatile c_ptr, *volatile C_ptr;
+  float *C0, *c0;
+  c_ptr = c;
+  C_ptr = C;
+  if (nc1 > 0) {
+    asm volatile(
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
+        "vst1.32    {q0, q1}, [r6]!         \n\t"
+
+        "vld1.32    {q2, q3}, [%[c_ptr]]!   \n\t"
+        "vst1.32    {q2, q3}, [r6]!         \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[C_ptr], %[C_ptr], %[step]   \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1]  \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3");
+  }
+
+  if (_nc1 != 0) {
+    for (int i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 16 + i * ldc;
+      c0 = c_ptr + nc1 * 16 + i * NC;
+      for (int j = 0; j < _nc1; j++) {
+        *C0++ = *c0++;
+      }
+    }
+  }
+}
+
+// C = alpha * A * B + beta * C
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+
+// C = A * B + C
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int step = 4 * ldc;
+  int step1 = 4 * (NC - 16 * nc1);
+  int volatile m = mc;
+
+  float *volatile c_ptr, *volatile C_ptr;
+  float *C0, *c0;
+  c_ptr = c;
+  C_ptr = C;
+  if (nc1 > 0) {
+    asm volatile(
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [r6]        \n\t"
+        "vld1.32    {q2, q3},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vst1.32    {q10, q11}, [r6]!       \n\t"
+
+        "vld1.32    {q4, q5},   [r6]        \n\t"
+        "vld1.32    {q6, q7},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vst1.32    {q12, q13}, [r6]!       \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[C_ptr], %[C_ptr], %[step]     \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1]    \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+          "q10", "q11", "q12", "q13");
+  }
+
+  if (_nc1 != 0) {
+    for (int i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 16 + i * ldc;
+      c0 = c_ptr + nc1 * 16 + i * NC;
+      for (int j = 0; j < _nc1; j++) {
+        *C0++ += *c0++;
+      }
+    }
+  }
+}
+
+// C = A * B + bias
+void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t biasv;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    biasv = vld1q_dup_f32(bias + i);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+        C_ptr++;
+      }
+    }
+  }
+}
+
+// C = A * B + C, relu(C)
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int step = 4 * ldc;
+  int step1 = 4 * (NC - 16 * nc1);
+  int volatile m = mc;
+
+  float *volatile c_ptr, *volatile C_ptr;
+  float *C0, *c0;
+  c_ptr = c;
+  C_ptr = C;
+  if (nc1 > 0) {
+    asm volatile(
+        "vmov.f32   q14,    #0.0            \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [r6]        \n\t"
+        "vld1.32    {q2, q3},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vmax.f32   q10,  q10,  q14         \n\t"
+        "vmax.f32   q11,  q11,  q14         \n\t"
+        "vst1.32    {q10, q11}, [r6]!       \n\t"
+
+        "vld1.32    {q4, q5},   [r6]        \n\t"
+        "vld1.32    {q6, q7},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vmax.f32   q12,  q12,  q14         \n\t"
+        "vmax.f32   q13,  q13,  q14         \n\t"
+        "vst1.32    {q12, q13}, [r6]!       \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[C_ptr], %[C_ptr], %[step]     \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1]    \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+          "q10", "q11", "q12", "q13");
+  }
+
+  if (_nc1 != 0) {
+    for (int i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 16 + i * ldc;
+      c0 = c_ptr + nc1 * 16 + i * NC;
+      for (int j = 0; j < _nc1; j++) {
+        *C0 += *c0;
+        if (*C0 < 0) {
+          *C0 = 0;
+        }
+        C0++;
+        c0++;
+      }
+    }
+  }
+}
+
+// C = A * B + bias, relu(C)
+void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
+                        float *bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t biasv;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    biasv = vld1q_dup_f32(bias + i);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      cv = vmaxq_f32(cv, zero);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vaddq_f32(cv, biasv);
+      cv = vmaxq_f32(cv, zero);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+        C_ptr++;
+      }
+    }
+  }
+}
+
+void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
+                       std::string mode, float *bias, float *bias1) {
+  if (nc < 4) {
+    if (bias1 == nullptr) {
+      for (int i = 0; i < mc; ++i) {
+        for (int j = 0; j < nc; ++j) {
+          float r = c[i * NC + j] + bias[i];
+          if (r < 0) {
+            r *= p[i];
+          }
+          C[i * ldc + j] = r;
+        }
+      }
+    } else {
+      for (int i = 0; i < mc; ++i) {
+        for (int j = 0; j < nc; ++j) {
+          float r = c[i * NC + j] + bias[i];
+          r += bias1[i * ldc + j];
+          if (r < 0) {
+            r *= p[i];
+          }
+          C[i * ldc + j] = r;
+        }
+      }
+    }
+    return;
+  }
+
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int nc2 = _nc1 / 4;
+  int nc3 = 16 - 4 * (_nc1 % 4);
+  int step = 4 * (ldc - nc);
+  int step1 = 4 * (NC - nc);
+
+  if (bias1 == nullptr) {
+    asm volatile(
+        "vmov.f32   q14,    #0.0            \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+
+        "mov        r5,     %[nc1]          \n\t"
+        "mov        r6,     %[nc2]          \n\t"
+        "vld1.32    {d0},   [%[bias]]       \n\t"
+        "vld1.32    {d1},   [%[p]]          \n\t"
+        "vdup.32    q1,     d0[0]           \n\t"
+        "vdup.32    q2,     d1[0]           \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "pld        [%[c], #32]             \n\t"
+        "vld1.32    {q3, q4},   [%[c]]!     \n\t"
+        "vld1.32    {q9, q10},  [%[c]]!     \n\t"
+
+        "vadd.f32   q3,   q3,   q1          \n\t"
+        "vadd.f32   q4,   q4,   q1          \n\t"
+        "vadd.f32   q9,   q9,   q1          \n\t"
+        "vadd.f32   q10,  q10,  q1          \n\t"
+
+        "vmax.f32   q5,   q3,   q14         \n\t"
+        "vmin.f32   q7,   q3,   q14         \n\t"
+        "vmax.f32   q6,   q4,   q14         \n\t"
+        "vmin.f32   q8,   q4,   q14         \n\t"
+
+        "vmax.f32   q11,  q9,   q14         \n\t"
+        "vmin.f32   q13,  q9,   q14         \n\t"
+        "vmax.f32   q12,  q10,  q14         \n\t"
+        "vmin.f32   q15,  q10,  q14         \n\t"
+
+        "vmla.f32   q5,   q7,   q2          \n\t"
+        "vmla.f32   q6,   q8,   q2          \n\t"
+        "vmla.f32   q11,  q13,  q2          \n\t"
+        "vmla.f32   q12,  q15,  q2          \n\t"
+
+        "vst1.32    {q5, q6},   [%[C]]!     \n\t"
+        "vst1.32    {q11, q12}, [%[C]]!     \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "subs       r6,  r6,   #1           \n\t"
+        "blt        end_nc2_%=              \n\t"
+        "loop_nc2_%=:                       \n\t"
+
+        "vld1.32    {q3},       [%[c]]!     \n\t"
+        "vadd.f32   q3,   q3,   q1          \n\t"
+        "vmax.f32   q5,   q3,   q14         \n\t"
+        "vmin.f32   q7,   q3,   q14         \n\t"
+        "vmla.f32   q5,   q7,   q2          \n\t"
+        "vst1.32    {q5},       [%[C]]!     \n\t"
+
+        "subs       r6,   r6,   #1          \n\t"
+        "bge        loop_nc2_%=             \n\t"
+        "end_nc2_%=:                        \n\t"
+
+        "cmp        %[nc3],    #16          \n\t"
+        "beq        end_nc3_%=              \n\t"
+
+        "sub        %[c],     %[c],   %[nc3]      \n\t"
+        "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+        "vld1.32    {q4},       [%[c]]!     \n\t"
+        "vadd.f32   q4,   q4,   q1          \n\t"
+        "vmax.f32   q6,   q4,   q14         \n\t"
+        "vmin.f32   q8,   q4,   q14         \n\t"
+        "vmla.f32   q6,   q8,   q2          \n\t"
+        "vst1.32    {q6},       [%[C]]!     \n\t"
+        "end_nc3_%=:                        \n\t"
+
+        "add        %[p],     %[p],     #4        \n\t"
+        "add        %[bias],  %[bias],  #4        \n\t"
+        "add        %[c],     %[c],     %[step1]  \n\t"
+        "add        %[C],     %[C],     %[step]   \n\t"
+
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
+          [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1), [p] "r"(p),
+          [bias] "r"(bias), [bias1] "r"(bias1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+          "q8");
+  } else {
+    asm volatile(
+        "vmov.f32   q14,    #0.0            \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+
+        "mov        r5,     %[nc1]          \n\t"
+        "mov        r6,     %[nc2]          \n\t"
+        "vld1.32    {d0},   [%[bias]]       \n\t"
+        "vld1.32    {d1},   [%[p]]          \n\t"
+        "vdup.32    q1,     d0[0]           \n\t"
+        "vdup.32    q2,     d1[0]           \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "pld        [%[c], #32]             \n\t"
+        "pld        [%[bias1], #32]         \n\t"
+        "vld1.32    {q3, q4},   [%[c]]!     \n\t"
+        "vld1.32    {q9, q10},  [%[bias1]]! \n\t"
+        "vadd.f32   q3,   q3,   q1          \n\t"
+        "vadd.f32   q4,   q4,   q1          \n\t"
+        "vadd.f32   q3,   q3,   q9          \n\t"
+        "vadd.f32   q4,   q4,   q10         \n\t"
+        "vmax.f32   q5,   q3,   q14         \n\t"
+        "vmin.f32   q7,   q3,   q14         \n\t"
+        "vmax.f32   q6,   q4,   q14         \n\t"
+        "vmin.f32   q8,   q4,   q14         \n\t"
+        "vmla.f32   q5,   q7,   q2          \n\t"
+        "vmla.f32   q6,   q8,   q2          \n\t"
+        "vst1.32    {q5, q6},   [%[C]]!     \n\t"
+
+        "vld1.32    {q3, q4},   [%[c]]!     \n\t"
+        "vld1.32    {q9, q10},  [%[bias1]]! \n\t"
+        "vadd.f32   q3,   q3,   q1          \n\t"
+        "vadd.f32   q4,   q4,   q1          \n\t"
+        "vadd.f32   q3,   q3,   q9          \n\t"
+        "vadd.f32   q4,   q4,   q10         \n\t"
+        "vmax.f32   q5,   q3,   q14         \n\t"
+        "vmin.f32   q7,   q3,   q14         \n\t"
+        "vmax.f32   q6,   q4,   q14         \n\t"
+        "vmin.f32   q8,   q4,   q14         \n\t"
+        "vmla.f32   q5,   q7,   q2          \n\t"
+        "vmla.f32   q6,   q8,   q2          \n\t"
+        "vst1.32    {q5, q6},   [%[C]]!     \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "subs       r6,  r6,   #1           \n\t"
+        "blt        end_nc2_%=              \n\t"
+        "loop_nc2_%=:                       \n\t"
+
+        "vld1.32    {q3},       [%[c]]!     \n\t"
+        "vld1.32    {q9},       [%[bias1]]! \n\t"
+        "vadd.f32   q3,   q3,   q1          \n\t"
+        "vadd.f32   q3,   q3,   q9          \n\t"
+        "vmax.f32   q5,   q3,   q14         \n\t"
+        "vmin.f32   q7,   q3,   q14         \n\t"
+        "vmla.f32   q5,   q7,   q2          \n\t"
+        "vst1.32    {q5},      [%[C]]!      \n\t"
+
+        "subs       r6,   r6,   #1          \n\t"
+        "bge        loop_nc2_%=             \n\t"
+        "end_nc2_%=:                        \n\t"
+
+        "cmp        %[nc3],    #16          \n\t"
+        "beq        end_nc3_%=              \n\t"
+
+        "sub        %[c],     %[c],     %[nc3]    \n\t"
+        "sub        %[C],     %[C],     %[nc3]    \n\t"
+        "sub        %[bias1], %[bias1], %[nc3]    \n\t"
+
+        "vld1.32    {q4},       [%[c]]!     \n\t"
+        "vld1.32    {q10},      [%[bias1]]! \n\t"
+        "vadd.f32   q4,   q4,   q1          \n\t"
+        "vadd.f32   q4,   q4,   q10         \n\t"
+        "vmax.f32   q6,   q4,   q14         \n\t"
+        "vmin.f32   q8,   q4,   q14         \n\t"
+        "vmla.f32   q6,   q8,   q2          \n\t"
+        "vst1.32    {q6},       [%[C]]!     \n\t"
+        "end_nc3_%=:                        \n\t"
+
+        "add        %[p],     %[p],     #4        \n\t"
+        "add        %[bias],  %[bias],  #4        \n\t"
+        "add        %[c],     %[c],     %[step1]  \n\t"
+        "add        %[C],     %[C],     %[step]   \n\t"
+        "add        %[bias1], %[bias1], %[step]   \n\t"
+
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
+          [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1), [p] "r"(p),
+          [bias] "r"(bias), [bias1] "r"(bias1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+          "q8", "q9", "q10");
+  }
+}
+
+// C = A * B, batchnorm(C)
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
+                 float *bias) {
+  if (nc < 4) {
+    for (int i = 0; i < mc; ++i) {
+      for (int j = 0; j < nc; ++j) {
+        *C = (*c) * (*scale) + (*bias);
+        C++;
+        c++;
+      }
+      C += (ldc - nc);
+      c += (NC - nc);
+      scale++;
+      bias++;
+    }
+    return;
+  }
+
+  int volatile nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int volatile nc2 = _nc1 / 4;
+  int volatile nc3 = 16 - 4 * (_nc1 % 4);
+  int volatile step = 4 * (ldc - nc);
+  int volatile step1 = 4 * (NC - nc);
+
+  asm volatile(
+      "subs       %[mc], %[mc], #1        \n\t"
+      "blt        end_mc_%=               \n\t"
+      "loop_mc_%=:                        \n\t"
+
+      "mov        r5,   %[nc1]            \n\t"
+      "mov        r6,   %[nc2]            \n\t"
+      "vld1.32    {d0},   [%[scale]]      \n\t"
+      "vld1.32    {d1},   [%[bias]]       \n\t"
+      "vdup.32    q1,   d0[0]             \n\t"
+      "vdup.32    q2,   d1[0]             \n\t"
+
+      "subs       r5,   r5,   #1          \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q3, q4},   [%[c]]!     \n\t"
+      "vmul.f32   q10,  q3,   q1          \n\t"
+      "vmul.f32   q11,  q4,   q1          \n\t"
+      "vadd.f32   q10,  q10,  q2          \n\t"
+      "vadd.f32   q11,  q11,  q2          \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+      "vld1.32    {q5, q6},   [%[c]]!     \n\t"
+      "vmul.f32   q12,  q5,   q1          \n\t"
+      "vmul.f32   q13,  q6,   q1          \n\t"
+      "vadd.f32   q12,  q12,  q2          \n\t"
+      "vadd.f32   q13,  q13,  q2          \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+      "subs       r5,   r5,   #1          \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      "subs       r6,  r6,   #1           \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+
+      "vld1.32    {q7},       [%[c]]!     \n\t"
+      "vmul.f32   q10,  q7,   q1          \n\t"
+      "vadd.f32   q10,  q10,  q2          \n\t"
+      "vst1.32    {q10},      [%[C]]!     \n\t"
+
+      "subs       r6,   r6,   #1          \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+
+      "sub        %[c],     %[c],   %[nc3]      \n\t"
+      "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+      "vld1.32    {q8},       [%[c]]!     \n\t"
+      "vmul.f32   q11,  q8,   q1          \n\t"
+      "vadd.f32   q11,  q11,  q2          \n\t"
+      "vst1.32    {q11},      [%[C]]!     \n\t"
+      "end_nc3_%=:                        \n\t"
+
+      "add        %[scale], %[scale], #4        \n\t"
+      "add        %[bias],  %[bias],  #4        \n\t"
+      "add        %[c],     %[c],     %[step1]  \n\t"
+      "add        %[C],     %[C],     %[step]   \n\t"
+
+      "subs       %[mc], %[mc], #1        \n\t"
+      "bge        loop_mc_%=              \n\t"
+      "end_mc_%=:                         \n\t"
+
+      :
+      : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
+        [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1),
+        [scale] "r"(scale), [bias] "r"(bias)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q8", "q10", "q11", "q12", "q13");
+}
+
+// C = A * B, batchnorm(C), relu(C)
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
+                     float *bias) {
+  if (nc < 4) {
+    for (int i = 0; i < mc; ++i) {
+      for (int j = 0; j < nc; ++j) {
+        *C = (*c) * (*scale) + (*bias);
+        if (*C < 0) {
+          *C = 0;
+        }
+        C++;
+        c++;
+      }
+      C += (ldc - nc);
+      c += (NC - nc);
+      scale++;
+      bias++;
+    }
+    return;
+  }
+
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int nc2 = _nc1 / 4;
+  int nc3 = 16 - 4 * (_nc1 % 4);
+  int step = 4 * (ldc - nc);
+  int step1 = 4 * (NC - nc);
+
+  asm volatile(
+      "vmov.f32   q14,    #0.0            \n\t"
+      "subs       %[mc], %[mc], #1        \n\t"
+      "blt        end_mc_%=               \n\t"
+      "loop_mc_%=:                        \n\t"
+
+      "mov        r5,   %[nc1]            \n\t"
+      "mov        r6,   %[nc2]            \n\t"
+      "vld1.32    {d0},   [%[scale]]      \n\t"
+      "vld1.32    {d1},   [%[bias]]       \n\t"
+      "vdup.32    q1,   d0[0]             \n\t"
+      "vdup.32    q2,   d1[0]             \n\t"
+
+      "subs       r5,   r5,   #1          \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q3, q4},   [%[c]]!     \n\t"
+      "vmul.f32   q10,  q3,   q1          \n\t"
+      "vmul.f32   q11,  q4,   q1          \n\t"
+      "vadd.f32   q10,  q10,  q2          \n\t"
+      "vadd.f32   q11,  q11,  q2          \n\t"
+      "vmax.f32   q10,  q10,  q14         \n\t"
+      "vmax.f32   q11,  q11,  q14         \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+      "vld1.32    {q5, q6},   [%[c]]!     \n\t"
+      "vmul.f32   q12,  q5,   q1          \n\t"
+      "vmul.f32   q13,  q6,   q1          \n\t"
+      "vadd.f32   q12,  q12,  q2          \n\t"
+      "vadd.f32   q13,  q13,  q2          \n\t"
+      "vmax.f32   q12,  q12,  q14         \n\t"
+      "vmax.f32   q13,  q13,  q14         \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+      "subs       r5,   r5,   #1          \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      "subs       r6,  r6,   #1           \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+
+      "vld1.32    {q7},       [%[c]]!     \n\t"
+      "vmul.f32   q10,  q7,   q1          \n\t"
+      "vadd.f32   q10,  q10,  q2          \n\t"
+      "vmax.f32   q10,  q10,  q14         \n\t"
+      "vst1.32    {q10},      [%[C]]!     \n\t"
+
+      "subs       r6,   r6,   #1          \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+
+      "sub        %[c],     %[c],   %[nc3]      \n\t"
+      "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+      "vld1.32    {q8},       [%[c]]!     \n\t"
+      "vmul.f32   q11,  q8,   q1          \n\t"
+      "vadd.f32   q11,  q11,  q2          \n\t"
+      "vmax.f32   q11,  q11,  q14         \n\t"
+      "vst1.32    {q11},      [%[C]]!     \n\t"
+      "end_nc3_%=:                        \n\t"
+
+      "add        %[scale], %[scale], #4        \n\t"
+      "add        %[bias],  %[bias],  #4        \n\t"
+      "add        %[c],     %[c],     %[step1]  \n\t"
+      "add        %[C],     %[C],     %[step]   \n\t"
+
+      "subs       %[mc], %[mc], #1        \n\t"
+      "bge        loop_mc_%=              \n\t"
+      "end_mc_%=:                         \n\t"
+
+      :
+      : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
+        [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1),
+        [scale] "r"(scale), [bias] "r"(bias)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q8", "q10", "q11", "q12", "q13", "q14");
+}
+
+// C = A * B, batchnorm(C),C = C + bias; relu(C)
+void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
+                        float *new_scale, float *new_bias, float *bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr, *bias_ptr;
+  float32x4_t cv;
+  float32x4_t nbias;
+  float32x2_t scale;
+  float32x4_t biasv;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    bias_ptr = bias + i * ldc;
+    nbias = vld1q_dup_f32(new_bias);
+    scale = vld1_dup_f32(new_scale);
+    new_bias++;
+    new_scale++;
+    float scale0 = vget_lane_f32(scale, 0);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      biasv = vld1q_f32(bias_ptr);
+      cv = vmlaq_n_f32(nbias, cv, scale0);
+      cv = vaddq_f32(cv, biasv);
+      cv = vmaxq_f32(cv, zero);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+      bias_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      biasv = vld1q_f32(bias_ptr);
+      cv = vmlaq_n_f32(nbias, cv, scale0);
+      cv = vaddq_f32(cv, biasv);
+      cv = vmaxq_f32(cv, zero);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+      }
+    }
+  }
+}
+
+  /*
+  // C = A * B
+  void VecWriteBasic(int n, float *c, float *C, int ldc) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
+    int nc2 = _nc1 / 4;
+    int nc3 = 16 - 4 * (_nc1 % 4);
+
+    asm volatile(
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1}, [%[c]]!       \n\t"
+        "vst1.32    {q0, q1}, [%[C]]!       \n\t"
+
+        "vld1.32    {q2, q3}, [%[c]]!       \n\t"
+        "vst1.32    {q2, q3}, [%[C]]!       \n\t"
+
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "blt        end_nc2_%=              \n\t"
+        "loop_nc2_%=:                       \n\t"
+
+        "vld1.32    {q4},     [%[c]]!       \n\t"
+        "vst1.32    {q4},     [%[C]]!       \n\t"
+
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "bge        loop_nc2_%=             \n\t"
+        "end_nc2_%=:                        \n\t"
+
+        "cmp        %[nc3],    #16          \n\t"
+        "beq        end_nc3_%=              \n\t"
+        "sub        %[c],     %[c],   %[nc3]    \n\t"
+        "sub        %[C],     %[C],   %[nc3]    \n\t"
+        "vld1.32    {q5},     [%[c]]!       \n\t"
+        "vst1.32    {q5},     [%[C]]!       \n\t"
+        "end_nc3_%=:                        \n\t"
+
+        :
+        : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5");
+  }
+
+  // C = alpha * A * B + beta * C
+  void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
+
+  // C = A * B + C
+  void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
+
+    asm volatile(
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+        "vld1.32    {q2, q3},   [%[C]]      \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+        "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+        "vld1.32    {q6, q7},   [%[C]]      \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        : [C] "+r"(C), [c] "+r"(c)
+        : [nc1] "r"(nc1)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10",
+  "q11", "q12", "q13");
+
+    if (_nc1 != 0) {
+      for (int j = 0; j < _nc1; j++) {
+        *C++ += *c++;
+      }
+    }
+  }
+
+  // C = A * B + C, relu(C)
+  void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
+
+    asm volatile(
+        "vmov.f32   q14,      #0.0          \n\t"
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+        "vld1.32    {q2, q3},   [%[C]]      \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vmax.f32   q10,  q10,  q14         \n\t"
+        "vmax.f32   q11,  q11,  q14         \n\t"
+        "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+        "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+        "vld1.32    {q6, q7},   [%[C]]      \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vmax.f32   q12,  q12,  q14         \n\t"
+        "vmax.f32   q13,  q13,  q14         \n\t"
+        "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        : [C] "+r"(C), [c] "+r"(c)
+        : [nc1] "r"(nc1)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10",
+  "q11", "q12", "q13");
+
+    if (_nc1 != 0) {
+      for (int j = 0; j < _nc1; j++) {
+        *C += *c;
+        if (*C < 0) {
+          *C = 0;
+        }
+        C++;
+        c++;
+      }
+    }
+  }
+
+  // C = A * B, batchnorm(C)
+  void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
+                      float *bias) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
+    int nc2 = _nc1 / 4;
+    int nc3 = 16 - 4 * (_nc1 % 4);
+
+    asm volatile(
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+        "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+        "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+        "vmla.f32   q10,  q0,   q2          \n\t"
+        "vmla.f32   q11,  q1,   q3          \n\t"
+        "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+        "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+        "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+        "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+        "vmla.f32   q12,  q4,   q6          \n\t"
+        "vmla.f32   q13,  q5,   q7          \n\t"
+        "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "blt        end_nc2_%=              \n\t"
+        "loop_nc2_%=:                       \n\t"
+
+        "vld1.32    {q0},   [%[c]]!         \n\t"
+        "vld1.32    {q1},   [%[scale]]!     \n\t"
+        "vld1.32    {q10},  [%[bias]]!      \n\t"
+        "vmla.f32   q10,    q0,   q1        \n\t"
+        "vst1.32    {q10},  [%[C]]!         \n\t"
+
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "bge        loop_nc2_%=             \n\t"
+        "end_nc2_%=:                        \n\t"
+
+        "cmp        %[nc3],    #16          \n\t"
+        "beq        end_nc3_%=              \n\t"
+
+        "sub        %[c],     %[c],   %[nc3]      \n\t"
+        "sub        %[scale], %[scale],  %[nc3]   \n\t"
+        "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+        "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+        "vld1.32    {q0},   [%[c]]!         \n\t"
+        "vld1.32    {q1},   [%[scale]]!     \n\t"
+        "vld1.32    {q10},  [%[bias]]!      \n\t"
+        "vmla.f32   q10,    q0,   q1        \n\t"
+        "vst1.32    {q10},  [%[C]]!         \n\t"
+        "end_nc3_%=:                        \n\t"
+
+        :
+        : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3]
+  "r"(nc3), [scale] "r"(scale), [bias] "r"(bias) : "memory", "q0", "q1", "q2",
+  "q3", "q4", "q5", "q6", "q7", "q10", "q11", "q12", "q13");
+  }
+
+  // C = A * B, batchnorm(C), relu(C)
+  void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
+                          float *bias) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
+    int nc2 = _nc1 / 4;
+    int nc3 = 16 - 4 * (_nc1 % 4);
+
+    asm volatile(
+        "vmov.f32   q14,      #0.0          \n\t"
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+        "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+        "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+        "vmla.f32   q10,  q0,   q2          \n\t"
+        "vmla.f32   q11,  q1,   q3          \n\t"
+        "vmax.f32   q10,  q10,  q14         \n\t"
+        "vmax.f32   q11,  q11,  q14         \n\t"
+        "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+        "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+        "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+        "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+        "vmla.f32   q12,  q4,   q6          \n\t"
+        "vmla.f32   q13,  q5,   q7          \n\t"
+        "vmax.f32   q12,  q12,  q14         \n\t"
+        "vmax.f32   q13,  q13,  q14         \n\t"
+        "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "blt        end_nc2_%=              \n\t"
+        "loop_nc2_%=:                       \n\t"
+
+        "vld1.32    {q0},   [%[c]]!         \n\t"
+        "vld1.32    {q1},   [%[scale]]!     \n\t"
+        "vld1.32    {q10},  [%[bias]]!      \n\t"
+        "vmla.f32   q10,    q0,   q1        \n\t"
+        "vmax.f32   q10,    q10,  q14       \n\t"
+        "vst1.32    {q10},  [%[C]]!         \n\t"
+
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "bge        loop_nc2_%=             \n\t"
+        "end_nc2_%=:                        \n\t"
+
+        "cmp        %[nc3],    #16          \n\t"
+        "beq        end_nc3_%=              \n\t"
+
+        "sub        %[c],     %[c],   %[nc3]      \n\t"
+        "sub        %[scale], %[scale],  %[nc3]   \n\t"
+        "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+        "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+        "vld1.32    {q0},   [%[c]]!         \n\t"
+        "vld1.32    {q1},   [%[scale]]!     \n\t"
+        "vld1.32    {q10},  [%[bias]]!      \n\t"
+        "vmla.f32   q10,    q0,   q1        \n\t"
+        "vmax.f32   q10,    q10,  q14       \n\t"
+        "vst1.32    {q10},  [%[C]]!         \n\t"
+        "end_nc3_%=:                        \n\t"
+
+        :
+        : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3]
+  "r"(nc3), [scale] "r"(scale), [bias] "r"(bias) : "memory", "q0", "q1", "q2",
+  "q3", "q4", "q5", "q6", "q7", "q10", "q11", "q12", "q13", "q14");
+  }
+  */
+
+#endif  // __aarch64__
+#else
+
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+  float *c0, *c1, *c2, *c3;
+  c0 = c;
+  c1 = c + ldc;
+  c2 = c + 2 * ldc;
+  c3 = c + 3 * ldc;
+  for (int p = 0; p < k; p += 1) {
+    // first row
+    c0[0] += a[0] * b[0];
+    c0[1] += a[0] * b[1];
+    c0[2] += a[0] * b[2];
+    c0[3] += a[0] * b[3];
+
+    // second row
+    c1[0] += a[1] * b[0];
+    c1[1] += a[1] * b[1];
+    c1[2] += a[1] * b[2];
+    c1[3] += a[1] * b[3];
+
+    // third row
+    c2[0] += a[2] * b[0];
+    c2[1] += a[2] * b[1];
+    c2[2] += a[2] * b[2];
+    c2[3] += a[2] * b[3];
+
+    // fourth row
+    c3[0] += a[3] * b[0];
+    c3[1] += a[3] * b[1];
+    c3[2] += a[3] * b[2];
+    c3[3] += a[3] * b[3];
+
+    a += 4;
+    b += 4;
+  }
+}
+
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {}
+
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {}
+
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {}
+
+void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias) {}
+
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {}
+
+void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
+                        float *bias) {}
+
+void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
+                       std::string mode, float *bias, float *bias1) {}
+
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
+                 float *new_bias) {}
+
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                     float *new_scale, float *new_bias) {}
+void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
+                        float *new_scale, float *new_bias, float *bias1) {}
+
+#endif  // __ARM_NEON
+
+// 32位 float 矩阵乘法
+void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+           const float *B, int ldb, float beta, float *C, int ldc, bool relu,
+           float *bias) {
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int L1 = 32 * 1024;
+  int L2 = 512 * 1024;
+
+  KC = k;
+  MC = L1 / (KC * sizeof(float));
+  NC = L2 / (KC * sizeof(float));
+
+  // make sure MC is multiple of MR, and NC is multiple of NR
+  if (MC == 0) {
+    MC = MR;
+  } else {
+    int mblock_num = (m + MC - 1) / MC;
+    MC = (m + mblock_num - 1) / mblock_num;
+    MC = (MC + MR - 1) / MR * MR;
+  }
+  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+  if (NC == 0) {
+    NC = NR;
+  } else {
+    int nblock_num = (n + NC - 1) / NC;
+    NC = (n + nblock_num - 1) / nblock_num;
+    NC = (NC + NR - 1) / NR * NR;
+  }
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+
+  packedA = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+  packedB = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
+
+  int mc, nc;
+  for (int j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+#if __aarch64__
+    // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#else
+    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#endif
+    for (int i = 0; i < m; i += MC) {
+      mc = s_min(m - i, MC);
+#if __aarch64__
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#else
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#endif
+      if (bias == nullptr) {
+        InnerKernelWithBias(mc, nc, alpha, packedA, packedB, beta, packedC,
+                            &C(i, j), ldc, relu, nullptr);
+      } else {
+        InnerKernelWithBias(mc, nc, alpha, packedA, packedB, beta, packedC,
+                            &C(i, j), ldc, relu, bias + i);
+      }
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
+}
+
+void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 bool relu, float *new_scale, float *new_bias, float *bias) {
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int L1 = 32 * 1024;
+  int L2 = 512 * 1024;
+
+  KC = k;
+  MC = L1 / (KC * sizeof(float));
+  NC = L2 / (KC * sizeof(float));
+
+  // make sure MC is multiple of MR, and NC is multiple of NR
+  if (MC == 0) {
+    MC = MR;
+  } else {
+    int mblock_num = (m + MC - 1) / MC;
+    MC = (m + mblock_num - 1) / mblock_num;
+    MC = (MC + MR - 1) / MR * MR;
+  }
+  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+  if (NC == 0) {
+    NC = NR;
+  } else {
+    int nblock_num = (n + NC - 1) / NC;
+    NC = (n + nblock_num - 1) / nblock_num;
+    NC = (NC + NR - 1) / NR * NR;
+  }
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+
+  packedA = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+  packedB = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
+
+  int mc, nc;
+  for (int j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+#if __aarch64__
+    // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#else
+    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#endif
+    for (int i = 0; i < m; i += MC) {
+      mc = s_min(m - i, MC);
+#if __aarch64__
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#else
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#endif
+      if (bias == nullptr) {
+        InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
+                          &C(i, j), ldc, relu, new_scale + i, new_bias + i);
+      } else {
+        InnerKernelWithBnAdd(mc, nc, alpha, packedA, packedB, beta, packedC,
+                             &C(i, j), ldc, relu, new_scale + i, new_bias + i,
+                             bias + i * ldc + j);
+      }
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
+}
+
+void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
+                    const float *B, int ldb, float *C, int ldc, float *p,
+                    std::string mode, float *bias, float *bias1) {
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int L1 = 32 * 1024;
+  int L2 = 0.5 * 1024 * 1024;
+
+  KC = k;
+  MC = L1 / (KC * sizeof(float));
+  NC = L2 / (KC * sizeof(float));
+
+  // make sure MC is multiple of MR, and NC is multiple of NR
+  if (MC == 0) {
+    MC = MR;
+  } else {
+    int mblock_num = (m + MC - 1) / MC;
+    MC = (m + mblock_num - 1) / mblock_num;
+    MC = (MC + MR - 1) / MR * MR;
+  }
+  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+  if (NC == 0) {
+    NC = NR;
+  } else {
+    int nblock_num = (n + NC - 1) / NC;
+    NC = (n + nblock_num - 1) / nblock_num;
+    NC = (NC + NR - 1) / NR * NR;
+  }
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+
+  packedA = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+  packedB = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+
+  for (int l = 0; l < KC; ++l) {
+    zero[l] = 0;
+  }
+
+  int mc, nc;
+  for (int j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+#if __aarch64__
+    // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#else
+    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#endif
+    for (int i = 0; i < m; i += MC) {
+      mc = s_min(m - i, MC);
+#if __aarch64__
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#else
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#endif
+      if (bias1 == nullptr) {
+        InnerKernelWithPRelu(mc, nc, packedA, packedB, packedC, &C(i, j), ldc,
+                             p + i, mode, bias + i, nullptr);
+      } else {
+        InnerKernelWithPRelu(mc, nc, packedA, packedB, packedC, &C(i, j), ldc,
+                             p + i, mode, bias + i, bias1 + i * ldc + j);
+      }
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
+}
+
+// 32位 float 矩阵乘法
+void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
+               const float *B, int ldb, float beta, float *C, int ldc,
+               bool relu, float *bias) {
+#ifdef _OPENMP
+  int max_threads = omp_get_max_threads();
+#else
+  int max_threads = 1;
+#endif
+
+  int L1 = 64 / max_threads * 1024;
+  KC = k;
+  if (m > n) {
+    // 对 A 分块
+    MC = L1 / (KC * sizeof(float));
+    if (MC == 0) {
+      MC = MR;
+    } else {
+      int mblock_num = (m + MC - 1) / MC;
+      MC = (m + mblock_num - 1) / mblock_num;
+      MC = (MC + MR - 1) / MR * MR;
+    }
+    // 补齐 B
+    NC = (n + NR - 1) / NR * NR;
+
+#if __aarch64__
+    procPackA = PackMatrixA_6r;
+    procPackB = PackMatrixB_omp_16c;
+    procAddDot = AddDot6x16;
+#else
+    procPackA = PackMatrixA_6r;
+    procPackB = PackMatrixB_omp_8c;
+    procAddDot = AddDot6x8;
+#endif
+
+    packedB = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+    procPackB(KC, NC, NC % NR, B, ldb, packedB);
+    packedA = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
+  } else {
+    // 对 B 分块
+    NC = L1 / (KC * sizeof(float));
+    if (NC == 0) {
+      NC = NR;
+    } else {
+      int nblock_num = (n + NC - 1) / NC;
+      NC = (n + nblock_num - 1) / nblock_num;
+      NC = (NC + NR - 1) / NR * NR;
+    }
+    // 补齐 A
+    MC = (m + MR - 1) / MR * MR;
+
+#if __aarch64__
+    procPackA = PackMatrixA_omp_6r;
+    procPackB = PackMatrixB_16c;
+    procAddDot = AddDot6x16;
+#else
+    procPackA = PackMatrixA_omp_6r;
+    procPackB = PackMatrixB_8c;
+    procAddDot = AddDot6x8;
+#endif
+
+    packedA = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+    procPackA(MC, KC, MC % MR, A, lda, packedA);
+    packedB = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
+  }
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
+
+  if (m > n) {
+#pragma omp parallel for
+    for (int i = 0; i < m; i += MC) {
+#ifdef _OPENMP
+      int local_threads = omp_get_thread_num();
+#else
+      int local_threads = 0;
+#endif
+
+      int mc;
+      mc = s_min(m - i, MC);
+      float *local_A = packedA + MC * KC * local_threads;
+      float *local_C = packedC + MC * NC * local_threads;
+      procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A);
+      InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
+                          &C(i, 0), ldc, relu, bias + i);
+    }
+  } else {
+#pragma omp parallel for
+    for (int j = 0; j < n; j += NC) {
+#ifdef _OPENMP
+      int local_threads = omp_get_thread_num();
+#else
+      int local_threads = 0;
+#endif
+
+      int nc;
+      nc = s_min(n - j, NC);
+      float *local_B = packedB + KC * NC * local_threads;
+      float *local_C = packedC + MC * NC * local_threads;
+      procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B);
+      InnerKernelWithBias(m, nc, alpha, packedA, local_B, beta, local_C,
+                          &C(0, j), ldc, relu, bias);
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
+}
+
+void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
+                     const float *B, int ldb, float beta, float *C, int ldc,
+                     bool relu, float *new_scale, float *new_bias,
+                     float *bias) {
+#ifdef _OPENMP
+  int max_threads = omp_get_max_threads();
+#else
+  int max_threads = 1;
+#endif
+
+  int L1 = 64 / max_threads * 1024;
+  KC = k;
+  if (m > n) {
+    // 对 A 分块
+    MC = L1 / (KC * sizeof(float));
+    if (MC == 0) {
+      MC = MR;
+    } else {
+      int mblock_num = (m + MC - 1) / MC;
+      MC = (m + mblock_num - 1) / mblock_num;
+      MC = (MC + MR - 1) / MR * MR;
+    }
+    // 补齐 B
+    NC = (n + NR - 1) / NR * NR;
+
+#if __aarch64__
+    procPackA = PackMatrixA_6r;
+    procPackB = PackMatrixB_omp_16c;
+    procAddDot = AddDot6x16;
+#else
+    procPackA = PackMatrixA_6r;
+    procPackB = PackMatrixB_omp_8c;
+    procAddDot = AddDot6x8;
+#endif
+
+    packedB = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+    procPackB(KC, NC, NC % NR, B, ldb, packedB);
+    packedA = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
+  } else {
+    // 对 B 分块
+    NC = L1 / (KC * sizeof(float));
+    if (NC == 0) {
+      NC == NR;
+    } else {
+      int nblock_num = (n + NC - 1) / NC;
+      NC = (n + nblock_num - 1) / nblock_num;
+      NC = (NC + NR - 1) / NR * NR;
+    }
+    // 补齐 A
+    MC = (m + MR - 1) / MR * MR;
+
+#if __aarch64__
+    procPackA = PackMatrixA_omp_6r;
+    procPackB = PackMatrixB_16c;
+    procAddDot = AddDot6x16;
+#else
+    procPackA = PackMatrixA_omp_6r;
+    procPackB = PackMatrixB_8c;
+    procAddDot = AddDot6x8;
+#endif
+
+    packedA = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+    procPackA(MC, KC, MC % MR, A, lda, packedA);
+    packedB = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
+  }
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
+
+  if (m > n) {
+#pragma omp parallel for
+    for (int i = 0; i < m; i += MC) {
+#ifdef _OPENMP
+      int local_threads = omp_get_thread_num();
+#else
+      int local_threads = 0;
+#endif
+
+      int mc;
+      mc = s_min(m - i, MC);
+      float *local_A = packedA + MC * KC * local_threads;
+      float *local_C = packedC + MC * NC * local_threads;
+      procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A);
+      if (bias == nullptr) {
+        InnerKernelWithBn(mc, n, alpha, local_A, packedB, beta, local_C,
+                          &C(i, 0), ldc, relu, new_scale + i, new_bias + i);
+      } else {
+        InnerKernelWithBnAdd(mc, n, alpha, local_A, packedB, beta, local_C,
+                             &C(i, 0), ldc, relu, new_scale + i, new_bias + i,
+                             bias + i * ldc);
+      }
+    }
+  } else {
+#pragma omp parallel for
+    for (int j = 0; j < n; j += NC) {
+#ifdef _OPENMP
+      int local_threads = omp_get_thread_num();
+#else
+      int local_threads = 0;
+#endif
+
+      int nc;
+      nc = s_min(n - j, NC);
+      float *local_B = packedB + KC * NC * local_threads;
+      float *local_C = packedC + MC * NC * local_threads;
+      procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B);
+      if (bias == nullptr) {
+        InnerKernelWithBn(m, nc, alpha, packedA, local_B, beta, local_C,
+                          &C(0, j), ldc, relu, new_scale, new_bias);
+      } else {
+        InnerKernelWithBnAdd(m, nc, alpha, packedA, local_B, beta, local_C,
+                             &C(0, j), ldc, relu, new_scale, new_bias,
+                             bias + j);
+      }
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
+}
+
+void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
+                        const float *B, int ldb, float *C, int ldc, float *p,
+                        std::string mode, float *bias, float *bias1) {
+#ifdef _OPENMP
+  int max_threads = omp_get_max_threads();
+#else
+  int max_threads = 1;
+#endif
+
+  int L1 = 8 * 1024;
+  KC = k;
+  if (m > n) {
+    // 对 A 分块
+    MC = L1 / (KC * sizeof(float));
+    if (MC == 0) {
+      MC = MR;
+    } else {
+      int mblock_num = (m + MC - 1) / MC;
+      MC = (m + mblock_num - 1) / mblock_num;
+      MC = (MC + MR - 1) / MR * MR;
+    }
+    // 补齐 B
+    NC = (n + NR - 1) / NR * NR;
+
+#if __aarch64__
+    procPackA = PackMatrixA_6r;
+    procPackB = PackMatrixB_omp_16c;
+    procAddDot = AddDot6x16;
+#else
+    procPackA = PackMatrixA_6r;
+    procPackB = PackMatrixB_omp_8c;
+    procAddDot = AddDot6x8;
+#endif
+
+    packedB = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+    procPackB(KC, NC, NC % NR, B, ldb, packedB);
+    packedA = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
+  } else {
+    // 对 B 分块
+    NC = L1 / (KC * sizeof(float));
+    if (NC == 0) {
+      NC = NR;
+    } else {
+      int nblock_num = (n + NC - 1) / NC;
+      NC = (n + nblock_num - 1) / nblock_num;
+      NC = (NC + NR - 1) / NR * NR;
+    }
+    // 补齐 A
+    MC = (m + MR - 1) / MR * MR;
+
+#if __aarch64__
+    procPackA = PackMatrixA_omp_6r;
+    procPackB = PackMatrixB_16c;
+    procAddDot = AddDot6x16;
+#else
+    procPackA = PackMatrixA_omp_6r;
+    procPackB = PackMatrixB_8c;
+    procAddDot = AddDot6x8;
+#endif
+
+    packedA = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+    procPackA(MC, KC, MC % MR, A, lda, packedA);
+    packedB = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
+  }
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
+
+  if (m > n) {
+#pragma omp parallel for
+    for (int i = 0; i < m; i += MC) {
+#ifdef _OPENMP
+      int local_threads = omp_get_thread_num();
+#else
+      int local_threads = 0;
+#endif
+
+      int mc;
+      mc = s_min(m - i, MC);
+      float *local_A = packedA + MC * KC * local_threads;
+      float *local_C = packedC + MC * NC * local_threads;
+      procPackA(mc, KC, mc % MR, &A(i, 0), lda, local_A);
+      if (bias1 == nullptr) {
+        InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc,
+                             p + i, mode, bias + i, nullptr);
+      } else {
+        InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc,
+                             p + i, mode, bias + i, bias1 + i * ldc);
+      }
+    }
+  } else {
+#pragma omp parallel for
+    for (int j = 0; j < n; j += NC) {
+#ifdef _OPENMP
+      int local_threads = omp_get_thread_num();
+#else
+      int local_threads = 0;
+#endif
+
+      int nc;
+      nc = s_min(n - j, NC);
+      float *local_B = packedB + KC * NC * local_threads;
+      float *local_C = packedC + MC * NC * local_threads;
+      procPackB(KC, nc, nc % NR, &B(0, j), ldb, local_B);
+      if (bias1 == nullptr) {
+        InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p,
+                             mode, bias, nullptr);
+      } else {
+        InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p,
+                             mode, bias, bias1 + j);
+      }
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
+}
+
+void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
+#if __ARM_NEON
+#if __aarch64__
+
+  // init C
+  float32x4_t cv0 = vdupq_n_f32(0.0);
+  float32x4_t cv1 = vdupq_n_f32(0.0);
+  float32x4_t cv2 = vdupq_n_f32(0.0);
+  float32x4_t cv3 = vdupq_n_f32(0.0);
+  float32x4_t cv4 = vdupq_n_f32(0.0);
+  float32x4_t cv5 = vdupq_n_f32(0.0);
+  float32x4_t cv6 = vdupq_n_f32(0.0);
+  float32x4_t cv7 = vdupq_n_f32(0.0);
+  float32x4_t cv8 = vdupq_n_f32(0.0);
+  float32x4_t cv9 = vdupq_n_f32(0.0);
+  float32x4_t cv10 = vdupq_n_f32(0.0);
+  float32x4_t cv11 = vdupq_n_f32(0.0);
+
+  float32x4_t av;
+  float32x4_t bv0;
+  float32x4_t bv1;
+
+  float32x2_t av01;
+  float32x2_t av23;
+  float32x2_t av45;
+
+  for (int p = 0; p < k; p += 1) {
+    av = vld1q_f32(a);
+    av01 = vget_low_f32(av);
+    av23 = vget_high_f32(av);
+    av45 = vld1_f32(a + 4);
+    bv0 = vld1q_f32(b);
+    bv1 = vld1q_f32(b + 4);
+
+    cv0 = vmlaq_lane_f32(cv0, bv0, av01, 0);
+    cv1 = vmlaq_lane_f32(cv1, bv1, av01, 0);
+    cv2 = vmlaq_lane_f32(cv2, bv0, av01, 1);
+    cv3 = vmlaq_lane_f32(cv3, bv1, av01, 1);
+
+    cv4 = vmlaq_lane_f32(cv4, bv0, av23, 0);
+    cv5 = vmlaq_lane_f32(cv5, bv1, av23, 0);
+    cv6 = vmlaq_lane_f32(cv6, bv0, av23, 1);
+    cv7 = vmlaq_lane_f32(cv7, bv1, av23, 1);
+
+    cv8 = vmlaq_lane_f32(cv8, bv0, av45, 0);
+    cv9 = vmlaq_lane_f32(cv9, bv1, av45, 0);
+    cv10 = vmlaq_lane_f32(cv10, bv0, av45, 1);
+    cv11 = vmlaq_lane_f32(cv11, bv1, av45, 1);
+
+    a += MR;
+    b += NR;
+  }
+
+  vst1q_f32(c, cv0);
+  vst1q_f32(c + 4, cv1);
+  vst1q_f32(c + ldc, cv2);
+  vst1q_f32(c + ldc + 4, cv3);
+  vst1q_f32(c + 2 * ldc, cv4);
+  vst1q_f32(c + 2 * ldc + 4, cv5);
+  vst1q_f32(c + 3 * ldc, cv6);
+  vst1q_f32(c + 3 * ldc + 4, cv7);
+  vst1q_f32(c + 4 * ldc, cv8);
+  vst1q_f32(c + 4 * ldc + 4, cv9);
+  vst1q_f32(c + 5 * ldc, cv10);
+  vst1q_f32(c + 5 * ldc + 4, cv11);
+
+#else
+
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k / 8;
+  int kc2 = k % 8;
+  int step = 4 * ldc;
+  asm volatile(
+      "pld        [%[a_ptr]]            \n\t"
+      "pld        [%[a_ptr],  #64]      \n\t"
+      "pld        [%[b_ptr]]            \n\t"
+      "pld        [%[b_ptr],  #64]      \n\t"
+
+      "vmov.f32   q4,     #0.0          \n\t"
+      "vmov.f32   q5,     #0.0          \n\t"
+      "vmov.f32   q6,     #0.0          \n\t"
+      "vmov.f32   q7,     #0.0          \n\t"
+      "vmov.f32   q8,     #0.0          \n\t"
+      "vmov.f32   q9,     #0.0          \n\t"
+      "vmov.f32   q10,    #0.0          \n\t"
+      "vmov.f32   q11,    #0.0          \n\t"
+      "vmov.f32   q12,    #0.0          \n\t"
+      "vmov.f32   q13,    #0.0          \n\t"
+      "vmov.f32   q14,    #0.0          \n\t"
+      "vmov.f32   q15,    #0.0          \n\t"
+
+      "subs       %[kc1], %[kc1], #1    \n\t"
+      "blt        2f                    \n\t"
+      "1:                               \n\t"
+
+      "pld        [%[a_ptr], #128]       \n\t"
+      "pld        [%[b_ptr], #128]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "pld        [%[a_ptr], #128]       \n\t"
+      "pld        [%[b_ptr], #128]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "pld        [%[a_ptr], #128]       \n\t"
+      "pld        [%[b_ptr], #128]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "pld        [%[a_ptr], #128]       \n\t"
+      "pld        [%[b_ptr], #128]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "subs       %[kc1], %[kc1], #1      \n\t"
+      "bge        1b                      \n\t"
+      "2:                                 \n\t"
+
+      "subs       %[kc2], %[kc2], #1      \n\t"
+      "blt        4f                      \n\t"
+      "3:                                 \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "subs       %[kc2], %[kc2], #1      \n\t"
+      "bge        3b                      \n\t"
+      "4:                                 \n\t"
+
+      "mov        r5,     %[c]            \n\t"
+      "mov        r6,     %[step]         \n\t"
+      "vst1.32    {q4, q5},   [r5], r6    \n\t"
+      "vst1.32    {q6, q7},   [r5], r6    \n\t"
+      "vst1.32    {q8, q9},   [r5], r6    \n\t"
+      "vst1.32    {q10, q11}, [r5], r6    \n\t"
+      "vst1.32    {q12, q13}, [r5], r6    \n\t"
+      "vst1.32    {q14, q15}, [r5]        \n\t"
+
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [step] "r"(step)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+
+#endif  // __aarch64__
+#else
+
+#endif  // __ARM_NEON
+}
+
+#if __aarch64__
+void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) {
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k;
+  int step = 4 * ldc;
+  asm volatile(
+      "dup      v5.4s,     wzr     \n\t"
+      "dup      v6.4s,     wzr     \n\t"
+      "dup      v7.4s,     wzr     \n\t"
+      "dup      v8.4s,     wzr     \n\t"
+      "dup      v9.4s,     wzr     \n\t"
+      "dup      v10.4s,    wzr     \n\t"
+      "dup      v11.4s,    wzr     \n\t"
+      "dup      v12.4s,    wzr     \n\t"
+      "dup      v13.4s,    wzr     \n\t"
+      "dup      v14.4s,    wzr     \n\t"
+      "dup      v15.4s,    wzr     \n\t"
+      "dup      v16.4s,    wzr     \n\t"
+
+      "dup      v17.4s,    wzr     \n\t"
+      "dup      v18.4s,    wzr     \n\t"
+      "dup      v19.4s,    wzr     \n\t"
+      "dup      v20.4s,    wzr     \n\t"
+      "dup      v21.4s,    wzr     \n\t"
+      "dup      v22.4s,    wzr     \n\t"
+      "dup      v23.4s,    wzr     \n\t"
+      "dup      v24.4s,    wzr     \n\t"
+      "dup      v25.4s,    wzr     \n\t"
+      "dup      v26.4s,    wzr     \n\t"
+      "dup      v27.4s,    wzr     \n\t"
+      "dup      v28.4s,    wzr     \n\t"
+
+      "subs       %[kc1], %[kc1], #1    \n\t"
+      "blt        2f                    \n\t"
+      "1:                               \n\t"
+
+      "prfm     pldl1keep,         [%[a_ptr],   #32]  \n\t"
+      "prfm     pldl1keep,         [%[b_ptr],   #48]  \n\t"
+
+      "ld1      {v0.4s, v1.4s},         [%[a_ptr]],   #32   \n\t"
+      "ld1      {v2.4s, v3.4s, v4.4s},  [%[b_ptr]],   #48   \n\t"
+
+      "fmla     v5.4s,    v2.4s,   v0.s[0]       \n\t"
+      "fmla     v6.4s,    v3.4s,   v0.s[0]       \n\t"
+      "fmla     v7.4s,    v4.4s,   v0.s[0]       \n\t"
+      "fmla     v8.4s,    v2.4s,   v0.s[1]       \n\t"
+      "fmla     v9.4s,    v3.4s,   v0.s[1]       \n\t"
+      "fmla     v10.4s,   v4.4s,   v0.s[1]       \n\t"
+      "fmla     v11.4s,   v2.4s,   v0.s[2]       \n\t"
+      "fmla     v12.4s,   v3.4s,   v0.s[2]       \n\t"
+      "fmla     v13.4s,   v4.4s,   v0.s[2]       \n\t"
+      "fmla     v14.4s,   v2.4s,   v0.s[3]       \n\t"
+      "fmla     v15.4s,   v3.4s,   v0.s[3]       \n\t"
+      "fmla     v16.4s,   v4.4s,   v0.s[3]       \n\t"
+
+      "fmla     v17.4s,   v2.4s,   v1.s[0]       \n\t"
+      "fmla     v18.4s,   v3.4s,   v1.s[0]       \n\t"
+      "fmla     v19.4s,   v4.4s,   v1.s[0]       \n\t"
+      "fmla     v20.4s,   v2.4s,   v1.s[1]       \n\t"
+      "fmla     v21.4s,   v3.4s,   v1.s[1]       \n\t"
+      "fmla     v22.4s,   v4.4s,   v1.s[1]       \n\t"
+      "fmla     v23.4s,   v2.4s,   v1.s[2]       \n\t"
+      "fmla     v24.4s,   v3.4s,   v1.s[2]       \n\t"
+      "fmla     v25.4s,   v4.4s,   v1.s[2]       \n\t"
+      "fmla     v26.4s,   v2.4s,   v1.s[3]       \n\t"
+      "fmla     v27.4s,   v3.4s,   v1.s[3]       \n\t"
+      "fmla     v28.4s,   v4.4s,   v1.s[3]       \n\t"
+
+      "subs       %[kc1], %[kc1], #1      \n\t"
+      "bge        1b                      \n\t"
+      "2:                                 \n\t"
+
+      "st1      {v5.4s,   v6.4s,  v7.4s},    [%[c]],   %[step]   \n\t"
+      "st1      {v8.4s,   v9.4s,  v10.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v11.4s,  v12.4s, v13.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v14.4s,  v15.4s, v16.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v17.4s,  v18.4s, v19.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v20.4s,  v21.4s, v22.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v23.4s,  v24.4s, v25.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v26.4s,  v27.4s, v28.4s},   [%[c]],   %[step]   \n\t"
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [step] "r"(step)
+      : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+        "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+        "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28");
+}
+
+void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc) {
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k;
+  int step = 4 * ldc;
+  int step1 = 4 * 6;
+  asm volatile(
+
+      "dup      v6.4s,     wzr     \n\t"
+      "dup      v7.4s,     wzr     \n\t"
+      "dup      v8.4s,     wzr     \n\t"
+      "dup      v9.4s,     wzr     \n\t"
+      "dup      v10.4s,    wzr     \n\t"
+      "dup      v11.4s,    wzr     \n\t"
+      "dup      v12.4s,    wzr     \n\t"
+      "dup      v13.4s,    wzr     \n\t"
+
+      "dup      v14.4s,    wzr     \n\t"
+      "dup      v15.4s,    wzr     \n\t"
+      "dup      v16.4s,    wzr     \n\t"
+      "dup      v17.4s,    wzr     \n\t"
+      "dup      v18.4s,    wzr     \n\t"
+      "dup      v19.4s,    wzr     \n\t"
+      "dup      v20.4s,    wzr     \n\t"
+      "dup      v21.4s,    wzr     \n\t"
+
+      "dup      v22.4s,    wzr     \n\t"
+      "dup      v23.4s,    wzr     \n\t"
+      "dup      v24.4s,    wzr     \n\t"
+      "dup      v25.4s,    wzr     \n\t"
+      "dup      v26.4s,    wzr     \n\t"
+      "dup      v27.4s,    wzr     \n\t"
+      "dup      v28.4s,    wzr     \n\t"
+      "dup      v29.4s,    wzr     \n\t"
+
+      "subs       %[kc1], %[kc1], #1    \n\t"
+      "blt        2f                    \n\t"
+      "1:                               \n\t"
+
+      "prfm   pldl1keep,  [%[a_ptr],  #24]  \n\t"
+      "prfm   pldl1keep,  [%[b_ptr],  #64]  \n\t"
+
+      "ld1      {v0.4s, v1.4s},  [%[a_ptr]],   %[step1]       \n\t"
+      "ld1      {v2.4s, v3.4s, v4.4s, v5.4s},  [%[b_ptr]],    #64   \n\t"
+
+      "fmla     v6.4s,    v2.4s,   v0.s[0]       \n\t"
+      "fmla     v7.4s,    v3.4s,   v0.s[0]       \n\t"
+      "fmla     v8.4s,    v4.4s,   v0.s[0]       \n\t"
+      "fmla     v9.4s,    v5.4s,   v0.s[0]       \n\t"
+
+      "fmla     v10.4s,   v2.4s,   v0.s[1]       \n\t"
+      "fmla     v11.4s,   v3.4s,   v0.s[1]       \n\t"
+      "fmla     v12.4s,   v4.4s,   v0.s[1]       \n\t"
+      "fmla     v13.4s,   v5.4s,   v0.s[1]       \n\t"
+
+      "fmla     v14.4s,   v2.4s,   v0.s[2]       \n\t"
+      "fmla     v15.4s,   v3.4s,   v0.s[2]       \n\t"
+      "fmla     v16.4s,   v4.4s,   v0.s[2]       \n\t"
+      "fmla     v17.4s,   v5.4s,   v0.s[2]       \n\t"
+
+      "fmla     v18.4s,   v2.4s,   v0.s[3]       \n\t"
+      "fmla     v19.4s,   v3.4s,   v0.s[3]       \n\t"
+      "fmla     v20.4s,   v4.4s,   v0.s[3]       \n\t"
+      "fmla     v21.4s,   v5.4s,   v0.s[3]       \n\t"
+
+      "fmla     v22.4s,   v2.4s,   v1.s[0]       \n\t"
+      "fmla     v23.4s,   v3.4s,   v1.s[0]       \n\t"
+      "fmla     v24.4s,   v4.4s,   v1.s[0]       \n\t"
+      "fmla     v25.4s,   v5.4s,   v1.s[0]       \n\t"
+
+      "fmla     v26.4s,   v2.4s,   v1.s[1]       \n\t"
+      "fmla     v27.4s,   v3.4s,   v1.s[1]       \n\t"
+      "fmla     v28.4s,   v4.4s,   v1.s[1]       \n\t"
+      "fmla     v29.4s,   v5.4s,   v1.s[1]       \n\t"
+
+      "subs       %[kc1], %[kc1], #1      \n\t"
+      "bge        1b                      \n\t"
+      "2:                                 \n\t"
+
+      "st1      {v6.4s,  v7.4s,  v8.4s,  v9.4s},    [%[c]],   %[step]   \n\t"
+      "st1      {v10.4s, v11.4s, v12.4s, v13.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v14.4s, v15.4s, v16.4s, v17.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v18.4s, v19.4s, v20.4s, v21.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v22.4s, v23.4s, v24.4s, v25.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v26.4s, v27.4s, v28.4s, v29.4s},   [%[c]],   %[step]   \n\t"
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [step] "r"(step), [step1] "r"(step1)
+      : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+        "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+        "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29");
+}
+
+#endif  // __aarch64__
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h
index 6d7ae6d2bcdbd7e24cb3c2389dd3cdf09a807892..abd209bb45c650363b7d19c495bea4d9848fc834 100644
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -13,18 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
+#include "common/log.h"
 
 // 矩阵取值运算宏，假设矩阵按行存储
 #define A(i, j) A[(i)*lda + (j)]
 #define B(i, j) B[(i)*ldb + (j)]
 #define C(i, j) C[(i)*ldc + (j)]
 
-// 分块计算的块大小，mc 与 kc 分别对应分块计算时的 m 与 k
-#define MC 128
-#define KC 128
-#define NC 1024
-#define MR 4
-#define NR 4
+#if __aarch64__
+#define MR 6
+#define NR 16
+#else
+#define MR 6
+#define NR 8
+#endif
 
 #define s_min(i, j) ((i) < (j) ? (i) : (j))
 
@@ -32,6 +35,7 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {
 
+/*
 // 将 A 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                  float *buffer);
@@ -39,42 +43,138 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
 // 将 B 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                  float *buffer);
+*/
 
 // 将 A 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
-                  float *buffer);
+void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer);
+void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer);
+void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer);
+void PackMatrixA_omp_6r(int m, int k, int m_tail, const float *A, int lda,
+                        float *buffer);
+void PackMatrixA_omp_8r(int m, int k, int m_tail, const float *A, int lda,
+                        float *buffer);
 
 // 将 B 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
-                  float *buffer);
+void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
+                    float *buffer);
+void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
+                     float *buffer);
+void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
+                     float *buffer);
+void PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
+                        float *buffer);
+void PackMatrixB_omp_12c(int k, int n, int n_tail, const float *B, int ldb,
+                         float *buffer);
+void PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B, int ldb,
+                         float *buffer);
 
 // 分块矩阵乘法
-void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                 const float *B, int ldb, float beta, float *C, int ldc,
-                 int first_time);
-
+void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
+                 float beta, float *c, float *C, int ldc, bool relu);
+void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
+                         const float *b, float beta, float *c, float *C,
+                         int ldc, bool relu, float *bias);
+
+void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+                       const float *b, float beta, float *c, float *C, int ldc,
+                       bool relu, float *new_scale, float *new_bias);
+void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
+                          const float *b, float beta, float *c, float *C,
+                          int ldc, bool relu, float *new_scale, float *new_bias,
+                          float *bias);
+void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
+                          float *c, float *C, int ldc, float *p,
+                          std::string mode, float *bias, float *bias1);
+/*
 // 向量矩阵乘法 (M = 1)
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                  const float *B, int ldb, float beta, float *C, int ldc);
-
-// 计算一个更小的 4 * 4 的 C 矩阵分块
-void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B,
-               int ldb, float beta, float *C, int ldc, int mc, int nc);
-
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu);
+                  const float *B, int ldb, float beta, float *C, int ldc,
+                  bool relu);
+
+void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+                        int lda, const float *B, int ldb, float beta, float *C,
+                        int ldc, bool relu, float *new_scale, float *new_bias);
+*/
+
+// 计算一个更小的 C 矩阵分块
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc);
+
+// 分块矩阵乘法结果回写
+// C = A * B
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
+// C = alpha * A * B + beta * C
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B + C
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B + bias
+void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias);
+// C = A * B + C, relu(C)
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B + C,prelu(C)
+void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
+                       std::string mode, float *bias, float *bias1);
+// C = A * B + bias ,relu(C)
+void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
+                        float *bias);
+// C = A * B, batchnorm(C)
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
+                 float *new_bias);
+// C = A * B, batchnorm(C), relu(C)
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                     float *new_scale, float *new_bias);
+void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
+                        float *new_scale, float *new_bias, float *bias1);
+/*
+// 向量矩阵乘法结果回写
+// C = A * B
+void VecWriteBasic(int n, float *c, float *C, int ldc);
+// C = alpha * A * B + beta * C
+void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
+// C = A * B + C
+void VecWriteWithAdd(int n, float *c, float *C, int ldc);
+// C = A * B + C, relu(C)
+void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
+// C = A * B, batchnorm(C)
+void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
+                    float *new_bias);
+// C = A * B, batchnorm(C), relu(C)
+void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
+                        float *new_bias);
+*/
 
 // 32位 float 矩阵乘法
-void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc);
-
-void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
-                const float *B, int ldb, float beta, float *C, int ldc);
+void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+           const float *B, int ldb, float beta, float *C, int ldc, bool relu,
+           float *bias);
 
-// 64位 double 矩阵乘法
-void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
-           const double *B, int ldb, float beta, double *C, int ldc);
+// 32位 float 矩阵乘法, 并对结果进行 batchnrom
+void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 bool relu, float *new_scale, float *new_bias, float *bias);
+void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
+                    const float *B, int ldb, float *C, int ldc, float *p,
+                    std::string mode, float *bias, float *bias1);
+
+// 32位 float 矩阵乘法（openmp 多线程版本）
+void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
+               const float *B, int ldb, float beta, float *C, int ldc,
+               bool relu, float *bias);
+
+// 32位 float 矩阵乘法, 并对结果进行 batchnrom（openmp 多线程版本）
+void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
+                     const float *B, int ldb, float beta, float *C, int ldc,
+                     bool relu, float *new_scale, float *new_bias, float *bias);
+
+void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
+                        const float *B, int ldb, float *C, int ldc, float *p,
+                        std::string mode, float *bias, float *bias1);
 
 }  // namespace math
 }  // namespace operators
diff --git a/src/operators/math/gru_compute.cpp b/src/operators/math/gru_compute.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2f71ec3a34d83cd65626c671ace41ae071c95ce2
--- /dev/null
+++ b/src/operators/math/gru_compute.cpp
@@ -0,0 +1,55 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef GRU_OP
+#include "operators/math/gru_compute.h"
+#include "common/types.h"
+#include "operators/math/activation_functions.h"
+#include "operators/math/gemm.h"
+#include "operators/math/gru_cpu_kernel.h"
+#include "operators/math/gru_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct GRUUnitFunctor<CPU, T> {
+  static void compute(GRUMetaValue<T> value, int frame_size, int batch_size,
+                      const ActivationType active_node,
+                      const ActivationType active_gate) {
+    if (value.prev_out_value) {
+      Sgemm(batch_size, frame_size * 2, frame_size, 1, value.prev_out_value,
+            frame_size, value.gate_weight, frame_size * 2, 1, value.gate_value,
+            frame_size * 3, false, nullptr);
+    }
+
+    forward_reset_output(forward::gru_resetOutput<T>(), value, frame_size,
+                         batch_size, active_gate);
+
+    if (value.prev_out_value) {
+      Sgemm(batch_size, frame_size, frame_size, 1, value.reset_output_value,
+            frame_size, value.state_weight, frame_size, 1,
+            value.gate_value + frame_size * 2, frame_size * 3, false, nullptr);
+    }
+
+    forward_final_output(forward::gru_finalOutput<T>(), value, frame_size,
+                         batch_size, active_node);
+  }
+};
+
+template struct GRUUnitFunctor<CPU, float>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/math/gru_compute.h b/src/operators/math/gru_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..89cac1b8e49cd11eec551ba60f54e72f3912c846
--- /dev/null
+++ b/src/operators/math/gru_compute.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef GRU_OP
+#pragma once
+
+#include "operators/math/activation_functions.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct GRUMetaValue {
+  T *gate_weight;
+  T *state_weight;
+  T *gate_value;
+  T *reset_output_value;
+  T *output_value;
+  T *prev_out_value;
+};
+
+template <typename DeviceType, typename T>
+struct GRUUnitFunctor {
+  static void compute(GRUMetaValue<T> value, int frame_size, int batch_size,
+                      const ActivationType active_node,
+                      const ActivationType active_gate);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/math/gru_cpu_kernel.h b/src/operators/math/gru_cpu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea24c4f1d97ebfbc5454e118121a3c79f28008c6
--- /dev/null
+++ b/src/operators/math/gru_cpu_kernel.h
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef GRU_OP
+#pragma once
+#include <type_traits>
+#include "operators/math/activation_functions.h"
+#include "operators/math/gru_compute.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+template <class OpResetOutput, typename T>
+void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                       T *gate_value, T *reset_output_value,
+                                       T *prev_output_value, int frame_size,
+                                       ActivationType active_gate) {
+  T r_value_update_gate;
+  T r_value_reset_gate;
+  T r_value_reset_output;
+  T r_prev_out = 0;
+  T *update_gate = gate_value;
+  T *reset_gate = gate_value + frame_size;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_reset_gate = reset_gate[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
+    }
+
+    op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
+                    &r_value_reset_output, active_gate);
+
+    update_gate[i] = r_value_update_gate;
+    reset_gate[i] = r_value_reset_gate;
+    reset_output_value[i] = r_value_reset_output;
+  }
+}
+
+template <class OpFinalOutput, typename T>
+void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
+                                       T *gate_value, T *prev_output_value,
+                                       T *output_value, int frame_size,
+                                       ActivationType active_node) {
+  T r_value_update_gate;
+  T r_value_frame_state;
+  T r_prev_out = 0;
+  T r_output;
+  T *update_gate = gate_value;
+  T *frame_state = gate_value + frame_size * 2;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_frame_state = frame_state[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
+    }
+
+    op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
+                    &r_output, active_node);
+
+    frame_state[i] = r_value_frame_state;
+    output_value[i] = r_output;
+  }
+}
+
+template <class OpResetOutput, typename T>
+inline void forward_reset_output(OpResetOutput op_reset_output,
+                                 GRUMetaValue<T> value, int frame_size,
+                                 int batch_size, ActivationType active_gate) {
+  for (int b = 0; b < batch_size; b++) {
+    hl_naive_gru_forward_reset_output(
+        op_reset_output, value.gate_value, value.reset_output_value,
+        value.prev_out_value, frame_size, active_gate);
+
+    value.gate_value += frame_size * 3;
+    value.reset_output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+  }
+}
+
+template <class OpFinalOutput, typename T>
+inline void forward_final_output(OpFinalOutput op_final_output,
+                                 GRUMetaValue<T> value, int frame_size,
+                                 int batch_size, ActivationType active_node) {
+  for (int b = 0; b < batch_size; b++) {
+    hl_naive_gru_forward_final_output(op_final_output, value.gate_value,
+                                      value.prev_out_value, value.output_value,
+                                      frame_size, active_node);
+
+    value.gate_value += frame_size * 3;
+    value.output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/math/gru_kernel.h b/src/operators/math/gru_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6113ce8da997eaa5720886d637a9cc9261ea5227
--- /dev/null
+++ b/src/operators/math/gru_kernel.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef GRU_OP
+#pragma once
+#include <type_traits>
+#include "operators/math/activation_functions.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+namespace forward {
+
+template <typename T>
+class gru_resetOutput {
+ public:
+  void operator()(T *value_update_gate, T *value_reset_gate, T *prev_out,
+                  T *value_reset_output, ActivationType act_gate) {
+    *value_update_gate = activation(*value_update_gate, act_gate);
+    *value_reset_gate = activation(*value_reset_gate, act_gate);
+    *value_reset_output = (*prev_out) * (*value_reset_gate);
+  }
+};
+
+template <typename T>
+class gru_finalOutput {
+ public:
+  void operator()(T *value_update_gate, T *value_frame_state, T *prev_out,
+                  T *value_output, ActivationType act_input) {
+    *value_frame_state = activation(*value_frame_state, act_input);
+    *value_output = *prev_out - ((*value_update_gate) * (*prev_out)) +
+                    ((*value_update_gate) * (*value_frame_state));
+  }
+};
+}  // namespace forward
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/math/im2col.cpp b/src/operators/math/im2col.cpp
index 625d120705aab8fcc3ea8d232b4077e213941ec4..090ccdf24e214fc86b8a4032df228d50caa65ef9 100644
--- a/src/operators/math/im2col.cpp
+++ b/src/operators/math/im2col.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "operators/math/im2col.h"
 #include <vector>
 #ifdef __ARM_NEON
-#include "arm_neon.h"
+#include <arm_neon.h>
 #endif
 #include "common/types.h"
 namespace paddle_mobile {
@@ -69,16 +69,16 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
     int channels_col = im_channels * filter_height * filter_width;
     const T *im_data = im.data<T>();
     T *col_data = col->data<T>();
-#ifdef __ARM_NEON
+#if __ARM_NEON
     const int osize = col_height;
     const int isize = im_height;
     bool pad1 = padding[0] > 0;
     bool pad2 =
-        (pad1 &&
+        (pad1 && padding[1] &&
          (((isize - 2 * padding[0] + filter_height) % stride[0] == 0) ? 1 : 0));
     int fill = isize % 2;
     if (stride[0] == 1 && filter_height == 3 && pad1 && pad2 &&
-        dilation[0] == 1) {
+        dilation[0] == 1 && im_height > 2) {
       for (int c = 0; c < im_channels; ++c) {
         int oosize = osize * osize;
         int nk4 = osize / 4;
@@ -250,7 +250,7 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
         im_data += isize * isize;
       }
     } else if (stride[0] == 2 && filter_height == 3 && pad1 &&
-               dilation[0] == 1) {
+               dilation[0] == 1 && im_height > 2) {
       for (int c = 0; c < im_channels; ++c) {
         int oosize = osize * osize;
         int nk4 = osize / 4;
@@ -481,6 +481,7 @@ class Col2ImFunctor<ColFormat::kCFO, CPU, T> {
 
     T *im_data = im->data<T>();
     const T *col_data = col.data<T>();
+    memset(static_cast<void *>(im_data), 0, sizeof(T) * im->numel());
 
     for (int c = 0; c < channels_col; ++c) {
       int w_offset = c % filter_width;
diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp
index fd4106038c7446e659736c6b3c61b5aa05127e72..14269817ededd097c4c9ade20be5ee773c02d692 100644
--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "operators/math/math_function.h"
+#include <cstring>
 #include "operators/math/gemm.h"
 
 namespace paddle_mobile {
@@ -22,58 +23,138 @@ namespace math {
 template <>
 void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
                    const framework::Tensor &matrix_b, bool trans_b, float alpha,
-                   framework::Tensor *matrix_out, float beta, bool relu) {
+                   framework::Tensor *matrix_out, float beta, bool relu,
+                   float *bias) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
   auto dim_out = matrix_out->dims();
-  //  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
-  //  dim_out.size() ==
-  //  2,
-  //                 "The input and output of matmul be matrix");
-  //
-  //  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
-  //                     platform::is_cpu_place(matrix_b.place())
-  //                     &&
-  //                     platform::is_cpu_place(matrix_out->place()),
-  //                 "Matrix must all be in CPUPlace");
+  PADDLE_MOBILE_ENFORCE(
+      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+      "The input and output of matmul be matrix");
 
   int M = dim_out[0];
   int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+  int K = (!trans_a) ? dim_a[1] : dim_a[0];
 
-  if (relu) {
-    sgemm_relu(M, N, K, alpha, matrix_a.data<float>(), K,
-               matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N);
+  if (trans_a) {
+    int numel = matrix_a.numel();
+    int m = matrix_a.dims()[0];
+    int n = matrix_a.dims()[1];
+    float *tmp = (float *)(matrix_a.data<float>());
+    float *a = static_cast<float *>(
+        paddle_mobile::memory::Alloc(sizeof(float) * numel));
+    int index = 0;
+    for (int j = 0; j < n; j++) {
+      for (int i = 0; i < m; i++) {
+        a[index++] = tmp[i * n + j];
+      }
+    }
+#ifdef _OPENMP
+    Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
+              matrix_out->data<float>(), N, relu, bias);
+#else
+    Sgemm(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
+          matrix_out->data<float>(), N, relu, bias);
+#endif
   } else {
-    sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-          beta, matrix_out->data<float>(), N);
+#ifdef _OPENMP
+    Sgemm_omp(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
+              N, beta, matrix_out->data<float>(), N, relu, bias);
+#else
+    Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+          beta, matrix_out->data<float>(), N, relu, bias);
+#endif
   }
 }
 
 template <>
-void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
-                    const framework::Tensor &matrix_b, bool trans_b,
-                    double alpha, framework::Tensor *matrix_out, double beta,
-                    bool relu) {
+void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
+                         const framework::Tensor &matrix_b, bool trans_b,
+                         float alpha, framework::Tensor *matrix_out, float beta,
+                         bool relu, framework::Tensor *new_scale,
+                         framework::Tensor *new_bias, int group, float *bias) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
   auto dim_out = matrix_out->dims();
-  //  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
-  //  dim_out.size() ==
-  //  2,
-  //                 "The input and output of matmul be matrix");
-  //
-  //  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
-  //                     platform::is_cpu_place(matrix_b.place())
-  //                     &&
-  //                     platform::is_cpu_place(matrix_out->place()),
-  //                 "Matrix must all be in CPUPlace");
+  PADDLE_MOBILE_ENFORCE(
+      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+      "The input and output of matmul be matrix");
 
   int M = dim_out[0];
   int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+  int K = (!trans_a) ? dim_a[1] : dim_a[0];
+
+#ifdef _OPENMP
+  SgemmWithBn_omp(M, N, K, alpha, matrix_a.data<float>(), K,
+                  matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N,
+                  relu, new_scale->data<float>() + group,
+                  new_bias->data<float>() + group, bias);
+#else
+  SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
+              N, beta, matrix_out->data<float>(), N, relu,
+              new_scale->data<float>() + group, new_bias->data<float>() + group,
+              bias);
+#endif
+}
+void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
+                     const framework::Tensor &matrix_b, bool trans_b,
+                     framework::Tensor *matrix_out, float *p, std::string mode,
+                     float *bias, float *bias1) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  PADDLE_MOBILE_ENFORCE(
+      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+      "The input and output of matmul be matrix");
+
+  int M = dim_out[0];
+  int N = dim_out[1];
+  int K = (!trans_a) ? dim_a[1] : dim_a[0];
+
+#ifdef _OPENMP
+  SgemmWithPRelu_omp(M, N, K, matrix_a.data<float>(), K, matrix_b.data<float>(),
+                     N, matrix_out->data<float>(), N, p, mode, bias, bias1);
+#else
+  SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+                 matrix_out->data<float>(), N, p, mode, bias, bias1);
+
+#endif
 }
 
+template <typename T>
+struct ClearTensor<CPU, T> {
+  void operator()(framework::Tensor *tensor) {
+    auto size = tensor->numel();
+    auto *tensor_data = tensor->data<float>();
+    memset((void *)tensor_data, 0, sizeof(T) * size);
+  }
+};
+
+template <typename T>
+struct RowwiseAdd<CPU, T> {
+  void operator()(const framework::Tensor &input,
+                  const framework::Tensor &vector, framework::Tensor *output) {
+    auto in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_MOBILE_ENFORCE((vector.numel() == size),
+                          "vector.numel() must be equal to size.");
+    PADDLE_MOBILE_ENFORCE((output->dims() == in_dims),
+                          "output->dims() must be equal to in_dims.");
+
+    auto *input_data = input.data<float>();
+    auto *out_data = output->data<float>();
+    auto *vec_data = vector.data<float>();
+    for (int64_t i = 0; i < in_dims[0]; ++i) {
+      for (int64_t j = 0; j < size; ++j) {
+        out_data[i * size + j] = input_data[i * size + j] + vec_data[j];
+      }
+    }
+  }
+};
+
+template struct RowwiseAdd<CPU, float>;
+template struct ClearTensor<CPU, float>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/math/math_function.h b/src/operators/math/math_function.h
index 0b953ec6a3b2a03a94a91884b9daf3ed88523a22..de19e3df2ab69c8ac490b09af2852bf2fa806c64 100644
--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -21,11 +21,34 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {
 
-// matrix multiply with continuous memory
 template <typename T>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
             const framework::Tensor &matrix_b, bool trans_b, T alpha,
-            framework::Tensor *matrix_out, T beta, bool relu = false);
+            framework::Tensor *matrix_out, T beta, bool relu = false,
+            float *bias = nullptr);
+
+template <typename T>
+void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
+                  const framework::Tensor &matrix_b, bool trans_b, T alpha,
+                  framework::Tensor *matrix_out, T beta, bool relu,
+                  framework::Tensor *new_scale, framework::Tensor *new_bias,
+                  int group, float *bias = nullptr);
+
+void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
+                     const framework::Tensor &matrix_b, bool trans_b,
+                     framework::Tensor *matrix_out, float *p, std::string mode,
+                     float *bias, float *bias1);
+template <typename DeviceType, typename T>
+struct ClearTensor {
+  void operator()(framework::Tensor *tensor);
+};
+
+template <typename DeviceType, typename T>
+struct RowwiseAdd {
+  void operator()(const framework::Tensor &input, const framework::Tensor &vec,
+                  framework::Tensor *output);
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/math/pool_2x2.cpp b/src/operators/math/pool_2x2.cpp
index 96d277c136b4656dbb1fd682489bd7dee5c3af0e..9dc3dbafed990de2f4057d98a2accdd8ce2fd7db 100644
--- a/src/operators/math/pool_2x2.cpp
+++ b/src/operators/math/pool_2x2.cpp
@@ -13,23 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef POOL_OP
-#include "pool_2x2.h"
+#include "operators/math/pool_2x2.h"
+#include <algorithm>
+#include <vector>
 
 namespace paddle_mobile {
 namespace operators {
 namespace math {
+#define FLT_MAX __FLT_MAX__
 
-void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
-                Tensor *output) {
-#if __ARM_NEON
+void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
+                    const Tensor *input, Tensor *output) {
   const int batch_size = input->dims()[0];
-
   const int input_height = input->dims()[2];
-
   const int input_width = input->dims()[3];
 
   const int output_channels = output->dims()[1];
-
   int output_height = output->dims()[2];
   const int output_width = output->dims()[3];
   const int ksize_height = 2;
@@ -42,67 +41,115 @@ void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
   const int input_channel_stride = input_height * input_width;
   const int output_channel_stride = output_height * output_width;
 
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+
   const float *input_data = input->data<float>();
   float *output_data = output->mutable_data<float>();
 
-  int out_w_num = output_width >> 2;
-  const int in_h_num = output_height >> 1;
-  const int input_batch_stride = output_channels * input_channel_stride;
-  const int output_batch_stride = output_channels * output_channel_stride;
-  int remain = output_width - out_w_num << 2;
+  int w1 = input_width / 16;
+  int _w1 = input_width % 16;
+  int w2 = _w1 / 4;
+  int _w2 = _w1 % 4;
+
   for (int i = 0; i < batch_size; ++i) {
     for (int c = 0; c < output_channels; ++c) {
-      const float *input_data_chanel_row_next = input_data + input_width;
-      for (; output_height > 0; output_height--) {
-        if (out_w_num > 0) {
-          asm volatile(
-              "max_loop:                            \n\t"
-              "vld1.f32  {q0,q1},  [%[in_ptr1]]!         \n\t"
-              "vld1.f32  {q2,q3},  [%[in_ptr2]]!         \n\t"
-              "vmax.f32  q0,  q0,  q2                 \n\t"
-              "vmax.f32  q1,  q1,  q3                 \n\t"
-              "vpmax.f32  d4,  d0, d1                  \n\t"
-              "vpmax.f32  d5,  d2, d3                  \n\t"
-              "subs %[out_w_num],  #1                  \n\t"
-              "vst1.32  {q2},  [%[out_ptr]]!                 \n\t"
-              "bne  max_loop                            \n\t"
-              : [in_ptr1] "+r"(input_data),
-                [in_ptr2] "+r"(input_data_chanel_row_next),
-                [out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num)
-              :
-              : "memory", "q0", "q1", "q2", "q3");
+      for (int ph = 0; ph < input_height; ph += 2) {
+        const float *in_ptr1 = input_data + i * input_batch_stride +
+                               c * input_channel_stride + ph * input_width;
+        const float *in_ptr2 = in_ptr1 + input_width;
+        if (ph + 1 >= input_height) {
+          in_ptr2 = static_cast<float *>(
+              paddle_mobile::memory::Alloc(sizeof(float) * input_width));
+          memset(static_cast<void *>(const_cast<float *>(in_ptr2)), -FLT_MAX,
+                 sizeof(float) * input_width);
         }
+        float *out_ptr = output_data + i * output_batch_stride +
+                         c * output_channel_stride + ph / 2 * output_width;
+#if __ARM_NEON
+#if __aarch64__
+#else
+        asm volatile(
+            "subs       %[w1], %[w1], #1        \n\t"
+            "blt        end_w1_%=               \n\t"
+            "loop_w1_%=:                        \n\t"
+
+            "pld        [%[in_ptr1], #64]       \n\t"
+            "pld        [%[in_ptr2], #64]       \n\t"
+
+            "vld1.f32   {q0, q1},   [%[in_ptr1]]!   \n\t"
+            "vld1.f32   {q2, q3},   [%[in_ptr2]]!   \n\t"
+            "vld1.f32   {q6, q7},   [%[in_ptr1]]!   \n\t"
+            "vld1.f32   {q8, q9},   [%[in_ptr2]]!   \n\t"
 
-        for (; remain > 0; remain--) {
-          float max_row1 = std::max(input_data[0], input_data[1]);
-          float max_row2 = std::max(input_data_chanel_row_next[0],
-                                    input_data_chanel_row_next[1]);
-          *output_data = std::max(max_row1, max_row2);
-          input_data += 2;
-          input_data_chanel_row_next += 2;
-          output_data++;
+            "vmax.f32   q0,     q0,   q2        \n\t"
+            "vmax.f32   q1,     q1,   q3        \n\t"
+
+            "vmax.f32   q6,     q6,   q8        \n\t"
+            "vmax.f32   q7,     q7,   q9        \n\t"
+
+            "vpmax.f32  d8,     d0,   d1        \n\t"
+            "vpmax.f32  d9,     d2,   d3        \n\t"
+
+            "vpmax.f32  d10,    d12,  d13       \n\t"
+            "vpmax.f32  d11,    d14,  d15       \n\t"
+
+            "vst1.32  {q4, q5},  [%[out_ptr]]!  \n\t"
+
+            "subs       %[w1], %[w1], #1        \n\t"
+            "bge        loop_w1_%=              \n\t"
+            "end_w1_%=:                         \n\t"
+
+            "subs       %[w2], %[w2], #1        \n\t"
+            "blt        end_w2_%=               \n\t"
+            "loop_w2_%=:                        \n\t"
+
+            "vld1.f32   {q0},   [%[in_ptr1]]!   \n\t"
+            "vld1.f32   {q1},   [%[in_ptr2]]!   \n\t"
+            "vmax.f32   q0,     q0,   q1        \n\t"
+            "vpmax.f32  d4,     d0,   d1        \n\t"
+            "vst1.32    {d4},   [%[out_ptr]]!   \n\t"
+
+            "subs       %[w2], %[w2], #1        \n\t"
+            "bge        loop_w2_%=              \n\t"
+            "end_w2_%=:                         \n\t"
+            :
+            : [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1),
+              [in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr)
+            : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+              "q9");
+#endif
+#endif
+
+        if (_w2 != 0) {
+          in_ptr1 += 16 * w1 + 4 * w2;
+          in_ptr2 += 16 * w1 + 4 * w2;
+          out_ptr += 8 * w1 + 2 * w2;
+          if (_w2 == 1) {
+            *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
+          } else if (_w2 == 2) {
+            float temp = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
+            float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
+            *out_ptr = (temp > temp1) ? temp : temp1;
+          } else if (_w2 == 3) {
+            float temp = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
+            float temp1 = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
+            *out_ptr++ = (temp > temp1) ? temp : temp1;
+            *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
+          }
         }
       }
-      input_data += input_channel_stride;
-      output_data += output_channel_stride;
     }
-    input_data += input_batch_stride;
-    output_data += output_batch_stride;
   }
-#endif
 }
 
-void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
-                Tensor *output) {
-#if __ARM_NEON
+void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
+                    const Tensor *input, Tensor *output) {
   const int batch_size = input->dims()[0];
-
   const int input_height = input->dims()[2];
-
   const int input_width = input->dims()[3];
 
   const int output_channels = output->dims()[1];
-
   int output_height = output->dims()[2];
   const int output_width = output->dims()[3];
   const int ksize_height = 2;
@@ -115,56 +162,119 @@ void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
   const int input_channel_stride = input_height * input_width;
   const int output_channel_stride = output_height * output_width;
 
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+
   const float *input_data = input->data<float>();
   float *output_data = output->mutable_data<float>();
 
-  int out_w_num = output_width >> 2;
-  const int input_batch_stride = output_channels * input_channel_stride;
-  const int output_batch_stride = output_channels * output_channel_stride;
-  float vqua[] = {0.25f, 0.25f, 0.25f, 0.25f};
-  int remain = output_width - out_w_num << 2;
+  int w1 = input_width / 16;
+  int _w1 = input_width % 16;
+  int w2 = _w1 / 4;
+  int _w2 = _w1 % 4;
+
+  float quarter = 1 / 4;
   for (int i = 0; i < batch_size; ++i) {
     for (int c = 0; c < output_channels; ++c) {
-      const float *input_data_chanel_row_next = input_data + input_width;
-      for (; output_height > 0; output_height--) {
-        if (out_w_num > 0) {
-          asm volatile(
-              "avg_loop:                            \n\t"
-              "vld1.32  {q0,q1},  [%[in_ptr1]]!         \n\t"
-              "vld1.32  {q2,q3},  [%[in_ptr2]]!         \n\t"
-              "vadd.f32  q0,  q0,  q2                 \n\t"
-              "vadd.f32  q1,  q1,  q3                 \n\t"
-              "vpadd.f32  d4,  d0, d1                  \n\t"
-              "vpadd.f32  d5,  d2, d3                  \n\t"
-              "vld1.32  {q4}, [%[vqua]]!                  \n\t"
-              "vmul.f32  q2,  q2,  q4                          \n\t"
-              "subs %[out_w_num],  #1                  \n\t"
-              "vst1.32  {q2},  [%[out_ptr]]!                 \n\t"
-              "bne  avg_loop                            \n\t"
-              : [in_ptr1] "+r"(input_data),
-                [in_ptr2] "+r"(input_data_chanel_row_next),
-                [out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num)
-              : [vqua] "r"(vqua)
-              : "memory", "q0", "q1", "q2", "q3", "q4");
+      for (int ph = 0; ph < input_height; ph += 2) {
+        const float *in_ptr1 = input_data + i * input_batch_stride +
+                               c * input_channel_stride + ph * input_width;
+        const float *in_ptr2 = in_ptr1 + input_width;
+        if (ph + 1 >= input_height) {
+          in_ptr2 = static_cast<float *>(
+              paddle_mobile::memory::Alloc(sizeof(float) * input_width));
+          memset(static_cast<void *>(const_cast<float *>(in_ptr2)), 0,
+                 sizeof(float) * input_width);
         }
+        float *out_ptr = output_data + i * output_batch_stride +
+                         c * output_channel_stride + ph / 2 * output_width;
+#if __ARM_NEON
+#if __aarch64__
+#else
+        asm volatile(
+            "subs       %[w1], %[w1], #1        \n\t"
+            "blt        end_w1_%=               \n\t"
+            "loop_w1_%=:                        \n\t"
+
+            "pld        [%[in_ptr1], #64]       \n\t"
+            "pld        [%[in_ptr2], #64]       \n\t"
+
+            "vmov.f32   d0[0],      %[quarter]      \n\t"
+            "vld1.f32   {q1, q2},   [%[in_ptr1]]!   \n\t"
+            "vld1.f32   {q3, q4},   [%[in_ptr2]]!   \n\t"
+            "vld1.f32   {q7, q8},   [%[in_ptr1]]!   \n\t"
+            "vld1.f32   {q9, q10},  [%[in_ptr2]]!   \n\t"
+
+            "vadd.f32   q1,     q1,   q3        \n\t"
+            "vadd.f32   q2,     q2,   q4        \n\t"
 
-        for (; remain > 0; remain--) {
-          float max_row1 = std::max(input_data[0], input_data[1]);
-          float max_row2 = std::max(input_data_chanel_row_next[0],
-                                    input_data_chanel_row_next[1]);
-          *output_data = std::max(max_row1, max_row2);
-          input_data += 2;
-          input_data_chanel_row_next += 2;
-          output_data++;
+            "vadd.f32   q7,     q7,   q9        \n\t"
+            "vadd.f32   q8,     q8,   q10       \n\t"
+
+            "vpadd.f32  d10,    d2,   d3        \n\t"
+            "vpadd.f32  d11,    d4,   d5        \n\t"
+
+            "vpadd.f32  d12,    d14,  d15       \n\t"
+            "vpadd.f32  d13,    d16,  d17       \n\t"
+
+            "vmul.f32   q5,     q5,   d0[0]     \n\t"
+            "vmul.f32   q6,     q6,   d0[0]     \n\t"
+
+            "vst1.32  {q5, q6},  [%[out_ptr]]!  \n\t"
+
+            "subs       %[w1], %[w1], #1        \n\t"
+            "bge        loop_w1_%=              \n\t"
+            "end_w1_%=:                         \n\t"
+
+            "subs       %[w2], %[w2], #1        \n\t"
+            "blt        end_w2_%=               \n\t"
+            "loop_w2_%=:                        \n\t"
+
+            "vld1.f32   {q1},   [%[in_ptr1]]!   \n\t"
+            "vld1.f32   {q2},   [%[in_ptr2]]!   \n\t"
+            "vadd.f32   q1,     q1,   q2        \n\t"
+            "vpadd.f32  d4,     d2,   d3        \n\t"
+            "vmul.f32   d4,     d4,   d0[0]     \n\t"
+            "vst1.32    {d4},   [%[out_ptr]]!   \n\t"
+
+            "subs       %[w2], %[w2], #1        \n\t"
+            "bge        loop_w2_%=              \n\t"
+            "end_w2_%=:                         \n\t"
+            :
+            : [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1),
+              [in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr),
+              [quarter] "r"(quarter)
+            : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+              "q9", "q10");
+#endif
+#endif
+
+        if (_w2 != 0) {
+          in_ptr1 += 16 * w1 + 4 * w2;
+          in_ptr2 += 16 * w1 + 4 * w2;
+          out_ptr += 8 * w1 + 2 * w2;
+          if (_w2 == 1) {
+            *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
+          } else if (_w2 == 2) {
+            float temp = 0;
+            temp += *in_ptr1++;
+            temp += *in_ptr2++;
+            temp += *in_ptr1;
+            temp += *in_ptr2;
+            *out_ptr = 0.5 * temp;
+          } else if (_w2 == 3) {
+            float temp = 0;
+            temp += *in_ptr1++;
+            temp += *in_ptr2++;
+            temp += *in_ptr1++;
+            temp += *in_ptr2++;
+            *out_ptr++ = 0.5 * temp;
+            *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
+          }
         }
       }
-      input_data += input_channel_stride;
-      output_data += output_channel_stride;
     }
-    input_data += input_batch_stride;
-    output_data += output_batch_stride;
   }
-#endif
 }
 
 //}
diff --git a/src/operators/math/pool_2x2.h b/src/operators/math/pool_2x2.h
index 3fb0d24ba2ce854e8e63c066222e355e2c84dabb..bd5e48482607cc868408b6371f47e0cb55caf499 100644
--- a/src/operators/math/pool_2x2.h
+++ b/src/operators/math/pool_2x2.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #pragma once
 
 #include "framework/tensor.h"
-#if __ARM_NEON
+#ifdef __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON
 namespace paddle_mobile {
@@ -26,11 +26,11 @@ namespace math {
 using framework::Tensor;
 using std::vector;
 
-void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
-                Tensor *output);
+void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
+                    const Tensor *input, Tensor *output);
 
-void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *in_x,
-                Tensor *out);
+void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
+                    const Tensor *in_x, Tensor *out);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/math/pool_3x3.cpp b/src/operators/math/pool_3x3.cpp
index 0259565377386a1415d27b0794580a6a223a88d4..f8b52c59f5689461ef9b4171b9e33c0d49529eed 100644
--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef POOL_OP
-#define __ARM_NEON true
-#include "pool_3x3.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include "framework/tensor.h"
+#include "operators/math/pool_3x3.h"
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON
@@ -27,6 +29,676 @@ using framework::Tensor;
 using std::max;
 using std::min;
 using std::vector;
+void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int input_channel = static_cast<int>(input->dims()[1]);
+
+  const int input_height = static_cast<int>(input->dims()[2]);
+  const int input_width = static_cast<int>(input->dims()[3]);
+  const int output_height = static_cast<int>(output->dims()[2]);
+  const int output_width = static_cast<int>(output->dims()[3]);
+
+  const int hxw = input_height * input_width;
+
+  const int l = input_height;
+
+  const float coef = 1.0 / 9.0;
+  const float coef1 = 1.0 / 6.0;
+  const float coef2 = 1.0 / 4.0;
+
+  float32x4_t v_coef = vdupq_n_f32(coef);
+  float32x4_t v_coef1 = vdupq_n_f32(coef1);
+
+  for (int b = 0; b < batch_size; b++) {
+#pragma omp parallel for
+    for (int c = 0; c < input_channel; c++) {
+      const float *input_data = input->data<float>() + c * hxw;
+      float *output_data = output->data<float>() + c * hxw;
+
+      for (int i = 1; i < output_height - 1; i++) {
+        float *output_ptr;
+        float32x4_t in0, in1, in2, in3, in4, in5, tmp0, tmp1, tmp2, tmp3, tmp4,
+            tmp5, out0;
+        for (int m = 1; m < output_width - 4; m += 4) {
+          output_ptr = output_data + i * output_width + m;
+          in0 = vld1q_f32(input_data + (i - 1) * input_width + m - 1);
+          in1 = vld1q_f32(input_data + (i - 1) * input_width + m + 3);
+          in2 = vld1q_f32(input_data + i * input_width + m - 1);
+          in3 = vld1q_f32(input_data + i * input_width + m + 3);
+          in4 = vld1q_f32(input_data + (i + 1) * input_width + m - 1);
+          in5 = vld1q_f32(input_data + (i + 1) * input_width + m + 3);
+
+          tmp0 = vextq_f32(in0, in1, 1);
+          tmp1 = vextq_f32(in0, in1, 2);
+          tmp2 = vextq_f32(in2, in3, 1);
+          tmp3 = vextq_f32(in2, in3, 2);
+          tmp4 = vextq_f32(in4, in5, 1);
+          tmp5 = vextq_f32(in4, in5, 2);
+
+          out0 = in0;
+          out0 = vaddq_f32(out0, tmp0);
+          out0 = vaddq_f32(out0, tmp1);
+          out0 = vaddq_f32(out0, in2);
+          out0 = vaddq_f32(out0, tmp2);
+          out0 = vaddq_f32(out0, tmp3);
+          out0 = vaddq_f32(out0, in4);
+          out0 = vaddq_f32(out0, tmp4);
+          out0 = vaddq_f32(out0, tmp5);
+
+          vst1q_f32(output_ptr, vmulq_f32(out0, v_coef));
+        }
+        int m;
+        for (m = 1; (m + 3) < output_width - 1; m = m + 4) {
+        }
+
+        for (int j = m; j < output_width - 1; j++) {
+          output_data[i * output_width + j] =
+              input_data[(i - 1) * input_width + j - 1] +
+              input_data[(i - 1) * input_width + j] +
+              input_data[(i - 1) * input_width + j + 1] +
+              input_data[(i)*input_width + j - 1] +
+              input_data[(i)*input_width + j] +
+              input_data[(i)*input_width + j + 1] +
+              input_data[(i + 1) * input_width + j - 1] +
+              input_data[(i + 1) * input_width + j] +
+              input_data[(i + 1) * input_width + j + 1];
+          output_data[i * output_width + j] =
+              output_data[i * output_width + j] * coef;
+        }
+      }
+
+      output_data[0] =
+          input_data[0] + input_data[1] + input_data[l] + input_data[l + 1];
+      output_data[l - 1] = input_data[l - 2] + input_data[l - 1] +
+                           input_data[2 * l - 2] + input_data[2 * l - 1];
+      output_data[(l - 1) * l] =
+          input_data[(l - 2) * l] + input_data[(l - 2) * l + 1] +
+          input_data[(l - 1) * l] + input_data[(l - 1) * l + 1];
+      output_data[l * l - 1] = input_data[(l - 2) * (l + 1)] +
+                               input_data[(l - 2) * (l + 1) + 1] +
+                               input_data[l * l - 2] + input_data[l * l - 1];
+      output_data[0] = output_data[0] * coef2;
+      output_data[l - 1] = output_data[l - 1] * coef2;
+      output_data[(l - 1) * l] = output_data[(l - 1) * l] * coef2;
+      output_data[l * l - 1] = output_data[l * l - 1] * coef2;
+
+      for (int i = 1; i < l - 1; ++i) {
+        output_data[i * l] = input_data[i * l - l] + input_data[i * l - l + 1] +
+                             input_data[i * l] + input_data[i * l + 1] +
+                             input_data[i * l + l] + input_data[i * l + l + 1];
+
+        output_data[i * l + l - 1] =
+            input_data[i * l + l - 1 - l - 1] + input_data[i * l + l - 1 - l] +
+            input_data[i * l + l - 1 - 1] + input_data[i * l + l - 1] +
+            input_data[i * l + l - 1 + l - 1] + input_data[i * l + l - 1 + l];
+        output_data[i * l] = output_data[i * l] * coef1;
+        output_data[i * l + l - 1] = output_data[i * l + l - 1] * coef1;
+      }
+
+      int m;
+      for (m = 1; m < output_width - 4; m += 4) {
+        float *output_ptr = output_data + m;
+        float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0;
+        in0 = vld1q_f32(input_data + m - 1);
+        in1 = vld1q_f32(input_data + m + 3);
+        in2 = vld1q_f32(input_data + input_width + m - 1);
+        in3 = vld1q_f32(input_data + input_width + m + 3);
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+        out0 = in0;
+        out0 = vaddq_f32(out0, tmp0);
+        out0 = vaddq_f32(out0, tmp1);
+        out0 = vaddq_f32(out0, in2);
+        out0 = vaddq_f32(out0, tmp2);
+        out0 = vaddq_f32(out0, tmp3);
+
+        vst1q_f32(output_ptr, vmulq_f32(out0, v_coef1));
+      }
+
+      for (m = 1; (m + 3) < output_width - 1; m += 4) {
+      }
+      for (int j = m; j < output_width - 1; j++) {
+        output_data[j] = input_data[j - 1] + input_data[j] + input_data[j + 1] +
+                         input_data[input_width + j - 1] +
+                         input_data[input_width + j] +
+                         input_data[input_width + j + 1];
+        output_data[j] = output_data[j] * coef1;
+      }
+
+      for (m = 1; m < output_width - 4; m += 4) {
+        float *output_ptr =
+            output_data + (output_height - 1) * output_width + m;
+
+        float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0;
+        in0 = vld1q_f32(input_data + (output_height - 2) * input_width + m - 1);
+        in1 = vld1q_f32(input_data + (output_height - 2) * input_width + m + 3);
+        in2 = vld1q_f32(input_data + (output_height - 1) * input_width + m - 1);
+        in3 = vld1q_f32(input_data + (output_height - 1) * input_width + m + 3);
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+        out0 = in0;
+        out0 = vaddq_f32(out0, tmp0);
+        out0 = vaddq_f32(out0, tmp1);
+        out0 = vaddq_f32(out0, in2);
+        out0 = vaddq_f32(out0, tmp2);
+        out0 = vaddq_f32(out0, tmp3);
+
+        vst1q_f32(output_ptr, vmulq_f32(out0, v_coef1));
+      }
+      for (m = 1; (m + 3) < output_width - 1; m = m + 4) {
+      }
+      for (int j = m; j < output_width - 1; j++) {
+        output_data[(output_height - 1) * input_width + j] =
+            input_data[(output_height - 2) * input_width + j - 1] +
+            input_data[(output_height - 2) * input_width + j] +
+            input_data[(output_height - 2) * input_width + j + 1] +
+            input_data[(output_height - 1) * input_width + j - 1] +
+            input_data[(output_height - 1) * input_width + j] +
+            input_data[(output_height - 1) * input_width + j + 1];
+        output_data[(output_height - 1) * output_width + j] =
+            output_data[(output_height - 1) * output_width + j] * coef1;
+      }
+    }
+  }
+
+//  const int batch_size = input->dims()[0];
+//
+//  const int h_in = input->dims()[2];
+//
+//  const int w_in = input->dims()[3];
+//
+//  const int output_channels = output->dims()[1];
+//
+//  const int h_out = output->dims()[2];
+//  const int w_out = output->dims()[3];
+//  const int outputdata_channel_stride = h_out * w_out;
+//  const int inputdata_channel_stride = h_in * w_in;
+//  const int input_batch_stride = output_channels * inputdata_channel_stride;
+//  const int output_batch_stride = output_channels *
+//  outputdata_channel_stride; float *out_data = output->data<float>(); const
+//  float *input_data = input->data<float>();
+//
+//  const float coef = 1.0 / 9.0;
+//  for (int k = 0; k < batch_size; ++k) {
+//#pragma omp parallel for
+//    for (int c = 0; c < output_channels; ++c) {
+//      const float *input_seg = input_data + c * inputdata_channel_stride;
+//      float *output_seg = out_data + c * outputdata_channel_stride;
+//      // four corner point
+//      output_seg[0] = (input_seg[0] + input_seg[1] + input_seg[w_in] +
+//                       input_seg[w_in + 1]) *
+//                      coef;
+//      output_seg[w_out - 1] =
+//          (input_seg[w_in - 2] + input_seg[w_in - 1] + input_seg[w_in * 2 -
+//          2] +
+//           input_seg[2 * w_in - 1]) *
+//          coef;
+//      output_seg[(h_out - 1) * w_out] =
+//          (input_seg[(h_in - 2) * w_in] + input_seg[(h_in - 2) * w_in + 1] +
+//           input_seg[(h_in - 1) * w_in] + input_seg[(h_in - 1) * w_in + 1])
+//           *
+//          coef;
+//      output_seg[h_out * w_out - 1] =
+//          (input_seg[h_in * w_in - 1] + input_seg[h_in * w_in - 2] +
+//           input_seg[(h_in - 1) * w_in - 1] +
+//           input_seg[(h_in - 1) * w_in - 2]) *
+//          coef;
+//      // left side & right side
+//      for (int i = 1; i < h_in - 1; ++i) {
+//        output_seg[i * w_out] =
+//            (input_seg[i * w_in - w_in] + input_seg[i * w_in - w_in + 1] +
+//             input_seg[i * w_in] + input_seg[i * w_in + 1] +
+//             input_seg[i * w_in + w_in] + input_seg[i * w_in + w_in + 1]) *
+//            coef;
+//        output_seg[i * w_out + w_out - 1] =
+//            (input_seg[i * w_in - w_in + w_in - 2] +
+//             input_seg[i * w_in - w_in + 1 + w_in - 2] +
+//             input_seg[i * w_in + w_in - 2] +
+//             input_seg[i * w_in + 1 + w_in - 2] +
+//             input_seg[i * w_in + w_in + w_in - 2] +
+//             input_seg[i * w_in + w_in + 1 + w_in - 2]) *
+//            coef;
+//      }
+//      // top 1 row & bottom 1 row
+//      const float *input_tmp = input_seg;
+//
+//      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
+//          tmp3, tmp4, tmp5, sum, out0;
+//      float32x4_t v_coef = vdupq_n_f32(coef);
+//      in0 = vld1q_f32(input_tmp);
+//      in2 = vld1q_f32(input_tmp + w_in);
+//      const float *input_tmp_end = input_tmp + (h_in - 2) * w_in;
+//      in4 = vld1q_f32(input_tmp_end);
+//      in6 = vld1q_f32(input_tmp_end + w_in);
+//      int c_mid = w_out - 2;
+//      auto output_ptr = output_seg + 1;
+//      for (; c_mid > 3; c_mid -= 4) {
+//        in1 = vld1q_f32(input_tmp + 4);
+//        in3 = vld1q_f32(input_tmp + w_in + 4);
+//
+//        tmp0 = vextq_f32(in0, in1, 1);
+//        tmp1 = vextq_f32(in0, in1, 2);
+//
+//        tmp2 = vextq_f32(in2, in3, 1);
+//        tmp3 = vextq_f32(in2, in3, 2);
+//
+//        sum = vaddq_f32(in0, tmp0);
+//        sum = vaddq_f32(sum, tmp1);
+//        sum = vaddq_f32(sum, in2);
+//        sum = vaddq_f32(sum, tmp2);
+//        sum = vaddq_f32(sum, tmp3);
+//
+//        vst1q_f32(output_ptr, vmulq_f32(sum, v_coef));
+//
+//        in5 = vld1q_f32(input_tmp_end + 4);
+//        in7 = vld1q_f32(input_tmp_end + w_in + 4);
+//
+//        tmp0 = vextq_f32(in4, in5, 1);
+//        tmp1 = vextq_f32(in4, in5, 2);
+//        tmp2 = vextq_f32(in6, in7, 1);
+//        tmp3 = vextq_f32(in6, in7, 2);
+//
+//        sum = vaddq_f32(in0, tmp0);
+//        sum = vaddq_f32(sum, tmp1);
+//        sum = vaddq_f32(sum, in2);
+//        sum = vaddq_f32(sum, tmp2);
+//        sum = vaddq_f32(sum, tmp3);
+//
+//        vst1q_f32(output_ptr + (h_out - 1) * w_out, vmulq_f32(sum, v_coef));
+//
+//        // can optimize to each 8 stride.
+//        input_tmp += 4;
+//        input_tmp_end += 4;
+//        output_ptr += 4;
+//        in0 = in1;
+//        in2 = in3;
+//        in4 = in5;
+//        in6 = in7;
+//      }
+//      // top right remain
+//      float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]);
+//      float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]);
+//
+//      tmp0 = vextq_f32(in0, pad0, 1);
+//      tmp1 = vextq_f32(in0, pad0, 2);
+//      tmp2 = vextq_f32(in2, pad1, 2);
+//      tmp3 = vextq_f32(in2, pad1, 2);
+//
+//      sum = vaddq_f32(in0, tmp0);
+//      sum = vaddq_f32(sum, tmp1);
+//      sum = vaddq_f32(sum, in2);
+//      sum = vaddq_f32(sum, tmp2);
+//      sum = vaddq_f32(sum, tmp3);
+//      out0 = vmulq_f32(sum, v_coef);
+//
+//      for (int i = 0; i < c_mid; ++i) {
+//        if (i == 0) {
+//          vst1q_lane_f32(output_ptr + i, out0, 0);
+//        }
+//        if (i == 1) {
+//          vst1q_lane_f32(output_ptr + i, out0, 1);
+//        }
+//        if (i == 2) {
+//          vst1q_lane_f32(output_ptr + i, out0, 2);
+//        }
+//      }
+//
+//      // bottom_right remain
+//      float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]);
+//      float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]);
+//
+//      tmp0 = vextq_f32(in4, pad2, 1);
+//      tmp1 = vextq_f32(in4, pad2, 2);
+//      tmp2 = vextq_f32(in6, pad3, 2);
+//      tmp3 = vextq_f32(in6, pad3, 2);
+//
+//      sum = vaddq_f32(in4, tmp0);
+//      sum = vaddq_f32(sum, tmp1);
+//      sum = vaddq_f32(sum, in6);
+//      sum = vaddq_f32(sum, tmp2);
+//      sum = vaddq_f32(sum, tmp3);
+//      out0 = vmulq_f32(sum, v_coef);
+//
+//      for (int i = 0; i < c_mid; ++i) {
+//        if (i == 0) {
+//          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 0);
+//        }
+//        if (i == 1) {
+//          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 1);
+//        }
+//        if (i == 2) {
+//          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 2);
+//        }
+//      }
+//      // mid
+//      for (int j = 0; j < h_out - 2; ++j) {
+//        output_ptr = output_seg + w_out * (j + 1) + 1;
+//        input_tmp = input_seg + j * w_in;
+//
+//        in0 = vld1q_f32(input_tmp);
+//        in2 = vld1q_f32(input_tmp + w_in);
+//        in4 = vld1q_f32(input_tmp + 2 * w_in);
+//        c_mid = w_out - 2;
+//        for (; c_mid > 3; c_mid -= 4) {
+//          in1 = vld1q_f32(input_tmp + 4);
+//          in3 = vld1q_f32(input_tmp + w_in + 4);
+//          in5 = vld1q_f32(input_tmp + 2 * w_in + 4);
+//
+//          tmp0 = vextq_f32(in0, in1, 1);
+//          tmp1 = vextq_f32(in0, in1, 2);
+//          tmp2 = vextq_f32(in2, in3, 1);
+//          tmp3 = vextq_f32(in2, in3, 2);
+//          tmp4 = vextq_f32(in4, in5, 1);
+//          tmp5 = vextq_f32(in4, in5, 2);
+//
+//          sum = vaddq_f32(in0, tmp0);
+//          sum = vaddq_f32(sum, tmp1);
+//          sum = vaddq_f32(sum, in2);
+//          sum = vaddq_f32(sum, tmp2);
+//          sum = vaddq_f32(sum, tmp3);
+//          sum = vaddq_f32(sum, in4);
+//          sum = vaddq_f32(sum, tmp4);
+//          sum = vaddq_f32(sum, tmp5);
+//
+//          out0 = vmulq_f32(sum, v_coef);
+//          vst1q_f32(output_ptr, out0);
+//          output_ptr += 4;
+//          input_tmp += 4;
+//          in0 = in1;
+//          in2 = in3;
+//          in4 = in5;
+//        }
+//        // mid remain
+//        float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]);
+//        float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
+//        float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
+//
+//        tmp0 = vextq_f32(in0, pad0, 1);
+//        tmp1 = vextq_f32(in0, pad0, 2);
+//        tmp2 = vextq_f32(in2, pad1, 1);
+//        tmp3 = vextq_f32(in2, pad1, 2);
+//        tmp4 = vextq_f32(in4, pad2, 1);
+//        tmp5 = vextq_f32(in4, pad2, 2);
+//
+//        sum = vaddq_f32(in0, tmp0);
+//        sum = vaddq_f32(sum, tmp1);
+//        sum = vaddq_f32(sum, in2);
+//        sum = vaddq_f32(sum, tmp2);
+//        sum = vaddq_f32(sum, tmp3);
+//        sum = vaddq_f32(sum, in4);
+//        sum = vaddq_f32(sum, tmp4);
+//        sum = vaddq_f32(sum, tmp5);
+//        out0 = vmulq_f32(sum, v_coef);
+//
+//        for (int i = 0; i < c_mid; ++i) {
+//          if (i == 0) {
+//            vst1q_lane_f32(output_ptr + i, out0, 0);
+//          }
+//          if (i == 1) {
+//            vst1q_lane_f32(output_ptr + i, out0, 1);
+//          }
+//          if (i == 2) {
+//            vst1q_lane_f32(output_ptr + i, out0, 2);
+//          }
+//        }
+//      }
+//      //      input_data += inputdata_channel_stride;
+//      //      out_data += outputdata_channel_stride;
+//    }
+//    input_data += input_batch_stride;
+//    out_data += output_batch_stride;
+//  }
+#endif
+}
+
+void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+
+  const int h_in = input->dims()[2];
+
+  const int w_in = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  const int h_out = output->dims()[2];
+  const int w_out = output->dims()[3];
+  const int outputdata_channel_stride = h_out * w_out;
+  const int inputdata_channel_stride = h_in * w_in;
+  const int input_batch_stride = output_channels * inputdata_channel_stride;
+  const int output_batch_stride = output_channels * outputdata_channel_stride;
+  float *out_data = output->data<float>();
+  const float *input_data = input->data<float>();
+  for (int k = 0; k < batch_size; ++k) {
+#pragma omp parallel for
+    for (int c = 0; c < output_channels; ++c) {
+      const float *input_seg = input_data + c * inputdata_channel_stride;
+      float *output_seg = out_data + c * outputdata_channel_stride;
+      // four corner point
+      output_seg[0] = std::max(std::max(input_seg[0], input_seg[1]),
+                               std::max(input_seg[w_in], input_seg[w_in + 1]));
+      output_seg[w_out - 1] =
+          std::max(std::max(input_seg[w_in - 2], input_seg[w_in - 1]),
+                   std::max(input_seg[w_in * 2 - 2], input_seg[2 * w_in - 1]));
+      output_seg[(h_out - 1) * w_out] =
+          std::max(std::max(input_seg[(h_in - 2) * w_in],
+                            input_seg[(h_in - 2) * w_in + 1]),
+                   std::max(input_seg[(h_in - 1) * w_in],
+                            input_seg[(h_in - 1) * w_in + 1]));
+      output_seg[h_out * w_out - 1] = std::max(
+          std::max(input_seg[(h_in - 1) * w_in - 1],
+                   input_seg[(h_in - 1) * w_in - 2]),
+          std::max(input_seg[h_in * w_in - 1], input_seg[h_in * w_in - 2]));
+      // left side & right side
+      for (int i = 1; i < h_in - 1; ++i) {
+        float max1 = std::max(input_seg[i * w_in - w_in],
+                              input_seg[i * w_in - w_in + 1]);
+        float max2 = std::max(input_seg[i * w_in], input_seg[i * w_in + 1]);
+        float max3 = std::max(input_seg[i * w_in + w_in],
+                              input_seg[i * w_in + w_in + 1]);
+        output_seg[i * w_out] = std::max(std::max(max1, max2), max3);
+
+        max1 = std::max(input_seg[i * w_in - w_in + w_in - 2],
+                        input_seg[i * w_in - w_in + 1 + w_in - 2]);
+        max2 = std::max(input_seg[i * w_in + w_in - 2],
+                        input_seg[i * w_in + 1 + w_in - 2]);
+        max3 = std::max(input_seg[i * w_in + w_in + w_in - 2],
+                        input_seg[i * w_in + w_in + 1 + w_in - 2]);
+        output_seg[i * w_out + w_out - 1] =
+            std::max(std::max(max1, max2), max3);
+      }
+      // top 1 row & bottom 1 row
+      const float *input_tmp = input_seg;
+
+      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
+          tmp3, tmp4, tmp5, max;
+      in0 = vld1q_f32(input_tmp);
+      in2 = vld1q_f32(input_tmp + w_in);
+      const float *input_tmp_end = input_tmp + (h_in - 2) * w_in;
+      in4 = vld1q_f32(input_tmp_end);
+      in6 = vld1q_f32(input_tmp_end + w_in);
+      int c_mid = w_out - 2;
+      auto output_ptr = output_seg + 1;
+      for (; c_mid > 3; c_mid -= 4) {
+        in1 = vld1q_f32(input_tmp + 4);
+        in3 = vld1q_f32(input_tmp + w_in + 4);
+
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+
+        max = vmaxq_f32(in0, tmp0);
+        max = vmaxq_f32(max, tmp1);
+        max = vmaxq_f32(max, in2);
+        max = vmaxq_f32(max, tmp2);
+        max = vmaxq_f32(max, tmp3);
+
+        vst1q_f32(output_ptr, max);
+
+        in5 = vld1q_f32(input_tmp_end + 4);
+        in7 = vld1q_f32(input_tmp_end + w_in + 4);
+
+        tmp0 = vextq_f32(in4, in5, 1);
+        tmp1 = vextq_f32(in4, in5, 2);
+        tmp2 = vextq_f32(in6, in7, 1);
+        tmp3 = vextq_f32(in6, in7, 2);
+
+        max = vmaxq_f32(in4, tmp0);
+        max = vmaxq_f32(max, tmp1);
+        max = vmaxq_f32(max, in6);
+        max = vmaxq_f32(max, tmp2);
+        max = vmaxq_f32(max, tmp3);
+
+        vst1q_f32(output_ptr + (h_out - 1) * w_out, max);
+
+        input_tmp += 4;
+        input_tmp_end += 4;
+        output_ptr += 4;
+        in0 = in1;
+        in2 = in3;
+        in4 = in5;
+        in6 = in7;
+      }
+      // top right remain
+      float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]);
+      float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]);
+
+      tmp0 = vextq_f32(in0, pad0, 1);
+      tmp1 = vextq_f32(in0, pad0, 2);
+      tmp2 = vextq_f32(in2, pad1, 1);
+      tmp3 = vextq_f32(in2, pad1, 2);
+
+      max = vmaxq_f32(in0, tmp0);
+      max = vmaxq_f32(max, tmp1);
+      max = vmaxq_f32(max, in2);
+      max = vmaxq_f32(max, tmp2);
+      max = vmaxq_f32(max, tmp3);
+
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + i, max, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + i, max, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + i, max, 2);
+        }
+      }
+
+      // bottom_right remain
+      float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]);
+      float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]);
+
+      tmp0 = vextq_f32(in4, pad2, 1);
+      tmp1 = vextq_f32(in4, pad2, 2);
+      tmp2 = vextq_f32(in6, pad3, 1);
+      tmp3 = vextq_f32(in6, pad3, 2);
+
+      max = vmaxq_f32(in4, tmp0);
+      max = vmaxq_f32(max, tmp1);
+      max = vmaxq_f32(max, in6);
+      max = vmaxq_f32(max, tmp2);
+      max = vmaxq_f32(max, tmp3);
+
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, max, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, max, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, max, 2);
+        }
+      }
+      // mid
+      for (int j = 0; j < h_out - 2; ++j) {
+        output_ptr = output_seg + (j + 1) * w_out + 1;
+        input_tmp = input_seg + j * w_in;
+
+        in0 = vld1q_f32(input_tmp);
+        in2 = vld1q_f32(input_tmp + w_in);
+        in4 = vld1q_f32(input_tmp + 2 * w_in);
+        c_mid = w_out - 2;
+        for (; c_mid > 3; c_mid -= 4) {
+          in1 = vld1q_f32(input_tmp + 4);
+          in3 = vld1q_f32(input_tmp + w_in + 4);
+          in5 = vld1q_f32(input_tmp + 2 * w_in + 4);
+
+          tmp0 = vextq_f32(in0, in1, 1);
+          tmp1 = vextq_f32(in0, in1, 2);
+          tmp2 = vextq_f32(in2, in3, 1);
+          tmp3 = vextq_f32(in2, in3, 2);
+          tmp4 = vextq_f32(in4, in5, 1);
+          tmp5 = vextq_f32(in4, in5, 2);
+
+          max = vmaxq_f32(in0, tmp0);
+          max = vmaxq_f32(max, tmp1);
+          max = vmaxq_f32(max, in2);
+          max = vmaxq_f32(max, tmp2);
+          max = vmaxq_f32(max, tmp3);
+          max = vmaxq_f32(max, in4);
+          max = vmaxq_f32(max, tmp4);
+          max = vmaxq_f32(max, tmp5);
+
+          vst1q_f32(output_ptr, max);
+          output_ptr += 4;
+          input_tmp += 4;
+          in0 = in1;
+          in2 = in3;
+          in4 = in5;
+        }
+        // mid remain
+        float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]);
+        float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
+        float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 3) * w_in - 1]);
+
+        tmp0 = vextq_f32(in0, pad0, 1);
+        tmp1 = vextq_f32(in0, pad0, 2);
+        tmp2 = vextq_f32(in2, pad1, 1);
+        tmp3 = vextq_f32(in2, pad1, 2);
+        tmp4 = vextq_f32(in4, pad2, 1);
+        tmp5 = vextq_f32(in4, pad2, 2);
+
+        max = vmaxq_f32(in0, tmp0);
+        max = vmaxq_f32(max, tmp1);
+        max = vmaxq_f32(max, in2);
+        max = vmaxq_f32(max, tmp2);
+        max = vmaxq_f32(max, tmp3);
+        max = vmaxq_f32(max, in4);
+        max = vmaxq_f32(max, tmp4);
+        max = vmaxq_f32(max, tmp5);
+
+        for (int i = 0; i < c_mid; ++i) {
+          if (i == 0) {
+            vst1q_lane_f32(output_ptr + i, max, 0);
+          }
+          if (i == 1) {
+            vst1q_lane_f32(output_ptr + i, max, 1);
+          }
+          if (i == 2) {
+            vst1q_lane_f32(output_ptr + i, max, 2);
+          }
+        }
+      }
+      //      input_data += inputdata_channel_stride;
+      //      out_data += outputdata_channel_stride;
+    }
+    input_data += input_batch_stride;
+    out_data += output_batch_stride;
+  }
+#else
+
+#endif
+}
 
 void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                 Tensor *output) {
@@ -41,11 +713,11 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
 
   const int output_height = output->dims()[2];
   const int output_width = output->dims()[3];
-  const int _kernel_size = 3;
-  const int stride_height = strides[0];
-  const int stride_width = strides[1];
-  const int padding_height = paddings[0];
-  const int padding_width = paddings[1];
+  //  const int _kernel_size = 3;
+  const int stride = strides[0];
+  //  const int stride_width = strides[1];
+  const int padding = paddings[0];
+  //  const int padding_width = paddings[1];
   const float negative_max = -INT_MAX;
   const int input_channel_stride = input_height * input_width;
   const int output_channel_stride = output_height * output_width;
@@ -55,38 +727,50 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
 
   const int input_batch_stride = output_channels * input_channel_stride;
   const int output_batch_stride = output_channels * output_channel_stride;
-  const float *pos1, *pos2, *pos3, *output_ptr;
+  const float *pos1, *output_ptr;
   int hstart, wstart, hend, wend;
   for (int i = 0; i < batch_size; ++i) {
+#pragma omp parallel for
     for (int c = 0; c < output_channels; ++c) {
+      const float *input_seg = input_data + c * input_channel_stride;
+      float *output_seg = output_data + c * output_channel_stride;
       for (int ph = 0; ph < output_height; ph++) {
+        int hstart = ph * stride - padding;
+        int hend = min(hstart + 3, input_height);
+        hstart = max(hstart, 0);
         for (int pw = 0; pw < output_width; pw++) {
-          hstart = ph * stride_height - padding_height;
-          wstart = pw * stride_width - padding_width;
-          hend = min(hstart + _kernel_size, input_height + padding_height);
-          wend = min(wstart + _kernel_size, input_width + padding_width);
-          hstart = max(hstart, 0);
+          int wstart = pw * stride - padding;
+          int wend = min(wstart + 3, input_width);
           wstart = max(wstart, 0);
-          hend = min(hend, input_height);
-          wend = min(wend, input_width);
-          pos1 = input_data + hstart * input_width + wstart;
-          pos2 = input_data + (hstart + 1) * input_width + wstart;
-          pos3 = input_data + (hstart + 2) * input_width + wstart;
-          output_ptr = output_data + ph * output_width + pw;
+          const float *pos1 = input_seg + hstart * input_width + wstart;
+          const float *pos2 = input_seg + (hstart + 1) * input_width + wstart;
+          const float *pos3 = input_seg + (hstart + 2) * input_width + wstart;
+          output_ptr = output_seg + ph * output_width + pw;
 
           if (hend - hstart != 3 || wend - wstart != 3) {
             float max_value = -INT_MAX;
             for (int h = hstart; h < hend; h++) {
               for (int w = wstart; w < wend; w++) {
-                float value = input_data[h * input_width + w];
+                float value = input_seg[h * input_width + w];
                 if (value > max_value) {
                   max_value = value;
                 }
               }
             }
-            output_data[ph * output_width + pw] = max_value;
+            output_seg[ph * output_width + pw] = max_value;
           } else {
-#if defined(ARMV7)
+#if __aarch64__
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos1 + input_width);
+            const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width);
+            const float32x4_t max_data =
+                vmaxq_f32(vmaxq_f32(data1, data2), data3);
+            float32x2_t res =
+                vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
+                          vget_low_f32(max_data));
+            res = vpmax_f32(res, res);
+            output_seg[ph * output_width + pw] = vget_lane_f32(res, 0);
+#else
             asm volatile(
                 "vld1.32  {q1}, [%[pos1]]        \n\t"
                 "vld1.32  {q2}, [%[pos2]]        \n\t"
@@ -98,27 +782,14 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                 "vpmax.f32  d7, d6, d6             \n\t"
                 "vst1.32 {d7[0]},[%[output_ptr]]    \n\t"
                 :
-                : [input_data] "r"(input_data), [pos1] "r"(pos1),
+                : [input_seg] "r"(input_seg), [pos1] "r"(pos1),
                   [pos2] "r"(pos2), [pos3] "r"(pos3),
                   [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
                 : "memory", "q1", "q2", "q3", "q4");
-#else
-            const float32x4_t data1 = vld1q_f32(pos1);
-            const float32x4_t data2 = vld1q_f32(pos2);
-            const float32x4_t data3 = vld1q_f32(pos3);
-            const float32x4_t max_data =
-                vmaxq_f32(vmaxq_f32(data1, data3), data2);
-            float32x2_t res =
-                vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
-                          vget_low_f32(max_data));
-            res = vpmax_f32(res, res);
-            output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
 #endif
           }
         }
       }
-      input_data += input_channel_stride;
-      output_data += output_channel_stride;
     }
     input_data += input_batch_stride;
     output_data += output_batch_stride;
@@ -139,11 +810,8 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
 
   const int output_height = output->dims()[2];
   const int output_width = output->dims()[3];
-  const int _kernel_size = 3;
-  const int stride_height = strides[0];
-  const int stride_width = strides[1];
-  const int padding_height = paddings[0];
-  const int padding_width = paddings[1];
+  const int stride = strides[0];
+  const int padding = paddings[0];
 
   const int input_channel_stride = input_height * input_width;
   const int output_channel_stride = output_height * output_width;
@@ -157,33 +825,38 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
   const int input_batch_stride = output_channels * input_channel_stride;
   const int output_batch_stride = output_channels * output_channel_stride;
   for (int i = 0; i < batch_size; ++i) {
+#pragma omp parallel for
     for (int c = 0; c < output_channels; ++c) {
+      const float *input_seg = input_data + c * input_channel_stride;
+      float *output_seg = output_data + c * output_channel_stride;
       for (int ph = 0; ph < output_height; ph++) {
         for (int pw = 0; pw < output_width; pw++) {
-          int hstart = ph * stride_height - padding_height;
-          int wstart = pw * stride_width - padding_width;
-          int hend = min(hstart + _kernel_size, input_height + padding_height);
-          int wend = min(wstart + _kernel_size, input_width + padding_width);
+          int hstart = ph * stride - padding;
+          int wstart = pw * stride - padding;
+          int hend = min(hstart + 3, input_height + padding);
+          int wend = min(wstart + 3, input_width + padding);
           hstart = max(hstart, 0);
           wstart = max(wstart, 0);
           hend = min(hend, input_height);
           wend = min(wend, input_width);
-          const float *pos1 = input_data + hstart * input_width + wstart;
-          const float *pos2 = input_data + (hstart + 1) * input_width + wstart;
-          const float *pos3 = input_data + (hstart + 2) * input_width + wstart;
-          const float *output_ptr = output_data + ph * output_width + pw;
+
+          const float *pos1 = input_seg + hstart * input_width + wstart;
+          const float *pos2 = input_seg + (hstart + 1) * input_width + wstart;
+          const float *pos3 = input_seg + (hstart + 2) * input_width + wstart;
+          float *output_ptr = output_seg + ph * output_width + pw;
 
           if (hend - hstart != 3 || wend - wstart != 3) {
             float sum = 0;
             for (int h = hstart; h < hend; h++) {
               for (int w = wstart; w < wend; w++) {
-                sum += input_data[h * input_width + w];
+                sum += input_seg[h * input_width + w];
               }
             }
-            output_data[ph * output_width + pw] = sum / 9.0;
+            output_seg[ph * output_width + pw] =
+                sum / ((hend - hstart) * (wend - wstart) * 1.0);
           } else {
-#if defined(ARMV7)
-
+#if __aarch64__
+#else
             asm volatile(
                 "vld1.32  {q1}, [%[pos1]]        \n\t"
                 "vld1.32  {q2}, [%[pos2]]        \n\t"
@@ -197,12 +870,12 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                 "vmul.f32 d6,d7                     \n\t"
                 "vst1.32 {d6[0]},[%[output_ptr]]    \n\t"
                 :
-                : [input_data] "r"(input_data), [pos1] "r"(pos1),
+                : [input_seg] "r"(input_seg), [pos1] "r"(pos1),
                   [pos2] "r"(pos2), [pos3] "r"(pos3),
                   [output_ptr] "r"(output_ptr), [zero] "r"(zero),
                   [nine_ptr] "r"(nine_ptr)
                 : "memory", "r6", "q1", "q2", "q3", "q4");
-#else
+#endif
             const float32x4_t data1 = vld1q_f32(pos1);
             const float32x4_t data2 = vld1q_f32(pos2);
             const float32x4_t data3 = vld1q_f32(pos3);
@@ -212,17 +885,15 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                 vpadd_f32(vget_high_f32(vsetq_lane_f32(0, sum_data, 3)),
                           vget_low_f32(sum_data));
             res = vpadd_f32(res, res);
-            output_data[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0;
-#endif
+            output_seg[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0;
           }
         }
       }
-      input_data += input_channel_stride;
-      output_data += output_channel_stride;
     }
     input_data += input_batch_stride;
     output_data += output_batch_stride;
   }
+#else
 #endif
 }
 }  // namespace math
diff --git a/src/operators/math/pool_3x3.h b/src/operators/math/pool_3x3.h
index 22a398084390701aefc8815c9aa93b82b4c4ec7b..ac1eb16a4c0e077c625267545767b8f29144b8f1 100644
--- a/src/operators/math/pool_3x3.h
+++ b/src/operators/math/pool_3x3.h
@@ -15,7 +15,11 @@ limitations under the License. */
 #ifdef POOL_OP
 
 #pragma once
-
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include <algorithm>
+#include <vector>
 #include "framework/tensor.h"
 #if __ARM_NEON
 #include <arm_neon.h>
@@ -26,7 +30,8 @@ namespace operators {
 namespace math {
 using framework::Tensor;
 using std::vector;
-
+void Pool3x3Avgs1p1(const Tensor *input, Tensor *output);
+void Pool3x3Maxs1p1(const Tensor *input, Tensor *output);
 void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                 Tensor *output);
 
diff --git a/src/operators/math/pooling.cpp b/src/operators/math/pooling.cpp
index 4287408394f1a7f407154938f3e83e9fac3543a2..f5bcdf7fdb6b9245eda7d3557b293395bce23b24 100644
--- a/src/operators/math/pooling.cpp
+++ b/src/operators/math/pooling.cpp
@@ -16,6 +16,9 @@ limitations under the License. */
 
 #include "pooling.h"
 #include "common/types.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 
 namespace paddle_mobile {
 namespace operators {
@@ -55,10 +58,9 @@ class PoolFunctor<CPU, PoolProcess, T> {
 
     const T *input_data = input.data<T>();
     T *output_data = output->mutable_data<T>();
-
     for (int i = 0; i < batch_size; i++) {
-      #pragma omp parallel for
       for (int c = 0; c < output_channels; ++c) {
+#pragma omp parallel for
         for (int ph = 0; ph < output_height; ++ph) {
           int hstart = ph * stride_height - padding_height;
           int hend = std::min(hstart + ksize_height, input_height);
diff --git a/src/operators/math/pooling.h b/src/operators/math/pooling.h
index bc2ecf41d224c2b0fd518d44fecc3f688d98ee19..3ca868fa4de4b9fefdcd8c18c0d7107cc9f60b4f 100644
--- a/src/operators/math/pooling.h
+++ b/src/operators/math/pooling.h
@@ -65,7 +65,7 @@ class PoolFunctor {
                   const std::vector<int> &paddings, PoolProcess pool_compute,
                   framework::Tensor *output);
 };
-}
+}  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/math/sequence2batch.cpp b/src/operators/math/sequence2batch.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..097a258dddd513294cd1c1d2f4c9ddb0dd530052
--- /dev/null
+++ b/src/operators/math/sequence2batch.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/math/sequence2batch.h"
+#include <cstring>
+#include "common/types.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+template <typename T>
+class CopyMatrixRowsFunctor<CPU, T> {
+ public:
+  void operator()(const framework::Tensor& src, std::vector<size_t> index_lod,
+                  framework::Tensor* dst, bool is_src_index) {
+    size_t* index = index_lod.data();
+    auto src_dims = src.dims();
+    auto dst_dims = dst->dims();
+    PADDLE_MOBILE_ENFORCE((src_dims.size() == 2UL),
+                          "The src must be matrix with rank 2.");
+    PADDLE_MOBILE_ENFORCE((dst_dims.size() == 2UL),
+                          "The dst must be matrix with rank 2.");
+    PADDLE_MOBILE_ENFORCE((src_dims[1] == dst_dims[1]),
+                          "The width of src and dst must be same.");
+    auto height = dst_dims[0];
+    auto width = dst_dims[1];
+    auto* src_data = src.data<T>();
+    auto* dst_data = dst->data<T>();
+    for (int i = 0; i < height; ++i) {
+      if (is_src_index) {
+        memcpy(dst_data + i * width, src_data + index[i] * width,
+               width * sizeof(T));
+      } else {
+        memcpy(dst_data + index[i] * width, src_data + i * width,
+               width * sizeof(T));
+      }
+    }
+  }
+};
+
+template class CopyMatrixRowsFunctor<CPU, float>;
+
+template class LoDTensor2BatchFunctor<CPU, float>;
+template class Batch2LoDTensorFunctor<CPU, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/math/sequence2batch.h b/src/operators/math/sequence2batch.h
new file mode 100644
index 0000000000000000000000000000000000000000..42b369f7dc48718846b7d8e039b876693f9770df
--- /dev/null
+++ b/src/operators/math/sequence2batch.h
@@ -0,0 +1,169 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "framework/lod_tensor.h"
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+template <typename DeviceType, typename T>
+class CopyMatrixRowsFunctor {
+ public:
+  // If is_src_index is true,
+  // copy the indexed rows of input src to the output dst.
+  // If is_src_index is false,
+  // copy the input src to the indexed rows of output dst.
+  // The indexed rows are based on the input index.
+  void operator()(const framework::Tensor& src, std::vector<size_t> index_lod,
+                  framework::Tensor* dst, bool is_src_index);
+};
+
+template <typename DeviceType, typename T>
+class LoDTensor2BatchFunctor {
+  // Calculate the length of each sequence and
+  // sort sequence index by the length.
+  // example:  sequences = {s0, s1, s2}
+  //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+  //           seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
+  //
+  struct SeqInfo {
+    SeqInfo(int start, int length, int seq_idx)
+        : start(start), length(length), seq_idx(seq_idx) {}
+    int start;
+    int length;
+    int seq_idx;
+  };
+
+ public:
+  void operator()(const framework::LoDTensor& lod_tensor,
+                  framework::LoDTensor* batch, bool is_cal_batch_lod,
+                  bool is_reverse = false) {
+    if (!is_cal_batch_lod) {
+      auto lods = batch->lod();
+      PADDLE_MOBILE_ENFORCE(
+          (lods.size() > 2UL),
+          "The LoD of LoDTensor should inlcude at least 2-level "
+          "sequence information.");
+      PADDLE_MOBILE_ENFORCE(
+          (lods[1].size() == static_cast<size_t>(lod_tensor.dims()[0])),
+          "The LoD information should be consistent with the dims.");
+      CopyMatrixRowsFunctor<DeviceType, T> to_batch;
+      to_batch(lod_tensor, lods[1], batch, true);
+      return;
+    }
+
+    auto lods = lod_tensor.lod();
+    PADDLE_MOBILE_ENFORCE((lods.size() == 1UL),
+                          "Only support one level sequence now.");
+
+    const auto& lod = lods[0];
+
+    std::vector<SeqInfo> seq_info;
+    for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
+      int length = lod[seq_id + 1] - lod[seq_id];
+      seq_info.emplace_back(lod[seq_id], length, seq_id);
+    }
+
+    std::sort(seq_info.begin(), seq_info.end(),
+              [](SeqInfo a, SeqInfo b) { return a.length > b.length; });
+
+    // Calculate the start position of each batch.
+    // example:  sequences = {s0, s1, s2}
+    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+    //           num_batch = 5,
+    //           batchIndex = {b0, b1, b2, b3, b4}
+    //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
+    //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
+    //              batch_start_positions[0] = len(b0)
+    //              batch_start_positions[1] = len(b0) + len(b1)
+    //              batch_start_positions[2] = len(b0) + len(b1) + len(b2)
+    //              ...
+    //           seq2batch_idx[12] = {4, 0, 9,
+    //                                5, 1, 10,
+    //                                6, 2, 11,
+    //                                7, 3,
+    //                                8}
+    //           seq_order = {1, 0, 2}, the sort order.
+    //               where 1 is the second sequence,
+    //                     0 is the first sequence,
+    //                     2 is the third sequence.
+    // The num_batch represents batch size after rearranging the
+    // input LodTensor. It is also the maximum length of input sequence.
+
+    framework::LoD batch_lods;
+    batch_lods.emplace_back(std::vector<size_t>{0});
+    batch_lods.emplace_back(std::vector<size_t>{0});
+    batch_lods.emplace_back(std::vector<size_t>{0});
+
+    // batch_lods[0] is the start positions for batch LoDTensor
+    int num_batch = seq_info[0].length;
+    batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
+    // batch_lods[1] is the raw index in the input LoDTensor
+    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
+    // batch_lods[2] is the sort order for the input LoDTensor.
+    batch_lods[2].resize(seq_info.size());
+
+    size_t* batch_starts = batch_lods[0].data();
+    size_t* seq2batch_idx = batch_lods[1].data();
+    batch_starts[0] = 0;
+    for (int n = 0; n < num_batch; n++) {
+      auto batch_id = static_cast<int>(batch_starts[n]);
+      for (size_t i = 0; i < seq_info.size(); ++i) {
+        int seq_len = seq_info[i].length;
+        int start = seq_info[i].start;
+        if (n < seq_len) {
+          seq2batch_idx[batch_id] =
+              is_reverse ? start + seq_len - 1 - n : start + n;
+          batch_id++;
+        } else {
+          break;
+        }
+      }
+      batch_starts[n + 1] = static_cast<size_t>(batch_id);
+    }
+    size_t* seq_order = batch_lods[2].data();
+    for (size_t i = 0; i < seq_info.size(); ++i) {
+      seq_order[i] = seq_info[i].seq_idx;
+    }
+    batch->set_lod(batch_lods);
+
+    CopyMatrixRowsFunctor<DeviceType, T> to_batch;
+    to_batch(lod_tensor, batch_lods[1], batch, true);
+  }
+};
+
+template <typename DeviceType, typename T>
+class Batch2LoDTensorFunctor {
+ public:
+  void operator()(const framework::LoDTensor& batch,
+                  framework::LoDTensor* lod_tensor) {
+    auto in_lod = batch.lod();
+    PADDLE_MOBILE_ENFORCE(
+        (in_lod.size() > 2UL),
+        "The LoD of LoDTensor should inlcude at least 2-level "
+        "sequence information.");
+    PADDLE_MOBILE_ENFORCE(
+        (in_lod[1].size() == static_cast<size_t>(lod_tensor->dims()[0])),
+        "The LoD information should be consistent with the dims.");
+    CopyMatrixRowsFunctor<DeviceType, T> to_seq;
+    to_seq(batch, in_lod[1], lod_tensor, false);
+  }
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/math/softmax.cpp b/src/operators/math/softmax.cpp
index a1eb4f13d82376d86da258101b15e6ae5e8bdc97..dba88c93969014f2ad0d2636b4141c734dbc2ed5 100644
--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "operators/math/softmax.h"
 #include "common/types.h"
-#if __ARM_NEON
+#ifdef __ARM_NEON
 #include <math.h>
 #include <algorithm>
 #include "operators/math/math_func_neon.h"
@@ -29,7 +29,7 @@ using framework::DDim;
 using framework::Tensor;
 template <typename T>
 class SoftmaxFuntor<CPU, T> {
-#if __ARM_NEON
+#ifdef __ARM_NEON
   void sum(float *input, float *sumptr, int inner_size, int outter_size) {
     float32x4_t acc = vdupq_n_f32(0);
     float sum_ = 0;
@@ -135,6 +135,7 @@ class SoftmaxFuntor<CPU, T> {
       }
     }
   }
+#else
 #endif  // ARM_NEON
 
  public:
@@ -144,7 +145,7 @@ class SoftmaxFuntor<CPU, T> {
       framework::Tensor sub_X = X->Slice(i, i + 1);
       framework::Tensor sub_Y = Y->Slice(i, i + 1);
 
-#if __ARM_NEON
+#ifdef __ARM_NEON
       SoftmaxCacl(&sub_X, &sub_Y);
 #endif
     }
diff --git a/src/operators/mul_op.cpp b/src/operators/mul_op.cpp
index 49ae3a5e8484cb2f6628eb53cabd9321ae5705b8..044da7012eccde57a87d417f4f3c00b82e01da42 100644
--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -50,17 +50,15 @@ void MulOp<Dtype, T>::InferShape() const {
   framework::DDim ddim = framework::make_ddim(output_dims);
   this->param_.Out()->Resize(ddim);
 }
-template class MulOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(mul);
 REGISTER_OPERATOR_CPU(mul, ops::MulOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(mul);
 REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
diff --git a/src/operators/mul_op.h b/src/operators/mul_op.h
index ad5c9a3702348455cb559c28453df82d81e1c4c8..127048efbacf2da87de9371cd8e54875f8554d61 100644
--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
@@ -26,17 +26,18 @@ namespace operators {
 
 template <typename DeviceType, typename T>
 class MulOp : public framework::OperatorWithKernel<
-                  DeviceType, MulParam, operators::MulKernel<DeviceType, T>> {
+                  DeviceType, MulParam<DeviceType>,
+                  operators::MulKernel<DeviceType, T>> {
  public:
   MulOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
         std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType, MulParam,
+      : framework::OperatorWithKernel<DeviceType, MulParam<DeviceType>,
                                       operators::MulKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, MulParam,
+      DeviceType, MulParam<DeviceType>,
       operators::MulKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
@@ -46,4 +47,13 @@ class MulOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(mul);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(mul);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/multiclass_nms_op.cpp b/src/operators/multiclass_nms_op.cpp
index 52adf6cc627d76b18b3b48928c344545327ca99e..4324cab35298a45ece7e375299909994648a27a4 100644
--- a/src/operators/multiclass_nms_op.cpp
+++ b/src/operators/multiclass_nms_op.cpp
@@ -34,13 +34,12 @@ void MultiClassNMSOp<Dtype, T>::InferShape() const {
   // pre size, will change in Compute.
   this->param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6}));
 }
-template class MultiClassNMSOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(multiclass_nms);
 REGISTER_OPERATOR_CPU(multiclass_nms, ops::MultiClassNMSOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/multiclass_nms_op.h b/src/operators/multiclass_nms_op.h
index 30cf8f67942f7888599e8f0057baff1ddd5d6cea..b40ef5ee009f6c16c685479ffcf58186958bb4cc 100644
--- a/src/operators/multiclass_nms_op.h
+++ b/src/operators/multiclass_nms_op.h
@@ -29,7 +29,7 @@ using paddle_mobile::framework::Tensor;
 
 template <typename DeviceType, typename T>
 class MultiClassNMSOp : public framework::OperatorWithKernel<
-                            DeviceType, MultiClassNMSParam,
+                            DeviceType, MultiClassNMSParam<DeviceType>,
                             operators::MultiClassNMSKernel<DeviceType, T>> {
  public:
   MultiClassNMSOp(const std::string &type, const VariableNameMap &inputs,
@@ -37,12 +37,12 @@ class MultiClassNMSOp : public framework::OperatorWithKernel<
                   const framework::AttributeMap &attrs,
                   std::shared_ptr<framework::Scope> scope)
       : framework::OperatorWithKernel<
-            DeviceType, MultiClassNMSParam,
+            DeviceType, MultiClassNMSParam<DeviceType>,
             operators::MultiClassNMSKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, MultiClassNMSParam,
+      DeviceType, MultiClassNMSParam<DeviceType>,
       operators::MultiClassNMSKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
@@ -52,4 +52,12 @@ class MultiClassNMSOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(multiclass_nms);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/op_param.cpp b/src/operators/op_param.cpp
index 4ad7685731bdb44794c235c639be7eed4a0c812b..4d1689911686198612eb4df4dfe8f99450ba503d 100644
--- a/src/operators/op_param.cpp
+++ b/src/operators/op_param.cpp
@@ -17,7 +17,8 @@ namespace paddle_mobile {
 namespace operators {
 
 #ifdef CONV_OP
-Print &operator<<(Print &printer, const ConvParam &conv_param) {
+template <>
+Print &operator<<(Print &printer, const ConvParam<CPU> &conv_param) {
   printer << "parameter of conv: "
           << "\n";
   printer << "  stride: "
@@ -37,11 +38,37 @@ Print &operator<<(Print &printer, const ConvParam &conv_param) {
   printer << "  output dims: " << conv_param.Output()->dims();
   return printer;
 }
+
+template class ConvParam<CPU>;
+template class ConvParam<FPGA>;
+template class ConvParam<GPU_MALI>;
+#endif
+
+template class ElementwiseAddParam<CPU>;
+template class ElementwiseAddParam<FPGA>;
+template class ElementwiseAddParam<GPU_MALI>;
+
+#ifdef MUL_OP
+template class MulParam<CPU>;
+template class MulParam<FPGA>;
+template class MulParam<GPU_MALI>;
+#endif
+
+#ifdef CONCAT_OP
+template class ConcatParam<CPU>;
+template class ConcatParam<FPGA>;
+template class ConcatParam<GPU_MALI>;
+#endif
+
+#ifdef LRN_OP
+template class LrnParam<CPU>;
+template class LrnParam<FPGA>;
+template class LrnParam<GPU_MALI>;
 #endif
 
 #ifdef FUSION_CONVADD_OP
 
-Print &operator<<(Print &printer, const FusionConvAddParam &conv_param) {
+Print &operator<<(Print &printer, const FusionConvAddParam<CPU> &conv_param) {
   printer << "parameter of conv_add: "
           << "\n";
   printer << "  stride: "
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index c0f0fbc8a9939bc4609e64359835a685dd4c67f9..5b53743b75bfe65a9e029e44114b339603388c08 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -18,10 +18,14 @@ limitations under the License. */
 #include <vector>
 #include "common/log.h"
 #include "common/type_define.h"
+#include "common/types.h"
 #include "framework/lod_tensor.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
+#ifdef PADDLE_MOBILE_FPGA
+#include "fpga/api.h"
+#endif
 
 namespace paddle_mobile {
 namespace operators {
@@ -34,8 +38,50 @@ using framework::Tensor;
 using std::string;
 using std::vector;
 
+template <typename Dtype>
+struct DtypeTensorTrait {
+  typedef void ptype;
+  typedef void rtype;
+};
+
+template <>
+struct DtypeTensorTrait<CPU> {
+  // This is the type we obtained in variable.
+  typedef framework::LoDTensor gtype;
+  // This type will be the parent class type
+  // or the same type.
+  typedef framework::Tensor rtype;
+};
+
+template <>
+struct DtypeTensorTrait<FPGA> {
+  // This is the type we obtained in variable.
+  typedef framework::LoDTensor gtype;
+  // This type will be the parent class type
+  // or the same type.
+  typedef framework::Tensor rtype;
+};
+
+template <>
+struct DtypeTensorTrait<GPU_MALI> {
+  // This is the type we obtained in variable.
+  typedef framework::LoDTensor gtype;
+  // This type will be the parent class type
+  // or the same type.
+  typedef framework::Tensor rtype;
+};
+
 class OpParam {
  protected:
+  template <typename T>
+  static T *InputH0From(const VariableNameMap &inputs, const Scope &scope) {
+    return GetVarValue<T>("H0", inputs, scope);
+  }
+  template <typename T>
+  static T *InputAlphaFrom(const VariableNameMap &inputs, const Scope &scope) {
+    return GetVarValue<T>("Alpha", inputs, scope);
+  }
+
   template <typename T>
   static T *InputFrom(const VariableNameMap &inputs, const Scope &scope) {
     return GetVarValue<T>("Input", inputs, scope);
@@ -45,12 +91,53 @@ class OpParam {
   static T *InputXFrom(const VariableNameMap &inputs, const Scope &scope) {
     return GetVarValue<T>("X", inputs, scope);
   }
+  template <typename T>
+  static T *InputOutSizeFrom(const VariableNameMap &inputs,
+                             const Scope &scope) {
+    return GetVarValue<T>("OutSize", inputs, scope);
+  }
+
+  template <typename T>
+  static T *InputWFrom(const VariableNameMap &inputs, const Scope &scope) {
+    return GetVarValue<T>("W", inputs, scope);
+  }
+
+  template <typename T>
+  static T *InputIdsFrom(const VariableNameMap &inputs, const Scope &scope) {
+    return GetVarValue<T>("Ids", inputs, scope);
+  }
+
+  template <typename T>
+  static T *InputEmissionFrom(const VariableNameMap &inputs,
+                              const Scope &scope) {
+    return GetVarValue<T>("Emission", inputs, scope);
+  }
+
+  template <typename T>
+  static T *InputTransitionFrom(const VariableNameMap &inputs,
+                                const Scope &scope) {
+    return GetVarValue<T>("Transition", inputs, scope);
+  }
+  template <typename T>
+  static T *InputLabelFrom(const VariableNameMap &inputs, const Scope &scope) {
+    return GetVarValue<T>("Label", inputs, scope);
+  }
+
+  template <typename T>
+  static T *InputXFrom1(const VariableNameMap &inputs, const Scope &scope) {
+    return GetVarValue1<T>("addX", inputs, scope);
+  }
 
   template <typename T>
   static T *InputYFrom(const VariableNameMap &inputs, const Scope &scope) {
     return GetVarValue<T>("Y", inputs, scope);
   }
 
+  template <typename T>
+  static T *InputYFrom1(const VariableNameMap &inputs, const Scope &scope) {
+    return GetVarValue1<T>("Y", inputs, scope);
+  }
+
   template <typename T>
   static T *InputZFrom(const VariableNameMap &inputs, const Scope &scope) {
     return GetVarValue<T>("Z", inputs, scope);
@@ -61,6 +148,10 @@ class OpParam {
     return GetVarValue<T>("Bias", inputs, scope);
   }
   template <typename T>
+  static T *InputWeightFrom(const VariableNameMap &inputs, const Scope &scope) {
+    return GetVarValue<T>("Weight", inputs, scope);
+  }
+  template <typename T>
   static T *InputVarianceFrom(const VariableNameMap &inputs,
                               const Scope &scope) {
     return GetVarValue<T>("Variance", inputs, scope);
@@ -115,6 +206,35 @@ class OpParam {
     return GetMultiVarValue<T>("X", inputs, scope);
   }
 
+  template <typename T>
+  static T *OutputBatchGateFrom(const VariableNameMap &outputs,
+                                const Scope &scope) {
+    return GetVarValue<T>("BatchGate", outputs, scope);
+  }
+
+  template <typename T>
+  static T *OutputViterbiPathFrom(const VariableNameMap &outputs,
+                                  const Scope &scope) {
+    return GetVarValue<T>("ViterbiPath", outputs, scope);
+  }
+  template <typename T>
+  static T *OutputBatchResetHiddenPrevFrom(const VariableNameMap &outputs,
+                                           const Scope &scope) {
+    return GetVarValue<T>("BatchResetHiddenPrev", outputs, scope);
+  }
+
+  template <typename T>
+  static T *OutputBatchHiddenFrom(const VariableNameMap &outputs,
+                                  const Scope &scope) {
+    return GetVarValue<T>("BatchHidden", outputs, scope);
+  }
+
+  template <typename T>
+  static T *OutputHiddenFrom(const VariableNameMap &outputs,
+                             const Scope &scope) {
+    return GetVarValue<T>("Hidden", outputs, scope);
+  }
+
   template <typename T>
   static T *OutputFrom(const VariableNameMap &outputs, const Scope &scope) {
     return GetVarValue<T>("Output", outputs, scope);
@@ -125,6 +245,12 @@ class OpParam {
     return GetVarValue<T>("Out", outputs, scope);
   }
 
+  template <typename T>
+  static vector<T *> OutMultiFrom(const VariableNameMap &outputs,
+                                  const Scope &scope) {
+    return GetMultiVarValue<T>("Out", outputs, scope);
+  }
+
   template <typename T>
   static T *OutputYFrom(const VariableNameMap &outputs, const Scope &scope) {
     return GetVarValue<T>("Y", outputs, scope);
@@ -162,6 +288,10 @@ class OpParam {
     return ((Attribute)map.at(key)).Get<T>();
   }
 
+  static const bool HasAttr(const string &key, const AttributeMap &map) {
+    return map.count(key) > 0;
+  }
+
   template <typename T>
   static T *GetVarValue(const string &key, const VariableNameMap &var_map,
                         const Scope &scope) {
@@ -176,6 +306,26 @@ class OpParam {
     }
   }
 
+  static std::string getkey(const string &key, const VariableNameMap &var_map,
+                            int index) {
+    auto var_vec = var_map.at(key);
+    return var_vec[index];
+  }
+
+  template <typename T>
+  static T *GetVarValue1(const string &key, const VariableNameMap &var_map,
+                         const Scope &scope) {
+    PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0,
+                          "%s is not contained in var_map", key.c_str())
+    auto var_vec = var_map.at(key);
+    if (!var_vec.empty()) {
+      auto var = scope.FindVar(var_vec[1]);
+      return var->GetMutable<T>();
+    } else {
+      return nullptr;
+    }
+  }
+
   template <typename T>
   static vector<T *> GetMultiVarValue(const string &key,
                                       const VariableNameMap &var_map,
@@ -191,26 +341,30 @@ class OpParam {
   }
 };
 
-#ifdef CONV_OP
-class ConvParam : OpParam {
+template <typename Dtype>
+class ConvParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   ConvParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs,
-            const framework::Scope &scope) {
-    filter_ = FilterFrom<LoDTensor>(inputs, scope);
-    input_ = InputFrom<LoDTensor>(inputs, scope);
-    output_ = OutputFrom<LoDTensor>(outputs, scope);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    dilations_ = GetAttr<vector<int>>("dilations", attrs);
-    groups = GetAttr<int>("groups", attrs);
+            const AttributeMap &attrs, const Scope &scope) {
+    filter_ = OpParam::FilterFrom<GType>(inputs, scope);
+    input_ = OpParam::InputFrom<GType>(inputs, scope);
+    if (outputs.count("Output")) {
+      output_ = OpParam::OutputFrom<GType>(outputs, scope);
+    }
+    strides_ = OpParam::GetAttr<vector<int>>("strides", attrs);
+    paddings_ = OpParam::GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = OpParam::GetAttr<vector<int>>("dilations", attrs);
+    groups = OpParam::GetAttr<int>("groups", attrs);
   }
 
-  const Tensor *Input() const { return input_; }
+  const RType *Input() const { return input_; }
 
-  const Tensor *Filter() const { return filter_; }
+  RType *Filter() const { return filter_; }
 
-  Tensor *Output() const { return output_; }
+  RType *Output() const { return output_; }
 
   const vector<int> &Strides() const { return strides_; }
 
@@ -221,113 +375,144 @@ class ConvParam : OpParam {
   const int &Groups() const { return groups; }
 
  private:
-  Tensor *input_;
-  Tensor *output_;
-  Tensor *filter_;
+  RType *input_;
+  RType *output_;
+  RType *filter_;
   vector<int> strides_;
   vector<int> paddings_;
   vector<int> dilations_;
   int groups;
 };
+template <typename Dtype>
+Print &operator<<(Print &printer, const ConvParam<Dtype> &conv_param);
 
-Print &operator<<(Print &printer, const ConvParam &conv_param);
-#endif
-
-#ifdef ELEMENTWISEADD_OP
+template <typename Dtype>
 class ElementwiseAddParam : OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   ElementwiseAddParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs,
-                      const framework::AttributeMap &attrs,
-                      const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    input_y_ = InputYFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+                      const VariableNameMap &outputs, const AttributeMap &attrs,
+                      const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    input_y_ = InputYFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
     axis_ = GetAttr<int>("axis", attrs);
   }
 
-  const Tensor *InputX() const { return input_x_; }
+  const GType *InputX() const { return input_x_; }
 
-  const Tensor *InputY() const { return input_y_; }
+  const GType *InputY() const { return input_y_; }
 
-  Tensor *Out() const { return out_; }
+  GType *Out() const { return out_; }
 
   const int &Axis() const { return axis_; }
 
  private:
-  Tensor *input_x_;
-  Tensor *input_y_;
-  Tensor *out_;
+  GType *input_x_;
+  GType *input_y_;
+  GType *out_;
   int axis_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::EWAddArgs fpga_EW_add_args;
+
+ public:
+  const fpga::EWAddArgs &FpgaArgs() const { return fpga_EW_add_args; }
+  void SetFpgaArgs(const fpga::EWAddArgs &args) { fpga_EW_add_args = args; }
+#endif
 };
 
+#ifdef FUSION_ELEMENTWISEADDRELU_OP
+template <typename Dtype>
+using ElementwiseAddReluParam = ElementwiseAddParam<Dtype>;
 #endif
 
 #ifdef MUL_OP
+template <typename Dtype>
 class MulParam : OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   MulParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const framework::AttributeMap &attrs,
-           const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    input_y_ = InputYFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+           const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    input_y_ = InputYFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
     x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
     y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
   }
 
-  const Tensor *InputX() const { return input_x_; }
+  const GType *InputX() const { return input_x_; }
 
-  const Tensor *InputY() const { return input_y_; }
+  const GType *InputY() const { return input_y_; }
 
-  Tensor *Out() const { return out_; }
+  GType *Out() const { return out_; }
 
   const int &XNumColDims() const { return x_num_col_dims_; }
 
   const int &YNumColDims() const { return y_num_col_dims_; }
 
  private:
-  Tensor *input_x_;
-  Tensor *input_y_;
-  Tensor *out_;
+  GType *input_x_;
+  GType *input_y_;
+  GType *out_;
   int x_num_col_dims_;
   int y_num_col_dims_;
 };
 #endif
 
 #ifdef CONCAT_OP
+template <typename Dtype>
 class ConcatParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   ConcatParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-              const framework::AttributeMap &attrs,
-              const framework::Scope &scope) {
-    inputs_ = InputMultiFrom<LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+              const AttributeMap &attrs, const Scope &scope) {
+    inputs_ = InputMultiFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
     axis_ = GetAttr<int>("axis", attrs);
   }
 
-  vector<LoDTensor *> Inputs() const { return inputs_; }
+  vector<GType *> Inputs() const { return inputs_; }
 
-  Tensor *Out() const { return out_; }
+  GType *Out() const { return out_; }
 
   const int &Axis() const { return axis_; }
 
  private:
-  vector<LoDTensor *> inputs_;
-  Tensor *out_;
+  vector<GType *> inputs_;
+  GType *out_;
   int axis_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::ConcatArgs fpga_concat_args;
+
+ public:
+  const fpga::ConcatArgs &FpgaArgs() const { return fpga_concat_args; }
+  void SetFpgaArgs(const fpga::ConcatArgs &args) { fpga_concat_args = args; }
+#endif
 };
 #endif
 
 #ifdef LRN_OP
+template <typename Dtype>
 class LrnParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   LrnParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const framework::AttributeMap &attrs,
-           const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
-    mid_out_ = MidOutFrom<framework::LoDTensor>(outputs, scope);
+           const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    mid_out_ = MidOutFrom<GType>(outputs, scope);
     n_ = GetAttr<int>("n", attrs);
     alpha_ = GetAttr<float>("alpha", attrs);
     beta_ = GetAttr<float>("beta", attrs);
@@ -335,11 +520,11 @@ class LrnParam : public OpParam {
     data_format_ = GetAttr<string>("data_format", attrs);
   }
 
-  const Tensor *InputX() const { return input_x_; }
+  const RType *InputX() const { return input_x_; }
 
-  Tensor *Out() const { return out_; }
+  RType *Out() const { return out_; }
 
-  Tensor *MidOut() const { return mid_out_; }
+  RType *MidOut() const { return mid_out_; }
 
   const int &N() const { return n_; }
 
@@ -352,9 +537,9 @@ class LrnParam : public OpParam {
   const string &DataFormat() const { return data_format_; }
 
  private:
-  Tensor *input_x_;
-  Tensor *out_;
-  Tensor *mid_out_;
+  RType *input_x_;
+  RType *out_;
+  RType *mid_out_;
   int n_;
   float alpha_;
   float beta_;
@@ -364,33 +549,36 @@ class LrnParam : public OpParam {
 #endif
 
 #ifdef BATCHNORM_OP
+template <typename Dtype>
 class BatchNormParam : OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   BatchNormParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs,
-                 const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    output_y_ = OutputYFrom<framework::LoDTensor>(outputs, scope);
-    input_bias_ = InputBiasFrom<framework::LoDTensor>(inputs, scope);
-    input_mean_ = InputMeanFrom<framework::LoDTensor>(inputs, scope);
-    input_scale_ = InputScaleFrom<framework::LoDTensor>(inputs, scope);
-    input_variance_ = InputVarianceFrom<framework::LoDTensor>(inputs, scope);
+                 const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    output_y_ = OutputYFrom<GType>(outputs, scope);
+    input_bias_ = InputBiasFrom<GType>(inputs, scope);
+    input_mean_ = InputMeanFrom<GType>(inputs, scope);
+    input_scale_ = InputScaleFrom<GType>(inputs, scope);
+    input_variance_ = InputVarianceFrom<GType>(inputs, scope);
     epsilon_ = GetAttr<float>("epsilon", attrs);
     momentum_ = GetAttr<float>("momentum", attrs);
-    is_test_ = GetAttr<bool>("is_test", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
   }
 
-  const Tensor *InputX() const { return input_x_; }
+  const RType *InputX() const { return input_x_; }
 
-  Tensor *OutputY() const { return output_y_; }
+  RType *OutputY() const { return output_y_; }
 
-  const Tensor *InputBias() const { return input_bias_; }
+  const RType *InputBias() const { return input_bias_; }
 
-  const Tensor *InputMean() const { return input_mean_; }
+  const RType *InputMean() const { return input_mean_; }
 
-  const Tensor *InputScale() const { return input_scale_; }
+  const RType *InputScale() const { return input_scale_; }
 
-  const Tensor *InputVariance() const { return input_variance_; }
+  const RType *InputVariance() const { return input_variance_; }
 
   const float &Epsilon() const { return epsilon_; }
 
@@ -401,12 +589,12 @@ class BatchNormParam : OpParam {
   const string &DataFormat() const { return data_format_; }
 
  private:
-  Tensor *input_x_;
-  Tensor *output_y_;
-  Tensor *input_bias_;
-  Tensor *input_mean_;
-  Tensor *input_scale_;
-  Tensor *input_variance_;
+  RType *input_x_;
+  RType *output_y_;
+  RType *input_bias_;
+  RType *input_mean_;
+  RType *input_scale_;
+  RType *input_variance_;
   float epsilon_;
   float momentum_;
   bool is_test_;
@@ -415,25 +603,28 @@ class BatchNormParam : OpParam {
 #endif
 
 #ifdef POOL_OP
+template <typename Dtype>
 class PoolParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   PoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs,
-            const framework::Scope &scope) {
-    input_ = InputXFrom<framework::LoDTensor>(inputs, scope);
+            const AttributeMap &attrs, const Scope &scope) {
+    input_ = InputXFrom<GType>(inputs, scope);
 
-    output_ = OutFrom<framework::LoDTensor>(outputs, scope);
+    output_ = OutFrom<GType>(outputs, scope);
     pooling_type_ = GetAttr<string>("pooling_type", attrs);
     ksize_ = GetAttr<vector<int>>("ksize", attrs);
     strides_ = GetAttr<vector<int>>("strides", attrs);
     paddings_ = GetAttr<vector<int>>("paddings", attrs);
     ceil_mode_ = GetAttr<bool>("ceil_mode", attrs);
-    gloabal_pooling_ = GetAttr<bool>("global_pooling", attrs);
+    global_pooling_ = GetAttr<bool>("global_pooling", attrs);
   }
 
-  const Tensor *Input() const { return input_; }
+  const RType *Input() const { return input_; }
 
-  Tensor *Output() const { return output_; }
+  RType *Output() const { return output_; }
 
   const string &PoolingType() const { return pooling_type_; }
 
@@ -445,49 +636,64 @@ class PoolParam : public OpParam {
 
   bool isCeilMode() const { return ceil_mode_; }
 
-  bool isGlobalPooling() const { return gloabal_pooling_; }
+  bool isGlobalPooling() const { return global_pooling_; }
 
  private:
-  Tensor *input_;
-  Tensor *output_;
+  RType *input_;
+  RType *output_;
   string pooling_type_;
   vector<int> ksize_;
   vector<int> strides_;
   vector<int> paddings_;
   bool ceil_mode_;
-  bool gloabal_pooling_ = false;
-};
+  bool global_pooling_ = false;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::PoolingArgs fpga_pool_args;
 
+ public:
+  const fpga::PoolingArgs &FpgaArgs() const { return fpga_pool_args; }
+  void SetFpgaArgs(const fpga::PoolingArgs &args) { fpga_pool_args = args; }
+#endif
+};
 #endif
 
 #ifdef PRIORBOX_OP
+template <typename Dtype>
 class PriorBoxParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   PriorBoxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const framework::AttributeMap &attrs,
-                const framework::Scope &scope) {
-    input_ = InputFrom<framework::LoDTensor>(inputs, scope);
-    input_image_ = InputImageFrom<framework::LoDTensor>(inputs, scope);
-    output_boxes_ = OutputBoxesFrom<framework::LoDTensor>(outputs, scope);
-    output_variances_ =
-        OutputVariancesFrom<framework::LoDTensor>(outputs, scope);
+                const AttributeMap &attrs, const Scope &scope) {
+    input_ = InputFrom<GType>(inputs, scope);
+    input_image_ = InputImageFrom<GType>(inputs, scope);
+    output_boxes_ = OutputBoxesFrom<GType>(outputs, scope);
+    output_variances_ = OutputVariancesFrom<GType>(outputs, scope);
     min_sizes_ = GetAttr<vector<float>>("min_sizes", attrs);
     max_sizes_ = GetAttr<vector<float>>("max_sizes", attrs);
     aspect_ratios_ = GetAttr<vector<float>>("aspect_ratios", attrs);
     variances_ = GetAttr<vector<float>>("variances", attrs);
+
+    if (HasAttr("min_max_aspect_ratios_order", attrs)) {
+      min_max_aspect_ratios_order_ =
+          GetAttr<bool>("min_max_aspect_ratios_order", attrs);
+    }
     flip_ = GetAttr<bool>("flip", attrs);
     clip_ = GetAttr<bool>("clip", attrs);
     step_w_ = GetAttr<float>("step_w", attrs);
     step_h_ = GetAttr<float>("step_h", attrs);
     offset_ = GetAttr<float>("offset", attrs);
   }
-  const Tensor *Input() const { return input_; }
+  const RType *Input() const { return input_; }
 
-  const Tensor *InputImage() const { return input_image_; }
+  const RType *InputImage() const { return input_image_; }
 
-  Tensor *OutputBoxes() const { return output_boxes_; }
+  RType *OutputBoxes() const { return output_boxes_; }
 
-  Tensor *OutputVariances() const { return output_variances_; }
+  RType *OutputVariances() const { return output_variances_; }
 
   const vector<float> &MinSizes() const { return min_sizes_; }
 
@@ -507,11 +713,15 @@ class PriorBoxParam : public OpParam {
 
   const float &Offset() const { return offset_; }
 
+  const bool &MinMaxAspectRatiosOrder() const {
+    return min_max_aspect_ratios_order_;
+  }
+
  private:
-  Tensor *input_;
-  Tensor *input_image_;
-  Tensor *output_boxes_;
-  Tensor *output_variances_;
+  RType *input_;
+  RType *input_image_;
+  RType *output_boxes_;
+  RType *output_variances_;
   vector<float> min_sizes_;
   vector<float> max_sizes_;
   vector<float> aspect_ratios_;
@@ -521,86 +731,114 @@ class PriorBoxParam : public OpParam {
   float step_w_;
   float step_h_;
   float offset_;
+  bool min_max_aspect_ratios_order_;
 };
 #endif
 
 #ifdef BOXCODER_OP
+template <typename Dtype>
 class BoxCoderParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   BoxCoderParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const framework::AttributeMap &attrs,
-                const framework::Scope &scope) {
-    input_priorbox_ = InputPriorBoxFrom<framework::LoDTensor>(inputs, scope);
-    input_priorboxvar_ =
-        InputPriorBoxVarFrom<framework::LoDTensor>(inputs, scope);
-    input_targetbox_ = InputTargetBoxFrom<framework::LoDTensor>(inputs, scope);
-    output_box_ = OutputBoxFrom<framework::LoDTensor>(outputs, scope);
+                const AttributeMap &attrs, const Scope &scope) {
+    input_priorbox_ = InputPriorBoxFrom<GType>(inputs, scope);
+    input_priorboxvar_ = InputPriorBoxVarFrom<GType>(inputs, scope);
+    input_targetbox_ = InputTargetBoxFrom<GType>(inputs, scope);
+    output_box_ = OutputBoxFrom<GType>(outputs, scope);
     code_type_ = GetAttr<std::string>("code_type", attrs);
   }
-  const Tensor *InputPriorBox() const { return input_priorbox_; }
+  const RType *InputPriorBox() const { return input_priorbox_; }
 
-  const Tensor *InputPriorBoxVar() const { return input_priorboxvar_; }
+  const RType *InputPriorBoxVar() const { return input_priorboxvar_; }
 
-  const Tensor *InputTargetBox() const { return input_targetbox_; }
+  const RType *InputTargetBox() const { return input_targetbox_; }
 
-  Tensor *OutputBox() const { return output_box_; }
+  RType *OutputBox() const { return output_box_; }
 
   const std::string &CodeType() const { return code_type_; }
 
  private:
-  Tensor *input_priorbox_;
-  Tensor *input_priorboxvar_;
-  Tensor *input_targetbox_;
-  Tensor *output_box_;
+  RType *input_priorbox_;
+  RType *input_priorboxvar_;
+  RType *input_targetbox_;
+  RType *output_box_;
   std::string code_type_;
 };
 #endif
 
 #ifdef SOFTMAX_OP
+template <typename Dtype>
 class SoftmaxParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   SoftmaxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const framework::AttributeMap &attrs,
-               const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+               const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
   }
-  const Tensor *InputX() const { return input_x_; }
-  Tensor *Out() const { return out_; }
+  const RType *InputX() const { return input_x_; }
+  RType *Out() const { return out_; }
+
+ private:
+  RType *input_x_;
+  RType *out_;
+
+#ifdef PADDLE_MOBILE_FPGA
 
  private:
-  Tensor *input_x_;
-  Tensor *out_;
+  std::shared_ptr<RType> float_input_x_;
+  fpga::BypassArgs fpga_bypass_args;
+
+ public:
+  RType *FloatInput() const {
+    return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
+  }
+  void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
+  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
+  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
+#endif
 };
 #endif
 
 #ifdef SIGMOID_OP
+template <typename Dtype>
 class SigmoidParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   SigmoidParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const framework::AttributeMap &attrs,
-               const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+               const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
   }
-  const Tensor *InputX() const { return input_x_; }
-  Tensor *Out() const { return out_; }
+  const RType *InputX() const { return input_x_; }
+  RType *Out() const { return out_; }
 
  private:
-  Tensor *input_x_;
-  Tensor *out_;
+  RType *input_x_;
+  RType *out_;
 };
 #endif
 
 #ifdef MULTICLASSNMS_OP
+template <typename Dtype>
 class MultiClassNMSParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   MultiClassNMSParam(const VariableNameMap &inputs,
                      const VariableNameMap &outputs, const AttributeMap &attrs,
                      const Scope &scope) {
-    input_bboxes_ = InputBBoxesFrom<LoDTensor>(inputs, scope);
-    input_scores_ = InputScoresFrom<LoDTensor>(inputs, scope);
-    out_ = OutFrom<LoDTensor>(outputs, scope);
+    input_bboxes_ = InputBBoxesFrom<GType>(inputs, scope);
+    input_scores_ = InputScoresFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
     background_label_ = GetAttr<int>("background_label", attrs);
     nms_top_k_ = GetAttr<int>("nms_top_k", attrs);
     keep_top_k_ = GetAttr<int>("keep_top_k", attrs);
@@ -609,11 +847,11 @@ class MultiClassNMSParam : public OpParam {
     score_threshold_ = GetAttr<float>("score_threshold", attrs);
   }
 
-  const Tensor *InputBBoxes() const { return input_bboxes_; }
+  const RType *InputBBoxes() const { return input_bboxes_; }
 
-  const Tensor *InputScores() const { return input_scores_; }
+  const RType *InputScores() const { return input_scores_; }
 
-  Tensor *Out() const { return out_; }
+  RType *Out() const { return out_; }
 
   const int &BackGroundLabel() const { return background_label_; }
 
@@ -628,9 +866,9 @@ class MultiClassNMSParam : public OpParam {
   const float &ScoreThreshold() const { return score_threshold_; }
 
  private:
-  Tensor *input_bboxes_;
-  Tensor *input_scores_;
-  Tensor *out_;
+  RType *input_bboxes_;
+  RType *input_scores_;
+  RType *out_;
   int background_label_;
   int nms_top_k_;
   int keep_top_k_;
@@ -640,137 +878,390 @@ class MultiClassNMSParam : public OpParam {
 };
 #endif
 
+template <typename Dtype>
 class FeedParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs, framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
-    auto var = scope.Var("batch_size");
+            const AttributeMap &attrs, Scope *scope) {
+    input_x_ = InputXFrom<GType>(inputs, *scope);
+    out_ = OutFrom<GType>(outputs, *scope);
+    auto var = scope->Var("batch_size");
     batch_size = var->GetValue<int>();
   }
-  const Tensor *InputX() const { return input_x_; }
-  Tensor *Out() const { return out_; }
+  const GType *InputX() const { return input_x_; }
+  GType *Out() const { return out_; }
   const int BatchSize() const { return batch_size; }
 
  private:
-  Tensor *input_x_;
-  Tensor *out_;
+  GType *input_x_;
+  GType *out_;
   int batch_size;
 };
 
+template <typename Dtype>
 class FetchParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs,
-             const framework::Scope &scope) {
-    input_x_ = InputXFrom<framework::LoDTensor>(inputs, scope);
-    out_ = OutFrom<framework::LoDTensor>(outputs, scope);
+             const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
   }
-  const Tensor *InputX() const { return input_x_; }
-  Tensor *Out() const { return out_; }
+  const RType *InputX() const { return input_x_; }
+  RType *Out() const { return out_; }
 
  private:
-  Tensor *input_x_;
-  Tensor *out_;
+  RType *input_x_;
+  RType *out_;
 };
 
 #ifdef TRANSPOSE_OP
+template <typename Dtype>
 class TransposeParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   TransposeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
                  const AttributeMap &attrs, const Scope &scope) {
-    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    out_ = OutFrom<LoDTensor>(outputs, scope);
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
     axis_ = GetAttr<vector<int>>("axis", attrs);
   }
 
-  const Tensor *InputX() const { return input_x_; }
+  const RType *InputX() const { return input_x_; }
 
-  Tensor *Out() const { return out_; }
+  RType *Out() const { return out_; }
 
   const vector<int> &Axis() const { return axis_; }
 
  private:
-  Tensor *input_x_;
-  Tensor *out_;
+  RType *input_x_;
+  RType *out_;
   vector<int> axis_;
 };
 #endif
 
+#ifdef LOOKUP_OP
+template <typename Dtype>
+class LookupParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  LookupParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+              const AttributeMap &attrs, const Scope &scope) {
+    input_w_ = InputWFrom<GType>(inputs, scope);
+    input_ids_ = InputIdsFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    padding_idx_ = GetAttr<int64_t>("padding_idx", attrs);
+  }
+
+  const GType *InputW() const { return input_w_; }
+  const GType *InputIds() const { return input_ids_; }
+  GType *Out() const { return out_; }
+  int64_t PaddingIdx() const { return padding_idx_; }
+
+ private:
+  GType *input_w_;
+  GType *input_ids_;
+  GType *out_;
+  int64_t padding_idx_;
+};
+#endif
+
+#ifdef CRF_OP
+template <typename Dtype>
+class CrfParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  //    {G_OP_TYPE_CRF, {{"Emission", "Transition", "Label"}, {"ViterbiPath"}}},
+
+  CrfParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+           const AttributeMap &attrs, const Scope &scope) {
+    // todo crf params
+    input_emission_ = InputEmissionFrom<GType>(inputs, scope);
+    input_transition_ = InputTransitionFrom<GType>(inputs, scope);
+    input_label_ = InputLabelFrom<GType>(inputs, scope);
+    output_viterbipath_ = OutputViterbiPathFrom<GType>(outputs, scope);
+    //    padding_idx_ = GetAttr<int64_t>("padding_idx", attrs);
+  }
+  const GType *InputEmission() const { return input_emission_; }
+  const GType *InputTransition() const { return input_transition_; }
+  const GType *InputLabel() const { return input_label_; }
+  GType *outputVBP() const { return output_viterbipath_; }
+  //  const RType *InputIds() const { return input_ids_; }
+  //  RType *Out() const { return out_; }
+  //  int64_t PaddingIdx() const { return padding_idx_; }
+
+ private:
+  GType *input_emission_;
+  GType *input_transition_;
+  GType *input_label_;
+  GType *output_viterbipath_;
+
+  //  RType *input_ids_;
+  //  RType *out_;
+  //  int64_t padding_idx_;
+};
+#endif
+
 #ifdef RESHAPE_OP
+template <typename Dtype>
 class ReshapeParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   ReshapeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
                const AttributeMap &attrs, const Scope &scope) {
-    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    input_shape_ = InputShapeFrom<LoDTensor>(inputs, scope);
-    out_ = OutFrom<LoDTensor>(outputs, scope);
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    input_shape_ = InputShapeFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
     shape_ = GetAttr<vector<int>>("shape", attrs);
-    inplace_ = GetAttr<bool>("inplace", attrs);
+
+    if (HasAttr("inplace", attrs)) {
+      inplace_ = GetAttr<bool>("inplace", attrs);
+    } else {
+      inplace_ = false;
+      DLOG << "ReshapeParam lost inplace params. maybe fluid updated";
+    }
   }
 
-  const Tensor *InputX() const { return input_x_; }
+  const RType *InputX() const { return input_x_; }
 
-  const Tensor *InputShape() const { return input_shape_; }
+  const RType *InputShape() const { return input_shape_; }
 
-  Tensor *Out() const { return out_; }
+  RType *Out() const { return out_; }
 
   const vector<int> &Shape() const { return shape_; }
 
   const bool &Inplace() const { return inplace_; }
 
  private:
-  Tensor *input_x_;
-  Tensor *input_shape_;
-  Tensor *out_;
+  RType *input_x_;
+  RType *input_shape_;
+  RType *out_;
   vector<int> shape_;
   bool inplace_;
 };
 #endif
 
+#ifdef SCALE_OP
+template <typename Dtype>
+class ScaleParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  ScaleParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+             const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    input_bias_ = InputBiasFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    inplace_ = GetAttr<bool>("inplace", attrs);
+    has_bias_ = GetAttr<bool>("has_bias", attrs);
+    scales_ = GetAttr<vector<float>>("scales", attrs);
+    biases_ = GetAttr<vector<float>>("biases", attrs);
+  }
+
+  const RType *InputX() const { return input_x_; }
+
+  const RType *InputBias() const { return input_bias_; }
+
+  RType *Out() const { return out_; }
+
+  const bool &Inplace() const { return inplace_; }
+
+  const bool &HasBias() const { return has_bias_; }
+
+  const vector<float> &Scales() const { return scales_; }
+
+  const vector<float> &Biases() const { return biases_; }
+
+ private:
+  RType *input_x_;
+  RType *input_bias_;
+  RType *out_;
+  bool inplace_;
+  bool has_bias_;
+  vector<float> scales_;
+  vector<float> biases_;
+};
+#endif
+
+#ifdef SLICE_OP
+template <typename Dtype>
+class SliceParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  SliceParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+             const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    input_shape_ = InputShapeFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    axis_ = GetAttr<int>("axis", attrs);
+    slice_points_ = GetAttr<vector<int>>("slice_points", attrs);
+    inplace_ = GetAttr<bool>("inplace", attrs);
+  }
+
+  const RType *InputX() const { return input_x_; }
+
+  const RType *InputShape() const { return input_shape_; }
+
+  RType *Out() const { return out_; }
+
+  const int &Axis() const { return axis_; }
+
+  const vector<int> &SlicePoints() const { return slice_points_; }
+
+  const bool &Inplace() const { return inplace_; }
+
+ private:
+  RType *input_x_;
+  RType *input_shape_;
+  RType *out_;
+  int axis_;
+  vector<int> slice_points_;
+  bool inplace_;
+};
+#endif
+
+#ifdef RESIZE_OP
+template <typename Dtype>
+class ResizeParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  ResizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+              const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    input_shape_ = InputShapeFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    is_pyramid_test_ = GetAttr<bool>("is_pyramid_test", attrs);
+    height_ = GetAttr<int>("height", attrs);
+    width_ = GetAttr<int>("width", attrs);
+    out_height_scale_ = GetAttr<float>("out_height_scale", attrs);
+    out_width_scale_ = GetAttr<float>("out_width_scale", attrs);
+  }
+
+  const RType *InputX() const { return input_x_; }
+
+  const RType *InputShape() const { return input_shape_; }
+
+  RType *Out() const { return out_; }
+
+  const bool &IsPyramidTest() const { return is_pyramid_test_; }
+
+  const int &Height() const { return height_; }
+
+  const int &Width() const { return width_; }
+
+  const float &OutHeightScale() const { return out_height_scale_; }
+
+  const float &OutWidthScale() const { return out_width_scale_; }
+
+ private:
+  RType *input_x_;
+  RType *input_shape_;
+  RType *out_;
+  bool is_pyramid_test_;
+  int height_;
+  int width_;
+  float out_height_scale_;
+  float out_width_scale_;
+};
+#endif
+
 #ifdef RELU_OP
 /*
  * @b op 层实例化好这个 param 传递给 kernel 层使用
  * */
+template <typename Dtype>
 class ReluParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   ReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
             const AttributeMap &attrs, const Scope &scope) {
-    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    out_ = OutFrom<LoDTensor>(outputs, scope);
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
   }
 
-  const Tensor *InputX() const { return input_x_; }
+  const RType *InputX() const { return input_x_; }
+
+  RType *Out() const { return out_; }
 
-  Tensor *Out() const { return out_; }
+ private:
+  RType *input_x_;
+  RType *out_;
+};
+#endif
+
+#ifdef PRELU_OP
+template <typename Dtype>
+class PReluParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  PReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+             const AttributeMap &attrs, const Scope &scope) {
+    DLOG << "PReluParam inputs before";
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    alpha_ = InputAlphaFrom<GType>(inputs, scope);
+    framework::DDim dims = alpha_->dims();
+    out_ = OutFrom<GType>(outputs, scope);
+    mode_ = GetAttr<std::string>("mode", attrs);
+    DLOG << "PReluParam mode after" << mode_;
+  }
+  const RType *InputX() const { return input_x_; }
+  const RType *InputAlpha() const { return alpha_; }
+  RType *Out() const { return out_; }
+  const std::string &Mode() const { return mode_; }
 
  private:
-  Tensor *input_x_;
-  Tensor *out_;
+  RType *input_x_;
+  RType *out_;
+  RType *alpha_;
+  std::string mode_;
 };
 #endif
 
-#ifdef FUSION_FC_OP
+template <typename Dtype>
 class FusionFcParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   FusionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
                 const AttributeMap &attrs, const Scope &scope) {
-    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    input_y_ = InputYFrom<LoDTensor>(inputs, scope);
-    input_z_ = InputZFrom<LoDTensor>(inputs, scope);
-    out_ = OutFrom<LoDTensor>(outputs, scope);
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    input_y_ = InputYFrom<GType>(inputs, scope);
+    input_z_ = InputZFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
     x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
     y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
     axis_ = GetAttr<int>("axis", attrs);
   }
-  const Tensor *InputX() const { return input_x_; }
+  const GType *InputX() const { return input_x_; }
 
-  const Tensor *InputY() const { return input_y_; }
+  const RType *InputY() const { return input_y_; }
 
-  const Tensor *InputZ() const { return input_z_; }
+  const RType *InputZ() const { return input_z_; }
 
-  Tensor *Out() const { return out_; }
+  GType *Out() const { return out_; }
 
   const int &XNumColDims() const { return x_num_col_dims_; }
 
@@ -779,123 +1270,213 @@ class FusionFcParam : public OpParam {
   const int &Axis() const { return axis_; }
 
  private:
-  Tensor *input_x_;
-  Tensor *input_y_;
-  Tensor *input_z_;
-  Tensor *out_;
+  GType *input_x_;
+  RType *input_y_;
+  RType *input_z_;
+  GType *out_;
   int x_num_col_dims_;
   int y_num_col_dims_;
   int axis_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::WrapperConvArgs fpga_conv_args;
+
+ public:
+  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+#endif
 };
+
+#ifdef FUSION_FCRELU_OP
+template <typename DeviceType>
+using FusionFcReluParam = FusionFcParam<DeviceType>;
 #endif
 
-#ifdef FUSION_CONVADD_OP
-class FusionConvAddParam : public OpParam {
+template <typename Dtype>
+class FusionConvAddParam : public ConvParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   FusionConvAddParam(const VariableNameMap &inputs,
                      const VariableNameMap &outputs, const AttributeMap &attrs,
-                     const Scope &scope) {
-    bias_ = InputYFrom<LoDTensor>(inputs, scope);
-    axis_ = GetAttr<int>("axis", attrs);
-    filter_ = FilterFrom<LoDTensor>(inputs, scope);
-    input_ = InputFrom<LoDTensor>(inputs, scope);
-    output_ = OutFrom<LoDTensor>(outputs, scope);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    dilations_ = GetAttr<vector<int>>("dilations", attrs);
-    groups = GetAttr<int>("groups", attrs);
+                     const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
+    axis_ = OpParam::GetAttr<int>("axis", attrs);
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
   }
-  Tensor *Bias() const { return bias_; }
+  RType *Bias() const { return bias_; }
 
   const int &Axis() const { return axis_; }
 
-  const Tensor *Input() const { return input_; }
-
-  const Tensor *Filter() const { return filter_; }
-
-  Tensor *Output() const { return output_; }
-
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  const vector<int> &Dilations() const { return dilations_; }
-
-  const int &Groups() const { return groups; }
+  RType *Output() const { return output_; }
 
  protected:
-  Tensor *bias_;
+  RType *bias_;
   int axis_;
-  Tensor *input_;
-  Tensor *output_;
-  Tensor *filter_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  vector<int> dilations_;
-  int groups;
-};
+  RType *output_;
+#ifdef PADDLE_MOBILE_FPGA
 
-Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
+ private:
+  fpga::WrapperConvArgs fpga_conv_args;
+
+ public:
+  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
 #endif
+};
 
-#ifdef FUSION_CONVADD_RELU_OP
-class FusionConvAddReluParam : public FusionConvAddParam {
+template <typename Dtype>
+Print &operator<<(Print &printer, const FusionConvAddParam<Dtype> &conv_param);
+
+#ifdef FUSION_CONVADDRELU_OP
+template <typename DeviceType>
+class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
  public:
   FusionConvAddReluParam(const VariableNameMap &inputs,
                          const VariableNameMap &outputs,
                          const AttributeMap &attrs, const Scope &scope)
-      : FusionConvAddParam(inputs, outputs, attrs, scope) {}
+      : FusionConvAddParam<DeviceType>(inputs, outputs, attrs, scope) {}
 };
 #endif
 
-#ifdef FUSION_CONVADDBNRELU_OP
-class FusionConvAddBNReluParam : public OpParam {
+#ifdef FUSION_CONVADDPRELU_OP
+template <typename Dtype>
+class FusionConvAddPReluParam : public ConvParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
-  FusionConvAddBNReluParam(const VariableNameMap &inputs,
-                           const VariableNameMap &outputs,
-                           const AttributeMap &attrs, const Scope &scope) {
-    bias_ = InputYFrom<LoDTensor>(inputs, scope);
-    axis_ = GetAttr<int>("axis", attrs);
-    filter_ = FilterFrom<LoDTensor>(inputs, scope);
-    input_ = InputFrom<LoDTensor>(inputs, scope);
-    output_ = OutFrom<LoDTensor>(outputs, scope);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    dilations_ = GetAttr<vector<int>>("dilations", attrs);
-    groups = GetAttr<int>("groups", attrs);
-    input_bias_ = InputBiasFrom<framework::LoDTensor>(inputs, scope);
-    input_mean_ = InputMeanFrom<framework::LoDTensor>(inputs, scope);
-    input_scale_ = InputScaleFrom<framework::LoDTensor>(inputs, scope);
-    input_variance_ = InputVarianceFrom<framework::LoDTensor>(inputs, scope);
-    epsilon_ = GetAttr<float>("epsilon", attrs);
-    momentum_ = GetAttr<float>("momentum", attrs);
-    is_test_ = GetAttr<bool>("is_test", attrs);
+  FusionConvAddPReluParam(const VariableNameMap &inputs,
+                          const VariableNameMap &outputs,
+                          const AttributeMap &attrs, const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    alpha_ = OpParam::InputAlphaFrom<GType>(inputs, scope);
+    mode_ = OpParam::GetAttr<std::string>("mode", attrs);
+    framework::DDim dims = alpha_->dims();
+    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
+    axis_ = OpParam::GetAttr<int>("axis", attrs);
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
   }
-  Tensor *Bias() const { return bias_; }
+  const RType *InputAlpha() const { return alpha_; }
+  const std::string &Mode() const { return mode_; }
+  RType *Bias() const { return bias_; }
+  const int &Axis() const { return axis_; }
+  RType *Output() const { return output_; }
+
+ protected:
+  RType *bias_;
+  int axis_;
+  RType *output_;
+  RType *alpha_;
+  std::string mode_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::WrapperConvArgs fpga_conv_args;
+
+ public:
+  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+#endif
+};
+#endif
+
+#ifdef FUSION_CONVADDADDPRELU_OP
+template <typename Dtype>
+class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FusionConvAddAddPReluParam(const VariableNameMap &inputs,
+                             const VariableNameMap &outputs,
+                             const AttributeMap &attrs, const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    bias1_ = OpParam::InputYFrom1<GType>(inputs, scope);
+    alpha_ = OpParam::InputAlphaFrom<GType>(inputs, scope);
+    mode_ = OpParam::GetAttr<std::string>("mode", attrs);
+    framework::DDim dims = alpha_->dims();
+    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
+    axis_ = OpParam::GetAttr<int>("axis", attrs);
+    keyOutput_ = OpParam::getkey("addOut", inputs, 0);
+    keyX1_ = OpParam::getkey("addX", inputs, 1);
+    keyY1_ = OpParam::getkey("Y", inputs, 1);
+    if (keyX1_ == keyOutput_) {
+      bias1_ = OpParam::InputYFrom1<GType>(inputs, scope);
+    } else if (keyY1_ == keyOutput_) {
+      bias1_ = OpParam::InputXFrom1<GType>(inputs, scope);
+    }
+  }
+  const RType *InputAlpha() const { return alpha_; }
+  const std::string &Mode() const { return mode_; }
+  const RType *Bias1() const { return bias1_; }
+
+  RType *Bias() const { return bias_; }
 
   const int &Axis() const { return axis_; }
+  RType *Output() const { return output_; }
 
-  const Tensor *Input() const { return input_; }
+ protected:
+  RType *bias_;
+  int axis_;
+  RType *output_;
+  RType *alpha_;
+  std::string mode_;
+  RType *bias1_;
+  std::string keyOutput_;
+  std::string keyX1_;
+  std::string keyY1_;
+#ifdef PADDLE_MOBILE_FPGA
 
-  const Tensor *Filter() const { return filter_; }
+ private:
+  fpga::WrapperConvArgs fpga_conv_args;
 
-  Tensor *Output() const { return output_; }
+ public:
+  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+#endif
+};
+#endif
 
-  const vector<int> &Strides() const { return strides_; }
+#ifdef FUSION_CONVADDBNRELU_OP
+template <typename Dtype>
+class FusionConvAddBNReluParam : public ConvParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
 
-  const vector<int> &Paddings() const { return paddings_; }
+ public:
+  FusionConvAddBNReluParam(const VariableNameMap &inputs,
+                           const VariableNameMap &outputs,
+                           const AttributeMap &attrs, const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
+    axis_ = OpParam::GetAttr<int>("axis", attrs);
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
+    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
+    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
+    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
+    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
+    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
+    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
+    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
+  }
+  RType *Bias() const { return bias_; }
 
-  const vector<int> &Dilations() const { return dilations_; }
+  const int &Axis() const { return axis_; }
 
-  const int &Groups() const { return groups; }
+  RType *Output() const { return output_; }
 
-  const Tensor *InputBias() const { return input_bias_; }
+  const RType *InputBias() const { return input_bias_; }
 
-  const Tensor *InputMean() const { return input_mean_; }
+  const RType *InputMean() const { return input_mean_; }
 
-  const Tensor *InputScale() const { return input_scale_; }
+  const RType *InputScale() const { return input_scale_; }
 
-  const Tensor *InputVariance() const { return input_variance_; }
+  const RType *InputVariance() const { return input_variance_; }
 
   const float &Epsilon() const { return epsilon_; }
 
@@ -903,54 +1484,413 @@ class FusionConvAddBNReluParam : public OpParam {
 
   const bool &IsTest() const { return is_test_; }
 
-  void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; }
+  void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
 
-  void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; }
+  void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
 
-  const Tensor *NewScale() const { return new_scale_; }
+  const RType *NewScale() const { return new_scale_; }
 
-  const Tensor *NewBias() const { return new_bias_; }
+  const RType *NewBias() const { return new_bias_; }
 
  protected:
-  Tensor *bias_;
+  RType *bias_;
   int axis_;
-  Tensor *input_;
-  Tensor *output_;
-  Tensor *filter_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  vector<int> dilations_;
-  int groups;
-  Tensor *input_bias_;
-  Tensor *input_mean_;
-  Tensor *input_scale_;
-  Tensor *input_variance_;
+  RType *output_;
+  RType *input_bias_;
+  RType *input_mean_;
+  RType *input_scale_;
+  RType *input_variance_;
   float epsilon_;
   float momentum_;
   bool is_test_;
-  Tensor *new_bias_;
-  Tensor *new_scale_;
+  RType *new_bias_;
+  RType *new_scale_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::WrapperConvArgs fpga_conv_args;
+
+ public:
+  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+#endif
 };
+#endif
+
+#ifdef FUSION_CONVBNADDRELU_OP
+template <typename Dtype>
+class FusionConvBNAddReluParam : public ConvParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FusionConvBNAddReluParam(const VariableNameMap &inputs,
+                           const VariableNameMap &outputs,
+                           const AttributeMap &attrs, const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
+    axis_ = OpParam::GetAttr<int>("axis", attrs);
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
+    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
+    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
+    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
+    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
+    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
+    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
+    keyBNY_ = OpParam::getkey("BNY", inputs, 0);
+    keyX_ = OpParam::getkey("X", inputs, 0);
+    keyY_ = OpParam::getkey("Y", inputs, 0);
+    if (keyX_ == keyBNY_) {
+      bias_ = OpParam::InputYFrom<GType>(inputs, scope);
+    } else if (keyY_ == keyBNY_) {
+      bias_ = OpParam::InputXFrom<GType>(inputs, scope);
+    }
+    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
+  }
+  RType *Bias() const { return bias_; }
+
+  const int &Axis() const { return axis_; }
+
+  RType *Output() const { return output_; }
+
+  const RType *InputBias() const { return input_bias_; }
+
+  const RType *InputMean() const { return input_mean_; }
+
+  const RType *InputScale() const { return input_scale_; }
+
+  const RType *InputVariance() const { return input_variance_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+  const float &Momentum() const { return momentum_; }
+
+  const bool &IsTest() const { return is_test_; }
+
+  void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
+
+  void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
+
+  const RType *NewScale() const { return new_scale_; }
 
-Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
+  const RType *NewBias() const { return new_bias_; }
+
+ protected:
+  RType *bias_;
+  int axis_;
+  RType *output_;
+  RType *input_bias_;
+  RType *input_mean_;
+  RType *input_scale_;
+  RType *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  RType *new_bias_;
+  RType *new_scale_;
+  std::string keyBNY_;
+  std::string keyX_;
+  std::string keyY_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::WrapperConvArgs fpga_conv_args;
+
+ public:
+  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+#endif
+};
+#endif
+
+#ifdef FUSION_CONVBN_OP
+template <typename Dtype>
+class FusionConvBNParam : public ConvParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FusionConvBNParam(const VariableNameMap &inputs,
+                    const VariableNameMap &outputs, const AttributeMap &attrs,
+                    const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    output_y_ = OpParam::OutputYFrom<GType>(outputs, scope);
+    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
+    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
+    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
+    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
+    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
+    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
+    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
+  }
+  RType *Output() const { return output_y_; }
+
+  const RType *InputBias() const { return input_bias_; }
+
+  const RType *InputMean() const { return input_mean_; }
+
+  const RType *InputScale() const { return input_scale_; }
+
+  const RType *InputVariance() const { return input_variance_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+  const float &Momentum() const { return momentum_; }
+
+  const bool &IsTest() const { return is_test_; }
+
+  void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
+
+  void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
+
+  const RType *NewScale() const { return new_scale_; }
+
+  const RType *NewBias() const { return new_bias_; }
+
+ protected:
+  RType *output_y_;
+  RType *input_bias_;
+  RType *input_mean_;
+  RType *input_scale_;
+  RType *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  RType *new_bias_;
+  RType *new_scale_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::WrapperConvArgs fpga_conv_args;
+
+ public:
+  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+#endif
+};
+#endif
+
+#ifdef FUSION_CONVADDBN_OP
+template <typename Dtype>
+class FusionConvAddBNParam : public ConvParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FusionConvAddBNParam(const VariableNameMap &inputs,
+                       const VariableNameMap &outputs,
+                       const AttributeMap &attrs, const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
+    axis_ = OpParam::GetAttr<int>("axis", attrs);
+    output_y_ = OpParam::OutputYFrom<GType>(outputs, scope);
+    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
+    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
+    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
+    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
+    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
+    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
+    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
+  }
+  RType *Bias() const { return bias_; }
+
+  const int &Axis() const { return axis_; }
+
+  RType *Output() const { return output_y_; }
+
+  const RType *InputBias() const { return input_bias_; }
+
+  const RType *InputMean() const { return input_mean_; }
+
+  const RType *InputScale() const { return input_scale_; }
+
+  const RType *InputVariance() const { return input_variance_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+  const float &Momentum() const { return momentum_; }
+
+  const bool &IsTest() const { return is_test_; }
+
+  void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
+
+  void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
+
+  const RType *NewScale() const { return new_scale_; }
+
+  const RType *NewBias() const { return new_bias_; }
+
+ protected:
+  RType *bias_;
+  int axis_;
+  RType *output_y_;
+  RType *input_bias_;
+  RType *input_mean_;
+  RType *input_scale_;
+  RType *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  RType *new_bias_;
+  RType *new_scale_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::WrapperConvArgs fpga_conv_args;
+
+ public:
+  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+#endif
+};
+#endif
+
+#ifdef FUSION_DWCONVBNRELU_OP
+template <typename Dtype>
+class FusionDWConvBNReluParam : public ConvParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FusionDWConvBNReluParam(const VariableNameMap &inputs,
+                          const VariableNameMap &outputs,
+                          const AttributeMap &attrs, const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
+    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
+    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
+    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
+    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
+    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
+    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
+    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
+  }
+  RType *Output() const { return output_; }
+
+  const RType *InputBias() const { return input_bias_; }
+
+  const RType *InputMean() const { return input_mean_; }
+
+  const RType *InputScale() const { return input_scale_; }
+
+  const RType *InputVariance() const { return input_variance_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+  const float &Momentum() const { return momentum_; }
+
+  const bool &IsTest() const { return is_test_; }
+
+  void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
+
+  void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
+
+  const RType *NewScale() const { return new_scale_; }
+
+  const RType *NewBias() const { return new_bias_; }
+
+ protected:
+  RType *output_;
+  RType *input_bias_;
+  RType *input_mean_;
+  RType *input_scale_;
+  RType *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  RType *new_bias_;
+  RType *new_scale_;
+};
+
+#endif
+
+#ifdef FUSION_CONVBNRELU_OP
+template <typename Dtype>
+class FusionConvBNReluParam : public ConvParam<Dtype> {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FusionConvBNReluParam(const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const AttributeMap &attrs, const Scope &scope)
+      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
+    output_ = OpParam::OutFrom<GType>(outputs, scope);
+    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, scope);
+    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, scope);
+    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
+    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, scope);
+    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
+    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
+    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
+  }
+  RType *Output() const { return output_; }
+
+  const RType *InputBias() const { return input_bias_; }
+
+  const RType *InputMean() const { return input_mean_; }
+
+  const RType *InputScale() const { return input_scale_; }
+
+  const RType *InputVariance() const { return input_variance_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+  const float &Momentum() const { return momentum_; }
+
+  const bool &IsTest() const { return is_test_; }
+
+  void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
+
+  void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
+
+  const RType *NewScale() const { return new_scale_; }
+
+  const RType *NewBias() const { return new_bias_; }
+
+ protected:
+  RType *output_;
+  RType *input_bias_;
+  RType *input_mean_;
+  RType *input_scale_;
+  RType *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  RType *new_bias_;
+  RType *new_scale_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::WrapperConvArgs fpga_conv_args;
+
+ public:
+  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+#endif
+};
 #endif
 
 #ifdef IM2SEQUENCE_OP
+template <typename Dtype>
 class Im2SequenceParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   Im2SequenceParam(const VariableNameMap &inputs,
                    const VariableNameMap &outputs, const AttributeMap &attrs,
                    const Scope &scope) {
-    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    out_ = OutFrom<LoDTensor>(outputs, scope);
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
     kernels_ = GetAttr<vector<int>>("kernels", attrs);
     strides_ = GetAttr<vector<int>>("strides", attrs);
     paddings_ = GetAttr<vector<int>>("paddings", attrs);
   }
 
-  const Tensor *Input() const { return input_x_; }
+  const RType *Input() const { return input_x_; }
 
-  Tensor *Output() const { return out_; }
+  RType *Output() const { return out_; }
 
   const vector<int> &Kernels() const { return kernels_; }
 
@@ -959,8 +1899,8 @@ class Im2SequenceParam : public OpParam {
   const vector<int> &Paddings() const { return paddings_; }
 
  private:
-  Tensor *input_x_;
-  Tensor *out_;
+  RType *input_x_;
+  RType *out_;
   vector<int> kernels_;
   vector<int> strides_;
   vector<int> paddings_;
@@ -968,21 +1908,246 @@ class Im2SequenceParam : public OpParam {
 #endif
 
 #ifdef DROPOUT_OP
+template <typename Dtype>
 class DropoutParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
  public:
   DropoutParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
                const AttributeMap &attrs, const Scope &scope) {
-    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    out_ = OutFrom<LoDTensor>(outputs, scope);
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+
+    dropout_prob_ = GetAttr<float>("dropout_prob", attrs);
+  }
+
+  const RType *InputX() const { return input_x_; }
+
+  RType *Out() const { return out_; }
+
+  float DropoutProb() const { return dropout_prob_; }
+
+ private:
+  RType *input_x_;
+  RType *out_;
+  float dropout_prob_;
+};
+#endif
+
+#ifdef CONV_TRANSPOSE
+template <typename Dtype>
+class ConvTransposeParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  ConvTransposeParam(const VariableNameMap &inputs,
+                     const VariableNameMap &outputs, const AttributeMap &attrs,
+                     const Scope &scope) {
+    filter_ = FilterFrom<GType>(inputs, scope);
+    input_ = InputFrom<GType>(inputs, scope);
+    output_ = OutputFrom<GType>(outputs, scope);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+  }
+
+  const RType *Input() const { return input_; }
+
+  const RType *Filter() const { return filter_; }
+
+  RType *Output() const { return output_; }
+
+  const vector<int> &Strides() const { return strides_; }
+
+  const vector<int> &Paddings() const { return paddings_; }
+
+  const vector<int> &Dilations() const { return dilations_; }
+
+  const int &Groups() const { return groups; }
+
+ private:
+  RType *input_;
+  RType *output_;
+  RType *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+};
+#endif
+
+#ifdef GRU_OP
+template <typename Dtype>
+class GruParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+
+ public:
+  /**
+   *
+   * @param inputs
+   * @param outputs
+   * @param attrs
+   * @param scope
+   * */
+  GruParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+           const AttributeMap &attrs, const Scope &scope) {
+    input_input_ = InputFrom<GType>(inputs, scope);
+    input_h0_ = InputH0From<GType>(inputs, scope);
+    input_bias_ = InputBiasFrom<GType>(inputs, scope);
+    input_weight_ = InputWeightFrom<GType>(inputs, scope);
+
+    output_batch_gate_ = OutputBatchGateFrom<GType>(outputs, scope);
+    output_batch_reset_hidden_prev_ =
+        OutputBatchResetHiddenPrevFrom<GType>(outputs, scope);
+    output_batch_hidden_ = OutputBatchHiddenFrom<GType>(outputs, scope);
+    output_hidden_ = OutputHiddenFrom<GType>(outputs, scope);
+    activation_ = GetAttr<std::string>("activation", attrs);
+    gate_activation_ = GetAttr<std::string>("gate_activation", attrs);
+    is_reverse_ = GetAttr<bool>("is_reverse", attrs);
+  }
+  const GType *InputInput() const { return input_input_; }
+  const GType *InputWeight() const { return input_weight_; }
+  const GType *InputH0() const { return input_h0_; }
+  const GType *InputBias() const { return input_bias_; }
+  const std::string &Activation() const { return activation_; }
+  const std::string &GateActivation() const { return gate_activation_; }
+  const bool &IsReverse() const { return is_reverse_; }
+
+  GType *OutBatchGate() const { return output_batch_gate_; }
+  GType *OutBatchResetHiddenPrev() const {
+    return output_batch_reset_hidden_prev_;
+  }
+  GType *OutBatchHidden() const { return output_batch_hidden_; }
+  GType *OutHidden() const { return output_hidden_; }
+
+ private:
+  GType *input_input_;
+  GType *input_h0_;
+  GType *input_bias_;
+  GType *input_weight_;
+
+  GType *output_batch_gate_;
+  GType *output_batch_reset_hidden_prev_;
+  GType *output_batch_hidden_;
+  GType *output_hidden_;
+  std::string activation_;
+  std::string gate_activation_;
+  bool is_reverse_;
+};
+#endif
+
+#ifdef FLATTEN_OP
+template <typename Dtype>
+class FlattenParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FlattenParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+               const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    axis = GetAttr<int>("axis", attrs);
+  }
+  const RType *InputX() const { return input_x_; }
+  RType *Out() const { return out_; }
+  const int &Axis() const { return axis; }
+
+ private:
+  RType *input_x_;
+  RType *out_;
+  int axis;
+};
+#endif
+
+#ifdef SPLIT_OP
+template <typename Dtype>
+class SplitParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  SplitParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+             const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    outs_ = OutMultiFrom<GType>(outputs, scope);
+    axis = GetAttr<int>("axis", attrs);
+    num = GetAttr<int>("num", attrs);
+    sections = GetAttr<std::vector<int>>("sections", attrs);
+
+    //    for (int i = 0; i < outs_.size(); ++i) {
+    //      out_ts_.push_back(*scope.FindVar(outs_[i])->GetMutable());
+    //    }
   }
+  const RType *InputX() const { return input_x_; }
+  std::vector<GType *> Outs() const { return outs_; }
+  int Axis() const { return axis; }
+  int Num() const { return num; }
+  std::vector<int> Sections() const { return sections; }
+  //  std::vector<GType> OutTs() const { return out_ts_; }
 
-  const Tensor *InputX() const { return input_x_; }
+ private:
+  RType *input_x_;
+  std::vector<GType *> outs_;
+  int axis;
+  int num;
+  std::vector<int> sections;
+  //  std::vector<GType> out_ts_;
+};
+#endif
+
+#ifdef BILINEAR_INTERP_OP
+template <typename Dtype>
+class BilinearInterpParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
 
-  Tensor *Out() const { return out_; }
+ public:
+  BilinearInterpParam(const VariableNameMap &inputs,
+                      const VariableNameMap &outputs, const AttributeMap &attrs,
+                      const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    input_outsize_ = InputOutSizeFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    out_h_ = GetAttr<int>("out_h", attrs);
+    out_w_ = GetAttr<int>("out_w", attrs);
+  }
+  const RType *InputX() const { return input_x_; }
+  const RType *InputOutPutSize() const { return input_outsize_; }
+  RType *Out() const { return out_; }
+  int OutH() const { return out_h_; }
+  int OutW() const { return out_w_; }
+
+ private:
+  RType *input_x_;
+  RType *input_outsize_;
+  RType *out_;
+  int out_h_;
+  int out_w_;
+};
+#endif
+
+#ifdef SHAPE_OP
+template <typename Dtype>
+class ShapeParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  ShapeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+             const AttributeMap &attrs, const Scope &scope) {
+    input_ = InputFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+  }
+  const RType *Input() const { return input_; }
+  RType *Out() const { return out_; }
 
  private:
-  Tensor *input_x_;
-  Tensor *out_;
+  RType *input_;
+  RType *out_;
 };
 #endif
 
diff --git a/src/operators/pool_op.cpp b/src/operators/pool_op.cpp
index 62eaf6b5f8105c4d2ab63f2f883445705b815860..dd23059ea01a332aff45137b7f7ed4c9f6c2e1bb 100644
--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
@@ -54,20 +54,19 @@ void PoolOp<DeviceType, T>::InferShape() const {
   }
   this->param_.Output()->Resize(framework::make_ddim(output_shape));
 }
-template class PoolOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(pool2d);
 REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(pool2d);
 REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(pool2d, ops::PoolOp);
 #endif
 
 #endif
diff --git a/src/operators/pool_op.h b/src/operators/pool_op.h
index 5b436fb18bdc055add21acd37e5a1a9c7b6e5b02..4f76fb8f800dea43432b48562cca563505a1af76 100644
--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
@@ -29,17 +29,17 @@ using framework::OperatorWithKernel;
 using framework::Scope;
 using std::string;
 template <typename DeviceType, typename T>
-class PoolOp : public OperatorWithKernel<DeviceType, PoolParam,
+class PoolOp : public OperatorWithKernel<DeviceType, PoolParam<DeviceType>,
                                          operators::PoolKernel<DeviceType, T>> {
  public:
   PoolOp(const string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const AttributeMap &attrs,
          std::shared_ptr<Scope> scope)
-      : OperatorWithKernel<DeviceType, PoolParam,
+      : OperatorWithKernel<DeviceType, PoolParam<DeviceType>,
                            operators::PoolKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
   using OperatorWithKernel<
-      DeviceType, PoolParam,
+      DeviceType, PoolParam<DeviceType>,
       operators::PoolKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
@@ -48,4 +48,14 @@ class PoolOp : public OperatorWithKernel<DeviceType, PoolParam,
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(pool2d);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(pool2d);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(pool2d);
+#endif
+
 #endif
diff --git a/src/operators/prelu_op.cpp b/src/operators/prelu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..332b5cc9bbbabf9498858b96e0028a9e3992f3ea
--- /dev/null
+++ b/src/operators/prelu_op.cpp
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PRELU_OP
+
+#include "operators/prelu_op.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void PReluOp<Dtype, T>::InferShape() const {
+  auto input_dims = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(input_dims);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+/*
+ * @b 每一个 op 都需要注册一下的,
+ *    USE_OP的参数 和 REGISTER_OPERATOR的第一个参数
+ * 都是需要和model中类型对应起来的
+ * */
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(prelu, ops::PReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(prelu, ops::PReluOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/prelu_op.h b/src/operators/prelu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b6b778fa6e8f0951faffda6803b25b6b23ea17c
--- /dev/null
+++ b/src/operators/prelu_op.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PRELU_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/prelu_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class PReluOp : public framework::OperatorWithKernel<
+                    DeviceType, PReluParam<DeviceType>,
+                    operators::PReluKernel<DeviceType, T>> {
+ public:
+  PReluOp(const std::string &type, const VariableNameMap &inputs,
+          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+          std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, PReluParam<DeviceType>,
+                                      operators::PReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, PReluParam<DeviceType>,
+      operators::PReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(prelu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(prelu);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(prelu);
+#endif
+
+#endif
diff --git a/src/operators/prior_box_op.cpp b/src/operators/prior_box_op.cpp
index 44e1741b66f301aee55f1f4d33b9bb1173e6004d..a05a0ddcec5ba9d442b58846468a121e9b655a6a 100644
--- a/src/operators/prior_box_op.cpp
+++ b/src/operators/prior_box_op.cpp
@@ -44,13 +44,12 @@ void PriorBoxOp<Dtype, T>::InferShape() const {
   this->param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec));
   this->param_.OutputVariances()->Resize(framework::make_ddim(dim_vec));
 }
-template class PriorBoxOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(prior_box);
 REGISTER_OPERATOR_CPU(prior_box, ops::PriorBoxOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/prior_box_op.h b/src/operators/prior_box_op.h
index 5b3e3fffd6787360b69ff3af2d19bc8e05549c04..00fc8e039c9958e4b43653d6360c0f54c78648a1 100644
--- a/src/operators/prior_box_op.h
+++ b/src/operators/prior_box_op.h
@@ -28,20 +28,20 @@ namespace operators {
 using paddle_mobile::framework::Tensor;
 
 template <typename DeviceType, typename T>
-class PriorBoxOp
-    : public framework::OperatorWithKernel<
-          DeviceType, PriorBoxParam, operators::PriorBoxKernel<DeviceType, T>> {
+class PriorBoxOp : public framework::OperatorWithKernel<
+                       DeviceType, PriorBoxParam<DeviceType>,
+                       operators::PriorBoxKernel<DeviceType, T>> {
  public:
   PriorBoxOp(const std::string &type, const VariableNameMap &inputs,
              const VariableNameMap &outputs,
              const framework::AttributeMap &attrs,
              std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType, PriorBoxParam,
+      : framework::OperatorWithKernel<DeviceType, PriorBoxParam<DeviceType>,
                                       operators::PriorBoxKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, PriorBoxParam,
+      DeviceType, PriorBoxParam<DeviceType>,
       operators::PriorBoxKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
@@ -51,4 +51,12 @@ class PriorBoxOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(prior_box);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/relu_op.cpp b/src/operators/relu_op.cpp
index 877dcee1a7f4a5a75d013031235d3a216c35f854..2a771e81e7a5a0e869984990b52b98d15036543a 100644
--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -23,7 +23,7 @@ void ReluOp<Dtype, T>::InferShape() const {
   auto input_dims = this->param_.InputX()->dims();
   this->param_.Out()->Resize(input_dims);
 }
-template class ReluOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
@@ -34,11 +34,9 @@ template class ReluOp<CPU, float>;
  * */
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(relu);
 REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(relu);
 REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
diff --git a/src/operators/relu_op.h b/src/operators/relu_op.h
index 8f9e55cf8a2d5bb58e85c21cd2cee3647b00fa22..0364dd7f8ec4b3861200380597e18ede0819e8b6 100644
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
@@ -28,9 +28,9 @@ namespace operators {
 using paddle_mobile::framework::Tensor;
 
 template <typename DeviceType, typename T>
-class ReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, ReluParam, operators::ReluKernel<DeviceType, T>> {
+class ReluOp : public framework::OperatorWithKernel<
+                   DeviceType, ReluParam<DeviceType>,
+                   operators::ReluKernel<DeviceType, T>> {
  public:
   /*
    * @b op 的实例化方法, 需要调用父类的实例化方法, 以及实例化自己的参数结构体
@@ -38,12 +38,12 @@ class ReluOp
   ReluOp(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
          std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType, ReluParam,
+      : framework::OperatorWithKernel<DeviceType, ReluParam<DeviceType>,
                                       operators::ReluKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, ReluParam,
+      DeviceType, ReluParam<DeviceType>,
       operators::ReluKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
@@ -53,4 +53,13 @@ class ReluOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(relu);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/reshape_op.cpp b/src/operators/reshape_op.cpp
index c7294079b26250770006aeb1b79c15469489b988..dcc15009af2b23129552d58b3fa22c3c67684dce 100644
--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -27,17 +27,15 @@ void ReshapeOp<Dtype, T>::InferShape() const {
   auto out_dims = ValidateShape(shape, input_x_dims);
   this->param_.Out()->Resize(out_dims);
 }
-template class ReshapeOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(reshape);
 REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(reshape);
 REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
diff --git a/src/operators/reshape_op.h b/src/operators/reshape_op.h
index 90d31153135f629585d56eb89ae12830215900d8..9284e94f346ed0f225d6dabe16077b1fb2034c64 100644
--- a/src/operators/reshape_op.h
+++ b/src/operators/reshape_op.h
@@ -28,20 +28,20 @@ namespace operators {
 using paddle_mobile::framework::Tensor;
 
 template <typename DeviceType, typename T>
-class ReshapeOp
-    : public framework::OperatorWithKernel<
-          DeviceType, ReshapeParam, operators::ReshapeKernel<DeviceType, T>> {
+class ReshapeOp : public framework::OperatorWithKernel<
+                      DeviceType, ReshapeParam<DeviceType>,
+                      operators::ReshapeKernel<DeviceType, T>> {
  public:
   ReshapeOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
             const framework::AttributeMap &attrs,
             std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType, ReshapeParam,
+      : framework::OperatorWithKernel<DeviceType, ReshapeParam<DeviceType>,
                                       operators::ReshapeKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, ReshapeParam,
+      DeviceType, ReshapeParam<DeviceType>,
       operators::ReshapeKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
@@ -51,4 +51,14 @@ class ReshapeOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(reshape);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(reshape);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/resize_op.cpp b/src/operators/resize_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..02c50b662665fc9bd2f662922cb88dbce9fc5d53
--- /dev/null
+++ b/src/operators/resize_op.cpp
@@ -0,0 +1,43 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESIZE_OP
+
+#include "operators/resize_op.h"
+#include <vector>
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void ResizeOp<Dtype, T>::InferShape() const {
+  auto out_dims = CalOutputShape(this->param_);
+  this->param_.Out()->Resize(out_dims);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(resize);
+REGISTER_OPERATOR_CPU(resize, ops::ResizeOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(resize);
+REGISTER_OPERATOR_MALI_GPU(resize, ops::ResizeOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/resize_op.h b/src/operators/resize_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0b38bb1cf4048af4b07d05f28a88a5ac8056ea3
--- /dev/null
+++ b/src/operators/resize_op.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESIZE_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/resize_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class ResizeOp : public framework::OperatorWithKernel<
+                     DeviceType, ResizeParam<DeviceType>,
+                     operators::ResizeKernel<DeviceType, T>> {
+ public:
+  ResizeOp(const std::string &type, const VariableNameMap &inputs,
+           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+           std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, ResizeParam<DeviceType>,
+                                      operators::ResizeKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, ResizeParam<DeviceType>,
+      operators::ResizeKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/scale_op.cpp b/src/operators/scale_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..968fcd4098e92a47899c9a733c0261d91c314c29
--- /dev/null
+++ b/src/operators/scale_op.cpp
@@ -0,0 +1,43 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SCALE_OP
+
+#include "operators/scale_op.h"
+#include <vector>
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void ScaleOp<Dtype, T>::InferShape() const {
+  auto input_dims = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(input_dims);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(scale);
+REGISTER_OPERATOR_CPU(scale, ops::ScaleOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(scale);
+REGISTER_OPERATOR_MALI_GPU(scale, ops::ScaleOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/scale_op.h b/src/operators/scale_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c5f5e620f25bef88533e80cdd78b243fef9bc70
--- /dev/null
+++ b/src/operators/scale_op.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SCALE_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/scale_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class ScaleOp : public framework::OperatorWithKernel<
+                    DeviceType, ScaleParam<DeviceType>,
+                    operators::ScaleKernel<DeviceType, T>> {
+ public:
+  ScaleOp(const std::string &type, const VariableNameMap &inputs,
+          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+          std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, ScaleParam<DeviceType>,
+                                      operators::ScaleKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, ScaleParam<DeviceType>,
+      operators::ScaleKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/shape_op.cpp b/src/operators/shape_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b50a9c4507bff31ee753980c93917b93a4e1f42f
--- /dev/null
+++ b/src/operators/shape_op.cpp
@@ -0,0 +1,42 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SHAPE_OP
+
+#include "operators/shape_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+void ShapeOp<DeviceType, T>::InferShape() const {
+  PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr,
+                        "Input (Input) of get_shape op should not be null.");
+  PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
+                        "Output (Out) of get_shape op should not be null.");
+  this->param_.Out()->Resize({this->param_.Input()->dims().size()});
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(shape, ops::ShapeOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/shape_op.h b/src/operators/shape_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f88c807d3c331f83cf87e6c77a65fa5d90a9f4e
--- /dev/null
+++ b/src/operators/shape_op.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SHAPE_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/shape_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class ShapeOp : public framework::OperatorWithKernel<
+                    DeviceType, ShapeParam<DeviceType>,
+                    operators::ShapeKernel<DeviceType, T>> {
+ public:
+  ShapeOp(const std::string &type, const VariableNameMap &inputs,
+          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+          std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, ShapeParam<DeviceType>,
+                                      operators::ShapeKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, ShapeParam<DeviceType>,
+      operators::ShapeKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(shape);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/sigmoid_op.cpp b/src/operators/sigmoid_op.cpp
index 79190e6c3368b9d375770062d948580779393f04..8ea4c98942e0630f5b69133991583ee1192c8153 100644
--- a/src/operators/sigmoid_op.cpp
+++ b/src/operators/sigmoid_op.cpp
@@ -22,13 +22,12 @@ template <typename DeviceType, typename T>
 void SigmoidOp<DeviceType, T>::InferShape() const {
   this->param_.Out()->Resize(this->param_.InputX()->dims());
 }
-template class SigmoidOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(sigmoid);
 REGISTER_OPERATOR_CPU(sigmoid, ops::SigmoidOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/sigmoid_op.h b/src/operators/sigmoid_op.h
index bd914a63783f65c7b55d783f2bbcdf19c303c00f..533ea587958e8766b1469c73b909cfa2fcb60696 100644
--- a/src/operators/sigmoid_op.h
+++ b/src/operators/sigmoid_op.h
@@ -25,20 +25,20 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType, typename T>
-class SigmoidOp
-    : public framework::OperatorWithKernel<
-          DeviceType, SigmoidParam, operators::SigmoidKernel<DeviceType, T>> {
+class SigmoidOp : public framework::OperatorWithKernel<
+                      DeviceType, SigmoidParam<DeviceType>,
+                      operators::SigmoidKernel<DeviceType, T>> {
  public:
   SigmoidOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
             const framework::AttributeMap &attrs,
             std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType, SigmoidParam,
+      : framework::OperatorWithKernel<DeviceType, SigmoidParam<DeviceType>,
                                       operators::SigmoidKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, SigmoidParam,
+      DeviceType, SigmoidParam<DeviceType>,
       operators::SigmoidKernel<DeviceType, T>>::OperatorWithKernel;
 
   void InferShape() const override;
@@ -46,4 +46,12 @@ class SigmoidOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(sigmoid);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/src/operators/slice_op.cpp b/src/operators/slice_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b77a675e10ed030443e1d4074239a715ddedf772
--- /dev/null
+++ b/src/operators/slice_op.cpp
@@ -0,0 +1,42 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SLICE_OP
+
+#include "operators/slice_op.h"
+#include <vector>
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void SliceOp<Dtype, T>::InferShape() const {
+  /// todo: add InputShape() detection.
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(slice);
+REGISTER_OPERATOR_CPU(slice, ops::SliceOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(slice);
+REGISTER_OPERATOR_MALI_GPU(slice, ops::SliceOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/slice_op.h b/src/operators/slice_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..6bcb6fa0b9e88cefb3c88dfc096e1073ad261c1b
--- /dev/null
+++ b/src/operators/slice_op.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SLICE_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/slice_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class SliceOp : public framework::OperatorWithKernel<
+                    DeviceType, SliceParam<DeviceType>,
+                    operators::SliceKernel<DeviceType, T>> {
+ public:
+  SliceOp(const std::string &type, const VariableNameMap &inputs,
+          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+          std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, SliceParam<DeviceType>,
+                                      operators::SliceKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, SliceParam<DeviceType>,
+      operators::SliceKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/softmax_op.cpp b/src/operators/softmax_op.cpp
index 296e3ef30f7c0260cca169bcfe2f6b445493792a..e85edc69c3291c794f2eeb8119b91b2926c4d870 100644
--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -22,20 +22,19 @@ template <typename DeviceType, typename T>
 void SoftmaxOp<DeviceType, T>::InferShape() const {
   this->param_.Out()->Resize(this->param_.InputX()->dims());
 }
-template class SoftmaxOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(softmax);
 REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(softmax);
 REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(softmax, ops::SoftmaxOp);
 #endif
 
 #endif
diff --git a/src/operators/softmax_op.h b/src/operators/softmax_op.h
index 1445ca055ea0472cdaa02d7496ff895feb9174bc..579a2ed605cb3f3c8c4a3d0c2f1ccc7bd9595fc2 100644
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
@@ -25,20 +25,20 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType, typename T>
-class SoftmaxOp
-    : public framework::OperatorWithKernel<
-          DeviceType, SoftmaxParam, operators::SoftmaxKernel<DeviceType, T>> {
+class SoftmaxOp : public framework::OperatorWithKernel<
+                      DeviceType, SoftmaxParam<DeviceType>,
+                      operators::SoftmaxKernel<DeviceType, T>> {
  public:
   SoftmaxOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
             const framework::AttributeMap &attrs,
             std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType, SoftmaxParam,
+      : framework::OperatorWithKernel<DeviceType, SoftmaxParam<DeviceType>,
                                       operators::SoftmaxKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, SoftmaxParam,
+      DeviceType, SoftmaxParam<DeviceType>,
       operators::SoftmaxKernel<DeviceType, T>>::OperatorWithKernel;
 
   void InferShape() const override;
@@ -48,4 +48,14 @@ class SoftmaxOp
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(softmax);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(softmax);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(softmax);
+#endif
+
 #endif
diff --git a/src/operators/split_op.cpp b/src/operators/split_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b7fadc1a64d1a6f7549e5875b543c871b385e6d
--- /dev/null
+++ b/src/operators/split_op.cpp
@@ -0,0 +1,91 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SPLIT_OP
+
+#include "operators/split_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+void SplitOp<DeviceType, T>::InferShape() const {
+  PADDLE_MOBILE_ENFORCE(this->param_.InputX() != nullptr,
+                        "Input(X) of SplitOp should not be null.");
+  //  std::string str;
+  //  str.size()
+  const auto &outs = this->param_.Outs();
+  PADDLE_MOBILE_ENFORCE(outs.size() >= 1UL,
+                        "Outputs(Out) of SplitOp should not be empty.");
+
+  auto in_dims = this->param_.InputX()->dims();
+  size_t axis = static_cast<size_t>(this->param_.Axis());
+  size_t num = static_cast<size_t>(this->param_.Num());
+
+  const auto &sections = this->param_.Sections();
+
+  const size_t outs_number = outs.size();
+  std::vector<framework::DDim> outs_dims;
+  outs_dims.reserve(outs_number);
+
+  if (num > 0) {
+    int64_t in_axis_dim = in_dims[axis];
+    PADDLE_MOBILE_ENFORCE(in_axis_dim % num == 0,
+                          "tensor split does not result"
+                          " in an equal division");
+    size_t out_axis_dim = in_axis_dim / num;
+    for (size_t i = 0; i < outs_number; ++i) {
+      auto dim = in_dims;
+      dim[axis] = out_axis_dim;
+      outs_dims.push_back(dim);
+    }
+  } else if (sections.size() > 0) {
+    PADDLE_MOBILE_ENFORCE(sections.size() == outs_number,
+                          "tensor split sections size"
+                          "should be equal to output size.");
+    for (size_t i = 0; i < outs_number; ++i) {
+      auto dim = in_dims;
+      dim[axis] = sections[i];
+      outs_dims.push_back(dim);
+    }
+  }
+
+  PADDLE_MOBILE_ENFORCE(outs_dims.size() == outs.size(),
+                        "length==dims.size()  must be true!");
+  for (int j = 0; j < outs_dims.size(); ++j) {
+    outs[j]->Resize(outs_dims[j]);
+  }
+
+  //  todo lod impl
+  //  if (axis != 0) {
+  //    // Only pass LoD when not spliting along the first dim.
+  //    for (size_t i = 0; i < outs_number; ++i) {
+  //      ctx->ShareLoD("X", "Out", 0, i);
+  //    }
+  //  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(split, ops::SplitOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/split_op.h b/src/operators/split_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7d60b37441e77c5d47ac6040404535a841bcf8e
--- /dev/null
+++ b/src/operators/split_op.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SPLIT_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/split_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class SplitOp : public framework::OperatorWithKernel<
+                    DeviceType, SplitParam<DeviceType>,
+                    operators::SplitKernel<DeviceType, T>> {
+ public:
+  SplitOp(const std::string &type, const VariableNameMap &inputs,
+          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+          std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, SplitParam<DeviceType>,
+                                      operators::SplitKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, SplitParam<DeviceType>,
+      operators::SplitKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(split);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/transpose_op.cpp b/src/operators/transpose_op.cpp
index 989b277b9d58a8c029e041a89a1982f8994bae44..5f193f96396c8d4d7cb58143573015384e7a7c28 100644
--- a/src/operators/transpose_op.cpp
+++ b/src/operators/transpose_op.cpp
@@ -47,13 +47,12 @@ void TransposeOp<Dtype, T>::InferShape() const {
   }
   this->param_.Out()->Resize(out_dims);
 }
-template class TransposeOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(transpose);
 REGISTER_OPERATOR_CPU(transpose, ops::TransposeOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
diff --git a/src/operators/transpose_op.h b/src/operators/transpose_op.h
index 349220b58ff3e0daec8c7dc2e2dec969ced8b289..b96ce4e17ca4b0d0e321cefb3175b973cd7df307 100644
--- a/src/operators/transpose_op.h
+++ b/src/operators/transpose_op.h
@@ -29,7 +29,7 @@ using paddle_mobile::framework::Tensor;
 
 template <typename DeviceType, typename T>
 class TransposeOp : public framework::OperatorWithKernel<
-                        DeviceType, TransposeParam,
+                        DeviceType, TransposeParam<DeviceType>,
                         operators::TransposeKernel<DeviceType, T>> {
  public:
   TransposeOp(const std::string &type, const VariableNameMap &inputs,
@@ -37,12 +37,12 @@ class TransposeOp : public framework::OperatorWithKernel<
               const framework::AttributeMap &attrs,
               std::shared_ptr<framework::Scope> scope)
       : framework::OperatorWithKernel<
-            DeviceType, TransposeParam,
+            DeviceType, TransposeParam<DeviceType>,
             operators::TransposeKernel<DeviceType, T>>(type, inputs, outputs,
                                                        attrs, scope) {}
 
   using framework::OperatorWithKernel<
-      DeviceType, TransposeParam,
+      DeviceType, TransposeParam<DeviceType>,
       operators::TransposeKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 };
@@ -50,4 +50,12 @@ class TransposeOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
 
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(transpose);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
 #endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 9bfc55c93daa2f69200941bfb49a8a6312fa9eb1..d68a8c1fb1a2cd0584d80d5afa8ed8f439d5d5d4 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,94 +1,215 @@
 set(dir ${CMAKE_CURRENT_SOURCE_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build")
+set(FOUND_MATCH OFF)
 
-if (googlenet)
+set(CON -1)
+
+message(STATUS "nets :${NET}")
+
+list(FIND NET "googlenet" CON)
+if (CON GREATER -1)
     # gen test
-    ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-googlenet paddle-mobile)
-elseif (mobilenet)
+
+    set(FOUND_MATCH ON)
+
+endif ()
+
+list(FIND NET "mobilenet" CON)
+if (CON GREATER -1)
     # gen test
-    ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-mobilenet paddle-mobile)
-elseif (yolo)
+
     # gen test
-    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-mobilenet-combine net/test_mobilenet_combine.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-mobilenet-combine paddle-mobile)
+    set(FOUND_MATCH ON)
+
+endif ()
+
+list(FIND NET "yolo" CON)
+if (CON GREATER -1)
+    # gen test
+    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-yolo paddle-mobile)
-elseif (squeezenet)
     # gen test
-    ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test_yolo_combined paddle-mobile)
+    set(FOUND_MATCH ON)
+
+endif ()
+
+list(FIND NET "squeezenet" CON)
+if (CON GREATER -1)
+    # gen test
+    ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-squeezenet paddle-mobile)
-elseif(resnet)
+    set(FOUND_MATCH ON)
+
+endif ()
+
+list(FIND NET "resnet" CON)
+if (CON GREATER -1)
     # gen test
-    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-resnet paddle-mobile)
-else ()
+    set(FOUND_MATCH ON)
+
+endif ()
+
+list(FIND NET "FPGAnets" CON)
+if (CON GREATER -1)
+    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet paddle-mobile)
+
+    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet50 paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-EW fpga/test_fpga_EW.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-EW paddle-mobile)
 
+    ADD_EXECUTABLE(test-fpga-conv fpga/test_fpga_conv.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-conv paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-pooling fpga/test_fpga_pooling.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-pooling paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-bypass fpga/test_fpga_bypass.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-bypass paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-softmax fpga/test_fpga_softmax.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-softmax paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-concat fpga/test_fpga_concat.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-concat paddle-mobile)
+
+    ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-tensor-quant paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-concat-op fpga/test_concat_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-fpga-concat-op paddle-mobile)
+
+    ADD_EXECUTABLE(test-format-data fpga/test_format_data.cpp test_helper.h test_include.h)
+    target_link_libraries(test-format-data paddle-mobile)
+    set(FOUND_MATCH ON)
+
+endif ()
+
+list(FIND NET "mobilenetssd" CON)
+if (CON GREATER -1)
     # gen test
-    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-mobilenetssd paddle-mobile)
+
+    set(FOUND_MATCH ON)
+
+endif ()
+
+list(FIND NET "nlp" CON)
+if (CON GREATER -1)
+    # gen test
+    ADD_EXECUTABLE(test-nlp net/test_nlp.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-nlp paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-gru-op operators/test_gru_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-gru-op paddle-mobile)
+    set(FOUND_MATCH ON)
+
+endif ()
+
+list(FIND NET "mobilenetfssd" CON)
+if (CON GREATER -1)
+    # gen test
+    ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h)
+    target_link_libraries(test-fssd paddle-mobile)
+
+    set(FOUND_MATCH ON)
+
+endif ()
+
+list(FIND NET "genet" CON)
+if (CON GREATER -1)
+    # gen test
+    ADD_EXECUTABLE(test-genet net/test_genet_combine.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-genet paddle-mobile)
+    set(FOUND_MATCH ON)
+
+endif ()
+
+if (NOT FOUND_MATCH)
+    # gen test
+    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-resnet paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-squeezenet paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-yolo paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test_yolo_combined paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-googlenet paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-conv-op  operators/test_cov_op.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-conv-op operators/test_cov_op.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-conv-op paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-mul-op  operators/test_mul_op.cpp test_helper.h  test_include.h)
+    ADD_EXECUTABLE(test-mul-op operators/test_mul_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-mul-op paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h  test_include.h)
+    ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-elementwiseadd-op paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h  test_include.h)
+    ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-concat-op paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-lrn-op  operators/test_lrn_op.cpp test_helper.h  test_include.h)
+    ADD_EXECUTABLE(test-lrn-op operators/test_lrn_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-lrn-op paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-batchnorm-op  operators/test_batchnorm_op.cpp test_helper.h  test_include.h)
+    ADD_EXECUTABLE(test-batchnorm-op operators/test_batchnorm_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-batchnorm-op paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-priorbox-op  operators/test_prior_box_op.cpp test_helper.h  test_include.h)
+    ADD_EXECUTABLE(test-priorbox-op operators/test_prior_box_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-priorbox-op paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-boxcoder-op  operators/test_box_coder_op.cpp test_helper.h  test_include.h)
+    ADD_EXECUTABLE(test-boxcoder-op operators/test_box_coder_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-boxcoder-op paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-transpose-op  operators/test_transpose_op.cpp test_helper.h  test_include.h)
+    ADD_EXECUTABLE(test-transpose-op operators/test_transpose_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-transpose-op paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-multiclassnms-op  operators/test_multiclass_nms_op.cpp test_helper.h  test_include.h)
+    ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-multiclassnms-op paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-reshape-op  operators/test_reshape_op.cpp test_helper.h  test_include.h)
+    ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-reshape-op paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-relu-op  operators/test_relu_op.cpp test_helper.h  test_include.h)
+    ADD_EXECUTABLE(test-relu-op operators/test_relu_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-relu-op paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-fc-op  operators/test_fusion_fc_op.cpp test_helper.h  test_include.h)
+    ADD_EXECUTABLE(test-fc-op operators/test_fusion_fc_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-fc-op paddle-mobile)
 
     # gen test log
@@ -99,6 +220,14 @@ else ()
     ADD_EXECUTABLE(test-load framework/test_load.cpp)
     target_link_libraries(test-load paddle-mobile)
 
+    # gen test log
+    ADD_EXECUTABLE(test-loadmemory framework/test_load_memory.cpp)
+    target_link_libraries(test-loadmemory paddle-mobile)
+
+    ADD_EXECUTABLE(test-inference-api framework/test_inference_api.cpp)
+    target_link_libraries(test-inference-api paddle-mobile)
+
+
     # gen test log
     # gen test
     ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp)
@@ -114,8 +243,12 @@ else ()
     target_link_libraries(test-softmax paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-gemm common/test_gemm.cpp)
-    target_link_libraries(test-gemm paddle-mobile)
+    ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
+    target_link_libraries(test-gemm-accuracy paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp)
+    target_link_libraries(test-gemm-perf paddle-mobile)
 
     # gen test
     ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
@@ -126,11 +259,19 @@ else ()
     target_link_libraries(test-openmp paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-mobilenetssd paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp  test_include.h)
+    ADD_EXECUTABLE(test-mobilenet-combine net/test_mobilenet_combine.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-mobilenet-combine paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-genet net/test_genet_combine.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-genet paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp test_include.h)
     target_link_libraries(test-sigmoid paddle-mobile)
 
     # gen test
@@ -138,13 +279,43 @@ else ()
     target_link_libraries(test-depthwise-conv-op paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-mobilenet paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-conv-add-relu-op paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-nlp net/test_nlp.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-nlp paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-gru-op operators/test_gru_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-gru-op paddle-mobile)
+
+    # gen test
+
+    ADD_EXECUTABLE(test-inceptionv4 net/test_inceptionv4.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-inceptionv4 paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-alexnet net/test_alexnet.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-alexnet paddle-mobile)
+
+    ADD_EXECUTABLE(test-googlenetv1 net/test_googlenetv1_combine.cpp test_helper.h test_include.h)
+    target_link_libraries(test-googlenetv1 paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h)
+    target_link_libraries(test-fssd paddle-mobile)
+
+
     #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 
-endif()
+
+endif ()
diff --git a/test/common/test_gemm.cpp b/test/common/test_gemm.cpp
deleted file mode 100644
index aaf3c183f3e125f09695fad8a41cfb5360e9da13..0000000000000000000000000000000000000000
--- a/test/common/test_gemm.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "common/log.h"
-#include "memory/t_malloc.h"
-#include "operators/math/gemm.h"
-
-#define a(i, j) a[(i)*lda + (j)]
-#define b(i, j) b[(i)*ldb + (j)]
-#define c1(i, j) c1[(i)*ldc + (j)]
-
-#define m 62
-#define n 63
-#define k 74
-
-int main() {
-  int lda = k;
-  int ldb = n;
-  int ldc = n;
-
-  float *a =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * k));
-  float *b =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * k * n));
-  float *c =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
-  float *c1 =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
-
-  for (int i = 0; i < m * k; ++i) {
-    a[i] = 2;
-  }
-  for (int i = 0; i < k * n; ++i) {
-    b[i] = 2;
-  }
-  for (int i = 0; i < m * n; ++i) {
-    c[i] = 2;
-    c1[i] = 2;
-  }
-
-  auto time1 = time();
-  paddle_mobile::operators::math::sgemm(m, n, k, 0.9, a, lda, b, ldb, 0.3, c,
-                                        ldc);
-  auto time2 = time();
-  DLOG << "gemm cost :" << time_diff(time1, time2) << "ms\n";
-  for (int i = 0; i < m * n; ++i) {
-    std::cout << c[i] << " | ";
-    if (i % n == (n - 1)) {
-      std::cout << std::endl;
-    }
-  }
-  for (int j = 0; j < n; ++j) {
-    for (int i = 0; i < m; ++i) {
-      c1(i, j) *= 0.3;
-      for (int p = 0; p < k; ++p) {
-        c1(i, j) += 0.9 * a(i, p) * b(p, j);
-      }
-    }
-  }
-  std::cout << "正确结果对比:" << std::endl;
-  for (int i = 0; i < m * n; ++i) {
-    std::cout << c1[i] << " | ";
-    if (i % n == (n - 1)) {
-      std::cout << std::endl;
-    }
-  }
-  return 0;
-}
diff --git a/test/common/test_gemm_accuracy.cpp b/test/common/test_gemm_accuracy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e31a5f2fe9b41f90f9aebfe44db908682f83ce1
--- /dev/null
+++ b/test/common/test_gemm_accuracy.cpp
@@ -0,0 +1,136 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstdlib>
+#include <ctime>
+#include <iostream>
+#include "../test_helper.h"
+#include "common/log.h"
+#include "memory/t_malloc.h"
+#include "operators/math/gemm.h"
+
+#define a(i, j) a[(i)*lda + (j)]
+#define b(i, j) b[(i)*ldb + (j)]
+#define c(i, j) c[(i)*ldc + (j)]
+#define c1(i, j) c1[(i)*ldc + (j)]
+
+void print_matirx(int m, int n, int ldc, float *c) {
+  for (int i = 0; i < m; ++i) {
+    std::cout << c(i, 0);
+    for (int j = 1; j < n; ++j) {
+      std::cout << " | " << c(i, j);
+    }
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+}
+
+int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
+  int lda = k;
+  int ldb = n;
+  int ldc = n;
+
+  float *a =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * k));
+  float *b =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * k * n));
+  float *c =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
+  float *c1 =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
+  float *scale =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m));
+  float *bias =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m));
+
+  srand(unsigned(time(0)));
+  for (int i = 0; i < m * k; ++i) {
+    a[i] = t1 + rand() % t2;
+  }
+  for (int i = 0; i < k * n; ++i) {
+    b[i] = t1 + rand() % t2;
+  }
+  for (int i = 0; i < m; ++i) {
+    scale[i] = t1 + rand() % t2;
+  }
+  for (int i = 0; i < m; ++i) {
+    bias[i] = t1 + rand() % t2;
+  }
+
+  for (int i = 0; i < m; ++i) {
+    for (int j = 0; j < n; ++j) {
+      float r = 0;
+      for (int p = 0; p < k; p++) {
+        r += a(i, p) * b(p, j);
+      }
+      r *= scale[i];
+      r += bias[i];
+      if (relu && (r < 0)) {
+        r = 0;
+      }
+      c1(i, j) = r;
+    }
+  }
+
+  paddle_mobile::operators::math::SgemmWithBn(
+      m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias, nullptr);
+  int eq = 0;
+  int neq = 0;
+  for (int i = 0; i < m * n; ++i) {
+    if (static_cast<int>(c[i]) == static_cast<int>(c1[i])) {
+      ++eq;
+    } else {
+      ++neq;
+    }
+  }
+
+  if (pr > 0) {
+    std::cout << "A:" << std::endl;
+    print_matirx(m, k, lda, a);
+    std::cout << "B:" << std::endl;
+    print_matirx(k, n, ldb, b);
+    std::cout << "C:" << std::endl;
+    print_matirx(m, n, ldc, c);
+    std::cout << "C1:" << std::endl;
+    print_matirx(m, n, ldc, c1);
+  }
+
+  std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
+            << "   eq=" << eq << " neq=" << neq << std::endl;
+
+  paddle_mobile::memory::Free(a);
+  paddle_mobile::memory::Free(b);
+  paddle_mobile::memory::Free(c);
+  paddle_mobile::memory::Free(c1);
+  paddle_mobile::memory::Free(scale);
+  paddle_mobile::memory::Free(bias);
+
+  return 0;
+}
+
+int main() {
+  do_sgemm(9, 9, 9, true, 10, 10, 10);
+  do_sgemm(10, 6, 12, false, 10, 10, 0);
+  do_sgemm(512, 256, 384, false, 10, 10, 0);
+  do_sgemm(1366, 768, 256, false, 10, 10, 0);
+  do_sgemm(1255, 755, 333, false, 10, 10, 0);
+  do_sgemm(555, 777, 999, false, 10, 10, 0);
+
+  do_sgemm(10, 6, 12, true, -4, 10, 0);
+  do_sgemm(512, 256, 384, true, -4, 10, 0);
+  do_sgemm(1366, 768, 256, true, -4, 10, 0);
+  do_sgemm(1255, 755, 333, true, -4, 10, 0);
+  do_sgemm(555, 777, 999, true, -4, 10, 0);
+  return 0;
+}
diff --git a/test/common/test_gemm_perf.cpp b/test/common/test_gemm_perf.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..386c09d71a3d5709842991bffd2e8ea039edc940
--- /dev/null
+++ b/test/common/test_gemm_perf.cpp
@@ -0,0 +1,67 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "operators/math/gemm.h"
+#include "operators/math/math_function.h"
+
+#define a(i, j) a[(i)*lda + (j)]
+#define b(i, j) b[(i)*ldb + (j)]
+#define c1(i, j) c1[(i)*ldc + (j)]
+
+#define m 1024
+#define n 1024
+#define k 1024
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  Tensor aa, bb, cc, scale, bias;
+  auto aaptr = aa.mutable_data<float>({m, k});
+  auto bbptr = bb.mutable_data<float>({k, n});
+  auto ccptr = cc.mutable_data<float>({m, n});
+  auto scaleptr = scale.mutable_data<float>({m});
+  auto biasptr = bias.mutable_data<float>({m});
+
+  for (int i = 0; i < m * k; ++i) {
+    aaptr[i] = 2;
+  }
+  for (int i = 0; i < k * n; ++i) {
+    bbptr[i] = 2;
+  }
+  for (int i = 0; i < m * n; ++i) {
+    ccptr[i] = 2;
+  }
+  for (int i = 0; i < m; ++i) {
+    scaleptr[i] = 1;
+    biasptr[i] = 0;
+  }
+
+  auto time1 = time();
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul<float>(
+        aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
+        false, biasptr);
+
+    //    paddle_mobile::operators::math::matmulWithBn<float>(
+    //        aa, false, bb, false, static_cast<float>(1), &cc,
+    //        static_cast<float>(0), true, &scale, &bias, 0);
+  }
+  auto time2 = time();
+  std::cout << "gemm  cost :" << time_diff(time1, time2) / 10 << "ms\n";
+
+  return 0;
+}
diff --git a/test/executor_for_test.h b/test/executor_for_test.h
index 0d3051327a57202e2b8d1dcbdda571fd244de108..93847af20a6d48a6df33dc50f6c6a1db76facf51 100644
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #include "common/log.h"
 #include "framework/op_registry.h"
-#include "io/io.h"
+#include "io/executor.h"
 #include "operators/conv_op.h"
 #include "operators/elementwise_add_op.h"
 #include "operators/pool_op.h"
@@ -43,7 +43,7 @@ template <typename DeviceType, typename OpType>
 class Executor4Test : public Executor<DeviceType> {
  public:
   Executor4Test(Program<DeviceType> p, string op_type,
-                bool use_optimize = false)
+                bool use_optimize = false, int predict_op_count = 1)
       : Executor<DeviceType>() {
     this->use_optimize_ = use_optimize;
     this->program_ = p;
@@ -57,12 +57,14 @@ class Executor4Test : public Executor<DeviceType> {
       LOG(paddle_mobile::LogLevel::kLOG_ERROR)
           << "to_predict_program_ == nullptr";
     }
+
     const std::vector<std::shared_ptr<BlockDesc>> blocks =
         this->to_predict_program_->Blocks();
     for (std::shared_ptr<BlockDesc> block_desc : blocks) {
       std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      for (std::shared_ptr<OpDesc> op : ops) {
-        if (op->Type() == op_type) {
+      for (int i = 0; i < ops.size(); ++i) {
+        auto op = ops[i];
+        if (op->Type() == op_type && i < predict_op_count) {
           DLOG << "匹配到: " << op->Type();
 
           /// test first meeting op in program
@@ -72,11 +74,17 @@ class Executor4Test : public Executor<DeviceType> {
                       op->Type(), op->GetInputs(), op->GetOutputs(),
                       op->GetAttrMap(), this->program_.scope);
           this->ops_of_block_[*block_desc.get()].push_back(op_ptr);
-          break;
         }
       }
     }
     this->InitMemory();
+
+    std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
+        this->to_predict_program_->Block(0);
+    auto &ops = this->ops_of_block_[*to_predict_block.get()];
+    for (const auto &op : ops) {
+      op->Init();
+    }
   }
 
   template <typename T = LoDTensor>
@@ -130,9 +138,6 @@ class Executor4Test : public Executor<DeviceType> {
     auto *output_tensor = con_output->GetMutable<LoDTensor>();
     output_tensor->mutable_data<float>(dDim);
 
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
     std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
         this->to_predict_program_->Block(0);
     for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size();
@@ -141,6 +146,7 @@ class Executor4Test : public Executor<DeviceType> {
       op->Run();
     }
 
-    return out_tensor;
+    return std::make_shared<paddle_mobile::framework::Tensor>(
+        paddle_mobile::framework::Tensor(*output_tensor));
   }
 };
diff --git a/test/fpga/test_concat_op.cpp b/test/fpga/test_concat_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5d1a5828b36b3d9ed371a271af6db82657ff1596
--- /dev/null
+++ b/test/fpga/test_concat_op.cpp
@@ -0,0 +1,87 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/concat_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::FPGA> loader;
+  auto program = loader.Load(g_googlenet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::FPGA,
+                paddle_mobile::operators::ConcatOp<paddle_mobile::FPGA, float>>
+      executor(program, "concat");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {4, 10, 2, 2}, 0, 1);
+  input_tensors.push_back(input1);
+  Tensor input2;
+  auto input2_data = CreateInput<float>(&input2, {4, 20, 2, 2}, 0, 1);
+  input_tensors.push_back(input2);
+  Tensor input3;
+  auto input3_data = CreateInput<float>(&input3, {4, 30, 2, 2}, 0, 1);
+  input_tensors.push_back(input3);
+  Tensor input4;
+  auto input4_data = CreateInput<float>(&input4, {4, 40, 2, 2}, 0, 1);
+  input_tensors.push_back(input4);
+  // 2. input_names
+  vector<string> input_names({
+      "conv2d_3.tmp_1",
+      "conv2d_5.tmp_1",
+      "conv2d_7.tmp_1",
+      "conv2d_8.tmp_1",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"concat_0.tmp_0"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  // 5. test one example.
+  int input_n = 1;
+  int input_c = 2;
+  int input_h = 0;
+  int input_w = 1;
+  int stride0 = input3.numel() / input3.dims()[0];
+  int stride1 = input3.numel() / input3.dims()[0] / input3.dims()[1];
+  int stride2 = input3.dims()[3];
+  /// inputx1 (4,10,2,2),
+  /// inputx2 (4,20,2,2),
+  /// inputx3 (4,30,2,2),
+  /// inputx4 (4,40,2,2),
+  /// axis = 1
+  /// output (4,100,2,2)
+  int input_index =
+      input_n * stride0 + input_c * stride1 + input_h * stride2 + input_w;
+  int output_index = input_n * 100 * 2 * 2 +
+                     (input_c + input1.dims()[1] + input2.dims()[1]) * 2 * 2 +
+                     input_h * 2 + input_w;
+
+  DLOG << " input3 [1, 2,0,1] = " << input3_data[input_index];
+  DLOG << " output [1,32,0,1] = " << output0_data[output_index];
+  return 0;
+}
diff --git a/test/fpga/test_format_data.cpp b/test/fpga/test_format_data.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d67c3110ff86dc6fba2d49412edb70ab1c9c16d
--- /dev/null
+++ b/test/fpga/test_format_data.cpp
@@ -0,0 +1,93 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "fpga/api.h"
+
+namespace frame = paddle_mobile::framework;
+namespace fpga = paddle_mobile::fpga;
+using std::cout;
+using std::endl;
+
+void test_format_image() {
+  std::vector<int> dims{1, 1, 3, 3};
+  std::vector<float> elements{1, 2, 3, 4, 5, 6, 7, 8, 9};
+  frame::DDim ddim = frame::make_ddim(dims);
+  frame::Tensor image(elements, ddim);
+  int num = image.numel();
+  float *data_ptr = image.mutable_data<float>();
+
+  for (int i = 0; i < num; i++) {
+    cout << data_ptr[i] << " ";
+  }
+  cout << endl;
+
+  fpga::format_image(&image);
+  data_ptr = image.mutable_data<float>();
+
+  for (int i = 0; i < 48; i++) {
+    cout << data_ptr[i] << " ";
+  }
+  cout << endl;
+  auto dd = image.dims();
+  cout << dims[0] << dims[1] << dims[2] << dims[3] << endl;
+}
+
+void test_fill_conv_arg() {
+  Tensor input, out, filter;
+  DLOG << "Setup input";
+  SetupTensor<int16_t>(&input, {1, 250, 32, 30}, static_cast<int16_t>(0),
+                       static_cast<int16_t>(1));
+
+  DLOG << "Setup filter";
+  SetupTensor<float>(&filter, {1001, 250, 3, 3}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  DLOG << "Setup output";
+  SetupTensor<int16_t>(&out, {1, 1001, 32, 30}, static_cast<int16_t>(0),
+                       static_cast<int16_t>(1));
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * 1001 * sizeof(float));
+
+  DLOG << "find max";
+  float max_value = fpga::filter_find_max(&filter);
+  DLOG << "format filter";
+  fpga::format_filter(&filter, max_value, 1);
+
+  DLOG << "format bs_ptr";
+  int element_num_per_div = fpga::get_filter_num_per_div(&filter, 1);
+  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, 1001);
+
+  DLOG << "format ofm";
+  fpga::format_fp16_ofm(&out);
+  DLOG << "Build arg";
+
+  fpga::WrapperConvArgs arg;
+  fpga::fill_conv_arg(&arg, &input, &out, &filter, true, 1, 1, 1, 1, 1, bs_ptr);
+  DLOG << "splitNum: " << arg.split_num << "  group_num:" << arg.group_num
+       << "  filter_num:" << arg.filter_num;
+
+  for (int i = 0; i < arg.split_num; i++) {
+    DLOG << arg.conv_args[i].filter_num << "   " << arg.conv_args[i].sb_address
+         << "   " << arg.conv_args[i].filter_address << "   "
+         << arg.conv_args[i].filter_scale_address;
+  }
+}
+
+int main() {
+  test_format_image();
+  test_fill_conv_arg();
+  return 0;
+}
diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cca6793f10da5a0784cf8a3ba2d0104f3508028d
--- /dev/null
+++ b/test/fpga/test_resnet50.cpp
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+static const char *g_resnet_combine = "../models/resnet50";
+
+int main() {
+  DLOG << paddle_mobile::fpga::open_device();
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+  if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model",
+                         std::string(g_resnet_combine) + "/params", true)) {
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
+                       static_cast<float>(1));
+
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+
+    paddle_mobile.FeedData(input_tensor);
+    paddle_mobile.Predict_To(-1);
+    //    paddle_mobile.Predict_From(73);
+    //    paddle_mobile.Predict_From_To(72, 73);
+
+    DLOG << "Computation done";
+    return 0;
+  }
+}
diff --git a/test/fpga/test_tensor_quant.cpp b/test/fpga/test_tensor_quant.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6cfc27e91ced109e41bf5420649dbb762ee94d66
--- /dev/null
+++ b/test/fpga/test_tensor_quant.cpp
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  if (paddle_mobile.Load(g_resnet, true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    std::vector<int64_t> dims{1, 3, 32, 32};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
+                       static_cast<float>(1));
+
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+    // 预热一次
+    paddle_mobile.Predict(input, dims);
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+              << std::endl;
+  }
+
+  return 0;
+}
diff --git a/test/framework/test_inference_api.cpp b/test/framework/test_inference_api.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7dec2fe29753c75ee70f31428d104450acce9404
--- /dev/null
+++ b/test/framework/test_inference_api.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "io/paddle_inference_api.h"
+
+using namespace paddle_mobile;
+
+PaddleMobileConfig GetConfig() {
+  PaddleMobileConfig config;
+  config.precision = PaddleMobileConfig::FP32;
+  config.device = PaddleMobileConfig::kCPU;
+  config.model_dir = "../models/mobilenet/";
+  config.thread_num = 4;
+  return config;
+}
+
+int main() {
+  PaddleMobileConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<PaddleMobileConfig,
+                            PaddleEngineKind::kPaddleMobile>(config);
+
+  float data[1 * 3 * 224 * 224] = {1.0f};
+
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({1, 3, 224, 224});
+  tensor.data = PaddleBuf(data, sizeof(data));
+  tensor.dtype = PaddleDType::FLOAT32;
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+
+  PaddleTensor tensor_out;
+  tensor_out.shape = std::vector<int>({});
+  tensor_out.data = PaddleBuf();
+  tensor_out.dtype = PaddleDType::FLOAT32;
+  std::vector<PaddleTensor> outputs(1, tensor_out);
+
+  assert(predictor->Run(paddle_tensor_feeds, &outputs));
+
+  float* data_o = static_cast<float*>(outputs[0].data.data());
+  for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) {
+    std::cout << "output[" << j << "]: " << data_o[j] << std::endl;
+  }
+
+  return 0;
+}
diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp
index 05b60d3e9105fc7576d29b7c24c94e2a28c85a70..64fa42658be6b39fabe9bb26296a426949d31197 100644
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -12,18 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <string>
+
 #include "../test_helper.h"
-#include "io/io.h"
+#include "io/loader.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
-  auto program = loader.Load(g_googlenet, true);
-  //  auto program = loader.Load(g_googlenet_combine + "/model",
-  //  g_googlenet_combine +
-  //    "/params", true);
 
+  //  auto program = loader.Load(g_googlenet, true);
+  //  auto program = loader.Load(g_mobilenet_ssd, true);
+
+  auto program = loader.Load(std::string(g_ocr) + "/model",
+                             std::string(g_ocr) + "/params", false);
   //  program.originProgram->Description("program desc: ");
   return 0;
 }
diff --git a/test/framework/test_load_memory.cpp b/test/framework/test_load_memory.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4be7aaa82f53bd8c5ccfb531339827534b2736ab
--- /dev/null
+++ b/test/framework/test_load_memory.cpp
@@ -0,0 +1,67 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+
+#include "../test_helper.h"
+#include "../test_include.h"
+static size_t ReadBuffer(const char *file_name, uint8_t **out) {
+  FILE *fp;
+  fp = fopen(file_name, "rb");
+  PADDLE_MOBILE_ENFORCE(fp != nullptr, " %s open failed !", file_name);
+  fseek(fp, 0, SEEK_END);
+  auto size = static_cast<size_t>(ftell(fp));
+  rewind(fp);
+  DLOG << "model size: " << size;
+  *out = reinterpret_cast<uint8_t *>(malloc(size));
+  size_t cur_len = 0;
+  size_t nread;
+  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
+    cur_len += nread;
+  }
+  fclose(fp);
+  return cur_len;
+}
+
+static char *Get_binary_data(std::string filename) {
+  FILE *file = fopen(filename.c_str(), "rb");
+  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
+                        filename.c_str());
+  fseek(file, 0, SEEK_END);
+  int64_t size = ftell(file);
+  PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
+  rewind(file);
+  auto *data = new char[size];
+  size_t bytes_read = fread(data, 1, size, file);
+  PADDLE_MOBILE_ENFORCE(bytes_read == size,
+                        "read binary file bytes do not match with fseek");
+  fclose(file);
+  return data;
+}
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  auto model_path = std::string(g_genet_combine) + "/model";
+  auto params_path = std::string(g_genet_combine) + "/params";
+  uint8_t *bufModel = nullptr;
+  size_t sizeBuf = ReadBuffer(model_path.c_str(), &bufModel);
+  uint8_t *bufParams = nullptr;
+
+  DLOG << "sizeBuf: " << sizeBuf;
+  size_t sizeParams = ReadBuffer(params_path.c_str(), &bufParams);
+  DLOG << "sizeParams: " << sizeParams;
+
+  paddle_mobile.LoadCombinedMemory(sizeBuf, bufModel, sizeParams, bufParams);
+  return 0;
+}
diff --git a/test/framework/test_optimize.cpp b/test/framework/test_optimize.cpp
index 32574764e1ba538ab0bea31d1e238096e7098dfc..3cae963eca048da221d69c4c336dd4fdfecbb584 100644
--- a/test/framework/test_optimize.cpp
+++ b/test/framework/test_optimize.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "../test_helper.h"
 #include "framework/program/program-optimize/node.h"
 #include "framework/program/program-optimize/program_optimize.h"
-#include "io/io.h"
+#include "io/loader.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/net/test_alexnet.cpp b/test/net/test_alexnet.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..50053fe82f95177fd786c1c8f8f5c9b7a521b888
--- /dev/null
+++ b/test/net/test_alexnet.cpp
@@ -0,0 +1,59 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
+  //                     std::string(g_mobilenet_detect) + "/params", true);
+
+  auto isok = paddle_mobile.Load(g_alexnet, true);
+  if (isok) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    DLOG << vec_result;
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+
+  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
+               "是否存在?"
+            << std::endl;
+  return 0;
+}
diff --git a/test/net/test_genet_combine.cpp b/test/net/test_genet_combine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e6b0505a670f1a58ed7d09cc4854ef52b05b0649
--- /dev/null
+++ b/test/net/test_genet_combine.cpp
@@ -0,0 +1,51 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  if (paddle_mobile.Load(std::string(g_genet_combine) + "/model",
+                         std::string(g_genet_combine) + "/params", true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 128, 128};
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+
+    // 预热一次
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+  std::cout
+      << "如果结果Nan请查看: test/images/test_image_1x3x224x224_float 是否存在?"
+      << std::endl;
+  return 0;
+}
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index 1695995a8d60d20e0d6c5f8911c39a948426a82a..a2f030eeac5c2584b33fad2b082b9d5513707260 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -12,30 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+#ifdef PADDLE_MOBILE_FPGA
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+#endif
+
+#ifdef PADDLE_MOBILE_CPU
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+#endif
+
+  paddle_mobile.SetThreadNum(4);
   bool optimize = true;
   auto time1 = time();
-  //  auto program = loader.Load(g_googlenet, optimize);
-  auto program = loader.Load(g_googlenet_combine + "/model",
-                             g_googlenet_combine + "/params", optimize);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
-  std::vector<float> input;
-  std::vector<int64_t> dims{1, 3, 224, 224};
-  GetInput<float>(g_test_image_1x3x224x224, &input, dims);
-  auto time3 = time();
-
-  for (int i = 0; i < 10; ++i) {
-    executor.Predict(input, dims);
-  }
+  if (paddle_mobile.Load(g_googlenet, optimize)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
 
-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms\n";
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
   return 0;
 }
diff --git a/test/net/test_googlenetv1_combine.cpp b/test/net/test_googlenetv1_combine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9aab25afd2aa6ece4e6b99bbd368b8a5be2e3106
--- /dev/null
+++ b/test/net/test_googlenetv1_combine.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  if (paddle_mobile.Load(std::string(g_googlenetv1_combined) + "/model",
+                         std::string(g_googlenetv1_combined) + "/params",
+                         false)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 160, 160};
+    GetInput<float>(g_img, &input, dims);
+
+    for (int i = 0; i < input.size(); i += 1000) {
+      std::cout << input[i] << std::endl;
+    }
+    //    auto vec_result = paddle_mobile.Predict(input, dims);
+    //    std::vector<float>::iterator biggest =
+    //        std::max_element(std::begin(vec_result), std::end(vec_result));
+    //    std::cout << " Max element is " << *biggest << " at position "
+    //              << std::distance(std::begin(vec_result), biggest) <<
+    //              std::endl;
+
+    //    // 预热十次
+    //    for (int i = 0; i < 1; ++i) {
+    //      auto vec_result = paddle_mobile.Predict(input, dims);
+    //    }
+    auto time3 = time();
+
+    auto vec_result = paddle_mobile.Predict(input, dims);
+
+    for (int j = 0; j < vec_result.size(); ++j) {
+      std::cout << j << " : " << vec_result[j] << std::endl;
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"
+              << std::endl;
+  }
+
+  return 0;
+}
diff --git a/test/net/test_inceptionv4.cpp b/test/net/test_inceptionv4.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fbbc9dd39e64f7a8ea745cf7489e46f00ffe1413
--- /dev/null
+++ b/test/net/test_inceptionv4.cpp
@@ -0,0 +1,59 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
+  //                     std::string(g_mobilenet_detect) + "/params", true);
+
+  auto isok = paddle_mobile.Load(g_inceptionv4, true);
+  if (isok) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    //        DLOG << vec_result;
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+
+  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
+               "是否存在?"
+            << std::endl;
+  return 0;
+}
diff --git a/test/net/test_mobilenet+ssd.cpp b/test/net/test_mobilenet+ssd.cpp
index 097d03ad710468a881050ff729e8352f029d664f..85083ca441ad242ffb5b63dd612a0e35e3589f99 100644
--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
@@ -12,28 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
   auto time1 = time();
-  auto program = loader.Load(g_mobilenet_ssd, true);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, true);
+  auto isok = paddle_mobile.Load(
+      std::string(g_mobilenet_ssd_gesture) + "/model",
+      std::string(g_mobilenet_ssd_gesture) + "/params", true);
+  //  auto isok = paddle_mobile.Load(g_mobilenet_ssd, false);
+  if (isok) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
 
-  std::vector<int64_t> dims{1, 3, 300, 300};
-  Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {1, 3, 300, 300}, static_cast<float>(0),
-                     static_cast<float>(1));
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 300, 300};
+    GetInput<float>(g_hand, &input, dims);
 
-  std::vector<float> input(input_tensor.data<float>(),
-                           input_tensor.data<float>() + input_tensor.numel());
-  auto time3 = time();
-  executor.Predict(input, dims);
-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      auto output = paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto output = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
   return 0;
 }
diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp
index 2495fb497e679d75128f3a74fdbb8da98b927f9f..4ed7d3b756cfef9554028e1d33f4dd86bf58e4b8 100644
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -12,29 +12,48 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
   auto time1 = time();
-  auto program = loader.Load(g_mobilenet, true);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, true);
-
-  std::vector<int64_t> dims{1, 3, 224, 224};
-  Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
-                     static_cast<float>(1));
-
-  std::vector<float> input(input_tensor.data<float>(),
-                           input_tensor.data<float>() + input_tensor.numel());
-  auto time3 = time();
-  auto vec_result = executor.Predict(input, dims);
-  auto time4 = time();
-
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
+  //                     std::string(g_mobilenet_detect) + "/params", true);
+
+  auto isok = paddle_mobile.Load(g_mobilenet, true);
+  if (isok) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    DLOG << vec_result;
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+
+  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
+               "是否存在?"
+            << std::endl;
   return 0;
 }
diff --git a/test/net/test_mobilenet_025_fssd.cpp b/test/net/test_mobilenet_025_fssd.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c0d037ceb05f57361f1385cb9959beed66186e4f
--- /dev/null
+++ b/test/net/test_mobilenet_025_fssd.cpp
@@ -0,0 +1,61 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main(int argc, char **argv) {
+  int times = 10;
+  if (argc <= 1) {
+    times = 10;
+    std::cout << "没有输入 , 使用默认10次 " << times << std::endl;
+  } else {
+    std::string arstr = argv[1];
+    times = std::stoi(arstr);
+    std::cout << "input times: " << times << std::endl;
+  }
+
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(1);
+  auto isok =
+      paddle_mobile.Load(std::string(g_fluid_fssd_new) + "/model",
+                         std::string(g_fluid_fssd_new) + "/params", true);
+  if (isok) {
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 160, 160};
+    GetInput<float>(g_imgfssd_ar1, &input, dims);
+    std::cout << "预热10次....." << std::endl;
+
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      auto output = paddle_mobile.Predict(input, dims);
+    }
+    std::cout << "开始....." << std::endl;
+
+    double time_sum = 0;
+
+    for (int i = 0; i < times; ++i) {
+      auto time3 = time();
+      auto output = paddle_mobile.Predict(input, dims);
+      auto time4 = time();
+      double timeDiff = time_diff(time3, time4);
+      time_sum += timeDiff;
+      std::cout << "第" << i << "次"
+                << "predict cost :" << timeDiff << "ms" << std::endl;
+    }
+    std::cout << "平均时间:" << time_sum / times << "ms" << std::endl;
+  }
+  return 0;
+}
diff --git a/test/net/test_mobilenet_combine.cpp b/test/net/test_mobilenet_combine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..073607795967af09c81bc0a0c492d065bce7ed72
--- /dev/null
+++ b/test/net/test_mobilenet_combine.cpp
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  if (paddle_mobile.Load(std::string(g_mobilenet_combined) + "/model",
+                         std::string(g_mobilenet_combined) + "/params", true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+  std::cout
+      << "如果结果Nan请查看: test/images/test_image_1x3x224x224_float 是否存在?"
+      << std::endl;
+  return 0;
+}
diff --git a/test/net/test_nlp.cpp b/test/net/test_nlp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..edf5cd623a94d348a5a213115821202b447ae648
--- /dev/null
+++ b/test/net/test_nlp.cpp
@@ -0,0 +1,86 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
+  //                     std::string(g_mobilenet_detect) + "/params", true);
+
+  auto isok = paddle_mobile.Load(g_nlp, true, false, 1, true);
+
+  //  auto isok = paddle_mobile.Load(std::string(g_nlp) + "/model",
+  //                                 std::string(g_nlp) + "/params", false);
+  if (isok) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    //    1064 1603 644 699 2878 1219 867 1352 8 1 13 312 479
+
+    std::vector<int64_t> ids{1918, 117, 55, 97, 1352, 4272, 1656, 903};
+
+    paddle_mobile::framework::LoDTensor words;
+    auto size = static_cast<int>(ids.size());
+    paddle_mobile::framework::LoD lod{{0, ids.size()}};
+    DDim dims{size, 1};
+    words.Resize(dims);
+    words.set_lod(lod);
+    DLOG << "words lod : " << words.lod();
+    auto *pdata = words.mutable_data<int64_t>();
+    size_t n = words.numel() * sizeof(int64_t);
+    DLOG << "n :" << n;
+    memcpy(pdata, ids.data(), n);
+    DLOG << "words lod 22: " << words.lod();
+    auto time3 = time();
+    for (int i = 0; i < 1; ++i) {
+      auto vec_result = paddle_mobile.PredictLod(words);
+      DLOG << *vec_result;
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"
+              << std::endl;
+  }
+
+  auto time2 = time();
+  std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+  //    1064 1603 644 699 2878 1219 867 1352 8 1 13 312 479
+
+  std::vector<int64_t> ids{1791, 656, 1549, 281, 96};
+
+  paddle_mobile::framework::LoDTensor words;
+  auto size = static_cast<int>(ids.size());
+  paddle_mobile::framework::LoD lod{{0, ids.size()}};
+  DDim dims{size, 1};
+  words.Resize(dims);
+  words.set_lod(lod);
+  DLOG << "words lod : " << words.lod();
+  auto *pdata = words.mutable_data<int64_t>();
+  size_t n = words.numel() * sizeof(int64_t);
+  DLOG << "n :" << n;
+  memcpy(pdata, ids.data(), n);
+  DLOG << "words lod 22: " << words.lod();
+  auto time3 = time();
+  for (int i = 0; i < 1; ++i) {
+    auto vec_result = paddle_mobile.PredictLod(words);
+    DLOG << *vec_result;
+  }
+  auto time4 = time();
+  std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"
+            << std::endl;
+  return 0;
+}
diff --git a/test/net/test_resnet.cpp b/test/net/test_resnet.cpp
index 55f4c5efef209c421fc550c1f17422acd64b11b9..d2a4abbbfd2c023f1e8220e74f815eda44acb6db 100644
--- a/test/net/test_resnet.cpp
+++ b/test/net/test_resnet.cpp
@@ -12,28 +12,59 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+#ifdef PADDLE_MOBILE_FPGA
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+#endif
+
+#ifdef PADDLE_MOBILE_CPU
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+#endif
+  paddle_mobile.SetThreadNum(4);
   auto time1 = time();
-  auto program = loader.Load(g_resnet, false);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
-
-  std::vector<int64_t> dims{1, 3, 32, 32};
-  Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
-                     static_cast<float>(1));
-
-  std::vector<float> input(input_tensor.data<float>(),
-                           input_tensor.data<float>() + input_tensor.numel());
-  auto time3 = time();
-  executor.Predict(input, dims);
-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  if (paddle_mobile.Load(g_resnet, true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    std::vector<int64_t> dims{1, 3, 32, 32};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
+                       static_cast<float>(1));
+
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+#ifndef PADDLE_MOBILE_FPGA
+    //   预热十次
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+              << std::endl;
+
+#else
+    auto time3 = time();
+    paddle_mobile.FeedData(input_tensor);
+    paddle_mobile.Predict_To(10);
+    paddle_mobile.Predict_From(10);
+    auto tensor_ptr = paddle_mobile.FetchResult(9);
+    std::cout << "Tensor element number for op[9]: " << tensor_ptr->numel()
+              << std::endl;
+    auto result_ptr = paddle_mobile.FetchResult();
+    std::cout << "Result tensor element number: " << result_ptr->numel()
+              << std::endl;
+
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+              << std::endl;
+#endif
+  }
   return 0;
 }
diff --git a/test/net/test_squeezenet.cpp b/test/net/test_squeezenet.cpp
index 30460018fe8cc008e0031c1c713150745767fa28..02ec8691febbad5ec0e811f7d7bebde1bef54a79 100644
--- a/test/net/test_squeezenet.cpp
+++ b/test/net/test_squeezenet.cpp
@@ -12,30 +12,38 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
   auto time1 = time();
-  auto program = loader.Load(g_squeezenet, false);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+  if (paddle_mobile.Load(g_squeezenet, true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    std::vector<int64_t> dims{1, 3, 227, 227};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
+                       static_cast<float>(1));
 
-  std::vector<int64_t> dims{1, 3, 227, 227};
-  Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
-                     static_cast<float>(1));
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
 
-  std::vector<float> input(input_tensor.data<float>(),
-                           input_tensor.data<float>() + input_tensor.numel());
-  auto time3 = time();
-  executor.Predict(input, dims);
-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
   return 0;
 }
diff --git a/test/net/test_yolo.cpp b/test/net/test_yolo.cpp
index c82443e23953def917826fe4ec3b2c484b588f59..700eb10cac6f0b80595d8c53866c7f675d2b56fb 100644
--- a/test/net/test_yolo.cpp
+++ b/test/net/test_yolo.cpp
@@ -12,30 +12,38 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
   auto time1 = time();
-  auto program = loader.Load(g_yolo, false);
-  auto time2 = time();
-  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+  if (paddle_mobile.Load(g_yolo, true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
 
-  std::vector<int64_t> dims{1, 3, 227, 227};
-  Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
-                     static_cast<float>(1));
+    std::vector<int64_t> dims{1, 3, 227, 227};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
+                       static_cast<float>(1));
 
-  std::vector<float> input(input_tensor.data<float>(),
-                           input_tensor.data<float>() + input_tensor.numel());
-  auto time3 = time();
-  executor.Predict(input, dims);
-  auto time4 = time();
-  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
   return 0;
 }
diff --git a/test/net/test_yolo_combined.cpp b/test/net/test_yolo_combined.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..88b889daa946cfaef1d86ff36f416b4643532c89
--- /dev/null
+++ b/test/net/test_yolo_combined.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  //  ../../../test/models/googlenet
+  //  ../../../test/models/mobilenet
+  auto time1 = time();
+
+  if (paddle_mobile.Load(std::string(g_yolo_combined) + "/model",
+                         std::string(g_yolo_combined) + "/params", true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    std::vector<int64_t> dims{1, 3, 416, 416};
+    std::vector<float> input;
+
+    GetInput<float>(g_test_image_desktop_1_3_416_416_nchw_float, &input, dims);
+    std::cout << "input.size():  " << input.size() << std::endl;
+    for (int j = 0; j < 100; ++j) {
+      std::cout << j << " :  " << input[j] << std::endl;
+    }
+    //        // 预热十次
+    //        for (int i = 0; i < 10; ++i) {
+    //            paddle_mobile.Predict(input, dims);
+    //        }
+    auto time3 = time();
+    const vector<float> vector_out = paddle_mobile.Predict(input, dims);
+    std::cout << "--------------------------------------------" << std::endl;
+
+    for (float i : vector_out) {
+      std::cout << i << std::endl;
+    }
+
+    std::cout << "--------------------------------------------" << std::endl;
+
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+  return 0;
+}
diff --git a/test/operators/test_fusion_conv_add_bn_relu_op.cpp b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7764d95ed72da613459233bd55ddcffdc444318f
--- /dev/null
+++ b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_include.h"
+#include "operators/fusion_conv_add_bn_relu_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../models/image_classification_resnet.inference.model
+  auto program = loader.Load(g_mobilenet, true);
+
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::FusionConvAddBNReluOp<
+                    paddle_mobile::CPU, float>>
+      executor(program, "fusion_conv_add_bn_relu", true);
+
+  std::cout << "executor 4 test: " << std::endl;
+
+  paddle_mobile::framework::Tensor input;
+  GetInput<float>(g_test_image_1x3x224x224_banana, &input, {1, 3, 224, 224});
+  //  // use SetupTensor if not has local input image .
+  //  SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
+  //                     static_cast<float>(1));
+
+  DLOG << " fuck: " << input;
+
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 112, 112});
+  std::cout << "before predict: " << std::endl;
+  auto output =
+      executor.Predict(input, "data", "conv2_1_dw_bn.tmp_2", out_ddim);
+  std::cout << "after predict " << std::endl;
+  auto output_ptr = output->data<float>();
+
+  int stride = output->numel() / 100;
+  for (int i = 0; i < 100; i++) {
+    DLOG << " index:" << i * stride << " value: " << output_ptr[i * stride];
+  }
+
+  //  for (int i = 0; i < 100; i++) {
+  //    DLOG << " index:" << i << " value: "<< output_ptr[i];
+  //  }
+
+  //  for (int j = 0; j < output->numel(); ++j) {
+  //    std::cout << " (index: " << j << " value: " << output_ptr[j] << ") ";
+  //  }
+  std::cout << std::endl;
+  return 0;
+}
diff --git a/test/operators/test_gru_op.cpp b/test/operators/test_gru_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..52ab8b54d709391ea263b74a395a635ce50a18af
--- /dev/null
+++ b/test/operators/test_gru_op.cpp
@@ -0,0 +1,29 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/gru_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_nlp);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::GruOp<paddle_mobile::CPU, float>>
+      executor(program, "gru");
+
+  return 0;
+}
diff --git a/test/operators/test_prelu_op.cpp b/test/operators/test_prelu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e93d8732d18496721b24cfba1df296250169f8b2
--- /dev/null
+++ b/test/operators/test_prelu_op.cpp
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../executor_for_test.h"
+#include "../test_include.h"
+#include "operators/prelu_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_resnet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::PReluOp<paddle_mobile::CPU, float>>
+      executor(program, "prelu");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {1, 2, 3, 4}, -1, 1);
+  input_tensors.push_back(input1);
+
+  // 2. input_names
+  vector<string> input_names({
+      "batch_norm_0.tmp_2",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"batch_norm_0.tmp_3"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 2, 3, 4});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  for (int j = 0; j < output[0]->numel(); ++j) {
+    DLOG << " value of output: " << output0_data[j];
+  }
+  return 0;
+}
diff --git a/test/operators/test_resize_op.cpp b/test/operators/test_resize_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f4dcaa6885d92a727e8c97d5106c3b6913a4ab33
--- /dev/null
+++ b/test/operators/test_resize_op.cpp
@@ -0,0 +1,47 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/resize_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
+  if (program.originProgram == nullptr) {
+    DLOG << "program read file";
+  }
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::ResizeOp<paddle_mobile::CPU, float>>
+      executor(program, "resize");
+  paddle_mobile::framework::Tensor input;
+  SetupTensor<float>(&input, {2, 3, 3, 2}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto input_ptr = input.data<float>();
+  auto out_ddim = paddle_mobile::framework::make_ddim({2, 9, 2});
+  auto output =
+      executor.Predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim);
+  auto *output_ptr = output->data<float>();
+
+  DLOG << "input : ";
+  for (int j = 0; j < input.numel(); ++j) {
+    DLOG << " index " << j << " : " << input_ptr[j];
+  }
+
+  DLOG << "output : ";
+  for (int j = 0; j < output->numel(); ++j) {
+    DLOG << " index " << j << " : " << output_ptr[j];
+  }
+
+  return 0;
+}
diff --git a/test/operators/test_scale_op.cpp b/test/operators/test_scale_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..574779d71e5ebc5f06fe5cd8fb33422726f39464
--- /dev/null
+++ b/test/operators/test_scale_op.cpp
@@ -0,0 +1,18 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/scale_op.h"
+
+int main() {}
diff --git a/test/operators/test_sigmoid_op.cpp b/test/operators/test_sigmoid_op.cpp
index 4ed3efaf28aa986f0b679729c46cb386150583e3..c8fac6b9eee5c5777ddb0147bc81d361d4dd09f5 100644
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "../../src/operators/kernel/sigmoid_kernel.h"
 #include "../test_helper.h"
-#include "io/io.h"
+#include "io/executor.h"
 
 int main() {
   paddle_mobile::framework::Tensor input;
diff --git a/test/operators/test_slice_op.cpp b/test/operators/test_slice_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9306bc53c6ae23b10c27a71071c11c9ddf1c0d25
--- /dev/null
+++ b/test/operators/test_slice_op.cpp
@@ -0,0 +1,18 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/slice_op.h"
+
+int main() {}
diff --git a/test/test_helper.h b/test/test_helper.h
index 81ad23ff3b4e53db0225630eebaa34878ad4c139..ecbc251a815e343f75b1247ffc430e9c52d6abfd 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -16,22 +16,45 @@ limitations under the License. */
 
 #include <fstream>
 #include <random>
+#include <string>
+#include <vector>
 
 #include "common/common.h"
 #include "common/log.h"
 #include "framework/ddim.h"
 #include "framework/tensor.h"
 
-static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd";
-static const std::string g_squeezenet = "../models/squeezenet";
-static const std::string g_googlenet = "../models/googlenet";
-static const std::string g_mobilenet = "../models/mobilenet";
-static const std::string g_resnet_50 = "../models/resnet_50";
-static const std::string g_resnet = "../models/resnet";
-static const std::string g_googlenet_combine = "../models/googlenet_combine";
-static const std::string g_yolo = "../models/yolo";
-static const std::string g_test_image_1x3x224x224 =
+static const char *g_ocr = "../models/ocr";
+static const char *g_mobilenet_ssd = "../models/mobilenet+ssd";
+static const char *g_genet_combine = "../models/enet";
+static const char *g_mobilenet_ssd_gesture = "../models/mobilenet+ssd_gesture";
+static const char *g_mobilenet_combined = "../models/mobilenet_combine";
+static const char *g_googlenetv1_combined = "../models/googlenetv1_combine";
+static const char *g_mobilenet_detect = "../models/mobilenet-detect";
+static const char *g_squeezenet = "../models/squeezenet";
+static const char *g_googlenet = "../models/googlenet";
+static const char *g_mobilenet = "../models/mobilenet";
+static const char *g_alexnet = "../models/alexnet";
+static const char *g_inceptionv4 = "../models/inceptionv4";
+static const char *g_nlp = "../models/nlp";
+static const char *g_resnet_50 = "../models/resnet_50";
+static const char *g_resnet = "../models/resnet";
+static const char *g_googlenet_combine = "../models/googlenet_combine";
+static const char *g_yolo = "../models/yolo";
+static const char *g_yolo_combined = "../models/yolo_combined";
+static const char *g_fluid_fssd_new = "../models/fluid_fssd_new";
+
+static const char *g_test_image_1x3x224x224 =
     "../images/test_image_1x3x224x224_float";
+static const char *g_test_image_1x3x224x224_banana =
+    "../images/input_3x224x224_banana";
+static const char *g_test_image_desktop_1_3_416_416_nchw_float =
+    "../images/in_put_1_3_416_416_2";
+static const char *g_hand = "../images/hand_image";
+static const char *g_imgfssd_ar = "../images/test_image_ssd_ar";
+static const char *g_imgfssd_ar1 = "../images/003_0001.txt";
+static const char *g_img = "../images/img.bin";
+
 using paddle_mobile::framework::DDim;
 using paddle_mobile::framework::Tensor;
 
@@ -62,9 +85,9 @@ void GetInput(const std::string &input_name, std::vector<T> *input,
     size *= dim;
   }
 
-  T *input_ptr = (T *)malloc(sizeof(T) * size);
+  T *input_ptr = reinterpret_cast<T *>(malloc(sizeof(T) * size));
   std::ifstream in(input_name, std::ios::in | std::ios::binary);
-  in.read((char *)(input_ptr), size * sizeof(T));
+  in.read(reinterpret_cast<char *>(input_ptr), size * sizeof(T));
   in.close();
   for (int i = 0; i < size; ++i) {
     input->push_back(input_ptr[i]);
@@ -79,6 +102,6 @@ void GetInput(const std::string &input_name,
   T *input_ptr = input->mutable_data<T>(dims);
 
   std::ifstream in(input_name, std::ios::in | std::ios::binary);
-  in.read((char *)(input_ptr), input->numel() * sizeof(T));
+  in.read(reinterpret_cast<char *>(input_ptr), input->numel() * sizeof(T));
   in.close();
 }
diff --git a/test/test_include.h b/test/test_include.h
index 2d89dc8c9ed1de1ad49ebca07724b6649e2a12a7..4728a469334010e7353e6ab1f3695ec23f3e7456 100644
--- a/test/test_include.h
+++ b/test/test_include.h
@@ -30,4 +30,4 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
-#include "io/io.h"
+#include "io/paddle_mobile.h"
diff --git a/tools/android-cmake/android.toolchain.cmake b/tools/android-cmake/android.toolchain.cmake
index a57d9c102ff65d4c10cc9bd3773ffa4c87e482fa..55b90ba65260b99d9af4a29832ed6f8ff5b235c8 100644
--- a/tools/android-cmake/android.toolchain.cmake
+++ b/tools/android-cmake/android.toolchain.cmake
@@ -65,6 +65,8 @@ endif()
 file(TO_CMAKE_PATH "${ANDROID_NDK}" ANDROID_NDK)
 
 # Android NDK revision
+message("${ANDROID_NDK}")
+
 file(READ "${ANDROID_NDK}/source.properties" ANDROID_NDK_SOURCE_PROPERTIES)
 set(ANDROID_NDK_SOURCE_PROPERTIES_REGEX
   "^Pkg\\.Desc = Android NDK\nPkg\\.Revision = ([0-9]+)\\.")
@@ -159,7 +161,7 @@ endif()
 
 # Default values for configurable variables.
 if(NOT ANDROID_TOOLCHAIN)
-  set(ANDROID_TOOLCHAIN clang)
+  set(ANDROID_TOOLCHAIN gcc)
 endif()
 if(NOT ANDROID_ABI)
   set(ANDROID_ABI armeabi-v7a)
diff --git a/tools/build.sh b/tools/build.sh
index 86ae8b5e1aa16c7cab66580bc2eaa6e1e526fc17..1408822e46850752bcd448350fc483c25f70ae9a 100755
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+NETS=""
+declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet" "mobilenetssd" "nlp" "mobilenetfssd" "genet")
 
 build_for_mac() {
     if [ ! `which brew` ]; then
@@ -32,13 +34,14 @@ build_for_mac() {
 
 build_for_android() {
     #rm -rf "../build"
-    if [ -z "${ANDROID_NDK}" ]; then
-        echo "ANDROID_NDK not found!"
+    if [ -z "${NDK_ROOT}" ]; then
+        echo "NDK_ROOT not found!"
         exit -1
     fi
 
     if [ -z "$PLATFORM" ]; then
-        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" or other platforms from the command line.
+        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" platform.
+#        PLATFORM="arm-v8a"
     fi
 
     if [ "${PLATFORM}" = "arm-v7a" ]; then
@@ -59,8 +62,8 @@ build_for_android() {
     ANDROID_PLATFORM_VERSION="android-22"
     TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
     ANDROID_ARM_MODE="arm"
-    if [ $# -eq 1 ]; then
-    NET=$1
+
+    if [ "${#NETS}" -gt 1 ]; then
     cmake .. \
         -B"../build/release/${PLATFORM}" \
         -DANDROID_ABI="${ABI}" \
@@ -70,7 +73,7 @@ build_for_android() {
         -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
         -DANDROID_STL=c++_static \
         -DANDROID=true \
-        -D"${NET}=true" \
+        -DNET="${NETS}" \
         -D"${ARM_PLATFORM}"=true
     else
 
@@ -89,86 +92,122 @@ build_for_android() {
     make -j 8
 }
 
+
+build_for_arm_linux() {
+    MODE="Release"
+    ARM_LINUX="arm-linux"
+
+    if [ "${#NETS}" -gt 1 ]; then
+        cmake .. \
+            -B"../build/release/arm-linux" \
+            -DCMAKE_BUILD_TYPE="${MODE}" \
+            -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
+            -DCMAKE_CXX_FLAGS="-std=c++14 -mcpu=cortex-a53 -mtune=cortex-a53 -mfpu=neon-vfpv4 -mfloat-abi=hard -ftree-vectorize -funsafe-math-optimizations  -pipe -mlittle-endian -munaligned-access" \
+            -DNET="${NETS}" \
+            -D"V7"=true
+    else
+        cmake .. \
+            -B"../build/release/arm-linux" \
+            -DCMAKE_BUILD_TYPE="${MODE}" \
+            -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
+            -DCMAKE_CXX_FLAGS="-std=c++14 -mcpu=cortex-a53 -mtune=cortex-a53 -mfpu=neon-vfpv4 -mfloat-abi=hard -ftree-vectorize -funsafe-math-optimizations  -pipe -mlittle-endian -munaligned-access" \
+            -DNET="${NETS}" \
+            -D"V7"=true
+    fi
+
+    cd "../build/release/arm-linux"
+    make -j 8
+    cd "../../../test/"
+    DIRECTORY="models"
+    if [ "`ls -A $DIRECTORY`" = "" ]; then
+        echo "$DIRECTORY is indeed empty pull images"
+        wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip
+        unzip paddle-mobile%2FmodelsAndImages.zip
+        mv modelsAndImages/images/ images
+        mv modelsAndImages/models/ models
+        rm -rf paddle-mobile%2FmodelsAndImages.zip
+        rm -rf __MACOS
+    else
+        echo "$DIRECTORY is indeed not empty, DONE!"
+    fi
+
+}
+
 build_for_ios() {
-    rm -rf "../build"
+#    rm -rf "../build"
     PLATFORM="ios"
     MODE="Release"
-    BUILD_DIR=../build/release/"${PLATFORM}"
+    BUILD_DIR=../build/release/"${PLATFORM}"/
     TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
-    C_FLAGS="-fobjc-abi-version=2 -fobjc-arc -isysroot ${CMAKE_OSX_SYSROOT}"
-    CXX_FLAGS="-fobjc-abi-version=2 -fobjc-arc -std=gnu++14 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT}"
     mkdir -p "${BUILD_DIR}"
-    if [ $# -eq 1 ]; then
-        NET=$1
+    if [ "${#NETS}" -gt 1 ]; then
         cmake .. \
             -B"${BUILD_DIR}" \
             -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
             -DIOS_PLATFORM=OS \
-            -DCMAKE_C_FLAGS="${C_FLAGS}" \
-            -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-            -D"${NET}"=true \
+            -DIOS_ARCH="${IOS_ARCH}" \
+            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
+            -DNET="${NETS}" \
             -DIS_IOS="true"
     else
         cmake .. \
             -B"${BUILD_DIR}" \
             -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
             -DIOS_PLATFORM=OS \
-            -DCMAKE_C_FLAGS="${C_FLAGS}" \
-            -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
+            -DIOS_ARCH="${IOS_ARCH}" \
+            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
             -DIS_IOS="true"
     fi
     cd "${BUILD_DIR}"
     make -j 8
+    cp ../../../src/ios_io/PaddleMobile.h ./build/PaddleMobile.h
+    cd ./build
+    # 生成符号表
+    ranlib *.a
 }
 
 build_error() {
-    echo "unknown argument"
+    echo "unknown target : $1"
 }
 
 if [ $# -lt 1 ]; then
 	echo "error: target missing!"
-    echo "available targets: mac|linux|ios|android"
-    echo "sample usage: ./build.sh mac"
+    echo "available targets: ios|android"
+    echo "sample usage: ./build.sh android"
 else
-    if [ $# -eq 2 ]; then
-        if [ $2 != "googlenet" -a $2 != "mobilenet" -a $2 != "yolo" -a $2 != "squeezenet" -a $2 != "resnet" ]; then
-            if [ $1 = "mac" ]; then
-		        build_for_mac
-	        elif [ $1 = "linux" ]; then
-		        build_for_linux
-	        elif [ $1 = "android" ]; then
-		        build_for_android
-	        elif [ $1 = "ios" ]; then
-		        build_for_ios
-	        else
-		        build_error
-	        fi
-        else
-            if [ $1 = "mac" ]; then
-		        build_for_mac $2
-	        elif [ $1 = "linux" ]; then
-		        build_for_linux $2
-	        elif [ $1 = "android" ]; then
-		        build_for_android $2
-	        elif [ $1 = "ios" ]; then
-		        build_for_ios $2
-	        else
-		        build_error
-	        fi
+    params=($@)
+    for(( i=1; i<$#; i++ )); do
+        if [ ${i} != 1 ]; then
+            NETS=$NETS$";"
+        fi
+        NETS=$NETS$"${params[i]}"
+    done
+    params=${@:2}
+
+    supported=false
+    for name in ${params[@]}; do
+        for net in ${supportedNets[@]}; do
+            match=false
+            if [ "$name"x = "$net"x ];then
+                supported=true
+                match=true
+                break 1
+            fi
+        done
+        if [ "$match" = false ];then
+            echo "${name} not supported!"
+            echo "supported nets are: ${supportedNets[@]}"
+            exit -1
         fi
+    done
+
+    if [ $1 = "android" ]; then
+        build_for_android
+    elif [ $1 = "arm_linux" ]; then
+        build_for_arm_linux
+    elif [ $1 = "ios" ]; then
+        build_for_ios
     else
-        if [ $1 = "mac" ]; then
-		    build_for_mac
-	    elif [ $1 = "linux" ]; then
-		    build_for_linux
-	    elif [ $1 = "android" ]; then
-		    build_for_android
-	    elif [ $1 = "ios" ]; then
-		    build_for_ios
-	    else
-		    build_error
-	    fi
-	fi
-fi
+        build_error "$1"
+    fi
+fi
\ No newline at end of file
diff --git a/tools/ios-cmake/ios.toolchain.cmake b/tools/ios-cmake/ios.toolchain.cmake
index a8735adc8d853a5825a23f1ddf129d0a95199275..6000f7a8e5dffcd8693b56539f4519840ddd8be8 100644
--- a/tools/ios-cmake/ios.toolchain.cmake
+++ b/tools/ios-cmake/ios.toolchain.cmake
@@ -34,6 +34,7 @@ set (CMAKE_SYSTEM_VERSION 1)
 set (UNIX True)
 set (APPLE True)
 set (IOS True)
+set (IOS_ARCH armv7 armv7s arm64)
 
 # Required as of cmake 2.8.10
 set (CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
@@ -49,8 +50,13 @@ endif (CMAKE_UNAME)
 #include (CMakeForceCompiler)
 #CMAKE_C_COMPILER (/usr/bin/gcc)
 #CMAKE_CXX_COMPILER (/usr/bin/g++)
-set(CMAKE_C_COMPILER /usr/bin/gcc)
-set(CMAKE_CXX_COMPILER /usr/bin/g++)
+if(USE_OPENMP)
+    set(CMAKE_C_COMPILER /usr/local/opt/llvm/bin/clang)
+    set(CMAKE_CXX_COMPILER /usr/local/opt/llvm/bin/clang++)
+else()
+    set(CMAKE_C_COMPILER /usr/bin/gcc)
+    set(CMAKE_CXX_COMPILER /usr/bin/g++)
+endif()
 set(CMAKE_AR ar CACHE FILEPATH "" FORCE)
 
 # Skip the platform compiler checks for cross compiling
@@ -159,7 +165,6 @@ set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS su
 
 # set the architecture for iOS
 if (${IOS_PLATFORM} STREQUAL "OS")
-  set (IOS_ARCH armv7 armv7s arm64)
 elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
   set (IOS_ARCH i386)
 elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")
diff --git a/tools/net-detail.awk b/tools/net-detail.awk
new file mode 100644
index 0000000000000000000000000000000000000000..84d0166ac777b5b7fbd9801665031bb2d51fedbb
--- /dev/null
+++ b/tools/net-detail.awk
@@ -0,0 +1,91 @@
+BEGIN {
+print "digraph G {"
+}
+/op:/ {
+    id++
+    opname[id] = $NF
+}
+/input/ {
+    type = "input"
+    para = $NF
+    if (input[id]) {
+        input[id] = input[id] "|"
+    }
+    input[id] = input[id] "<" para ">" para
+}
+/output/ {
+    type = "output"
+    para = $NF
+    if (output[id]) {
+        output[id] = output[id] "|"
+    }
+    output[id] = output[id] "<" para ">" para
+}
+/attr/ {
+    type = "attr"
+    aname = $NF
+    if (attr_key[id]) {
+        attr_key[id] = attr_key[id] "|"
+        attr_value[id] = attr_value[id] "|"
+    }
+    attr_key[id] = attr_key[id] $NF
+}
+/argument/ {
+    if (type == "attr") {
+        split($0, arr, " - ")
+        attr_value[id] = attr_value[id] arr[2]
+    } else if ((type == "input") || (type == "output")) {
+        if (!var2id[$NF]) {
+            var_id++
+            var[var_id] = $NF
+            var2id[$NF] = var_id
+        }
+        varid = var2id[$NF]
+        lid++
+        if (type == "input") {
+            line[lid] = "var_" varid " -> " "op_" id ":<" para ">"
+            if (xout[$NF]) {
+                xi++
+                xline[xi] = "xop_" xout[$NF] " -> " "xop_" id
+            }
+        } else if (type == "output") {
+            line[lid] = "op_" id ":<" para ">" " -> " "var_" varid
+            xout[$NF] = id
+        }
+    }
+}
+/var name/ {
+    varname = $NF
+    vid = var2id[varname]
+}
+/var tensor desc dim / {
+    if (tensor[vid]) tensor[vid] = tensor[vid] " x "
+    tensor[vid] = tensor[vid] $NF
+}
+END {
+
+print "subgraph cluster_G0 {"
+for (i = 1; i <= id; i++) {
+    print "xop_" i "[label=\"" i ". " opname[i] "\"]"
+}
+for (i = 1; i <= xi; i++) {
+    print xline[i]
+}
+print "}"
+
+for (i = 1; i <= id; i++) {
+print "op_" i "[group=op;shape=record;label=\"{{" input[i] "}|<op>" i ". " opname[i] "|{" output[i] "}}\"]"
+}
+for (i = 1; i <= var_id; i++) {
+print "var_" i "[label=\"" var[i] " [" tensor[i] "]\"]"
+}
+for (i = 1; i <= lid; i++) {
+print line[i]
+}
+for (i = 1; i <= id; i++) {
+print "attr_" i "[shape=record;label=\"{" attr_key[i] "}|{" attr_value[i] "}\"]"
+print "attr_" i " -> " "op_" i ":<op>"
+}
+print "}"
+}
+
diff --git a/tools/net.awk b/tools/net.awk
new file mode 100644
index 0000000000000000000000000000000000000000..25689c90d871618fc445bba5044446fa7198b2c5
--- /dev/null
+++ b/tools/net.awk
@@ -0,0 +1,27 @@
+BEGIN {
+    print "digraph {"
+}
+/op:/ {
+    id++
+    op = $NF
+    opname = op "_" id
+    print opname "[\"label\"=\"" op " [" id "]" "\"]"
+}
+/input/ {
+    type = "input"
+}
+/output/ {
+    type = "output"
+}
+/argument/ {
+    if (type == "output") {
+        output[$NF] = opname
+    } else if (type == "input") {
+        if (output[$NF]) {
+            print output[$NF] " -> " opname
+        }
+    }
+}
+END {
+    print "}"
+}
diff --git a/tools/op.cmake b/tools/op.cmake
index 8ddf739b5fc112ba0fa588d4d5b354ceaeb8ebde..6158a318140cd4befebb68434dc8ef53d1b7cd07 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -1,7 +1,11 @@
-set(NET "googlenet" CACHE STRING "select net type")
-set_property(CACHE NET PROPERTY STRINGS "defult" "googlenet" "mobilenet" "yolo" "squeezenet")
+set(FOUND_MATCH OFF)
+set(CON -1)
 
-if (NET EQUAL "googlenet")
+message(STATUS "nets :${NET}")
+
+list(FIND NET "googlenet" CON)
+if (CON GREATER -1)
+  message("googlenet enabled")
   set(CONCAT_OP ON)
   set(CONV_OP ON)
   set(LRN_OP ON)
@@ -11,24 +15,67 @@ if (NET EQUAL "googlenet")
   set(POOL_OP ON)
   set(RELU_OP ON)
   set(FUSION_CONVADD_OP ON)
-  set(FUSION_CONVADD_RELU_OP ON)
-elseif (NET EQUAL "mobilenet")
+  set(FUSION_CONVADDRELU_OP ON)
+
+  set(FOUND_MATCH ON)
+endif()
+
+list(FIND NET "mobilenet" CON)
+if (CON GREATER -1)
+  message("mobilenet enabled")
   set(CONV_OP ON)
   set(ELEMENTWISEADD_OP ON)
   set(RELU_OP ON)
   set(SOFTMAX_OP ON)
-  set(SOFTMAX_OP ON)
+  set(MUL_OP ON)
   set(DEPTHWISECONV_OP ON)
   set(BATCHNORM_OP ON)
   set(POOL_OP ON)
   set(RESHAPE_OP ON)
-  set(FUSION_CONVADDBNRELU_OP)
-elseif (NET EQUAL "yolo")
+  set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_CONVADD_OP ON)
+
+  set(FOUND_MATCH ON)
+endif()
+
+
+list(FIND NET "mobilenetssd" CON)
+if (CON GREATER -1)
+  message("mobilenetssd enabled")
+  set(FUSION_CONVBNRELU_OP ON)
+  set(FUSION_CONVBNRELU_OP ON)
+  set(FUSION_DWCONVBNRELU_OP ON)
+  set(FUSION_CONVADD_OP ON)
+  set(MULTICLASSNMS_OP ON)
+  set(SOFTMAX_OP ON)
+  set(TRANSPOSE_OP ON)
+    #feed
+  set(PRIORBOX_OP ON)
+  set(CONCAT_OP ON)
+  set(BOXCODER_OP ON)
+  set(RESHAPE_OP ON)
+#fetch
+  #total
+
+  set(FOUND_MATCH ON)
+
+endif()
+
+
+list(FIND NET "yolo" CON)
+if (CON GREATER -1)
+  message("yolo enabled")
   set(BATCHNORM_OP ON)
   set(CONV_OP ON)
   set(RELU_OP ON)
   set(ELEMENTWISEADD_OP ON)
-elseif (NET EQUAL "squeezenet")
+
+  set(FOUND_MATCH ON)
+endif()
+
+list(FIND NET "squeezenet" CON)
+if (CON GREATER -1)
+  message("squeezenet enabled")
   set(CONCAT_OP ON)
   set(CONV_OP ON)
   set(RELU_OP ON)
@@ -36,23 +83,113 @@ elseif (NET EQUAL "squeezenet")
   set(POOL_OP ON)
   set(RESHAPE_OP ON)
   set(SOFTMAX_OP ON)
-elseif (NET EQUAL "resnet")
+
+  set(FOUND_MATCH ON)
+endif()
+
+
+list(FIND NET "resnet" CON)
+if (CON GREATER -1)
+  message("resnet enabled")
+  set(CONCAT_OP ON)
   set(CONV_OP ON)
+  set(RELU_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(POOL_OP ON)
   set(BATCHNORM_OP ON)
+  set(FUSION_CONVBNADDRELU_OP ON)
+  set(MUL_OP ON)
+  set(RESHAPE_OP ON)
+  set(SOFTMAX_OP ON)
+
+  set(FOUND_MATCH ON)
+endif()
+
+list(FIND NET "FPGAnets" CON)
+if (CON GREATER -1)
+  message("FPGAnets enabled")
+  set(FUSION_CONVADDRELU_OP ON)
+  set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_CONVADDBN_OP ON)
+  set(FUSION_ELEMENTWISEADDRELU_OP ON)
+  set(FUSION_FC_OP ON)
+  set(FUSION_FCRELU_OP ON)
+  set(POOL_OP ON)
+  set(CONCAT_OP ON)
+  set(SOFTMAX_OP ON)
+  set(DROPOUT_OP ON)
+  set(FUSION_CONVBNRELU_OP ON)
+  set(FUSION_CONVBN_OP ON)
+  set(FUSION_CONVADD_OP ON)
+
+  set(FOUND_MATCH ON)
+endif()
+
+list(FIND NET "nlp" CON)
+if (CON GREATER -1)
+  message("nlp enabled")
+  set(FUSION_FC_OP ON)
+  set(LOOKUP_OP ON)
+  set(GRU_OP ON)
+  set(CRF_OP ON)
+  set(CONCAT_OP ON)
   set(ELEMENTWISEADD_OP ON)
+
+
+  set(FOUND_MATCH ON)
+endif()
+
+list(FIND NET "mobilenetfssd" CON)
+if (CON GREATER -1)
+  message("mobilenetfssd enabled")
+  set(FUSION_CONVADDRELU_OP ON)
+  set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_CONVADD_OP ON)
   set(SOFTMAX_OP ON)
-  set(MUL_OP ON)
+  set(RESHAPE_OP ON)
+  set(BILINEAR_INTERP_OP ON)
+  set(TRANSPOSE_OP ON)
+  set(CONCAT_OP ON)
+  set(PRIORBOX_OP ON)
+  set(BATCHNORM_OP ON)
+  set(BOXCODER_OP ON)
+  set(MULTICLASSNMS_OP ON)
+  set(FLATTEN_OP ON)
+  set(SPLIT_OP ON)
+  set(SHAPE_OP ON)
+
+  set(FOUND_MATCH ON)
+endif()
+
+list(FIND NET "genet" CON)
+if (CON GREATER -1)
+  message("genet enabled")
+  set(FUSION_CONVADDPRELU_OP ON)
+  set(FUSION_CONVADDADDPRELU_OP ON)
+  set(FUSION_CONVADD_OP ON)
+  set(CONV_TRANSPOSE_OP ON)
+  set(FUSION_CONVADDRELU_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(PRELU_OP ON)
   set(POOL_OP ON)
-  set(RELU_OP ON)
-else ()
+  set(CONCAT_OP ON)
+
+  set(FOUND_MATCH ON)
+endif()
+
+
+if(NOT FOUND_MATCH)
+  message("--default--")
   set(BATCHNORM_OP ON)
+  set(CONV_TRANSPOSE_OP ON)
   set(BOXCODER_OP ON)
   set(CONCAT_OP ON)
   set(CONV_OP ON)
   set(DEPTHWISECONV_OP ON)
   set(ELEMENTWISEADD_OP ON)
   set(FUSION_CONVADD_OP ON)
-  set(CONVADDRELU_OP ON)
+  set(FUSION_CONVADDPRELU_OP ON)
+  set(FUSION_CONVADDRELU_OP ON)
   set(FUSION_FC_OP ON)
   set(LRN_OP ON)
   set(MUL_OP ON)
@@ -64,10 +201,26 @@ else ()
   set(SIGMOID_OP ON)
   set(SOFTMAX_OP ON)
   set(TRANSPOSE_OP ON)
-  set(FUSION_CONVADD_RELU_OP ON)
   set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_CONVADDADDPRELU_OP ON)
+  set(FUSION_DWCONVBNRELU_OP ON)
+  set(FUSION_CONVBNRELU_OP ON)
+  set(FUSION_CONVBNADDRELU_OP ON)
+  set(PRELU_OP ON)
+  set(RESIZE_OP ON)
+  set(SCALE_OP ON)
+  set(SLICE_OP ON)
   set(DROPOUT_OP ON)
   set(IM2SEQUENCE_OP ON)
+  set(LOOKUP_OP ON)
+  set(GRU_OP ON)
+  set(CRF_OP ON)
+  set(BILINEAR_INTERP_OP ON)
+  set(SPLIT_OP ON)
+  set(FLATTEN_OP ON)
+  set(SHAPE_OP ON)
+endif()
+
   # option(BATCHNORM_OP "" ON)
   # option(BOXCODER_OP "" ON)
   # option(CONCAT_OP "" ON)
@@ -75,7 +228,7 @@ else ()
   # option(DEPTHWISECONV_OP "" ON)
   # option(ELEMENTWISEADD_OP "" ON)
   # option(FUSION_CONVADD_OP "" ON)
-  # option(CONVADDRELU_OP "" ON)
+  # option(FUSION_CONVADDRELU_OP "" ON)
   # option(FUSION_FC_OP "" ON)
   # option(LRN_OP "" ON)
   # option(MUL_OP "" ON)
@@ -87,8 +240,7 @@ else ()
   # option(SIGMOID_OP "" ON)
   # option(SOFTMAX_OP "" ON)
   # option(TRANSPOSE_OP "" ON)
-  # option(FUSION_CONVADD_RELU_OP "" ON)
-endif ()
+# endif ()
 
 if (BATCHNORM_OP)
   add_definitions(-DBATCHNORM_OP)
@@ -111,8 +263,14 @@ endif()
 if (FUSION_CONVADD_OP)
   add_definitions(-DFUSION_CONVADD_OP)
 endif()
-if (CONVADDRELU_OP)
-  add_definitions(-DCONVADDRELU_OP)
+if (FUSION_CONVADDRELU_OP)
+  add_definitions(-DFUSION_CONVADDRELU_OP)
+endif()
+if (FUSION_CONVADDPRELU_OP)
+  add_definitions(-DFUSION_CONVADDPRELU_OP)
+endif()
+if (FUSION_CONVADDADDPRELU_OP)
+  add_definitions(-DFUSION_CONVADDADDPRELU_OP)
 endif()
 if (FUSION_FC_OP)
   add_definitions(-DFUSION_FC_OP)
@@ -147,15 +305,85 @@ endif()
 if (TRANSPOSE_OP)
   add_definitions(-DTRANSPOSE_OP)
 endif()
-if (FUSION_CONVADD_RELU_OP)
-  add_definitions(-DFUSION_CONVADD_RELU_OP)
-endif()
 if (FUSION_CONVADDBNRELU_OP)
   add_definitions(-DFUSION_CONVADDBNRELU_OP)
 endif()
+if (FUSION_DWCONVBNRELU_OP)
+  add_definitions(-DFUSION_DWCONVBNRELU_OP)
+endif()
+
+if (FUSION_CONVBNRELU_OP)
+  add_definitions(-DFUSION_CONVBNRELU_OP)
+endif()
+
+if (FUSION_CONVBNADDRELU_OP)
+  add_definitions(-DFUSION_CONVBNADDRELU_OP)
+endif()
+
+if (PRELU_OP)
+  add_definitions(-DPRELU_OP)
+endif()
+if (RESIZE_OP)
+  add_definitions(-DRESIZE_OP)
+endif()
+if (SCALE_OP)
+  add_definitions(-DSCALE_OP)
+endif()
+if (SLICE_OP)
+  add_definitions(-DSLICE_OP)
+endif()
 if (DROPOUT_OP)
   add_definitions(-DDROPOUT_OP)
 endif()
 if (IM2SEQUENCE_OP)
   add_definitions(-DIM2SEQUENCE_OP)
 endif()
+
+if (FUSION_CONVADDBN_OP)
+  add_definitions(-DFUSION_CONVADDBN_OP)
+endif()
+if (FUSION_FCRELU_OP)
+  add_definitions(-DFUSION_FCRELU_OP)
+endif()
+if (FUSION_POOLBN_OP)
+  add_definitions(-DFUSION_POOLBN_OP)
+endif()
+if (FUSION_ELEMENTWISEADDRELU_OP)
+  add_definitions(-DFUSION_ELEMENTWISEADDRELU_OP)
+endif()
+if (FUSION_CONVBN_OP)
+  add_definitions(-DFUSION_CONVBN_OP)
+endif()
+
+if (CONV_TRANSPOSE_OP)
+  add_definitions(-DCONV_TRANSPOSE)
+endif()
+
+if (LOOKUP_OP)
+  add_definitions(-DLOOKUP_OP)
+endif()
+
+if (GRU_OP)
+  add_definitions(-DGRU_OP)
+endif()
+
+if (CRF_OP)
+  add_definitions(-DCRF_OP)
+endif()
+
+
+if (FLATTEN_OP)
+  add_definitions(-DFLATTEN_OP)
+endif()
+
+if (SPLIT_OP)
+  add_definitions(-DSPLIT_OP)
+endif()
+
+if (BILINEAR_INTERP_OP)
+  add_definitions(-DBILINEAR_INTERP_OP)
+endif()
+
+if (SHAPE_OP)
+  add_definitions(-DSHAPE_OP)
+endif()
\ No newline at end of file
diff --git a/tools/pre-commit.hooks/clang-format.hook b/tools/pre-commit.hooks/clang-format.hook
index 4fa4253bad78fe287fb92863a684a5d7def71061..ece9ebc598e3fa63d1d76409dc0068854aaec851 100644
--- a/tools/pre-commit.hooks/clang-format.hook
+++ b/tools/pre-commit.hooks/clang-format.hook
@@ -14,6 +14,10 @@ fi
 
 # https://medicineyeh.wordpress.com/2017/07/13/clang-format-with-pragma/
 shift
-perl -i -pe 's|#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@"
-clang-format -i $@
+perl -i -pe 's|^\s+#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@"
+(
+# remove clang format ios_io folder
+flist=$(echo "$@" | perl -pe 's|src/ios_io/[^ ]*||')
+clang-format -i $flist
+)
 perl -i -pe 's|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> ||' "$@"
diff --git a/tools/quantification/CMakeLists.txt b/tools/quantification/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5f1ca7fdc2b65638c7158b0933b924c71eadc4a0
--- /dev/null
+++ b/tools/quantification/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 3.6)
+project(quali)
+add_definitions(-DENABLE_EXCEPTION)
+
+set(CMAKE_CXX_STANDARD 11)
+file(GLOB_RECURSE QULIFICATON_CC src/*.cc src/*.cpp src/*.c src/*.mm)
+file(GLOB_RECURSE QULIFICATON_H src/*.h)
+include_directories(. src/)
+
+#add_library(paddle-mobile SHARED ${QULIFICATON_CC} ${QULIFICATON_H} convert.cpp)
+
+add_executable(quantify convert.cpp ${QULIFICATON_CC} ${QULIFICATON_H})
\ No newline at end of file
diff --git a/tools/quantification/README.md b/tools/quantification/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ac729af01e7e73328b884097009dad1d468e7997
--- /dev/null
+++ b/tools/quantification/README.md
@@ -0,0 +1,39 @@
+# 模型量化脚本
+
+#### 量化脚本使用指南
+1. 在PaddleMobile项目目录下（如 ~/PaddleProject/paddle-mobile）
+
+2. cd到  tools/quantification/ 目录
+
+3. cmake编译
+
+    ``` sh
+    cmake .
+    make
+    ```
+
+4. 运行量化脚本
+    ```sh
+    ./quantify (0:seperated. 1:combined ) (输入路径) (输出路径)
+    # quantify googlenet seperated   from  /Users/xiebaiyuan/PaddleProject/quali/models/googlenet to ./googlenet_min
+    ./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min 
+
+    ```
+
+*注:*
+*量化工具中*
+*1.seperated模型model文件默认命名为 "__model__";*
+*2.combined模型的model文件默认命名为 "model",参数文件默认命名为"params";*
+
+    
+##### 整体如下:
+以googlenet非combined为例：
+
+```sh
+cd tools/quantification/
+cmake .
+make
+./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min
+```
+
+
diff --git a/tools/quantification/convert.cpp b/tools/quantification/convert.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..282b22073fc96ddb2ed0d421f113604aadcc4afc
--- /dev/null
+++ b/tools/quantification/convert.cpp
@@ -0,0 +1,275 @@
+
+
+#include "src/enforce.h"
+#include "src/var_desc.h"
+#include "src/program_desc.h"
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+#include <iostream>
+#include <utility>
+#include <vector>
+#include "src/framework.pb-c.h"
+#include "src/protobuf-c.h"
+#include <fstream>
+#include <iostream>
+#include <limits>
+
+const size_t kSize64 = sizeof(uint64_t);
+const size_t kSize32 = sizeof(uint32_t);
+
+char *Get_binary_data(const std::string &filename) {
+
+    FILE *file = fopen(filename.c_str(), "rb");
+
+    PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
+                          filename.c_str());
+    fseek(file, 0, SEEK_END);
+    int64_t size = ftell(file);
+
+    PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
+    rewind(file);
+    auto *data = new char[size];
+    size_t bytes_read = fread(data, 1, static_cast<size_t>(size), file);
+    PADDLE_MOBILE_ENFORCE(bytes_read == size,
+                          "read binary file bytes do not match with fseek");
+    fclose(file);
+    return data;
+}
+
+
+static size_t ReadBuffer(const char *file_name, uint8_t **out) {
+    FILE *fp;
+    fp = fopen(file_name, "rb");
+    PADDLE_MOBILE_ENFORCE(fp != nullptr, " %s open failed !", file_name);
+    fseek(fp, 0, SEEK_END);
+    auto size = static_cast<size_t>(ftell(fp));
+    rewind(fp);
+    *out = reinterpret_cast<uint8_t *>(malloc(size));
+    size_t cur_len = 0;
+    size_t nread;
+    while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
+        cur_len += nread;
+    }
+    fclose(fp);
+    return cur_len;
+}
+
+std::shared_ptr<ProgramDesc> loadParams(const std::string &model_path) {
+    PaddleMobile__Framework__Proto__ProgramDesc *c_program;
+    uint8_t *buf = nullptr;
+    size_t read_size = ReadBuffer(model_path.c_str(), &buf);
+    PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
+    c_program = paddle_mobile__framework__proto__program_desc__unpack(
+            nullptr, read_size, buf);
+    PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
+    auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
+    return originProgramDesc;
+
+}
+
+void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file) {
+    // 1. version
+    uint32_t version = *reinterpret_cast<uint32_t *>(*dataP);
+
+    // write version
+    fwrite(&version, kSize32, 1, out_file);
+
+    *dataP += kSize32;
+
+    // 2 Lod information
+    auto *lod_level_ptr = new uint64_t();
+    memcpy(lod_level_ptr, *dataP, kSize64);
+
+    uint64_t lod_level = 0;
+    // write lod Information
+    fwrite(&lod_level, kSize64, 1, out_file);
+    delete lod_level_ptr;
+
+    *dataP += kSize64;
+
+    for (uint64_t i = 0; i < lod_level; ++i) {
+        uint64_t size = *reinterpret_cast<uint64_t *>(*dataP);
+        // write lod size
+        fwrite(&size, kSize64, 1, out_file);
+        (*dataP) += kSize64;
+
+        std::vector<size_t> tmp(size / sizeof(size_t));
+        for (unsigned long &k : tmp) {
+            k = *reinterpret_cast<size_t *>(*dataP);
+            (*dataP) += sizeof(size_t);
+        }
+        // write lod size vector
+        fwrite(&tmp, sizeof(size_t), tmp.size(), out_file);
+    }
+
+    // 3. tensor version
+    uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*dataP);
+    // write tensor version
+    fwrite(&tensor_version, kSize32, 1, out_file);
+    (*dataP) += kSize32;
+
+    // 4. tensor desc
+    int32_t size = *reinterpret_cast<int32_t *>(*dataP);
+    // write tensor desc
+    fwrite(&size, sizeof(int32_t), 1, out_file);
+    (*dataP) += sizeof(int32_t);
+
+    std::unique_ptr<char[]> buf(new char[size]);
+    for (int m = 0; m < size; ++m) {
+        buf.get()[m] = (*dataP)[m];
+    }
+
+    fwrite(buf.get(), sizeof(char), static_cast<size_t>(size), out_file);
+    (*dataP) += (sizeof(char) * size);
+
+    const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
+    int memory_size = 1;
+    for (auto l : desc.Dims()) {
+        memory_size *= l;
+    }
+
+    void *memory = nullptr;
+    int type_size = 0;
+    switch (desc.DataType()) {
+        case paddle_mobile::framework::VARTYPE_TYPE_FP16:
+            type_size = 2;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_FP32:
+            type_size = 4;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_FP64:
+            type_size = 8;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_INT32:
+            type_size = 4;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_INT64:
+            type_size = 8;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_BOOL:
+            type_size = 1;
+            break;
+        default:
+            break;
+    }
+    size_t tensorSize = sizeof(char) * memory_size * type_size;
+
+    memory = new char[tensorSize];
+
+    for (int n = 0; n < tensorSize; ++n) {
+        static_cast<char *>(memory)[n] = (*dataP)[n];
+    }
+    *dataP += tensorSize;
+
+    // for float 32
+    float min_value = std::numeric_limits<float>::max();
+    float max_value = std::numeric_limits<float>::min();
+
+    for (int k = 0; k < memory_size; ++k) {
+        min_value = std::min(min_value, static_cast<float *> (memory)[k]);
+        max_value = std::max(max_value, static_cast<float *> (memory)[k]);
+    }
+
+    fwrite(&min_value, sizeof(float), 1, out_file);
+    fwrite(&max_value, sizeof(float), 1, out_file);
+
+    for (int g = 0; g < memory_size; ++g) {
+        float value = static_cast<float *> (memory)[g];
+        auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
+        fwrite(&factor, sizeof(uint8_t), 1, out_file);
+    }
+}
+
+void
+quantificate_combined(const std::string &model_path, const std::string &param_path, const std::string &param_min_path) {
+
+    auto program = loadParams(model_path);
+    char *origin_data = Get_binary_data(param_path);
+    char *data = origin_data;
+    FILE *out_file = fopen(param_min_path.c_str(), "wb");
+    for (const auto &block : program->Blocks()) {
+        for (const auto &var_desc : block->Vars()) {
+            if (var_desc->Persistable()) {
+                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+                    continue;
+                }
+                LoadWithDump(*var_desc, &data, out_file);
+            }
+        }
+    }
+    fclose(out_file);
+    delete origin_data;
+
+}
+
+void quantificate_seperated(const std::string model_dir, const std::string param_min_path) {
+
+    auto program = loadParams(model_dir + "/__model__");
+
+    std::string shell_command = "mkdir " + param_min_path;
+    system(shell_command.c_str());
+
+    for (const auto &block : program->Blocks()) {
+        for (const auto &var_desc : block->Vars()) {
+            if (var_desc->Persistable()) {
+                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+                    continue;
+                }
+                std::string file_name = param_min_path + "/" + var_desc->Name();
+                FILE *out_file = fopen(file_name.c_str(), "wb");
+                char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
+                char *data = origin_data;
+                LoadWithDump(*var_desc, &data, out_file);
+                delete origin_data;
+                fclose(out_file);
+            }
+        }
+    }
+
+}
+
+
+int main(int argc, char **argv) {
+
+    const std::string kNoteEg = "( eg:  ./quantify 1 your_combined_model_path output_path  or  ./quantify 0 your_seperated_model_path output_path)";
+
+    PADDLE_MOBILE_ENFORCE(argc > 1, "wee need params.%s ", kNoteEg.c_str());
+
+    std::string action_type = argv[1];
+    PADDLE_MOBILE_ENFORCE(argc > 1 && (action_type) == "1" || action_type == "0",
+                          "only 1 or 2 supported, current is %s %s ",
+                          action_type.c_str(),
+                          kNoteEg.c_str());
+
+    PADDLE_MOBILE_ENFORCE(argc > 2, "we need your model path. %s ", kNoteEg.c_str());
+    std::string base_path = argv[2];
+
+    PADDLE_MOBILE_ENFORCE(argc > 3, "we need your output path. %s ", kNoteEg.c_str());
+    std::string output_path = argv[3];
+
+    if (action_type == "0") {
+        // for seperated
+        const std::string &seperated_min_dir = output_path;
+        quantificate_seperated(base_path, seperated_min_dir);
+        return 0;
+    }
+
+    if (action_type == "1") {
+        // for combined
+        const std::string &combined_min_dir = output_path;
+        std::string model_path = base_path + "/model";
+        std::string param_path = base_path + "/params";
+        quantificate_combined(model_path, param_path, combined_min_dir);
+
+        return 0;
+    }
+
+    return -1;
+}
+
+
+
+
+
+
diff --git a/tools/quantification/src/block_desc_local.cpp b/tools/quantification/src/block_desc_local.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ad1982c05ed0b1b7c7bec5ef26aa8151f941cf3
--- /dev/null
+++ b/tools/quantification/src/block_desc_local.cpp
@@ -0,0 +1,48 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+//
+// Created by 谢柏渊 on 2018/7/25.
+//
+#include "src/block_desc_local.h"
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "src/framework.pb-c.h"
+
+std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>>
+BlockDesc::Vars() const {
+  return vars_;
+}
+
+BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc)
+    : index_(desc->idx), parent_index_(desc->idx) {
+  for (int i = 0; i < desc->n_vars; ++i) {
+    PaddleMobile__Framework__Proto__VarDesc *var_desc = desc->vars[i];
+    vars_.emplace_back(std::shared_ptr<paddle_mobile::framework::VarDesc>(
+        new paddle_mobile::framework::VarDesc(var_desc)));
+  }
+
+  std::sort(vars_.begin(), vars_.end(),
+            [](std::shared_ptr<paddle_mobile::framework::VarDesc> left,
+               std::shared_ptr<paddle_mobile::framework::VarDesc> right) {
+              return left->Name() < right->Name();
+            });
+
+  //        for (int j = 0; j < desc->n_ops; ++j) {
+  //            PaddleMobile__Framework__Proto__OpDesc *op_desc = desc->ops[j];
+  //            ops_.emplace_back(new OpDesc(op_desc));
+  //        }
+}
diff --git a/tools/quantification/src/block_desc_local.h b/tools/quantification/src/block_desc_local.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ee8132af7f21ed0e62678c8da510bfd7fba9dbd
--- /dev/null
+++ b/tools/quantification/src/block_desc_local.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+//
+// Created by 谢柏渊 on 2018/7/25.
+//
+
+#ifndef TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
+#define TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
+
+#include <memory>
+#include <vector>
+#include "src/var_desc.h"
+
+class BlockDesc {
+ public:
+  friend class Node;
+  friend class ProgramOptimize;
+  BlockDesc() {}
+  explicit BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc);
+
+  const int &ID() const { return index_; }
+
+  const bool &MultiThread() const { return multi_thread_; }
+
+  const int &Parent() const { return parent_index_; }
+
+  bool operator==(const BlockDesc &in_block) const {
+    return this->ID() == in_block.ID() && this->Parent() == in_block.Parent();
+  }
+
+  bool operator<(const BlockDesc &in_block) const {
+    return this->ID() < in_block.ID() && this->Parent() < in_block.Parent();
+  }
+
+  std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>> Vars() const;
+
+ private:
+  int index_;
+  bool multi_thread_;
+  int parent_index_;
+  std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>> vars_;
+};
+
+#endif  // TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
diff --git a/tools/quantification/src/enforce.h b/tools/quantification/src/enforce.h
new file mode 100644
index 0000000000000000000000000000000000000000..51d2110e32433686d1b3353bc63b92a564a13e9d
--- /dev/null
+++ b/tools/quantification/src/enforce.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef ENABLE_EXCEPTION
+#include <stdio.h>
+#include <exception>
+#include <string>
+
+#endif
+
+namespace paddle_mobile {
+
+#ifdef ENABLE_EXCEPTION
+struct PaddleMobileException : public std::exception {
+  const std::string exception_prefix = "paddle mobile C++ Exception: \n";
+  std::string message;
+
+  PaddleMobileException(const char *header, const char *detail,
+                        const char *file, const int line) {
+    char buffer[1500];
+    snprintf(buffer, sizeof(buffer),
+             "%s| %s \n| [in file] : %s\n| [on line] : %d\n| [detail]  : %s\n",
+             exception_prefix.c_str(), header, file, line, detail);
+    message = std::string(buffer);
+  }
+  const char *what() const noexcept { return message.c_str(); }
+};
+
+#define PADDLE_MOBILE_THROW_EXCEPTION(...)                                 \
+  {                                                                        \
+    char buffer[1000];                                                     \
+    snprintf(buffer, sizeof(buffer), __VA_ARGS__);                         \
+    std::string detail(buffer);                                            \
+    throw paddle_mobile::PaddleMobileException("Custom Exception", buffer, \
+                                               __FILE__, __LINE__);        \
+  }
+
+#define PADDLE_MOBILE_ENFORCE(stat, ...)                                      \
+  {                                                                           \
+    if (stat) {                                                               \
+    } else {                                                                  \
+      char buffer[1000];                                                      \
+      snprintf(buffer, sizeof(buffer), __VA_ARGS__);                          \
+      std::string detail(buffer);                                             \
+      throw paddle_mobile::PaddleMobileException("paddle-mobile enforce",     \
+                                                 buffer, __FILE__, __LINE__); \
+    }                                                                         \
+  }
+#else
+#define PADDLE_MOBILE_THROW_EXCEPTION(...)
+#define PADDLE_MOBILE_ENFORCE(stat, ...)
+#endif
+
+}  // namespace paddle_mobile
diff --git a/tools/quantification/src/framework.pb-c.c b/tools/quantification/src/framework.pb-c.c
new file mode 100644
index 0000000000000000000000000000000000000000..aed0a6c9c0614da74a82cea8c7aa705978dddafc
--- /dev/null
+++ b/tools/quantification/src/framework.pb-c.c
@@ -0,0 +1,1403 @@
+/* Generated by the protocol buffer compiler.  DO NOT EDIT! */
+/* Generated from: framework.proto */
+
+/* Do not generate deprecated warnings for self */
+#ifndef PROTOBUF_C__NO_DEPRECATED
+#define PROTOBUF_C__NO_DEPRECATED
+#endif
+
+#include "framework.pb-c.h"
+void paddle_mobile__framework__proto__op_desc__attr__init(
+    PaddleMobile__Framework__Proto__OpDesc__Attr *message) {
+  static const PaddleMobile__Framework__Proto__OpDesc__Attr init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__op_desc__var__init(
+    PaddleMobile__Framework__Proto__OpDesc__Var *message) {
+  static const PaddleMobile__Framework__Proto__OpDesc__Var init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__op_desc__init(
+    PaddleMobile__Framework__Proto__OpDesc *message) {
+  static const PaddleMobile__Framework__Proto__OpDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__OpDesc *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__op_desc__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+
+PaddleMobile__Framework__Proto__OpDesc *
+paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
+                                                 size_t len,
+                                                 const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__OpDesc *)protobuf_c_message_unpack(
+      &paddle_mobile__framework__proto__op_desc__descriptor, allocator, len,
+      data);
+}
+void paddle_mobile__framework__proto__op_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__OpDesc *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__op_desc__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+void paddle_mobile__framework__proto__op_proto__var__init(
+    PaddleMobile__Framework__Proto__OpProto__Var *message) {
+  static const PaddleMobile__Framework__Proto__OpProto__Var init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__op_proto__attr__init(
+    PaddleMobile__Framework__Proto__OpProto__Attr *message) {
+  static const PaddleMobile__Framework__Proto__OpProto__Attr init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__op_proto__init(
+    PaddleMobile__Framework__Proto__OpProto *message) {
+  static const PaddleMobile__Framework__Proto__OpProto init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
+    const PaddleMobile__Framework__Proto__OpProto *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__op_proto__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+
+PaddleMobile__Framework__Proto__OpProto *
+paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__OpProto *)protobuf_c_message_unpack(
+      &paddle_mobile__framework__proto__op_proto__descriptor, allocator, len,
+      data);
+}
+void paddle_mobile__framework__proto__op_proto__free_unpacked(
+    PaddleMobile__Framework__Proto__OpProto *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__op_proto__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+void paddle_mobile__framework__proto__var_type__tensor_desc__init(
+    PaddleMobile__Framework__Proto__VarType__TensorDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarType__TensorDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init(
+    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc
+      init_value =
+          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init(
+    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc
+      init_value =
+          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__reader_desc__init(
+    PaddleMobile__Framework__Proto__VarType__ReaderDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarType__ReaderDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__channel_desc__init(
+    PaddleMobile__Framework__Proto__VarType__ChannelDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarType__ChannelDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__tuple__init(
+    PaddleMobile__Framework__Proto__VarType__Tuple *message) {
+  static const PaddleMobile__Framework__Proto__VarType__Tuple init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__init(
+    PaddleMobile__Framework__Proto__VarType *message) {
+  static const PaddleMobile__Framework__Proto__VarType init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__var_type__get_packed_size(
+    const PaddleMobile__Framework__Proto__VarType *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__var_type__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+PaddleMobile__Framework__Proto__VarType *
+paddle_mobile__framework__proto__var_type__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__VarType *)protobuf_c_message_unpack(
+      &paddle_mobile__framework__proto__var_type__descriptor, allocator, len,
+      data);
+}
+void paddle_mobile__framework__proto__var_type__free_unpacked(
+    PaddleMobile__Framework__Proto__VarType *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__var_type__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+void paddle_mobile__framework__proto__var_desc__init(
+    PaddleMobile__Framework__Proto__VarDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__VarDesc *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__var_desc__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+
+PaddleMobile__Framework__Proto__VarDesc *
+paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__VarDesc *)protobuf_c_message_unpack(
+      &paddle_mobile__framework__proto__var_desc__descriptor, allocator, len,
+      data);
+}
+void paddle_mobile__framework__proto__var_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__VarDesc *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__var_desc__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+void paddle_mobile__framework__proto__block_desc__init(
+    PaddleMobile__Framework__Proto__BlockDesc *message) {
+  static const PaddleMobile__Framework__Proto__BlockDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__BlockDesc *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__block_desc__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+
+PaddleMobile__Framework__Proto__BlockDesc *
+paddle_mobile__framework__proto__block_desc__unpack(
+    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__BlockDesc *)protobuf_c_message_unpack(
+      &paddle_mobile__framework__proto__block_desc__descriptor, allocator, len,
+      data);
+}
+void paddle_mobile__framework__proto__block_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__BlockDesc *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__block_desc__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+void paddle_mobile__framework__proto__program_desc__init(
+    PaddleMobile__Framework__Proto__ProgramDesc *message) {
+  static const PaddleMobile__Framework__Proto__ProgramDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__ProgramDesc *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__program_desc__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+
+PaddleMobile__Framework__Proto__ProgramDesc *
+paddle_mobile__framework__proto__program_desc__unpack(
+    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__ProgramDesc *)
+      protobuf_c_message_unpack(
+          &paddle_mobile__framework__proto__program_desc__descriptor, allocator,
+          len, data);
+}
+void paddle_mobile__framework__proto__program_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__ProgramDesc *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__program_desc__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_desc__attr__field_descriptors[12] = {
+        {
+            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, name), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, type),
+            &paddle_mobile__framework__proto__attr_type__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "i", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_i),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, i), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "f", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_FLOAT,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_f),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, f), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "s", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, s), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "ints", 6, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT32,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_ints),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, ints), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "floats", 7, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_FLOAT,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_floats),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, floats),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "strings", 8, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_strings),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, strings),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "b", 10, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_b),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, b), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "bools", 11, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_bools),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, bools), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "block_idx", 12, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr,
+                     has_block_idx),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, block_idx),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "l", 13, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT64,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_l),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, l), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name[] = {
+        8,  /* field[8] = b */
+        10, /* field[10] = block_idx */
+        9,  /* field[9] = bools */
+        3,  /* field[3] = f */
+        6,  /* field[6] = floats */
+        2,  /* field[2] = i */
+        5,  /* field[5] = ints */
+        11, /* field[11] = l */
+        0,  /* field[0] = name */
+        4,  /* field[4] = s */
+        7,  /* field[7] = strings */
+        1,  /* field[1] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_desc__attr__number_ranges[2 + 1] = {
+        {1, 0}, {10, 8}, {0, 12}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__attr__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpDesc.Attr",
+        "Attr",
+        "PaddleMobile__Framework__Proto__OpDesc__Attr",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpDesc__Attr),
+        12,
+        paddle_mobile__framework__proto__op_desc__attr__field_descriptors,
+        paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name,
+        2,
+        paddle_mobile__framework__proto__op_desc__attr__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__op_desc__attr__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_desc__var__field_descriptors[2] = {
+        {
+            "parameter", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, parameter),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "arguments", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, n_arguments),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, arguments),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_desc__var__field_indices_by_name[] = {
+        1, /* field[1] = arguments */
+        0, /* field[0] = parameter */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_desc__var__number_ranges[1 + 1] = {
+        {1, 0}, {0, 2}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__var__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpDesc.Var",
+        "Var",
+        "PaddleMobile__Framework__Proto__OpDesc__Var",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpDesc__Var),
+        2,
+        paddle_mobile__framework__proto__op_desc__var__field_descriptors,
+        paddle_mobile__framework__proto__op_desc__var__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__op_desc__var__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__op_desc__var__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__op_desc__is_target__default_value = 0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_desc__field_descriptors[5] = {
+        {
+            "inputs", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_inputs),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, inputs),
+            &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "outputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_outputs),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, outputs),
+            &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "type", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, type), NULL, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_attrs),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, attrs),
+            &paddle_mobile__framework__proto__op_desc__attr__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "is_target", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, has_is_target),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, is_target), NULL,
+            &paddle_mobile__framework__proto__op_desc__is_target__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_desc__field_indices_by_name[] = {
+        3, /* field[3] = attrs */
+        0, /* field[0] = inputs */
+        4, /* field[4] = is_target */
+        1, /* field[1] = outputs */
+        2, /* field[2] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_desc__number_ranges[1 + 1] = {{1, 0},
+                                                                      {0, 5}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpDesc",
+        "OpDesc",
+        "PaddleMobile__Framework__Proto__OpDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpDesc),
+        5,
+        paddle_mobile__framework__proto__op_desc__field_descriptors,
+        paddle_mobile__framework__proto__op_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__op_desc__number_ranges,
+        (ProtobufCMessageInit)paddle_mobile__framework__proto__op_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__op_proto__var__duplicable__default_value =
+        0;
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__op_proto__var__intermediate__default_value =
+        0;
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__op_proto__var__dispensable__default_value =
+        0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_proto__var__field_descriptors[5] = {
+        {
+            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, name), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "comment", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, comment),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "duplicable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
+                     has_duplicable),
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, duplicable),
+            NULL,
+            &paddle_mobile__framework__proto__op_proto__var__duplicable__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "intermediate", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
+                     has_intermediate),
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
+                     intermediate),
+            NULL,
+            &paddle_mobile__framework__proto__op_proto__var__intermediate__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "dispensable", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
+                     has_dispensable),
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, dispensable),
+            NULL,
+            &paddle_mobile__framework__proto__op_proto__var__dispensable__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_proto__var__field_indices_by_name[] = {
+        1, /* field[1] = comment */
+        4, /* field[4] = dispensable */
+        2, /* field[2] = duplicable */
+        3, /* field[3] = intermediate */
+        0, /* field[0] = name */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_proto__var__number_ranges[1 + 1] = {
+        {1, 0}, {0, 5}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__var__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpProto.Var",
+        "Var",
+        "PaddleMobile__Framework__Proto__OpProto__Var",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpProto__Var),
+        5,
+        paddle_mobile__framework__proto__op_proto__var__field_descriptors,
+        paddle_mobile__framework__proto__op_proto__var__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__op_proto__var__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__op_proto__var__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__op_proto__attr__generated__default_value =
+        0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_proto__attr__field_descriptors[4] = {
+        {
+            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, name), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, type),
+            &paddle_mobile__framework__proto__attr_type__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "comment", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, comment),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "generated", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr,
+                     has_generated),
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, generated),
+            NULL,
+            &paddle_mobile__framework__proto__op_proto__attr__generated__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name[] = {
+        2, /* field[2] = comment */
+        3, /* field[3] = generated */
+        0, /* field[0] = name */
+        1, /* field[1] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_proto__attr__number_ranges[1 + 1] = {
+        {1, 0}, {0, 4}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__attr__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpProto.Attr",
+        "Attr",
+        "PaddleMobile__Framework__Proto__OpProto__Attr",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpProto__Attr),
+        4,
+        paddle_mobile__framework__proto__op_proto__attr__field_descriptors,
+        paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__op_proto__attr__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__op_proto__attr__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_proto__field_descriptors[5] = {
+        {
+            "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto, type), NULL, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "inputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpProto, n_inputs),
+            offsetof(PaddleMobile__Framework__Proto__OpProto, inputs),
+            &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "outputs", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpProto, n_outputs),
+            offsetof(PaddleMobile__Framework__Proto__OpProto, outputs),
+            &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpProto, n_attrs),
+            offsetof(PaddleMobile__Framework__Proto__OpProto, attrs),
+            &paddle_mobile__framework__proto__op_proto__attr__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "comment", 5, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto, comment), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_proto__field_indices_by_name[] = {
+        3, /* field[3] = attrs */
+        4, /* field[4] = comment */
+        1, /* field[1] = inputs */
+        2, /* field[2] = outputs */
+        0, /* field[0] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_proto__number_ranges[1 + 1] = {{1, 0},
+                                                                       {0, 5}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpProto",
+        "OpProto",
+        "PaddleMobile__Framework__Proto__OpProto",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpProto),
+        5,
+        paddle_mobile__framework__proto__op_proto__field_descriptors,
+        paddle_mobile__framework__proto__op_proto__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__op_proto__number_ranges,
+        (ProtobufCMessageInit)paddle_mobile__framework__proto__op_proto__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors
+        [2] = {
+            {
+                "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
+                0, /* quantifier_offset */
+                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
+                         data_type),
+                &paddle_mobile__framework__proto__var_type__type__descriptor,
+                NULL, 0,      /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+            {
+                "dims", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT64,
+                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
+                         n_dims),
+                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
+                         dims),
+                NULL, NULL, 0, /* flags */
+                0, NULL, NULL  /* reserved1,reserved2, etc */
+            },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name
+        [] = {
+            0, /* field[0] = data_type */
+            1, /* field[1] = dims */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges[1 +
+                                                                          1] = {
+        {1, 0}, {0, 2}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__tensor_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.TensorDesc",
+        "TensorDesc",
+        "PaddleMobile__Framework__Proto__VarType__TensorDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__TensorDesc),
+        2,
+        paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__tensor_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const int32_t
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value =
+        0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors
+        [2] = {
+            {
+                "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
+                0, /* quantifier_offset */
+                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
+                         tensor),
+                &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
+                NULL, 0,      /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+            {
+                "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL,
+                PROTOBUF_C_TYPE_INT32,
+                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
+                         has_lod_level),
+                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
+                         lod_level),
+                NULL,
+                &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value,
+                0,            /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name
+        [] = {
+            1, /* field[1] = lod_level */
+            0, /* field[0] = tensor */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges
+        [1 + 1] = {{1, 0}, {0, 2}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.LoDTensorDesc",
+        "LoDTensorDesc",
+        "PaddleMobile__Framework__Proto__VarType__LoDTensorDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc),
+        2,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const int32_t
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value =
+        0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors
+        [2] = {
+            {
+                "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
+                0, /* quantifier_offset */
+                offsetof(
+                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
+                    tensor),
+                &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
+                NULL, 0,      /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+            {
+                "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL,
+                PROTOBUF_C_TYPE_INT32,
+                offsetof(
+                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
+                    has_lod_level),
+                offsetof(
+                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
+                    lod_level),
+                NULL,
+                &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value,
+                0,            /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name
+        [] = {
+            1, /* field[1] = lod_level */
+            0, /* field[0] = tensor */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges
+        [1 + 1] = {{1, 0}, {0, 2}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc",
+        "LoDTensorArrayDesc",
+        "PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc),
+        2,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors[1] = {
+        {
+            "lod_tensor", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc,
+                     n_lod_tensor),
+            offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc,
+                     lod_tensor),
+            &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name
+        [] = {
+            0, /* field[0] = lod_tensor */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__reader_desc__number_ranges[1 +
+                                                                          1] = {
+        {1, 0}, {0, 1}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__reader_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.ReaderDesc",
+        "ReaderDesc",
+        "PaddleMobile__Framework__Proto__VarType__ReaderDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__ReaderDesc),
+        1,
+        paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__reader_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__reader_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors
+        [2] = {
+            {
+                "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
+                0, /* quantifier_offset */
+                offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc,
+                         data_type),
+                &paddle_mobile__framework__proto__var_type__type__descriptor,
+                NULL, 0,      /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+            {
+                "capacity", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT64,
+                0, /* quantifier_offset */
+                offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc,
+                         capacity),
+                NULL, NULL, 0, /* flags */
+                0, NULL, NULL  /* reserved1,reserved2, etc */
+            },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name
+        [] = {
+            1, /* field[1] = capacity */
+            0, /* field[0] = data_type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__channel_desc__number_ranges[1 +
+                                                                           1] =
+        {{1, 0}, {0, 2}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__channel_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.ChannelDesc",
+        "ChannelDesc",
+        "PaddleMobile__Framework__Proto__VarType__ChannelDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__ChannelDesc),
+        2,
+        paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__channel_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__channel_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__tuple__field_descriptors[1] = {
+        {
+            "element_type", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_ENUM,
+            offsetof(PaddleMobile__Framework__Proto__VarType__Tuple,
+                     n_element_type),
+            offsetof(PaddleMobile__Framework__Proto__VarType__Tuple,
+                     element_type),
+            &paddle_mobile__framework__proto__var_type__type__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name[] =
+        {
+            0, /* field[0] = element_type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__tuple__number_ranges[1 + 1] = {
+        {1, 0}, {0, 1}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__tuple__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.Tuple",
+        "Tuple",
+        "PaddleMobile__Framework__Proto__VarType__Tuple",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__Tuple),
+        1,
+        paddle_mobile__framework__proto__var_type__tuple__field_descriptors,
+        paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__tuple__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__tuple__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCEnumValue
+    paddle_mobile__framework__proto__var_type__type__enum_values_by_number[19] =
+        {
+            {"BOOL", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL",
+             0},
+            {"INT16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16",
+             1},
+            {"INT32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32",
+             2},
+            {"INT64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64",
+             3},
+            {"FP16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16",
+             4},
+            {"FP32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32",
+             5},
+            {"FP64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64",
+             6},
+            {"LOD_TENSOR",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR", 7},
+            {"SELECTED_ROWS",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS",
+             8},
+            {"FEED_MINIBATCH",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH",
+             9},
+            {"FETCH_LIST",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST", 10},
+            {"STEP_SCOPES",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES",
+             11},
+            {"LOD_RANK_TABLE",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE",
+             12},
+            {"LOD_TENSOR_ARRAY",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_"
+             "ARRAY",
+             13},
+            {"PLACE_LIST",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST", 14},
+            {"READER",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER", 15},
+            {"CHANNEL",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL", 16},
+            {"RAW", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW", 17},
+            {"TUPLE", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE",
+             18},
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__type__value_ranges[] = {{0, 0},
+                                                                       {0, 19}};
+static const ProtobufCEnumValueIndex
+    paddle_mobile__framework__proto__var_type__type__enum_values_by_name[19] = {
+        {"BOOL", 0},
+        {"CHANNEL", 16},
+        {"FEED_MINIBATCH", 9},
+        {"FETCH_LIST", 10},
+        {"FP16", 4},
+        {"FP32", 5},
+        {"FP64", 6},
+        {"INT16", 1},
+        {"INT32", 2},
+        {"INT64", 3},
+        {"LOD_RANK_TABLE", 12},
+        {"LOD_TENSOR", 7},
+        {"LOD_TENSOR_ARRAY", 13},
+        {"PLACE_LIST", 14},
+        {"RAW", 17},
+        {"READER", 15},
+        {"SELECTED_ROWS", 8},
+        {"STEP_SCOPES", 11},
+        {"TUPLE", 18},
+};
+const ProtobufCEnumDescriptor
+    paddle_mobile__framework__proto__var_type__type__descriptor = {
+        PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.Type",
+        "Type",
+        "PaddleMobile__Framework__Proto__VarType__Type",
+        "paddle_mobile.framework.proto",
+        19,
+        paddle_mobile__framework__proto__var_type__type__enum_values_by_number,
+        19,
+        paddle_mobile__framework__proto__var_type__type__enum_values_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__type__value_ranges,
+        NULL,
+        NULL,
+        NULL,
+        NULL /* reserved[1234] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__field_descriptors[7] = {
+        {
+            "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, type),
+            &paddle_mobile__framework__proto__var_type__type__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "selected_rows", 2, PROTOBUF_C_LABEL_OPTIONAL,
+            PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, selected_rows),
+            &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "lod_tensor", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, lod_tensor),
+            &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "tensor_array", 4, PROTOBUF_C_LABEL_OPTIONAL,
+            PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, tensor_array),
+            &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "reader", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, reader),
+            &paddle_mobile__framework__proto__var_type__reader_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "channel", 6, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, channel),
+            &paddle_mobile__framework__proto__var_type__channel_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "tuple", 7, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, tuple),
+            &paddle_mobile__framework__proto__var_type__tuple__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__field_indices_by_name[] = {
+        5, /* field[5] = channel */
+        2, /* field[2] = lod_tensor */
+        4, /* field[4] = reader */
+        1, /* field[1] = selected_rows */
+        3, /* field[3] = tensor_array */
+        6, /* field[6] = tuple */
+        0, /* field[0] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__number_ranges[1 + 1] = {{1, 0},
+                                                                       {0, 7}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType",
+        "VarType",
+        "PaddleMobile__Framework__Proto__VarType",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType),
+        7,
+        paddle_mobile__framework__proto__var_type__field_descriptors,
+        paddle_mobile__framework__proto__var_type__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__number_ranges,
+        (ProtobufCMessageInit)paddle_mobile__framework__proto__var_type__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__var_desc__persistable__default_value = 0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_desc__field_descriptors[3] = {
+        {
+            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarDesc, name), NULL, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarDesc, type),
+            &paddle_mobile__framework__proto__var_type__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "persistable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__VarDesc, has_persistable),
+            offsetof(PaddleMobile__Framework__Proto__VarDesc, persistable),
+            NULL,
+            &paddle_mobile__framework__proto__var_desc__persistable__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_desc__field_indices_by_name[] = {
+        0, /* field[0] = name */
+        2, /* field[2] = persistable */
+        1, /* field[1] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_desc__number_ranges[1 + 1] = {{1, 0},
+                                                                       {0, 3}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarDesc",
+        "VarDesc",
+        "PaddleMobile__Framework__Proto__VarDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarDesc),
+        3,
+        paddle_mobile__framework__proto__var_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_desc__number_ranges,
+        (ProtobufCMessageInit)paddle_mobile__framework__proto__var_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const int32_t
+    paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value =
+        -1;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__block_desc__field_descriptors[5] = {
+        {
+            "idx", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, idx), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "parent_idx", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, parent_idx),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "vars", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_vars),
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, vars),
+            &paddle_mobile__framework__proto__var_desc__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "ops", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_ops),
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, ops),
+            &paddle_mobile__framework__proto__op_desc__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "forward_block_idx", 5, PROTOBUF_C_LABEL_OPTIONAL,
+            PROTOBUF_C_TYPE_INT32,
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc,
+                     has_forward_block_idx),
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc,
+                     forward_block_idx),
+            NULL,
+            &paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__block_desc__field_indices_by_name[] = {
+        4, /* field[4] = forward_block_idx */
+        0, /* field[0] = idx */
+        3, /* field[3] = ops */
+        1, /* field[1] = parent_idx */
+        2, /* field[2] = vars */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__block_desc__number_ranges[1 + 1] = {
+        {1, 0}, {0, 5}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__block_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.BlockDesc",
+        "BlockDesc",
+        "PaddleMobile__Framework__Proto__BlockDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__BlockDesc),
+        5,
+        paddle_mobile__framework__proto__block_desc__field_descriptors,
+        paddle_mobile__framework__proto__block_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__block_desc__number_ranges,
+        (ProtobufCMessageInit)paddle_mobile__framework__proto__block_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__program_desc__field_descriptors[1] = {
+        {
+            "blocks", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, n_blocks),
+            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, blocks),
+            &paddle_mobile__framework__proto__block_desc__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__program_desc__field_indices_by_name[] = {
+        0, /* field[0] = blocks */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__program_desc__number_ranges[1 + 1] = {
+        {1, 0}, {0, 1}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__program_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.ProgramDesc",
+        "ProgramDesc",
+        "PaddleMobile__Framework__Proto__ProgramDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__ProgramDesc),
+        1,
+        paddle_mobile__framework__proto__program_desc__field_descriptors,
+        paddle_mobile__framework__proto__program_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__program_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__program_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCEnumValue
+    paddle_mobile__framework__proto__attr_type__enum_values_by_number[10] = {
+        {"INT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT", 0},
+        {"FLOAT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT", 1},
+        {"STRING", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING", 2},
+        {"INTS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS", 3},
+        {"FLOATS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS", 4},
+        {"STRINGS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS", 5},
+        {"BOOLEAN", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN", 6},
+        {"BOOLEANS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS", 7},
+        {"BLOCK", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK", 8},
+        {"LONG", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG", 9},
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__attr_type__value_ranges[] = {{0, 0},
+                                                                  {0, 10}};
+static const ProtobufCEnumValueIndex
+    paddle_mobile__framework__proto__attr_type__enum_values_by_name[10] = {
+        {"BLOCK", 8},  {"BOOLEAN", 6}, {"BOOLEANS", 7}, {"FLOAT", 1},
+        {"FLOATS", 4}, {"INT", 0},     {"INTS", 3},     {"LONG", 9},
+        {"STRING", 2}, {"STRINGS", 5},
+};
+const ProtobufCEnumDescriptor
+    paddle_mobile__framework__proto__attr_type__descriptor = {
+        PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.AttrType",
+        "AttrType",
+        "PaddleMobile__Framework__Proto__AttrType",
+        "paddle_mobile.framework.proto",
+        10,
+        paddle_mobile__framework__proto__attr_type__enum_values_by_number,
+        10,
+        paddle_mobile__framework__proto__attr_type__enum_values_by_name,
+        1,
+        paddle_mobile__framework__proto__attr_type__value_ranges,
+        NULL,
+        NULL,
+        NULL,
+        NULL /* reserved[1234] */
+};
diff --git a/tools/quantification/src/framework.pb-c.h b/tools/quantification/src/framework.pb-c.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d63bad76ad188d02986971bd911d8f30cf0af81
--- /dev/null
+++ b/tools/quantification/src/framework.pb-c.h
@@ -0,0 +1,579 @@
+/* Generated by the protocol buffer compiler.  DO NOT EDIT! */
+/* Generated from: framework.proto */
+
+#ifndef PROTOBUF_C_framework_2eproto__INCLUDED
+#define PROTOBUF_C_framework_2eproto__INCLUDED
+
+#include "protobuf-c.h"
+
+PROTOBUF_C__BEGIN_DECLS
+
+#if PROTOBUF_C_VERSION_NUMBER < 1000000
+# error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers.
+#elif 1003000 < PROTOBUF_C_MIN_COMPILER_VERSION
+# error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c.
+#endif
+
+typedef struct _PaddleMobile__Framework__Proto__OpDesc
+    PaddleMobile__Framework__Proto__OpDesc;
+typedef struct _PaddleMobile__Framework__Proto__OpDesc__Attr
+    PaddleMobile__Framework__Proto__OpDesc__Attr;
+typedef struct _PaddleMobile__Framework__Proto__OpDesc__Var
+    PaddleMobile__Framework__Proto__OpDesc__Var;
+typedef struct _PaddleMobile__Framework__Proto__OpProto
+    PaddleMobile__Framework__Proto__OpProto;
+typedef struct _PaddleMobile__Framework__Proto__OpProto__Var
+    PaddleMobile__Framework__Proto__OpProto__Var;
+typedef struct _PaddleMobile__Framework__Proto__OpProto__Attr
+    PaddleMobile__Framework__Proto__OpProto__Attr;
+typedef struct _PaddleMobile__Framework__Proto__VarType
+    PaddleMobile__Framework__Proto__VarType;
+typedef struct _PaddleMobile__Framework__Proto__VarType__TensorDesc
+    PaddleMobile__Framework__Proto__VarType__TensorDesc;
+typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc
+    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc;
+typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc
+    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc;
+typedef struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc
+    PaddleMobile__Framework__Proto__VarType__ReaderDesc;
+typedef struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc
+    PaddleMobile__Framework__Proto__VarType__ChannelDesc;
+typedef struct _PaddleMobile__Framework__Proto__VarType__Tuple
+    PaddleMobile__Framework__Proto__VarType__Tuple;
+typedef struct _PaddleMobile__Framework__Proto__VarDesc
+    PaddleMobile__Framework__Proto__VarDesc;
+typedef struct _PaddleMobile__Framework__Proto__BlockDesc
+    PaddleMobile__Framework__Proto__BlockDesc;
+typedef struct _PaddleMobile__Framework__Proto__ProgramDesc
+    PaddleMobile__Framework__Proto__ProgramDesc;
+
+/* --- enums --- */
+
+typedef enum _PaddleMobile__Framework__Proto__VarType__Type {
+  /*
+   * Pod Types
+   */
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL = 0,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16 = 1,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32 = 2,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64 = 3,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16 = 4,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32 = 5,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64 = 6,
+  /*
+   * Other types that may need additional descriptions
+   */
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR = 7,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS = 8,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH = 9,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST = 10,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES = 11,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE = 12,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_ARRAY = 13,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST = 14,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER = 15,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL = 16,
+  /*
+   * Any runtime decided variable type is raw
+   * raw variables should manage their own allocations
+   * in operators like nccl_op
+   */
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW = 17,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE =
+      18 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
+          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE)
+} PaddleMobile__Framework__Proto__VarType__Type;
+typedef enum _PaddleMobile__Framework__Proto__AttrType {
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT = 0,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT = 1,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING = 2,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS = 3,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS = 4,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS = 5,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN = 6,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG =
+      9 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
+          PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE)
+} PaddleMobile__Framework__Proto__AttrType;
+
+/* --- messages --- */
+
+struct _PaddleMobile__Framework__Proto__OpDesc__Attr {
+  ProtobufCMessage base;
+  char *name;
+  PaddleMobile__Framework__Proto__AttrType type;
+  protobuf_c_boolean has_i;
+  int32_t i;
+  protobuf_c_boolean has_f;
+  float f;
+  char *s;
+  size_t n_ints;
+  int32_t *ints;
+  size_t n_floats;
+  float *floats;
+  size_t n_strings;
+  char **strings;
+  protobuf_c_boolean has_b;
+  protobuf_c_boolean b;
+  size_t n_bools;
+  protobuf_c_boolean *bools;
+  protobuf_c_boolean has_block_idx;
+  int32_t block_idx;
+  protobuf_c_boolean has_l;
+  int64_t l;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT                   \
+  {                                                                            \
+    PROTOBUF_C_MESSAGE_INIT(                                                   \
+        &paddle_mobile__framework__proto__op_desc__attr__descriptor)           \
+    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, 0, 0, 0, 0, NULL, \
+        0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0                   \
+  }
+
+struct _PaddleMobile__Framework__Proto__OpDesc__Var {
+  ProtobufCMessage base;
+  char *parameter;
+  size_t n_arguments;
+  char **arguments;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT         \
+  {                                                                 \
+    PROTOBUF_C_MESSAGE_INIT(                                        \
+        &paddle_mobile__framework__proto__op_desc__var__descriptor) \
+    , NULL, 0, NULL                                                 \
+  }
+
+/*
+ * OpDesc describes an instance of a C++ framework::OperatorBase
+ * derived class type.
+ */
+struct _PaddleMobile__Framework__Proto__OpDesc {
+  ProtobufCMessage base;
+  char *type;
+  size_t n_inputs;
+  PaddleMobile__Framework__Proto__OpDesc__Var **inputs;
+  size_t n_outputs;
+  PaddleMobile__Framework__Proto__OpDesc__Var **outputs;
+  size_t n_attrs;
+  PaddleMobile__Framework__Proto__OpDesc__Attr **attrs;
+  protobuf_c_boolean has_is_target;
+  protobuf_c_boolean is_target;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT         \
+  {                                                            \
+    PROTOBUF_C_MESSAGE_INIT(                                   \
+        &paddle_mobile__framework__proto__op_desc__descriptor) \
+    , NULL, 0, NULL, 0, NULL, 0, NULL, 0, 0                    \
+  }
+
+/*
+ * VarProto describes the C++ type framework::Variable.
+ */
+struct _PaddleMobile__Framework__Proto__OpProto__Var {
+  ProtobufCMessage base;
+  char *name;
+  char *comment;
+  protobuf_c_boolean has_duplicable;
+  protobuf_c_boolean duplicable;
+  protobuf_c_boolean has_intermediate;
+  protobuf_c_boolean intermediate;
+  protobuf_c_boolean has_dispensable;
+  protobuf_c_boolean dispensable;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT         \
+  {                                                                  \
+    PROTOBUF_C_MESSAGE_INIT(                                         \
+        &paddle_mobile__framework__proto__op_proto__var__descriptor) \
+    , NULL, NULL, 0, 0, 0, 0, 0, 0                                   \
+  }
+
+/*
+ * AttrProto describes the C++ type Attribute.
+ */
+struct _PaddleMobile__Framework__Proto__OpProto__Attr {
+  ProtobufCMessage base;
+  char *name;
+  PaddleMobile__Framework__Proto__AttrType type;
+  char *comment;
+  /*
+   * If that attribute is generated, it means the Paddle third
+   * language binding has responsibility to fill that
+   * attribute. End-User should not set that attribute.
+   */
+  protobuf_c_boolean has_generated;
+  protobuf_c_boolean generated;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT           \
+  {                                                                     \
+    PROTOBUF_C_MESSAGE_INIT(                                            \
+        &paddle_mobile__framework__proto__op_proto__attr__descriptor)   \
+    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, NULL, 0, 0 \
+  }
+
+/*
+ * OpProto describes a C++ framework::OperatorBase derived class.
+ */
+struct _PaddleMobile__Framework__Proto__OpProto {
+  ProtobufCMessage base;
+  char *type;
+  size_t n_inputs;
+  PaddleMobile__Framework__Proto__OpProto__Var **inputs;
+  size_t n_outputs;
+  PaddleMobile__Framework__Proto__OpProto__Var **outputs;
+  size_t n_attrs;
+  PaddleMobile__Framework__Proto__OpProto__Attr **attrs;
+  char *comment;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT         \
+  {                                                             \
+    PROTOBUF_C_MESSAGE_INIT(                                    \
+        &paddle_mobile__framework__proto__op_proto__descriptor) \
+    , NULL, 0, NULL, 0, NULL, 0, NULL, NULL                     \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__TensorDesc {
+  ProtobufCMessage base;
+  /*
+   * Should only be PODType. Is enforced in C++
+   */
+  PaddleMobile__Framework__Proto__VarType__Type data_type;
+  /*
+   * [UNK, 640, 480] is saved as [-1, 640, 480]
+   */
+  size_t n_dims;
+  int64_t *dims;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT         \
+  {                                                                          \
+    PROTOBUF_C_MESSAGE_INIT(                                                 \
+        &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor) \
+    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0, NULL         \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc {
+  ProtobufCMessage base;
+  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor;
+  protobuf_c_boolean has_lod_level;
+  int32_t lod_level;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT         \
+  {                                                                              \
+    PROTOBUF_C_MESSAGE_INIT(                                                     \
+        &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor) \
+    , NULL, 0, 0                                                                 \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc {
+  ProtobufCMessage base;
+  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor;
+  protobuf_c_boolean has_lod_level;
+  int32_t lod_level;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT         \
+  {                                                                                    \
+    PROTOBUF_C_MESSAGE_INIT(                                                           \
+        &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor) \
+    , NULL, 0, 0                                                                       \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc {
+  ProtobufCMessage base;
+  size_t n_lod_tensor;
+  PaddleMobile__Framework__Proto__VarType__LoDTensorDesc **lod_tensor;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT         \
+  {                                                                          \
+    PROTOBUF_C_MESSAGE_INIT(                                                 \
+        &paddle_mobile__framework__proto__var_type__reader_desc__descriptor) \
+    , 0, NULL                                                                \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc {
+  ProtobufCMessage base;
+  PaddleMobile__Framework__Proto__VarType__Type data_type;
+  int64_t capacity;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT         \
+  {                                                                           \
+    PROTOBUF_C_MESSAGE_INIT(                                                  \
+        &paddle_mobile__framework__proto__var_type__channel_desc__descriptor) \
+    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0                \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__Tuple {
+  ProtobufCMessage base;
+  size_t n_element_type;
+  PaddleMobile__Framework__Proto__VarType__Type *element_type;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT         \
+  {                                                                    \
+    PROTOBUF_C_MESSAGE_INIT(                                           \
+        &paddle_mobile__framework__proto__var_type__tuple__descriptor) \
+    , 0, NULL                                                          \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType {
+  ProtobufCMessage base;
+  PaddleMobile__Framework__Proto__VarType__Type type;
+  PaddleMobile__Framework__Proto__VarType__TensorDesc *selected_rows;
+  PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *lod_tensor;
+  PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *tensor_array;
+  PaddleMobile__Framework__Proto__VarType__ReaderDesc *reader;
+  PaddleMobile__Framework__Proto__VarType__ChannelDesc *channel;
+  PaddleMobile__Framework__Proto__VarType__Tuple *tuple;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT                        \
+  {                                                                            \
+    PROTOBUF_C_MESSAGE_INIT(                                                   \
+        &paddle_mobile__framework__proto__var_type__descriptor)                \
+    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, NULL, NULL, NULL, \
+        NULL, NULL, NULL                                                       \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarDesc {
+  ProtobufCMessage base;
+  char *name;
+  PaddleMobile__Framework__Proto__VarType *type;
+  protobuf_c_boolean has_persistable;
+  protobuf_c_boolean persistable;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT         \
+  {                                                             \
+    PROTOBUF_C_MESSAGE_INIT(                                    \
+        &paddle_mobile__framework__proto__var_desc__descriptor) \
+    , NULL, NULL, 0, 0                                          \
+  }
+
+struct _PaddleMobile__Framework__Proto__BlockDesc {
+  ProtobufCMessage base;
+  int32_t idx;
+  int32_t parent_idx;
+  size_t n_vars;
+  PaddleMobile__Framework__Proto__VarDesc **vars;
+  size_t n_ops;
+  PaddleMobile__Framework__Proto__OpDesc **ops;
+  protobuf_c_boolean has_forward_block_idx;
+  int32_t forward_block_idx;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT         \
+  {                                                               \
+    PROTOBUF_C_MESSAGE_INIT(                                      \
+        &paddle_mobile__framework__proto__block_desc__descriptor) \
+    , 0, 0, 0, NULL, 0, NULL, 0, -1                               \
+  }
+
+/*
+ * Please refer to
+ * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
+ * for more details.
+ * TODO(panyx0718): A model can have multiple programs. Need a
+ * way to distinguish them. Maybe ID or name?
+ */
+struct _PaddleMobile__Framework__Proto__ProgramDesc {
+  ProtobufCMessage base;
+  size_t n_blocks;
+  PaddleMobile__Framework__Proto__BlockDesc **blocks;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT         \
+  {                                                                 \
+    PROTOBUF_C_MESSAGE_INIT(                                        \
+        &paddle_mobile__framework__proto__program_desc__descriptor) \
+    , 0, NULL                                                       \
+  }
+
+/* PaddleMobile__Framework__Proto__OpDesc__Attr methods */
+void paddle_mobile__framework__proto__op_desc__attr__init(
+    PaddleMobile__Framework__Proto__OpDesc__Attr *message);
+/* PaddleMobile__Framework__Proto__OpDesc__Var methods */
+void paddle_mobile__framework__proto__op_desc__var__init(
+    PaddleMobile__Framework__Proto__OpDesc__Var *message);
+/* PaddleMobile__Framework__Proto__OpDesc methods */
+void paddle_mobile__framework__proto__op_desc__init(
+    PaddleMobile__Framework__Proto__OpDesc *message);
+
+size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__OpDesc *message);
+
+PaddleMobile__Framework__Proto__OpDesc *
+paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
+                                                 size_t len,
+                                                 const uint8_t *data);
+void paddle_mobile__framework__proto__op_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__OpDesc *message,
+    ProtobufCAllocator *allocator);
+/* PaddleMobile__Framework__Proto__OpProto__Var methods */
+void paddle_mobile__framework__proto__op_proto__var__init(
+    PaddleMobile__Framework__Proto__OpProto__Var *message);
+/* PaddleMobile__Framework__Proto__OpProto__Attr methods */
+void paddle_mobile__framework__proto__op_proto__attr__init(
+    PaddleMobile__Framework__Proto__OpProto__Attr *message);
+/* PaddleMobile__Framework__Proto__OpProto methods */
+void paddle_mobile__framework__proto__op_proto__init(
+    PaddleMobile__Framework__Proto__OpProto *message);
+size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
+    const PaddleMobile__Framework__Proto__OpProto *message);
+PaddleMobile__Framework__Proto__OpProto *
+paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data);
+void paddle_mobile__framework__proto__op_proto__free_unpacked(
+    PaddleMobile__Framework__Proto__OpProto *message,
+    ProtobufCAllocator *allocator);
+/* PaddleMobile__Framework__Proto__VarType__TensorDesc methods */
+void paddle_mobile__framework__proto__var_type__tensor_desc__init(
+    PaddleMobile__Framework__Proto__VarType__TensorDesc *message);
+/* PaddleMobile__Framework__Proto__VarType__LoDTensorDesc methods */
+void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init(
+    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message);
+/* PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc methods */
+void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init(
+    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message);
+/* PaddleMobile__Framework__Proto__VarType__ReaderDesc methods */
+void paddle_mobile__framework__proto__var_type__reader_desc__init(
+    PaddleMobile__Framework__Proto__VarType__ReaderDesc *message);
+/* PaddleMobile__Framework__Proto__VarType__ChannelDesc methods */
+void paddle_mobile__framework__proto__var_type__channel_desc__init(
+    PaddleMobile__Framework__Proto__VarType__ChannelDesc *message);
+/* PaddleMobile__Framework__Proto__VarType__Tuple methods */
+void paddle_mobile__framework__proto__var_type__tuple__init(
+    PaddleMobile__Framework__Proto__VarType__Tuple *message);
+/* PaddleMobile__Framework__Proto__VarType methods */
+void paddle_mobile__framework__proto__var_type__init(
+    PaddleMobile__Framework__Proto__VarType *message);
+size_t paddle_mobile__framework__proto__var_type__get_packed_size(
+    const PaddleMobile__Framework__Proto__VarType *message);
+PaddleMobile__Framework__Proto__VarType *
+paddle_mobile__framework__proto__var_type__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data);
+void paddle_mobile__framework__proto__var_type__free_unpacked(
+    PaddleMobile__Framework__Proto__VarType *message,
+    ProtobufCAllocator *allocator);
+/* PaddleMobile__Framework__Proto__VarDesc methods */
+void paddle_mobile__framework__proto__var_desc__init(
+    PaddleMobile__Framework__Proto__VarDesc *message);
+size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__VarDesc *message);
+PaddleMobile__Framework__Proto__VarDesc *
+paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data);
+void paddle_mobile__framework__proto__var_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__VarDesc *message,
+    ProtobufCAllocator *allocator);
+/* PaddleMobile__Framework__Proto__BlockDesc methods */
+void paddle_mobile__framework__proto__block_desc__init(
+    PaddleMobile__Framework__Proto__BlockDesc *message);
+size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__BlockDesc *message);
+PaddleMobile__Framework__Proto__BlockDesc *
+paddle_mobile__framework__proto__block_desc__unpack(
+    ProtobufCAllocator *allocator, size_t len, const uint8_t *data);
+void paddle_mobile__framework__proto__block_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__BlockDesc *message,
+    ProtobufCAllocator *allocator);
+/* PaddleMobile__Framework__Proto__ProgramDesc methods */
+void paddle_mobile__framework__proto__program_desc__init(
+    PaddleMobile__Framework__Proto__ProgramDesc *message);
+size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__ProgramDesc *message);
+PaddleMobile__Framework__Proto__ProgramDesc *
+paddle_mobile__framework__proto__program_desc__unpack(
+    ProtobufCAllocator *allocator, size_t len, const uint8_t *data);
+void paddle_mobile__framework__proto__program_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__ProgramDesc *message,
+    ProtobufCAllocator *allocator);
+/* --- per-message closures --- */
+
+typedef void (*PaddleMobile__Framework__Proto__OpDesc__Attr_Closure)(
+    const PaddleMobile__Framework__Proto__OpDesc__Attr *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__OpDesc__Var_Closure)(
+    const PaddleMobile__Framework__Proto__OpDesc__Var *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__OpDesc_Closure)(
+    const PaddleMobile__Framework__Proto__OpDesc *message, void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__OpProto__Var_Closure)(
+    const PaddleMobile__Framework__Proto__OpProto__Var *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__OpProto__Attr_Closure)(
+    const PaddleMobile__Framework__Proto__OpProto__Attr *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__OpProto_Closure)(
+    const PaddleMobile__Framework__Proto__OpProto *message, void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__TensorDesc *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType__LoDTensorDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message,
+    void *closure_data);
+typedef void (
+    *PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType__ReaderDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__ReaderDesc *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType__ChannelDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__ChannelDesc *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType__Tuple_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__Tuple *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType_Closure)(
+    const PaddleMobile__Framework__Proto__VarType *message, void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarDesc *message, void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__BlockDesc_Closure)(
+    const PaddleMobile__Framework__Proto__BlockDesc *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__ProgramDesc_Closure)(
+    const PaddleMobile__Framework__Proto__ProgramDesc *message,
+    void *closure_data);
+
+/* --- services --- */
+
+/* --- descriptors --- */
+
+extern const ProtobufCEnumDescriptor
+    paddle_mobile__framework__proto__attr_type__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__attr__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__var__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__var__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__attr__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__tensor_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__reader_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__channel_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__tuple__descriptor;
+extern const ProtobufCEnumDescriptor
+    paddle_mobile__framework__proto__var_type__type__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__block_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__program_desc__descriptor;
+
+PROTOBUF_C__END_DECLS
+
+#endif /* PROTOBUF_C_framework_2eproto__INCLUDED */
diff --git a/tools/quantification/src/program_desc.cpp b/tools/quantification/src/program_desc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f9984832ada5061c7691aeb7fadba86cb5b8c0c
--- /dev/null
+++ b/tools/quantification/src/program_desc.cpp
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+//
+// Created by 谢柏渊 on 2018/7/25.
+//
+
+#include "src/program_desc.h"
+#include <vector>
+
+ProgramDesc::ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc) {
+  for (int i = 0; i < desc->n_blocks; ++i) {
+    blocks_.emplace_back(std::make_shared<BlockDesc>(desc->blocks[i]));
+  }
+}
+
+const std::vector<std::shared_ptr<BlockDesc>> ProgramDesc::Blocks() {
+  return blocks_;
+}
diff --git a/tools/quantification/src/program_desc.h b/tools/quantification/src/program_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..60a0f757b0c907165d7639a41e35a407ef083b59
--- /dev/null
+++ b/tools/quantification/src/program_desc.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+//
+// Created by 谢柏渊 on 2018/7/25.
+//
+
+#ifndef TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
+#define TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
+
+#include <memory>
+#include <vector>
+#include "src/block_desc_local.h"
+#include "src/framework.pb-c.h"
+
+class ProgramDesc {
+ public:
+  //    friend class Node;
+  //
+  //    friend class ProgramOptimize;
+
+  explicit ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc);
+
+  const std::vector<std::shared_ptr<BlockDesc>> Blocks();
+
+ private:
+  std::vector<std::shared_ptr<BlockDesc>> blocks_;
+};
+
+#endif  // TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
diff --git a/tools/quantification/src/protobuf-c.c b/tools/quantification/src/protobuf-c.c
new file mode 100644
index 0000000000000000000000000000000000000000..1092e3f78b02a343d8c8965ea7b2d777a6fac9ae
--- /dev/null
+++ b/tools/quantification/src/protobuf-c.c
@@ -0,0 +1,2098 @@
+/*
+ * Copyright (c) 2008-2015, Dave Benson and the protobuf-c authors.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*! \file
+ * Support library for `protoc-c` generated code.
+ *
+ * This file implements the public API used by the code generated
+ * by `protoc-c`.
+ *
+ * \authors Dave Benson and the protobuf-c authors
+ *
+ * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license.
+ */
+
+/**
+ * \todo 64-BIT OPTIMIZATION: certain implementations use 32-bit math
+ * even on 64-bit platforms (uint64_size, uint64_pack, parse_uint64).
+ *
+ * \todo Use size_t consistently.
+ */
+
+#include <stdlib.h> /* for malloc, free */
+#include <string.h> /* for strcmp, strlen, memcpy, memmove, memset */
+
+#include "protobuf-c.h"
+
+#define TRUE 1
+#define FALSE 0
+
+#define PROTOBUF_C__ASSERT_NOT_REACHED() assert(0)
+
+/* Workaround for Microsoft compilers. */
+#ifdef _MSC_VER
+#define inline __inline
+#endif
+
+/**
+ * \defgroup internal Internal functions and macros
+ *
+ * These are not exported by the library but are useful to developers working
+ * on `libprotobuf-c` itself.
+ */
+
+/**
+ * \defgroup macros Utility macros for manipulating structures
+ *
+ * Macros and constants used to manipulate the base "classes" generated by
+ * `protobuf-c`. They also define limits and check correctness.
+ *
+ * \ingroup internal
+ * @{
+ */
+
+/** The maximum length of a 64-bit integer in varint encoding. */
+#define MAX_UINT64_ENCODED_SIZE 10
+
+#ifndef PROTOBUF_C_UNPACK_ERROR
+#define PROTOBUF_C_UNPACK_ERROR(...)
+#endif
+
+const char protobuf_c_empty_string[] = "";
+
+/**
+ * Internal `ProtobufCMessage` manipulation macro.
+ *
+ * Base macro for manipulating a `ProtobufCMessage`. Used by STRUCT_MEMBER() and
+ * STRUCT_MEMBER_PTR().
+ */
+#define STRUCT_MEMBER_P(struct_p, struct_offset) \
+  ((void *)((uint8_t *)(struct_p) + (struct_offset)))
+
+/**
+ * Return field in a `ProtobufCMessage` based on offset.
+ *
+ * Take a pointer to a `ProtobufCMessage` and find the field at the offset.
+ * Cast it to the passed type.
+ */
+#define STRUCT_MEMBER(member_type, struct_p, struct_offset) \
+  (*(member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset)))
+
+/**
+ * Return field in a `ProtobufCMessage` based on offset.
+ *
+ * Take a pointer to a `ProtobufCMessage` and find the field at the offset. Cast
+ * it to a pointer to the passed type.
+ */
+#define STRUCT_MEMBER_PTR(member_type, struct_p, struct_offset) \
+  ((member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset)))
+
+/* Assertions for magic numbers. */
+
+#define ASSERT_IS_ENUM_DESCRIPTOR(desc) \
+  assert((desc)->magic == PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC)
+
+#define ASSERT_IS_MESSAGE_DESCRIPTOR(desc) \
+  assert((desc)->magic == PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC)
+
+#define ASSERT_IS_MESSAGE(message) \
+  ASSERT_IS_MESSAGE_DESCRIPTOR((message)->descriptor)
+
+#define ASSERT_IS_SERVICE_DESCRIPTOR(desc) \
+  assert((desc)->magic == PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC)
+
+/**@}*/
+
+/* --- version --- */
+
+const char *protobuf_c_version(void) { return PROTOBUF_C_VERSION; }
+
+uint32_t protobuf_c_version_number(void) { return PROTOBUF_C_VERSION_NUMBER; }
+
+/* --- allocator --- */
+
+static void *system_alloc(void *allocator_data, size_t size) {
+  return malloc(size);
+}
+
+static void system_free(void *allocator_data, void *data) { free(data); }
+
+static inline void *do_alloc(ProtobufCAllocator *allocator, size_t size) {
+  return allocator->alloc(allocator->allocator_data, size);
+}
+
+static inline void do_free(ProtobufCAllocator *allocator, void *data) {
+  if (data != NULL) allocator->free(allocator->allocator_data, data);
+}
+
+/*
+ * This allocator uses the system's malloc() and free(). It is the default
+ * allocator used if NULL is passed as the ProtobufCAllocator to an exported
+ * function.
+ */
+static ProtobufCAllocator protobuf_c__allocator = {
+    .alloc = &system_alloc,
+    .free = &system_free,
+    .allocator_data = NULL,
+};
+
+/* === buffer-simple === */
+
+void protobuf_c_buffer_simple_append(ProtobufCBuffer *buffer, size_t len,
+                                     const uint8_t *data) {
+  ProtobufCBufferSimple *simp = (ProtobufCBufferSimple *)buffer;
+  size_t new_len = simp->len + len;
+
+  if (new_len > simp->alloced) {
+    ProtobufCAllocator *allocator = simp->allocator;
+    size_t new_alloced = simp->alloced * 2;
+    uint8_t *new_data;
+
+    if (allocator == NULL) allocator = &protobuf_c__allocator;
+    while (new_alloced < new_len) new_alloced += new_alloced;
+    new_data = do_alloc(allocator, new_alloced);
+    if (!new_data) return;
+    memcpy(new_data, simp->data, simp->len);
+    if (simp->must_free_data)
+      do_free(allocator, simp->data);
+    else
+      simp->must_free_data = TRUE;
+    simp->data = new_data;
+    simp->alloced = new_alloced;
+  }
+  memcpy(simp->data + simp->len, data, len);
+  simp->len = new_len;
+}
+
+/**
+ * \defgroup packedsz protobuf_c_message_get_packed_size() implementation
+ *
+ * Routines mainly used by protobuf_c_message_get_packed_size().
+ *
+ * \ingroup internal
+ * @{
+ */
+
+/**
+ * Return the number of bytes required to store the tag for the field. Includes
+ * 3 bits for the wire-type, and a single bit that denotes the end-of-tag.
+ *
+ * \param number
+ *      Field tag to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t get_tag_size(uint32_t number) {
+  if (number < (1UL << 4)) {
+    return 1;
+  } else if (number < (1UL << 11)) {
+    return 2;
+  } else if (number < (1UL << 18)) {
+    return 3;
+  } else if (number < (1UL << 25)) {
+    return 4;
+  } else {
+    return 5;
+  }
+}
+
+/**
+ * Return the number of bytes required to store a variable-length unsigned
+ * 32-bit integer in base-128 varint encoding.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t uint32_size(uint32_t v) {
+  if (v < (1UL << 7)) {
+    return 1;
+  } else if (v < (1UL << 14)) {
+    return 2;
+  } else if (v < (1UL << 21)) {
+    return 3;
+  } else if (v < (1UL << 28)) {
+    return 4;
+  } else {
+    return 5;
+  }
+}
+
+/**
+ * Return the number of bytes required to store a variable-length signed 32-bit
+ * integer in base-128 varint encoding.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t int32_size(int32_t v) {
+  if (v < 0) {
+    return 10;
+  } else if (v < (1L << 7)) {
+    return 1;
+  } else if (v < (1L << 14)) {
+    return 2;
+  } else if (v < (1L << 21)) {
+    return 3;
+  } else if (v < (1L << 28)) {
+    return 4;
+  } else {
+    return 5;
+  }
+}
+
+/**
+ * Return the ZigZag-encoded 32-bit unsigned integer form of a 32-bit signed
+ * integer.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      ZigZag encoded integer.
+ */
+static inline uint32_t zigzag32(int32_t v) {
+  if (v < 0)
+    return (-(uint32_t)v) * 2 - 1;
+  else
+    return (uint32_t)(v)*2;
+}
+
+/**
+ * Return the number of bytes required to store a signed 32-bit integer,
+ * converted to an unsigned 32-bit integer with ZigZag encoding, using base-128
+ * varint encoding.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t sint32_size(int32_t v) { return uint32_size(zigzag32(v)); }
+
+/**
+ * Return the number of bytes required to store a 64-bit unsigned integer in
+ * base-128 varint encoding.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t uint64_size(uint64_t v) {
+  uint32_t upper_v = (uint32_t)(v >> 32);
+
+  if (upper_v == 0) {
+    return uint32_size((uint32_t)v);
+  } else if (upper_v < (1UL << 3)) {
+    return 5;
+  } else if (upper_v < (1UL << 10)) {
+    return 6;
+  } else if (upper_v < (1UL << 17)) {
+    return 7;
+  } else if (upper_v < (1UL << 24)) {
+    return 8;
+  } else if (upper_v < (1UL << 31)) {
+    return 9;
+  } else {
+    return 10;
+  }
+}
+
+/**
+ * Return the ZigZag-encoded 64-bit unsigned integer form of a 64-bit signed
+ * integer.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      ZigZag encoded integer.
+ */
+static inline uint64_t zigzag64(int64_t v) {
+  if (v < 0)
+    return (-(uint64_t)v) * 2 - 1;
+  else
+    return (uint64_t)(v)*2;
+}
+
+/**
+ * Return the number of bytes required to store a signed 64-bit integer,
+ * converted to an unsigned 64-bit integer with ZigZag encoding, using base-128
+ * varint encoding.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t sint64_size(int64_t v) { return uint64_size(zigzag64(v)); }
+
+/**
+ * Calculate the serialized size of a single required message field, including
+ * the space needed by the preceding tag.
+ *
+ * \param field
+ *      Field descriptor for member.
+ * \param member
+ *      Field to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static size_t required_field_get_packed_size(
+    const ProtobufCFieldDescriptor *field, const void *member) {
+  size_t rv = get_tag_size(field->id);
+
+  switch (field->type) {
+    case PROTOBUF_C_TYPE_SINT32:
+      return rv + sint32_size(*(const int32_t *)member);
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_INT32:
+      return rv + int32_size(*(const int32_t *)member);
+    case PROTOBUF_C_TYPE_UINT32:
+      return rv + uint32_size(*(const uint32_t *)member);
+    case PROTOBUF_C_TYPE_SINT64:
+      return rv + sint64_size(*(const int64_t *)member);
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+      return rv + uint64_size(*(const uint64_t *)member);
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+      return rv + 4;
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+      return rv + 8;
+    case PROTOBUF_C_TYPE_BOOL:
+      return rv + 1;
+    case PROTOBUF_C_TYPE_FLOAT:
+      return rv + 4;
+    case PROTOBUF_C_TYPE_DOUBLE:
+      return rv + 8;
+    case PROTOBUF_C_TYPE_STRING: {
+      const char *str = *(char *const *)member;
+      size_t len = str ? strlen(str) : 0;
+      return rv + uint32_size(len) + len;
+    }
+    case PROTOBUF_C_TYPE_BYTES: {
+      size_t len = ((const ProtobufCBinaryData *)member)->len;
+      return rv + uint32_size(len) + len;
+    }
+    case PROTOBUF_C_TYPE_MESSAGE: {
+      const ProtobufCMessage *msg = *(ProtobufCMessage *const *)member;
+      size_t subrv = msg ? protobuf_c_message_get_packed_size(msg) : 0;
+      return rv + uint32_size(subrv) + subrv;
+    }
+  }
+  PROTOBUF_C__ASSERT_NOT_REACHED();
+  return 0;
+}
+
+/**
+ * Calculate the serialized size of a single oneof message field, including
+ * the space needed by the preceding tag. Returns 0 if the oneof field isn't
+ * selected or is not set.
+ *
+ * \param field
+ *      Field descriptor for member.
+ * \param oneof_case
+ *      Enum value that selects the field in the oneof.
+ * \param member
+ *      Field to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static size_t oneof_field_get_packed_size(const ProtobufCFieldDescriptor *field,
+                                          uint32_t oneof_case,
+                                          const void *member) {
+  if (oneof_case != field->id) {
+    return 0;
+  }
+  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
+      field->type == PROTOBUF_C_TYPE_STRING) {
+    const void *ptr = *(const void *const *)member;
+    if (ptr == NULL || ptr == field->default_value) return 0;
+  }
+  return required_field_get_packed_size(field, member);
+}
+
+/**
+ * Calculate the serialized size of a single optional message field, including
+ * the space needed by the preceding tag. Returns 0 if the optional field isn't
+ * set.
+ *
+ * \param field
+ *      Field descriptor for member.
+ * \param has
+ *      True if the field exists, false if not.
+ * \param member
+ *      Field to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static size_t optional_field_get_packed_size(
+    const ProtobufCFieldDescriptor *field, const protobuf_c_boolean has,
+    const void *member) {
+  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
+      field->type == PROTOBUF_C_TYPE_STRING) {
+    const void *ptr = *(const void *const *)member;
+    if (ptr == NULL || ptr == field->default_value) return 0;
+  } else {
+    if (!has) return 0;
+  }
+  return required_field_get_packed_size(field, member);
+}
+
+static protobuf_c_boolean field_is_zeroish(
+    const ProtobufCFieldDescriptor *field, const void *member) {
+  protobuf_c_boolean ret = FALSE;
+
+  switch (field->type) {
+    case PROTOBUF_C_TYPE_BOOL:
+      ret = (0 == *(const protobuf_c_boolean *)member);
+      break;
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_SINT32:
+    case PROTOBUF_C_TYPE_INT32:
+    case PROTOBUF_C_TYPE_UINT32:
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+      ret = (0 == *(const uint32_t *)member);
+      break;
+    case PROTOBUF_C_TYPE_SINT64:
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+      ret = (0 == *(const uint64_t *)member);
+      break;
+    case PROTOBUF_C_TYPE_FLOAT:
+      ret = (0 == *(const float *)member);
+      break;
+    case PROTOBUF_C_TYPE_DOUBLE:
+      ret = (0 == *(const double *)member);
+      break;
+    case PROTOBUF_C_TYPE_STRING:
+      ret = (NULL == *(const char *const *)member) ||
+            ('\0' == **(const char *const *)member);
+      break;
+    case PROTOBUF_C_TYPE_BYTES:
+    case PROTOBUF_C_TYPE_MESSAGE:
+      ret = (NULL == *(const void *const *)member);
+      break;
+    default:
+      ret = TRUE;
+      break;
+  }
+
+  return ret;
+}
+
+/**
+ * Calculate the serialized size of a single unlabeled message field, including
+ * the space needed by the preceding tag. Returns 0 if the field isn't set or
+ * if it is set to a "zeroish" value (null pointer or 0 for numerical values).
+ * Unlabeled fields are supported only in proto3.
+ *
+ * \param field
+ *      Field descriptor for member.
+ * \param member
+ *      Field to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static size_t unlabeled_field_get_packed_size(
+    const ProtobufCFieldDescriptor *field, const void *member) {
+  if (field_is_zeroish(field, member)) return 0;
+  return required_field_get_packed_size(field, member);
+}
+
+/**
+ * Calculate the serialized size of repeated message fields, which may consist
+ * of any number of values (including 0). Includes the space needed by the
+ * preceding tags (as needed).
+ *
+ * \param field
+ *      Field descriptor for member.
+ * \param count
+ *      Number of repeated field members.
+ * \param member
+ *      Field to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static size_t repeated_field_get_packed_size(
+    const ProtobufCFieldDescriptor *field, size_t count, const void *member) {
+  size_t header_size;
+  size_t rv = 0;
+  unsigned i;
+  void *array = *(void *const *)member;
+
+  if (count == 0) return 0;
+  header_size = get_tag_size(field->id);
+  if (0 == (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED)) header_size *= count;
+
+  switch (field->type) {
+    case PROTOBUF_C_TYPE_SINT32:
+      for (i = 0; i < count; i++) rv += sint32_size(((int32_t *)array)[i]);
+      break;
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_INT32:
+      for (i = 0; i < count; i++) rv += int32_size(((int32_t *)array)[i]);
+      break;
+    case PROTOBUF_C_TYPE_UINT32:
+      for (i = 0; i < count; i++) rv += uint32_size(((uint32_t *)array)[i]);
+      break;
+    case PROTOBUF_C_TYPE_SINT64:
+      for (i = 0; i < count; i++) rv += sint64_size(((int64_t *)array)[i]);
+      break;
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+      for (i = 0; i < count; i++) rv += uint64_size(((uint64_t *)array)[i]);
+      break;
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+    case PROTOBUF_C_TYPE_FLOAT:
+      rv += 4 * count;
+      break;
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+    case PROTOBUF_C_TYPE_DOUBLE:
+      rv += 8 * count;
+      break;
+    case PROTOBUF_C_TYPE_BOOL:
+      rv += count;
+      break;
+    case PROTOBUF_C_TYPE_STRING:
+      for (i = 0; i < count; i++) {
+        size_t len = strlen(((char **)array)[i]);
+        rv += uint32_size(len) + len;
+      }
+      break;
+    case PROTOBUF_C_TYPE_BYTES:
+      for (i = 0; i < count; i++) {
+        size_t len = ((ProtobufCBinaryData *)array)[i].len;
+        rv += uint32_size(len) + len;
+      }
+      break;
+    case PROTOBUF_C_TYPE_MESSAGE:
+      for (i = 0; i < count; i++) {
+        size_t len =
+            protobuf_c_message_get_packed_size(((ProtobufCMessage **)array)[i]);
+        rv += uint32_size(len) + len;
+      }
+      break;
+  }
+
+  if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED))
+    header_size += uint32_size(rv);
+  return header_size + rv;
+}
+
+/**
+ * Calculate the serialized size of an unknown field, i.e. one that is passed
+ * through mostly uninterpreted. This is required for forward compatibility if
+ * new fields are added to the message descriptor.
+ *
+ * \param field
+ *      Unknown field type.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t unknown_field_get_packed_size(
+    const ProtobufCMessageUnknownField *field) {
+  return get_tag_size(field->tag) + field->len;
+}
+
+/**@}*/
+
+/*
+ * Calculate the serialized size of the message.
+ */
+size_t protobuf_c_message_get_packed_size(const ProtobufCMessage *message) {
+  unsigned i;
+  size_t rv = 0;
+
+  ASSERT_IS_MESSAGE(message);
+  for (i = 0; i < message->descriptor->n_fields; i++) {
+    const ProtobufCFieldDescriptor *field = message->descriptor->fields + i;
+    const void *member = ((const char *)message) + field->offset;
+    const void *qmember = ((const char *)message) + field->quantifier_offset;
+
+    if (field->label == PROTOBUF_C_LABEL_REQUIRED) {
+      rv += required_field_get_packed_size(field, member);
+    } else if ((field->label == PROTOBUF_C_LABEL_OPTIONAL ||
+                field->label == PROTOBUF_C_LABEL_NONE) &&
+               (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF))) {
+      rv += oneof_field_get_packed_size(field, *(const uint32_t *)qmember,
+                                        member);
+    } else if (field->label == PROTOBUF_C_LABEL_OPTIONAL) {
+      rv += optional_field_get_packed_size(
+          field, *(protobuf_c_boolean *)qmember, member);
+    } else if (field->label == PROTOBUF_C_LABEL_NONE) {
+      rv += unlabeled_field_get_packed_size(field, member);
+    } else {
+      rv += repeated_field_get_packed_size(field, *(const size_t *)qmember,
+                                           member);
+    }
+  }
+  for (i = 0; i < message->n_unknown_fields; i++)
+    rv += unknown_field_get_packed_size(&message->unknown_fields[i]);
+  return rv;
+}
+
+/**
+ * \defgroup pack protobuf_c_message_pack() implementation
+ *
+ * Routines mainly used by protobuf_c_message_pack().
+ *
+ * \ingroup internal
+ * @{
+ */
+
+/**
+ * Pack an unsigned 32-bit integer in base-128 varint encoding and return the
+ * number of bytes written, which must be 5 or less.
+ *
+ * \param value
+ *      Value to encode.
+ * \param[out] out
+ *      Packed value.
+ * \return
+ *      Number of bytes written to `out`.
+ */
+static inline size_t uint32_pack(uint32_t value, uint8_t *out) {
+  unsigned rv = 0;
+
+  if (value >= 0x80) {
+    out[rv++] = value | 0x80;
+    value >>= 7;
+    if (value >= 0x80) {
+      out[rv++] = value | 0x80;
+      value >>= 7;
+      if (value >= 0x80) {
+        out[rv++] = value | 0x80;
+        value >>= 7;
+        if (value >= 0x80) {
+          out[rv++] = value | 0x80;
+          value >>= 7;
+        }
+      }
+    }
+  }
+  /* assert: value<128 */
+  out[rv++] = value;
+  return rv;
+}
+
+/**
+ * Pack a 64-bit unsigned integer using base-128 varint encoding and return the
+ * number of bytes written.
+ *
+ * \param value
+ *      Value to encode.
+ * \param[out] out
+ *      Packed value.
+ * \return
+ *      Number of bytes written to `out`.
+ */
+static size_t uint64_pack(uint64_t value, uint8_t *out) {
+  uint32_t hi = (uint32_t)(value >> 32);
+  uint32_t lo = (uint32_t)value;
+  unsigned rv;
+
+  if (hi == 0) return uint32_pack((uint32_t)lo, out);
+  out[0] = (lo) | 0x80;
+  out[1] = (lo >> 7) | 0x80;
+  out[2] = (lo >> 14) | 0x80;
+  out[3] = (lo >> 21) | 0x80;
+  if (hi < 8) {
+    out[4] = (hi << 4) | (lo >> 28);
+    return 5;
+  } else {
+    out[4] = ((hi & 7) << 4) | (lo >> 28) | 0x80;
+    hi >>= 3;
+  }
+  rv = 5;
+  while (hi >= 128) {
+    out[rv++] = hi | 0x80;
+    hi >>= 7;
+  }
+  out[rv++] = hi;
+  return rv;
+}
+
+/**
+ * Pack a ProtobufCBinaryData and return the number of bytes written. The output
+ * includes a length delimiter.
+ *
+ * \param bd
+ *      ProtobufCBinaryData to encode.
+ * \param[out] out
+ *      Packed value.
+ * \return
+ *      Number of bytes written to `out`.
+ */
+static inline size_t binary_data_pack(const ProtobufCBinaryData *bd,
+                                      uint8_t *out) {
+  size_t len = bd->len;
+  size_t rv = uint32_pack(len, out);
+  memcpy(out + rv, bd->data, len);
+  return rv + len;
+}
+
+/**
+ * Pack a field tag.
+ *
+ * Wire-type will be added in required_field_pack().
+ *
+ * \todo Just call uint64_pack on 64-bit platforms.
+ *
+ * \param id
+ *      Tag value to encode.
+ * \param[out] out
+ *      Packed value.
+ * \return
+ *      Number of bytes written to `out`.
+ */
+static size_t tag_pack(uint32_t id, uint8_t *out) {
+  if (id < (1UL << (32 - 3)))
+    return uint32_pack(id << 3, out);
+  else
+    return uint64_pack(((uint64_t)id) << 3, out);
+}
+
+/**
+ * Given a field type, return the in-memory size.
+ *
+ * \todo Implement as a table lookup.
+ *
+ * \param type
+ *      Field type.
+ * \return
+ *      Size of the field.
+ */
+static inline size_t sizeof_elt_in_repeated_array(ProtobufCType type) {
+  switch (type) {
+    case PROTOBUF_C_TYPE_SINT32:
+    case PROTOBUF_C_TYPE_INT32:
+    case PROTOBUF_C_TYPE_UINT32:
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+    case PROTOBUF_C_TYPE_FLOAT:
+    case PROTOBUF_C_TYPE_ENUM:
+      return 4;
+    case PROTOBUF_C_TYPE_SINT64:
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+    case PROTOBUF_C_TYPE_DOUBLE:
+      return 8;
+    case PROTOBUF_C_TYPE_BOOL:
+      return sizeof(protobuf_c_boolean);
+    case PROTOBUF_C_TYPE_STRING:
+    case PROTOBUF_C_TYPE_MESSAGE:
+      return sizeof(void *);
+    case PROTOBUF_C_TYPE_BYTES:
+      return sizeof(ProtobufCBinaryData);
+  }
+  PROTOBUF_C__ASSERT_NOT_REACHED();
+  return 0;
+}
+
+static inline int int_range_lookup(unsigned n_ranges,
+                                   const ProtobufCIntRange *ranges, int value) {
+  unsigned n;
+  unsigned start;
+
+  if (n_ranges == 0) return -1;
+  start = 0;
+  n = n_ranges;
+  while (n > 1) {
+    unsigned mid = start + n / 2;
+
+    if (value < ranges[mid].start_value) {
+      n = mid - start;
+    } else if (value >=
+               ranges[mid].start_value +
+                   (int)(ranges[mid + 1].orig_index - ranges[mid].orig_index)) {
+      unsigned new_start = mid + 1;
+      n = start + n - new_start;
+      start = new_start;
+    } else
+      return (value - ranges[mid].start_value) + ranges[mid].orig_index;
+  }
+  if (n > 0) {
+    unsigned start_orig_index = ranges[start].orig_index;
+    unsigned range_size = ranges[start + 1].orig_index - start_orig_index;
+
+    if (ranges[start].start_value <= value &&
+        value < (int)(ranges[start].start_value + range_size)) {
+      return (value - ranges[start].start_value) + start_orig_index;
+    }
+  }
+  return -1;
+}
+
+static size_t parse_tag_and_wiretype(size_t len, const uint8_t *data,
+                                     uint32_t *tag_out,
+                                     ProtobufCWireType *wiretype_out) {
+  unsigned max_rv = len > 5 ? 5 : len;
+  uint32_t tag = (data[0] & 0x7f) >> 3;
+  unsigned shift = 4;
+  unsigned rv;
+
+  *wiretype_out = data[0] & 7;
+  if ((data[0] & 0x80) == 0) {
+    *tag_out = tag;
+    return 1;
+  }
+  for (rv = 1; rv < max_rv; rv++) {
+    if (data[rv] & 0x80) {
+      tag |= (data[rv] & 0x7f) << shift;
+      shift += 7;
+    } else {
+      tag |= data[rv] << shift;
+      *tag_out = tag;
+      return rv + 1;
+    }
+  }
+  return 0; /* error: bad header */
+}
+
+/* sizeof(ScannedMember) must be <= (1UL<<BOUND_SIZEOF_SCANNED_MEMBER_LOG2) */
+#define BOUND_SIZEOF_SCANNED_MEMBER_LOG2 5
+typedef struct _ScannedMember ScannedMember;
+/** Field as it's being read. */
+struct _ScannedMember {
+  uint32_t tag;                          /**< Field tag. */
+  uint8_t wire_type;                     /**< Field type. */
+  uint8_t length_prefix_len;             /**< Prefix length. */
+  const ProtobufCFieldDescriptor *field; /**< Field descriptor. */
+  size_t len;                            /**< Field length. */
+  const uint8_t *data;                   /**< Pointer to field data. */
+};
+
+static inline uint32_t scan_length_prefixed_data(size_t len,
+                                                 const uint8_t *data,
+                                                 size_t *prefix_len_out) {
+  unsigned hdr_max = len < 5 ? len : 5;
+  unsigned hdr_len;
+  uint32_t val = 0;
+  unsigned i;
+  unsigned shift = 0;
+
+  for (i = 0; i < hdr_max; i++) {
+    val |= (data[i] & 0x7f) << shift;
+    shift += 7;
+    if ((data[i] & 0x80) == 0) break;
+  }
+  if (i == hdr_max) {
+    PROTOBUF_C_UNPACK_ERROR("error parsing length for length-prefixed data");
+    return 0;
+  }
+  hdr_len = i + 1;
+  *prefix_len_out = hdr_len;
+  if (hdr_len + val > len) {
+    PROTOBUF_C_UNPACK_ERROR("data too short after length-prefix of %u", val);
+    return 0;
+  }
+  return hdr_len + val;
+}
+
+static size_t max_b128_numbers(size_t len, const uint8_t *data) {
+  size_t rv = 0;
+  while (len--)
+    if ((*data++ & 0x80) == 0) ++rv;
+  return rv;
+}
+
+/**@}*/
+
+/**
+ * Merge earlier message into a latter message.
+ *
+ * For numeric types and strings, if the same value appears multiple
+ * times, the parser accepts the last value it sees. For embedded
+ * message fields, the parser merges multiple instances of the same
+ * field. That is, all singular scalar fields in the latter instance
+ * replace those in the former, singular embedded messages are merged,
+ * and repeated fields are concatenated.
+ *
+ * The earlier message should be freed after calling this function, as
+ * some of its fields may have been reused and changed to their default
+ * values during the merge.
+ */
+static protobuf_c_boolean merge_messages(ProtobufCMessage *earlier_msg,
+                                         ProtobufCMessage *latter_msg,
+                                         ProtobufCAllocator *allocator) {
+  unsigned i;
+  const ProtobufCFieldDescriptor *fields = latter_msg->descriptor->fields;
+  for (i = 0; i < latter_msg->descriptor->n_fields; i++) {
+    if (fields[i].label == PROTOBUF_C_LABEL_REPEATED) {
+      size_t *n_earlier =
+          STRUCT_MEMBER_PTR(size_t, earlier_msg, fields[i].quantifier_offset);
+      uint8_t **p_earlier =
+          STRUCT_MEMBER_PTR(uint8_t *, earlier_msg, fields[i].offset);
+      size_t *n_latter =
+          STRUCT_MEMBER_PTR(size_t, latter_msg, fields[i].quantifier_offset);
+      uint8_t **p_latter =
+          STRUCT_MEMBER_PTR(uint8_t *, latter_msg, fields[i].offset);
+
+      if (*n_earlier > 0) {
+        if (*n_latter > 0) {
+          /* Concatenate the repeated field */
+          size_t el_size = sizeof_elt_in_repeated_array(fields[i].type);
+          uint8_t *new_field;
+
+          new_field = do_alloc(allocator, (*n_earlier + *n_latter) * el_size);
+          if (!new_field) return FALSE;
+
+          memcpy(new_field, *p_earlier, *n_earlier * el_size);
+          memcpy(new_field + *n_earlier * el_size, *p_latter,
+                 *n_latter * el_size);
+
+          do_free(allocator, *p_latter);
+          do_free(allocator, *p_earlier);
+          *p_latter = new_field;
+          *n_latter = *n_earlier + *n_latter;
+        } else {
+          /* Zero copy the repeated field from the earlier message */
+          *n_latter = *n_earlier;
+          *p_latter = *p_earlier;
+        }
+        /* Make sure the field does not get double freed */
+        *n_earlier = 0;
+        *p_earlier = 0;
+      }
+    } else if (fields[i].label == PROTOBUF_C_LABEL_OPTIONAL ||
+               fields[i].label == PROTOBUF_C_LABEL_NONE) {
+      const ProtobufCFieldDescriptor *field;
+      uint32_t *earlier_case_p =
+          STRUCT_MEMBER_PTR(uint32_t, earlier_msg, fields[i].quantifier_offset);
+      uint32_t *latter_case_p =
+          STRUCT_MEMBER_PTR(uint32_t, latter_msg, fields[i].quantifier_offset);
+      protobuf_c_boolean need_to_merge = FALSE;
+      void *earlier_elem;
+      void *latter_elem;
+      const void *def_val;
+
+      if (fields[i].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) {
+        if (*latter_case_p == 0) {
+          /* lookup correct oneof field */
+          int field_index = int_range_lookup(
+              latter_msg->descriptor->n_field_ranges,
+              latter_msg->descriptor->field_ranges, *earlier_case_p);
+          field = latter_msg->descriptor->fields + field_index;
+        } else {
+          /* Oneof is present in the latter message, move on */
+          continue;
+        }
+      } else {
+        field = &fields[i];
+      }
+
+      earlier_elem = STRUCT_MEMBER_P(earlier_msg, field->offset);
+      latter_elem = STRUCT_MEMBER_P(latter_msg, field->offset);
+      def_val = field->default_value;
+
+      switch (field->type) {
+        case PROTOBUF_C_TYPE_MESSAGE: {
+          ProtobufCMessage *em = *(ProtobufCMessage **)earlier_elem;
+          ProtobufCMessage *lm = *(ProtobufCMessage **)latter_elem;
+          if (em != NULL) {
+            if (lm != NULL) {
+              if (!merge_messages(em, lm, allocator)) return FALSE;
+              /* Already merged */
+              need_to_merge = FALSE;
+            } else {
+              /* Zero copy the message */
+              need_to_merge = TRUE;
+            }
+          }
+          break;
+        }
+        case PROTOBUF_C_TYPE_BYTES: {
+          uint8_t *e_data = ((ProtobufCBinaryData *)earlier_elem)->data;
+          uint8_t *l_data = ((ProtobufCBinaryData *)latter_elem)->data;
+          const ProtobufCBinaryData *d_bd = (ProtobufCBinaryData *)def_val;
+
+          need_to_merge =
+              (e_data != NULL && (d_bd == NULL || e_data != d_bd->data)) &&
+              (l_data == NULL || (d_bd != NULL && l_data == d_bd->data));
+          break;
+        }
+        case PROTOBUF_C_TYPE_STRING: {
+          char *e_str = *(char **)earlier_elem;
+          char *l_str = *(char **)latter_elem;
+          const char *d_str = def_val;
+
+          need_to_merge = e_str != d_str && l_str == d_str;
+          break;
+        }
+        default: {
+          /* Could be has field or case enum, the logic is
+           * equivalent, since 0 (FALSE) means not set for
+           * oneof */
+          need_to_merge = (*earlier_case_p != 0) && (*latter_case_p == 0);
+          break;
+        }
+      }
+
+      if (need_to_merge) {
+        size_t el_size = sizeof_elt_in_repeated_array(field->type);
+        memcpy(latter_elem, earlier_elem, el_size);
+        /*
+         * Reset the element from the old message to 0
+         * to make sure earlier message deallocation
+         * doesn't corrupt zero-copied data in the new
+         * message, earlier message will be freed after
+         * this function is called anyway
+         */
+        memset(earlier_elem, 0, el_size);
+
+        if (field->quantifier_offset != 0) {
+          /* Set the has field or the case enum,
+           * if applicable */
+          *latter_case_p = *earlier_case_p;
+          *earlier_case_p = 0;
+        }
+      }
+    }
+  }
+  return TRUE;
+}
+
+/**
+ * Count packed elements.
+ *
+ * Given a raw slab of packed-repeated values, determine the number of
+ * elements. This function detects certain kinds of errors but not
+ * others; the remaining error checking is done by
+ * parse_packed_repeated_member().
+ */
+static protobuf_c_boolean count_packed_elements(ProtobufCType type, size_t len,
+                                                const uint8_t *data,
+                                                size_t *count_out) {
+  switch (type) {
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+    case PROTOBUF_C_TYPE_FLOAT:
+      if (len % 4 != 0) {
+        PROTOBUF_C_UNPACK_ERROR(
+            "length must be a multiple of 4 for fixed-length 32-bit types");
+        return FALSE;
+      }
+      *count_out = len / 4;
+      return TRUE;
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+    case PROTOBUF_C_TYPE_DOUBLE:
+      if (len % 8 != 0) {
+        PROTOBUF_C_UNPACK_ERROR(
+            "length must be a multiple of 8 for fixed-length 64-bit types");
+        return FALSE;
+      }
+      *count_out = len / 8;
+      return TRUE;
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_INT32:
+    case PROTOBUF_C_TYPE_SINT32:
+    case PROTOBUF_C_TYPE_UINT32:
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_SINT64:
+    case PROTOBUF_C_TYPE_UINT64:
+      *count_out = max_b128_numbers(len, data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_BOOL:
+      *count_out = len;
+      return TRUE;
+    case PROTOBUF_C_TYPE_STRING:
+    case PROTOBUF_C_TYPE_BYTES:
+    case PROTOBUF_C_TYPE_MESSAGE:
+    default:
+      PROTOBUF_C_UNPACK_ERROR("bad protobuf-c type %u for packed-repeated",
+                              type);
+      return FALSE;
+  }
+}
+
+static inline uint32_t parse_uint32(unsigned len, const uint8_t *data) {
+  uint32_t rv = data[0] & 0x7f;
+  if (len > 1) {
+    rv |= ((uint32_t)(data[1] & 0x7f) << 7);
+    if (len > 2) {
+      rv |= ((uint32_t)(data[2] & 0x7f) << 14);
+      if (len > 3) {
+        rv |= ((uint32_t)(data[3] & 0x7f) << 21);
+        if (len > 4) rv |= ((uint32_t)(data[4]) << 28);
+      }
+    }
+  }
+  return rv;
+}
+
+static inline uint32_t parse_int32(unsigned len, const uint8_t *data) {
+  return parse_uint32(len, data);
+}
+
+static inline int32_t unzigzag32(uint32_t v) {
+  if (v & 1)
+    return -(v >> 1) - 1;
+  else
+    return v >> 1;
+}
+
+static inline uint32_t parse_fixed_uint32(const uint8_t *data) {
+#if !defined(WORDS_BIGENDIAN)
+  uint32_t t;
+  memcpy(&t, data, 4);
+  return t;
+#else
+  return data[0] | ((uint32_t)(data[1]) << 8) | ((uint32_t)(data[2]) << 16) |
+         ((uint32_t)(data[3]) << 24);
+#endif
+}
+
+static uint64_t parse_uint64(unsigned len, const uint8_t *data) {
+  unsigned shift, i;
+  uint64_t rv;
+
+  if (len < 5) return parse_uint32(len, data);
+  rv = ((uint64_t)(data[0] & 0x7f)) | ((uint64_t)(data[1] & 0x7f) << 7) |
+       ((uint64_t)(data[2] & 0x7f) << 14) | ((uint64_t)(data[3] & 0x7f) << 21);
+  shift = 28;
+  for (i = 4; i < len; i++) {
+    rv |= (((uint64_t)(data[i] & 0x7f)) << shift);
+    shift += 7;
+  }
+  return rv;
+}
+
+static inline int64_t unzigzag64(uint64_t v) {
+  if (v & 1)
+    return -(v >> 1) - 1;
+  else
+    return v >> 1;
+}
+
+static inline uint64_t parse_fixed_uint64(const uint8_t *data) {
+#if !defined(WORDS_BIGENDIAN)
+  uint64_t t;
+  memcpy(&t, data, 8);
+  return t;
+#else
+  return (uint64_t)parse_fixed_uint32(data) |
+         (((uint64_t)parse_fixed_uint32(data + 4)) << 32);
+#endif
+}
+
+static protobuf_c_boolean parse_boolean(unsigned len, const uint8_t *data) {
+  unsigned i;
+  for (i = 0; i < len; i++)
+    if (data[i] & 0x7f) return TRUE;
+  return FALSE;
+}
+
+static protobuf_c_boolean parse_required_member(
+    ScannedMember *scanned_member, void *member, ProtobufCAllocator *allocator,
+    protobuf_c_boolean maybe_clear) {
+  unsigned len = scanned_member->len;
+  const uint8_t *data = scanned_member->data;
+  ProtobufCWireType wire_type = scanned_member->wire_type;
+
+  switch (scanned_member->field->type) {
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_INT32:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
+      *(int32_t *)member = parse_int32(len, data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_UINT32:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
+      *(uint32_t *)member = parse_uint32(len, data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_SINT32:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
+      *(int32_t *)member = unzigzag32(parse_uint32(len, data));
+      return TRUE;
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+    case PROTOBUF_C_TYPE_FLOAT:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_32BIT) return FALSE;
+      *(uint32_t *)member = parse_fixed_uint32(data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
+      *(uint64_t *)member = parse_uint64(len, data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_SINT64:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
+      *(int64_t *)member = unzigzag64(parse_uint64(len, data));
+      return TRUE;
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+    case PROTOBUF_C_TYPE_DOUBLE:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_64BIT) return FALSE;
+      *(uint64_t *)member = parse_fixed_uint64(data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_BOOL:
+      *(protobuf_c_boolean *)member = parse_boolean(len, data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_STRING: {
+      char **pstr = member;
+      unsigned pref_len = scanned_member->length_prefix_len;
+
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
+
+      if (maybe_clear && *pstr != NULL) {
+        const char *def = scanned_member->field->default_value;
+        if (*pstr != NULL && *pstr != def) do_free(allocator, *pstr);
+      }
+      *pstr = do_alloc(allocator, len - pref_len + 1);
+      if (*pstr == NULL) return FALSE;
+      memcpy(*pstr, data + pref_len, len - pref_len);
+      (*pstr)[len - pref_len] = 0;
+      return TRUE;
+    }
+    case PROTOBUF_C_TYPE_BYTES: {
+      ProtobufCBinaryData *bd = member;
+      const ProtobufCBinaryData *def_bd;
+      unsigned pref_len = scanned_member->length_prefix_len;
+
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
+
+      def_bd = scanned_member->field->default_value;
+      if (maybe_clear && bd->data != NULL &&
+          (def_bd == NULL || bd->data != def_bd->data)) {
+        do_free(allocator, bd->data);
+      }
+      if (len - pref_len > 0) {
+        bd->data = do_alloc(allocator, len - pref_len);
+        if (bd->data == NULL) return FALSE;
+        memcpy(bd->data, data + pref_len, len - pref_len);
+      } else {
+        bd->data = NULL;
+      }
+      bd->len = len - pref_len;
+      return TRUE;
+    }
+    case PROTOBUF_C_TYPE_MESSAGE: {
+      ProtobufCMessage **pmessage = member;
+      ProtobufCMessage *subm;
+      const ProtobufCMessage *def_mess;
+      protobuf_c_boolean merge_successful = TRUE;
+      unsigned pref_len = scanned_member->length_prefix_len;
+
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
+
+      def_mess = scanned_member->field->default_value;
+      subm =
+          protobuf_c_message_unpack(scanned_member->field->descriptor,
+                                    allocator, len - pref_len, data + pref_len);
+
+      if (maybe_clear && *pmessage != NULL && *pmessage != def_mess) {
+        if (subm != NULL)
+          merge_successful = merge_messages(*pmessage, subm, allocator);
+        /* Delete the previous message */
+        protobuf_c_message_free_unpacked(*pmessage, allocator);
+      }
+      *pmessage = subm;
+      if (subm == NULL || !merge_successful) return FALSE;
+      return TRUE;
+    }
+  }
+  return FALSE;
+}
+
+static protobuf_c_boolean parse_oneof_member(ScannedMember *scanned_member,
+                                             void *member,
+                                             ProtobufCMessage *message,
+                                             ProtobufCAllocator *allocator) {
+  uint32_t *oneof_case = STRUCT_MEMBER_PTR(
+      uint32_t, message, scanned_member->field->quantifier_offset);
+
+  /* If we have already parsed a member of this oneof, free it. */
+  if (*oneof_case != 0) {
+    /* lookup field */
+    int field_index =
+        int_range_lookup(message->descriptor->n_field_ranges,
+                         message->descriptor->field_ranges, *oneof_case);
+    const ProtobufCFieldDescriptor *old_field =
+        message->descriptor->fields + field_index;
+    size_t el_size = sizeof_elt_in_repeated_array(old_field->type);
+
+    switch (old_field->type) {
+      case PROTOBUF_C_TYPE_STRING: {
+        char **pstr = member;
+        const char *def = old_field->default_value;
+        if (*pstr != NULL && *pstr != def) do_free(allocator, *pstr);
+        break;
+      }
+      case PROTOBUF_C_TYPE_BYTES: {
+        ProtobufCBinaryData *bd = member;
+        const ProtobufCBinaryData *def_bd = old_field->default_value;
+        if (bd->data != NULL && (def_bd == NULL || bd->data != def_bd->data)) {
+          do_free(allocator, bd->data);
+        }
+        break;
+      }
+      case PROTOBUF_C_TYPE_MESSAGE: {
+        ProtobufCMessage **pmessage = member;
+        const ProtobufCMessage *def_mess = old_field->default_value;
+        if (*pmessage != NULL && *pmessage != def_mess)
+          protobuf_c_message_free_unpacked(*pmessage, allocator);
+        break;
+      }
+      default:
+        break;
+    }
+
+    memset(member, 0, el_size);
+  }
+  if (!parse_required_member(scanned_member, member, allocator, TRUE))
+    return FALSE;
+
+  *oneof_case = scanned_member->tag;
+  return TRUE;
+}
+
+static protobuf_c_boolean parse_optional_member(ScannedMember *scanned_member,
+                                                void *member,
+                                                ProtobufCMessage *message,
+                                                ProtobufCAllocator *allocator) {
+  if (!parse_required_member(scanned_member, member, allocator, TRUE))
+    return FALSE;
+  if (scanned_member->field->quantifier_offset != 0)
+    STRUCT_MEMBER(protobuf_c_boolean, message,
+                  scanned_member->field->quantifier_offset) = TRUE;
+  return TRUE;
+}
+
+static protobuf_c_boolean parse_repeated_member(ScannedMember *scanned_member,
+                                                void *member,
+                                                ProtobufCMessage *message,
+                                                ProtobufCAllocator *allocator) {
+  const ProtobufCFieldDescriptor *field = scanned_member->field;
+  size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset);
+  size_t siz = sizeof_elt_in_repeated_array(field->type);
+  char *array = *(char **)member;
+
+  if (!parse_required_member(scanned_member, array + siz * (*p_n), allocator,
+                             FALSE)) {
+    return FALSE;
+  }
+  *p_n += 1;
+  return TRUE;
+}
+
+static unsigned scan_varint(unsigned len, const uint8_t *data) {
+  unsigned i;
+  if (len > 10) len = 10;
+  for (i = 0; i < len; i++)
+    if ((data[i] & 0x80) == 0) break;
+  if (i == len) return 0;
+  return i + 1;
+}
+
+static protobuf_c_boolean parse_packed_repeated_member(
+    ScannedMember *scanned_member, void *member, ProtobufCMessage *message) {
+  const ProtobufCFieldDescriptor *field = scanned_member->field;
+  size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset);
+  size_t siz = sizeof_elt_in_repeated_array(field->type);
+  void *array = *(char **)member + siz * (*p_n);
+  const uint8_t *at = scanned_member->data + scanned_member->length_prefix_len;
+  size_t rem = scanned_member->len - scanned_member->length_prefix_len;
+  size_t count = 0;
+  unsigned i;
+
+  switch (field->type) {
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+    case PROTOBUF_C_TYPE_FLOAT:
+      count = (scanned_member->len - scanned_member->length_prefix_len) / 4;
+#if !defined(WORDS_BIGENDIAN)
+      goto no_unpacking_needed;
+#else
+      for (i = 0; i < count; i++) {
+        ((uint32_t *)array)[i] = parse_fixed_uint32(at);
+        at += 4;
+      }
+      break;
+#endif
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+    case PROTOBUF_C_TYPE_DOUBLE:
+      count = (scanned_member->len - scanned_member->length_prefix_len) / 8;
+#if !defined(WORDS_BIGENDIAN)
+      goto no_unpacking_needed;
+#else
+      for (i = 0; i < count; i++) {
+        ((uint64_t *)array)[i] = parse_fixed_uint64(at);
+        at += 8;
+      }
+      break;
+#endif
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_INT32:
+      while (rem > 0) {
+        unsigned s = scan_varint(rem, at);
+        if (s == 0) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int32 value");
+          return FALSE;
+        }
+        ((int32_t *)array)[count++] = parse_int32(s, at);
+        at += s;
+        rem -= s;
+      }
+      break;
+    case PROTOBUF_C_TYPE_SINT32:
+      while (rem > 0) {
+        unsigned s = scan_varint(rem, at);
+        if (s == 0) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint32 value");
+          return FALSE;
+        }
+        ((int32_t *)array)[count++] = unzigzag32(parse_uint32(s, at));
+        at += s;
+        rem -= s;
+      }
+      break;
+    case PROTOBUF_C_TYPE_UINT32:
+      while (rem > 0) {
+        unsigned s = scan_varint(rem, at);
+        if (s == 0) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated enum or uint32 value");
+          return FALSE;
+        }
+        ((uint32_t *)array)[count++] = parse_uint32(s, at);
+        at += s;
+        rem -= s;
+      }
+      break;
+
+    case PROTOBUF_C_TYPE_SINT64:
+      while (rem > 0) {
+        unsigned s = scan_varint(rem, at);
+        if (s == 0) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint64 value");
+          return FALSE;
+        }
+        ((int64_t *)array)[count++] = unzigzag64(parse_uint64(s, at));
+        at += s;
+        rem -= s;
+      }
+      break;
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+      while (rem > 0) {
+        unsigned s = scan_varint(rem, at);
+        if (s == 0) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int64/uint64 value");
+          return FALSE;
+        }
+        ((int64_t *)array)[count++] = parse_uint64(s, at);
+        at += s;
+        rem -= s;
+      }
+      break;
+    case PROTOBUF_C_TYPE_BOOL:
+      count = rem;
+      for (i = 0; i < count; i++) {
+        if (at[i] > 1) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated boolean value");
+          return FALSE;
+        }
+        ((protobuf_c_boolean *)array)[i] = at[i];
+      }
+      break;
+    default:
+      PROTOBUF_C__ASSERT_NOT_REACHED();
+  }
+  *p_n += count;
+  return TRUE;
+
+#if !defined(WORDS_BIGENDIAN)
+no_unpacking_needed:
+  memcpy(array, at, count * siz);
+  *p_n += count;
+  return TRUE;
+#endif
+}
+
+static protobuf_c_boolean is_packable_type(ProtobufCType type) {
+  return type != PROTOBUF_C_TYPE_STRING && type != PROTOBUF_C_TYPE_BYTES &&
+         type != PROTOBUF_C_TYPE_MESSAGE;
+}
+
+static protobuf_c_boolean parse_member(ScannedMember *scanned_member,
+                                       ProtobufCMessage *message,
+                                       ProtobufCAllocator *allocator) {
+  const ProtobufCFieldDescriptor *field = scanned_member->field;
+  void *member;
+
+  if (field == NULL) {
+    ProtobufCMessageUnknownField *ufield =
+        message->unknown_fields + (message->n_unknown_fields++);
+    ufield->tag = scanned_member->tag;
+    ufield->wire_type = scanned_member->wire_type;
+    ufield->len = scanned_member->len;
+    ufield->data = do_alloc(allocator, scanned_member->len);
+    if (ufield->data == NULL) return FALSE;
+    memcpy(ufield->data, scanned_member->data, ufield->len);
+    return TRUE;
+  }
+  member = (char *)message + field->offset;
+  switch (field->label) {
+    case PROTOBUF_C_LABEL_REQUIRED:
+      return parse_required_member(scanned_member, member, allocator, TRUE);
+    case PROTOBUF_C_LABEL_OPTIONAL:
+    case PROTOBUF_C_LABEL_NONE:
+      if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF)) {
+        return parse_oneof_member(scanned_member, member, message, allocator);
+      } else {
+        return parse_optional_member(scanned_member, member, message,
+                                     allocator);
+      }
+    case PROTOBUF_C_LABEL_REPEATED:
+      if (scanned_member->wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED &&
+          (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) ||
+           is_packable_type(field->type))) {
+        return parse_packed_repeated_member(scanned_member, member, message);
+      } else {
+        return parse_repeated_member(scanned_member, member, message,
+                                     allocator);
+      }
+  }
+  PROTOBUF_C__ASSERT_NOT_REACHED();
+  return 0;
+}
+
+/**
+ * Initialise messages generated by old code.
+ *
+ * This function is used if desc->message_init == NULL (which occurs
+ * for old code, and which would be useful to support allocating
+ * descriptors dynamically).
+ */
+static void message_init_generic(const ProtobufCMessageDescriptor *desc,
+                                 ProtobufCMessage *message) {
+  unsigned i;
+
+  memset(message, 0, desc->sizeof_message);
+  message->descriptor = desc;
+  for (i = 0; i < desc->n_fields; i++) {
+    if (desc->fields[i].default_value != NULL &&
+        desc->fields[i].label != PROTOBUF_C_LABEL_REPEATED) {
+      void *field = STRUCT_MEMBER_P(message, desc->fields[i].offset);
+      const void *dv = desc->fields[i].default_value;
+
+      switch (desc->fields[i].type) {
+        case PROTOBUF_C_TYPE_INT32:
+        case PROTOBUF_C_TYPE_SINT32:
+        case PROTOBUF_C_TYPE_SFIXED32:
+        case PROTOBUF_C_TYPE_UINT32:
+        case PROTOBUF_C_TYPE_FIXED32:
+        case PROTOBUF_C_TYPE_FLOAT:
+        case PROTOBUF_C_TYPE_ENUM:
+          memcpy(field, dv, 4);
+          break;
+        case PROTOBUF_C_TYPE_INT64:
+        case PROTOBUF_C_TYPE_SINT64:
+        case PROTOBUF_C_TYPE_SFIXED64:
+        case PROTOBUF_C_TYPE_UINT64:
+        case PROTOBUF_C_TYPE_FIXED64:
+        case PROTOBUF_C_TYPE_DOUBLE:
+          memcpy(field, dv, 8);
+          break;
+        case PROTOBUF_C_TYPE_BOOL:
+          memcpy(field, dv, sizeof(protobuf_c_boolean));
+          break;
+        case PROTOBUF_C_TYPE_BYTES:
+          memcpy(field, dv, sizeof(ProtobufCBinaryData));
+          break;
+
+        case PROTOBUF_C_TYPE_STRING:
+        case PROTOBUF_C_TYPE_MESSAGE:
+          /*
+           * The next line essentially implements a cast
+           * from const, which is totally unavoidable.
+           */
+          *(const void **)field = dv;
+          break;
+      }
+    }
+  }
+}
+
+/**@}*/
+
+/*
+ * ScannedMember slabs (an unpacking implementation detail). Before doing real
+ * unpacking, we first scan through the elements to see how many there are (for
+ * repeated fields), and which field to use (for non-repeated fields given
+ * twice).
+ *
+ * In order to avoid allocations for small messages, we keep a stack-allocated
+ * slab of ScannedMembers of size FIRST_SCANNED_MEMBER_SLAB_SIZE (16). After we
+ * fill that up, we allocate each slab twice as large as the previous one.
+ */
+#define FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2 4
+
+/*
+ * The number of slabs, including the stack-allocated ones; choose the number so
+ * that we would overflow if we needed a slab larger than provided.
+ */
+#define MAX_SCANNED_MEMBER_SLAB                                      \
+  (sizeof(unsigned int) * 8 - 1 - BOUND_SIZEOF_SCANNED_MEMBER_LOG2 - \
+   FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2)
+
+#define REQUIRED_FIELD_BITMAP_SET(index) \
+  (required_fields_bitmap[(index) / 8] |= (1UL << ((index) % 8)))
+
+#define REQUIRED_FIELD_BITMAP_IS_SET(index) \
+  (required_fields_bitmap[(index) / 8] & (1UL << ((index) % 8)))
+
+ProtobufCMessage *protobuf_c_message_unpack(
+    const ProtobufCMessageDescriptor *desc, ProtobufCAllocator *allocator,
+    size_t len, const uint8_t *data) {
+  ProtobufCMessage *rv;
+  size_t rem = len;
+  const uint8_t *at = data;
+  const ProtobufCFieldDescriptor *last_field = desc->fields + 0;
+  ScannedMember first_member_slab[1UL << FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2];
+
+  /*
+   * scanned_member_slabs[i] is an array of arrays of ScannedMember.
+   * The first slab (scanned_member_slabs[0] is just a pointer to
+   * first_member_slab), above. All subsequent slabs will be allocated
+   * using the allocator.
+   */
+  ScannedMember *scanned_member_slabs[MAX_SCANNED_MEMBER_SLAB + 1];
+  unsigned which_slab = 0;    /* the slab we are currently populating */
+  unsigned in_slab_index = 0; /* number of members in the slab */
+  size_t n_unknown = 0;
+  unsigned f;
+  unsigned j;
+  unsigned i_slab;
+  unsigned last_field_index = 0;
+  unsigned required_fields_bitmap_len;
+  unsigned char required_fields_bitmap_stack[16];
+  unsigned char *required_fields_bitmap = required_fields_bitmap_stack;
+  protobuf_c_boolean required_fields_bitmap_alloced = FALSE;
+
+  ASSERT_IS_MESSAGE_DESCRIPTOR(desc);
+
+  if (allocator == NULL) allocator = &protobuf_c__allocator;
+
+  rv = do_alloc(allocator, desc->sizeof_message);
+  if (!rv) return (NULL);
+  scanned_member_slabs[0] = first_member_slab;
+
+  required_fields_bitmap_len = (desc->n_fields + 7) / 8;
+  if (required_fields_bitmap_len > sizeof(required_fields_bitmap_stack)) {
+    required_fields_bitmap = do_alloc(allocator, required_fields_bitmap_len);
+    if (!required_fields_bitmap) {
+      do_free(allocator, rv);
+      return (NULL);
+    }
+    required_fields_bitmap_alloced = TRUE;
+  }
+  memset(required_fields_bitmap, 0, required_fields_bitmap_len);
+
+  /*
+   * Generated code always defines "message_init". However, we provide a
+   * fallback for (1) users of old protobuf-c generated-code that do not
+   * provide the function, and (2) descriptors constructed from some other
+   * source (most likely, direct construction from the .proto file).
+   */
+  if (desc->message_init != NULL)
+    protobuf_c_message_init(desc, rv);
+  else
+    message_init_generic(desc, rv);
+
+  while (rem > 0) {
+    uint32_t tag;
+    ProtobufCWireType wire_type;
+    size_t used = parse_tag_and_wiretype(rem, at, &tag, &wire_type);
+    const ProtobufCFieldDescriptor *field;
+    ScannedMember tmp;
+
+    if (used == 0) {
+      PROTOBUF_C_UNPACK_ERROR("error parsing tag/wiretype at offset %u",
+                              (unsigned)(at - data));
+      goto error_cleanup_during_scan;
+    }
+    /*
+     * \todo Consider optimizing for field[1].id == tag, if field[1]
+     * exists!
+     */
+    if (last_field == NULL || last_field->id != tag) {
+      /* lookup field */
+      int field_index =
+          int_range_lookup(desc->n_field_ranges, desc->field_ranges, tag);
+      if (field_index < 0) {
+        field = NULL;
+        n_unknown++;
+      } else {
+        field = desc->fields + field_index;
+        last_field = field;
+        last_field_index = field_index;
+      }
+    } else {
+      field = last_field;
+    }
+
+    if (field != NULL && field->label == PROTOBUF_C_LABEL_REQUIRED)
+      REQUIRED_FIELD_BITMAP_SET(last_field_index);
+
+    at += used;
+    rem -= used;
+    tmp.tag = tag;
+    tmp.wire_type = wire_type;
+    tmp.field = field;
+    tmp.data = at;
+    tmp.length_prefix_len = 0;
+
+    switch (wire_type) {
+      case PROTOBUF_C_WIRE_TYPE_VARINT: {
+        unsigned max_len = rem < 10 ? rem : 10;
+        unsigned i;
+
+        for (i = 0; i < max_len; i++)
+          if ((at[i] & 0x80) == 0) break;
+        if (i == max_len) {
+          PROTOBUF_C_UNPACK_ERROR("unterminated varint at offset %u",
+                                  (unsigned)(at - data));
+          goto error_cleanup_during_scan;
+        }
+        tmp.len = i + 1;
+        break;
+      }
+      case PROTOBUF_C_WIRE_TYPE_64BIT:
+        if (rem < 8) {
+          PROTOBUF_C_UNPACK_ERROR("too short after 64bit wiretype at offset %u",
+                                  (unsigned)(at - data));
+          goto error_cleanup_during_scan;
+        }
+        tmp.len = 8;
+        break;
+      case PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED: {
+        size_t pref_len;
+
+        tmp.len = scan_length_prefixed_data(rem, at, &pref_len);
+        if (tmp.len == 0) {
+          /* NOTE: scan_length_prefixed_data calls UNPACK_ERROR */
+          goto error_cleanup_during_scan;
+        }
+        tmp.length_prefix_len = pref_len;
+        break;
+      }
+      case PROTOBUF_C_WIRE_TYPE_32BIT:
+        if (rem < 4) {
+          PROTOBUF_C_UNPACK_ERROR("too short after 32bit wiretype at offset %u",
+                                  (unsigned)(at - data));
+          goto error_cleanup_during_scan;
+        }
+        tmp.len = 4;
+        break;
+      default:
+        PROTOBUF_C_UNPACK_ERROR("unsupported tag %u at offset %u", wire_type,
+                                (unsigned)(at - data));
+        goto error_cleanup_during_scan;
+    }
+
+    if (in_slab_index ==
+        (1UL << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2))) {
+      size_t size;
+
+      in_slab_index = 0;
+      if (which_slab == MAX_SCANNED_MEMBER_SLAB) {
+        PROTOBUF_C_UNPACK_ERROR("too many fields");
+        goto error_cleanup_during_scan;
+      }
+      which_slab++;
+      size = sizeof(ScannedMember)
+             << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2);
+      scanned_member_slabs[which_slab] = do_alloc(allocator, size);
+      if (scanned_member_slabs[which_slab] == NULL)
+        goto error_cleanup_during_scan;
+    }
+    scanned_member_slabs[which_slab][in_slab_index++] = tmp;
+
+    if (field != NULL && field->label == PROTOBUF_C_LABEL_REPEATED) {
+      size_t *n = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset);
+      if (wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED &&
+          (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) ||
+           is_packable_type(field->type))) {
+        size_t count;
+        if (!count_packed_elements(field->type, tmp.len - tmp.length_prefix_len,
+                                   tmp.data + tmp.length_prefix_len, &count)) {
+          PROTOBUF_C_UNPACK_ERROR("counting packed elements");
+          goto error_cleanup_during_scan;
+        }
+        *n += count;
+      } else {
+        *n += 1;
+      }
+    }
+
+    at += tmp.len;
+    rem -= tmp.len;
+  }
+
+  /* allocate space for repeated fields, also check that all required fields
+   * have been set */
+  for (f = 0; f < desc->n_fields; f++) {
+    const ProtobufCFieldDescriptor *field = desc->fields + f;
+    if (field->label == PROTOBUF_C_LABEL_REPEATED) {
+      size_t siz = sizeof_elt_in_repeated_array(field->type);
+      size_t *n_ptr = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset);
+      if (*n_ptr != 0) {
+        unsigned n = *n_ptr;
+        void *a;
+        *n_ptr = 0;
+        assert(rv->descriptor != NULL);
+#define CLEAR_REMAINING_N_PTRS()                               \
+  for (f++; f < desc->n_fields; f++) {                         \
+    field = desc->fields + f;                                  \
+    if (field->label == PROTOBUF_C_LABEL_REPEATED)             \
+      STRUCT_MEMBER(size_t, rv, field->quantifier_offset) = 0; \
+  }
+        a = do_alloc(allocator, siz * n);
+        if (!a) {
+          CLEAR_REMAINING_N_PTRS();
+          goto error_cleanup;
+        }
+        STRUCT_MEMBER(void *, rv, field->offset) = a;
+      }
+    } else if (field->label == PROTOBUF_C_LABEL_REQUIRED) {
+      if (field->default_value == NULL && !REQUIRED_FIELD_BITMAP_IS_SET(f)) {
+        CLEAR_REMAINING_N_PTRS();
+        PROTOBUF_C_UNPACK_ERROR("message '%s': missing required field '%s'",
+                                desc->name, field->name);
+        goto error_cleanup;
+      }
+    }
+  }
+#undef CLEAR_REMAINING_N_PTRS
+
+  /* allocate space for unknown fields */
+  if (n_unknown) {
+    rv->unknown_fields =
+        do_alloc(allocator, n_unknown * sizeof(ProtobufCMessageUnknownField));
+    if (rv->unknown_fields == NULL) goto error_cleanup;
+  }
+
+  /* do real parsing */
+  for (i_slab = 0; i_slab <= which_slab; i_slab++) {
+    unsigned max =
+        (i_slab == which_slab) ? in_slab_index : (1UL << (i_slab + 4));
+    ScannedMember *slab = scanned_member_slabs[i_slab];
+
+    for (j = 0; j < max; j++) {
+      if (!parse_member(slab + j, rv, allocator)) {
+        PROTOBUF_C_UNPACK_ERROR(
+            "error parsing member %s of %s",
+            slab->field ? slab->field->name : "*unknown-field*", desc->name);
+        goto error_cleanup;
+      }
+    }
+  }
+
+  /* cleanup */
+  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
+  if (required_fields_bitmap_alloced)
+    do_free(allocator, required_fields_bitmap);
+  return rv;
+
+error_cleanup:
+  protobuf_c_message_free_unpacked(rv, allocator);
+  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
+  if (required_fields_bitmap_alloced)
+    do_free(allocator, required_fields_bitmap);
+  return NULL;
+
+error_cleanup_during_scan:
+  do_free(allocator, rv);
+  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
+  if (required_fields_bitmap_alloced)
+    do_free(allocator, required_fields_bitmap);
+  return NULL;
+}
+
+void protobuf_c_message_free_unpacked(ProtobufCMessage *message,
+                                      ProtobufCAllocator *allocator) {
+  const ProtobufCMessageDescriptor *desc;
+  unsigned f;
+
+  if (message == NULL) return;
+
+  desc = message->descriptor;
+
+  ASSERT_IS_MESSAGE(message);
+
+  if (allocator == NULL) allocator = &protobuf_c__allocator;
+  message->descriptor = NULL;
+  for (f = 0; f < desc->n_fields; f++) {
+    if (0 != (desc->fields[f].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) &&
+        desc->fields[f].id !=
+            STRUCT_MEMBER(uint32_t, message,
+                          desc->fields[f].quantifier_offset)) {
+      /* This is not the selected oneof, skip it */
+      continue;
+    }
+
+    if (desc->fields[f].label == PROTOBUF_C_LABEL_REPEATED) {
+      size_t n =
+          STRUCT_MEMBER(size_t, message, desc->fields[f].quantifier_offset);
+      void *arr = STRUCT_MEMBER(void *, message, desc->fields[f].offset);
+
+      if (arr != NULL) {
+        if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) {
+          unsigned i;
+          for (i = 0; i < n; i++) do_free(allocator, ((char **)arr)[i]);
+        } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) {
+          unsigned i;
+          for (i = 0; i < n; i++)
+            do_free(allocator, ((ProtobufCBinaryData *)arr)[i].data);
+        } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) {
+          unsigned i;
+          for (i = 0; i < n; i++)
+            protobuf_c_message_free_unpacked(((ProtobufCMessage **)arr)[i],
+                                             allocator);
+        }
+        do_free(allocator, arr);
+      }
+    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) {
+      char *str = STRUCT_MEMBER(char *, message, desc->fields[f].offset);
+
+      if (str && str != desc->fields[f].default_value) do_free(allocator, str);
+    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) {
+      void *data =
+          STRUCT_MEMBER(ProtobufCBinaryData, message, desc->fields[f].offset)
+              .data;
+      const ProtobufCBinaryData *default_bd;
+
+      default_bd = desc->fields[f].default_value;
+      if (data != NULL && (default_bd == NULL || default_bd->data != data)) {
+        do_free(allocator, data);
+      }
+    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) {
+      ProtobufCMessage *sm;
+
+      sm = STRUCT_MEMBER(ProtobufCMessage *, message, desc->fields[f].offset);
+      if (sm && sm != desc->fields[f].default_value)
+        protobuf_c_message_free_unpacked(sm, allocator);
+    }
+  }
+
+  for (f = 0; f < message->n_unknown_fields; f++)
+    do_free(allocator, message->unknown_fields[f].data);
+  if (message->unknown_fields != NULL)
+    do_free(allocator, message->unknown_fields);
+
+  do_free(allocator, message);
+}
+
+void protobuf_c_message_init(const ProtobufCMessageDescriptor *descriptor,
+                             void *message) {
+  descriptor->message_init((ProtobufCMessage *)(message));
+}
+
+protobuf_c_boolean protobuf_c_message_check(const ProtobufCMessage *message) {
+  unsigned i;
+
+  if (!message || !message->descriptor ||
+      message->descriptor->magic != PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC) {
+    return FALSE;
+  }
+
+  for (i = 0; i < message->descriptor->n_fields; i++) {
+    const ProtobufCFieldDescriptor *f = message->descriptor->fields + i;
+    ProtobufCType type = f->type;
+    ProtobufCLabel label = f->label;
+    void *field = STRUCT_MEMBER_P(message, f->offset);
+
+    if (label == PROTOBUF_C_LABEL_REPEATED) {
+      size_t *quantity = STRUCT_MEMBER_P(message, f->quantifier_offset);
+
+      if (*quantity > 0 && *(void **)field == NULL) {
+        return FALSE;
+      }
+
+      if (type == PROTOBUF_C_TYPE_MESSAGE) {
+        ProtobufCMessage **submessage = *(ProtobufCMessage ***)field;
+        unsigned j;
+        for (j = 0; j < *quantity; j++) {
+          if (!protobuf_c_message_check(submessage[j])) return FALSE;
+        }
+      } else if (type == PROTOBUF_C_TYPE_STRING) {
+        char **string = *(char ***)field;
+        unsigned j;
+        for (j = 0; j < *quantity; j++) {
+          if (!string[j]) return FALSE;
+        }
+      } else if (type == PROTOBUF_C_TYPE_BYTES) {
+        ProtobufCBinaryData *bd = *(ProtobufCBinaryData **)field;
+        unsigned j;
+        for (j = 0; j < *quantity; j++) {
+          if (bd[j].len > 0 && bd[j].data == NULL) return FALSE;
+        }
+      }
+
+    } else { /* PROTOBUF_C_LABEL_REQUIRED or PROTOBUF_C_LABEL_OPTIONAL */
+
+      if (type == PROTOBUF_C_TYPE_MESSAGE) {
+        ProtobufCMessage *submessage = *(ProtobufCMessage **)field;
+        if (label == PROTOBUF_C_LABEL_REQUIRED || submessage != NULL) {
+          if (!protobuf_c_message_check(submessage)) return FALSE;
+        }
+      } else if (type == PROTOBUF_C_TYPE_STRING) {
+        char *string = *(char **)field;
+        if (label == PROTOBUF_C_LABEL_REQUIRED && string == NULL) return FALSE;
+      } else if (type == PROTOBUF_C_TYPE_BYTES) {
+        protobuf_c_boolean *has =
+            STRUCT_MEMBER_P(message, f->quantifier_offset);
+        ProtobufCBinaryData *bd = field;
+        if (label == PROTOBUF_C_LABEL_REQUIRED || *has == TRUE) {
+          if (bd->len > 0 && bd->data == NULL) return FALSE;
+        }
+      }
+    }
+  }
+
+  return TRUE;
+}
+
+/* === services === */
+
+typedef void (*GenericHandler)(void *service, const ProtobufCMessage *input,
+                               ProtobufCClosure closure, void *closure_data);
diff --git a/tools/quantification/src/protobuf-c.h b/tools/quantification/src/protobuf-c.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd85695b868af6c7b91590196339bc4f7826a256
--- /dev/null
+++ b/tools/quantification/src/protobuf-c.h
@@ -0,0 +1,921 @@
+/*
+ * Copyright (c) 2008-2017, Dave Benson and the protobuf-c authors.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*! \file
+ * \mainpage Introduction
+ *
+ * This is [protobuf-c], a C implementation of [Protocol Buffers].
+ *
+ * This file defines the public API for the `libprotobuf-c` support library.
+ * This API includes interfaces that can be used directly by client code as well
+ * as the interfaces used by the code generated by the `protoc-c` compiler.
+ *
+ * The `libprotobuf-c` support library performs the actual serialization and
+ * deserialization of Protocol Buffers messages. It interacts with structures,
+ * definitions, and metadata generated by the `protoc-c` compiler from .proto
+ * files.
+ *
+ * \authors Dave Benson and the `protobuf-c` authors.
+ *
+ * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license.
+ *
+ * [protobuf-c]:       https://github.com/protobuf-c/protobuf-c
+ * [Protocol Buffers]: https://developers.google.com/protocol-buffers/
+ * [BSD-2-Clause]:     http://opensource.org/licenses/BSD-2-Clause
+ *
+ * \page gencode Generated Code
+ *
+ * For each enum, we generate a C enum. For each message, we generate a C
+ * structure which can be cast to a `ProtobufCMessage`.
+ *
+ * For each enum and message, we generate a descriptor object that allows us to
+ * implement a kind of reflection on the structures.
+ *
+ * First, some naming conventions:
+ *
+ * - The name of the type for enums and messages and services is camel case
+ *   (meaning WordsAreCrammedTogether) except that double underscores are used
+ *   to delimit scopes. For example, the following `.proto` file:
+ *
+~~~{.proto}
+        package foo.bar;
+        message BazBah {
+            optional int32 val = 1;
+        }
+~~~
+ *
+ * would generate a C type `Foo__Bar__BazBah`.
+ *
+ * - Identifiers for functions and globals are all lowercase, with camel case
+ *   words separated by single underscores. For example, one of the function
+ *   prototypes generated by `protoc-c` for the above example:
+ *
+~~~{.c}
+Foo__Bar__BazBah *
+       foo__bar__baz_bah__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data);
+~~~
+ *
+ * - Identifiers for enum values contain an uppercase prefix which embeds the
+ *   package name and the enum type name.
+ *
+ * - A double underscore is used to separate further components of identifier
+ *   names.
+ *
+ * For example, in the name of the unpack function above, the package name
+ * `foo.bar` has become `foo__bar`, the message name BazBah has become
+ * `baz_bah`, and the method name is `unpack`. These are all joined with double
+ * underscores to form the C identifier `foo__bar__baz_bah__unpack`.
+ *
+ * We also generate descriptor objects for messages and enums. These are
+ * declared in the `.pb-c.h` files:
+ *
+~~~{.c}
+extern const ProtobufCMessageDescriptor foo__bar__baz_bah__descriptor;
+~~~
+ *
+ * The message structures all begin with `ProtobufCMessageDescriptor *` which is
+ * sufficient to allow them to be cast to `ProtobufCMessage`.
+ *
+ * For each message defined in a `.proto` file, we generate a number of
+ * functions and macros. Each function name contains a prefix based on the
+ * package name and message name in order to make it a unique C identifier.
+ *
+ * - `INIT`. Statically initializes a message object, initializing its
+ *   descriptor and setting its fields to default values. Uninitialized
+ *   messages cannot be processed by the protobuf-c library.
+ *
+~~~{.c}
+#define FOO__BAR__BAZ_BAH__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&foo__bar__baz_bah__descriptor), 0 }
+~~~
+ * - `init()`. Initializes a message object, initializing its descriptor and
+ *   setting its fields to default values. Uninitialized messages cannot be
+ *   processed by the protobuf-c library.
+ *
+~~~{.c}
+void foo__bar__baz_bah__init
+                     (Foo__Bar__BazBah *message);
+~~~
+ * - `unpack()`. Unpacks data for a particular message format. Note that the
+ *   `allocator` parameter is usually `NULL` to indicate that the system's
+ *   `malloc()` and `free()` functions should be used for dynamically allocating
+ *   memory.
+ *
+~~~{.c}
+Foo__Bar__BazBah *
+       foo__bar__baz_bah__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data);
+~~~
+ *
+ * - `free_unpacked()`. Frees a message object obtained with the `unpack()`
+ *   method. Freeing `NULL` is allowed (the same as with `free()`).
+ *
+~~~{.c}
+void   foo__bar__baz_bah__free_unpacked
+                     (Foo__Bar__BazBah *message,
+                      ProtobufCAllocator *allocator);
+~~~
+ *
+ * - `get_packed_size()`. Calculates the length in bytes of the serialized
+ *   representation of the message object.
+ *
+~~~{.c}
+size_t foo__bar__baz_bah__get_packed_size
+                     (const Foo__Bar__BazBah   *message);
+~~~
+ *
+ * - `pack()`. Pack a message object into a preallocated buffer. Assumes that
+ *   the buffer is large enough. (Use `get_packed_size()` first.)
+ *
+~~~{.c}
+size_t foo__bar__baz_bah__pack
+                     (const Foo__Bar__BazBah   *message,
+                      uint8_t             *out);
+~~~
+ *
+ * - `pack_to_buffer()`. Packs a message into a "virtual buffer". This is an
+ *   object which defines an "append bytes" callback to consume data as it is
+ *   serialized.
+ *
+~~~{.c}
+size_t foo__bar__baz_bah__pack_to_buffer
+                     (const Foo__Bar__BazBah   *message,
+                      ProtobufCBuffer     *buffer);
+~~~
+ *
+ * \page pack Packing and unpacking messages
+ *
+ * To pack a message, first compute the packed size of the message with
+ * protobuf_c_message_get_packed_size(), then allocate a buffer of at least
+ * that size, then call protobuf_c_message_pack().
+ *
+ * Alternatively, a message can be serialized without calculating the final size
+ * first. Use the protobuf_c_message_pack_to_buffer() function and provide a
+ * ProtobufCBuffer object which implements an "append" method that consumes
+ * data.
+ *
+ * To unpack a message, call the protobuf_c_message_unpack() function. The
+ * result can be cast to an object of the type that matches the descriptor for
+ * the message.
+ *
+ * The result of unpacking a message should be freed with
+ * protobuf_c_message_free_unpacked().
+ */
+
+#ifndef PROTOBUF_C_H
+#define PROTOBUF_C_H
+
+#include <assert.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+#define PROTOBUF_C__BEGIN_DECLS extern "C" {
+#define PROTOBUF_C__END_DECLS }
+#else
+#define PROTOBUF_C__BEGIN_DECLS
+#define PROTOBUF_C__END_DECLS
+#endif
+
+PROTOBUF_C__BEGIN_DECLS
+
+#if defined(_WIN32) && defined(PROTOBUF_C_USE_SHARED_LIB)
+#ifdef PROTOBUF_C_EXPORT
+#define PROTOBUF_C__API __declspec(dllexport)
+#else
+#define PROTOBUF_C__API __declspec(dllimport)
+#endif
+#else
+#define PROTOBUF_C__API
+#endif
+
+#if !defined(PROTOBUF_C__NO_DEPRECATED) && \
+    ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+#define PROTOBUF_C__DEPRECATED __attribute__((__deprecated__))
+#else
+#define PROTOBUF_C__DEPRECATED
+#endif
+
+#ifndef PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE
+#define PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(enum_name) \
+  , _##enum_name##_IS_INT_SIZE = INT_MAX
+#endif
+
+#define PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC 0x14159bc3
+#define PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC 0x28aaeef9
+#define PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC 0x114315af
+
+/* Empty string used for initializers */
+extern const char protobuf_c_empty_string[];
+
+/**
+ * \defgroup api Public API
+ *
+ * This is the public API for `libprotobuf-c`. These interfaces are stable and
+ * subject to Semantic Versioning guarantees.
+ *
+ * @{
+ */
+
+/**
+ * Values for the `flags` word in `ProtobufCFieldDescriptor`.
+ */
+typedef enum {
+  /** Set if the field is repeated and marked with the `packed` option. */
+  PROTOBUF_C_FIELD_FLAG_PACKED = (1 << 0),
+
+  /** Set if the field is marked with the `deprecated` option. */
+  PROTOBUF_C_FIELD_FLAG_DEPRECATED = (1 << 1),
+
+  /** Set if the field is a member of a oneof (union). */
+  PROTOBUF_C_FIELD_FLAG_ONEOF = (1 << 2),
+} ProtobufCFieldFlag;
+
+/**
+ * Message field rules.
+ *
+ * \see [Defining A Message Type] in the Protocol Buffers documentation.
+ *
+ * [Defining A Message Type]:
+ *      https://developers.google.com/protocol-buffers/docs/proto#simple
+ */
+typedef enum {
+  /** A well-formed message must have exactly one of this field. */
+  PROTOBUF_C_LABEL_REQUIRED,
+
+  /**
+   * A well-formed message can have zero or one of this field (but not
+   * more than one).
+   */
+  PROTOBUF_C_LABEL_OPTIONAL,
+
+  /**
+   * This field can be repeated any number of times (including zero) in a
+   * well-formed message. The order of the repeated values will be
+   * preserved.
+   */
+  PROTOBUF_C_LABEL_REPEATED,
+
+  /**
+   * This field has no label. This is valid only in proto3 and is
+   * equivalent to OPTIONAL but no "has" quantifier will be consulted.
+   */
+  PROTOBUF_C_LABEL_NONE,
+} ProtobufCLabel;
+
+/**
+ * Field value types.
+ *
+ * \see [Scalar Value Types] in the Protocol Buffers documentation.
+ *
+ * [Scalar Value Types]:
+ *      https://developers.google.com/protocol-buffers/docs/proto#scalar
+ */
+typedef enum {
+  PROTOBUF_C_TYPE_INT32,    /**< int32 */
+  PROTOBUF_C_TYPE_SINT32,   /**< signed int32 */
+  PROTOBUF_C_TYPE_SFIXED32, /**< signed int32 (4 bytes) */
+  PROTOBUF_C_TYPE_INT64,    /**< int64 */
+  PROTOBUF_C_TYPE_SINT64,   /**< signed int64 */
+  PROTOBUF_C_TYPE_SFIXED64, /**< signed int64 (8 bytes) */
+  PROTOBUF_C_TYPE_UINT32,   /**< unsigned int32 */
+  PROTOBUF_C_TYPE_FIXED32,  /**< unsigned int32 (4 bytes) */
+  PROTOBUF_C_TYPE_UINT64,   /**< unsigned int64 */
+  PROTOBUF_C_TYPE_FIXED64,  /**< unsigned int64 (8 bytes) */
+  PROTOBUF_C_TYPE_FLOAT,    /**< float */
+  PROTOBUF_C_TYPE_DOUBLE,   /**< double */
+  PROTOBUF_C_TYPE_BOOL,     /**< boolean */
+  PROTOBUF_C_TYPE_ENUM,     /**< enumerated type */
+  PROTOBUF_C_TYPE_STRING,   /**< UTF-8 or ASCII string */
+  PROTOBUF_C_TYPE_BYTES,    /**< arbitrary byte sequence */
+  PROTOBUF_C_TYPE_MESSAGE,  /**< nested message */
+} ProtobufCType;
+
+/**
+ * Field wire types.
+ *
+ * \see [Message Structure] in the Protocol Buffers documentation.
+ *
+ * [Message Structure]:
+ *      https://developers.google.com/protocol-buffers/docs/encoding#structure
+ */
+typedef enum {
+  PROTOBUF_C_WIRE_TYPE_VARINT = 0,
+  PROTOBUF_C_WIRE_TYPE_64BIT = 1,
+  PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED = 2,
+  /* "Start group" and "end group" wire types are unsupported. */
+  PROTOBUF_C_WIRE_TYPE_32BIT = 5,
+} ProtobufCWireType;
+
+struct ProtobufCAllocator;
+struct ProtobufCBinaryData;
+struct ProtobufCBuffer;
+struct ProtobufCBufferSimple;
+struct ProtobufCEnumDescriptor;
+struct ProtobufCEnumValue;
+struct ProtobufCEnumValueIndex;
+struct ProtobufCFieldDescriptor;
+struct ProtobufCIntRange;
+struct ProtobufCMessage;
+struct ProtobufCMessageDescriptor;
+struct ProtobufCMessageUnknownField;
+struct ProtobufCMethodDescriptor;
+struct ProtobufCService;
+struct ProtobufCServiceDescriptor;
+
+typedef struct ProtobufCAllocator ProtobufCAllocator;
+typedef struct ProtobufCBinaryData ProtobufCBinaryData;
+typedef struct ProtobufCBuffer ProtobufCBuffer;
+typedef struct ProtobufCBufferSimple ProtobufCBufferSimple;
+typedef struct ProtobufCEnumDescriptor ProtobufCEnumDescriptor;
+typedef struct ProtobufCEnumValue ProtobufCEnumValue;
+typedef struct ProtobufCEnumValueIndex ProtobufCEnumValueIndex;
+typedef struct ProtobufCFieldDescriptor ProtobufCFieldDescriptor;
+typedef struct ProtobufCIntRange ProtobufCIntRange;
+typedef struct ProtobufCMessage ProtobufCMessage;
+typedef struct ProtobufCMessageDescriptor ProtobufCMessageDescriptor;
+typedef struct ProtobufCMessageUnknownField ProtobufCMessageUnknownField;
+typedef struct ProtobufCMethodDescriptor ProtobufCMethodDescriptor;
+typedef struct ProtobufCService ProtobufCService;
+typedef struct ProtobufCServiceDescriptor ProtobufCServiceDescriptor;
+
+/** Boolean type. */
+typedef int protobuf_c_boolean;
+
+typedef void (*ProtobufCClosure)(const ProtobufCMessage *, void *closure_data);
+typedef void (*ProtobufCMessageInit)(ProtobufCMessage *);
+typedef void (*ProtobufCServiceDestroy)(ProtobufCService *);
+
+/**
+ * Structure for defining a custom memory allocator.
+ */
+struct ProtobufCAllocator {
+  /** Function to allocate memory. */
+  void *(*alloc)(void *allocator_data, size_t size);
+
+  /** Function to free memory. */
+  void (*free)(void *allocator_data, void *pointer);
+
+  /** Opaque pointer passed to `alloc` and `free` functions. */
+  void *allocator_data;
+};
+
+/**
+ * Structure for the protobuf `bytes` scalar type.
+ *
+ * The data contained in a `ProtobufCBinaryData` is an arbitrary sequence of
+ * bytes. It may contain embedded `NUL` characters and is not required to be
+ * `NUL`-terminated.
+ */
+struct ProtobufCBinaryData {
+  size_t len;    /**< Number of bytes in the `data` field. */
+  uint8_t *data; /**< Data bytes. */
+};
+
+/**
+ * Structure for defining a virtual append-only buffer. Used by
+ * protobuf_c_message_pack_to_buffer() to abstract the consumption of serialized
+ * bytes.
+ *
+ * `ProtobufCBuffer` "subclasses" may be defined on the stack. For example, to
+ * write to a `FILE` object:
+ *
+~~~{.c}
+typedef struct {
+        ProtobufCBuffer base;
+        FILE *fp;
+} BufferAppendToFile;
+
+static void
+my_buffer_file_append(ProtobufCBuffer *buffer,
+                      size_t len,
+                      const uint8_t *data)
+{
+        BufferAppendToFile *file_buf = (BufferAppendToFile *) buffer;
+        fwrite(data, len, 1, file_buf->fp); // XXX: No error handling!
+}
+~~~
+ *
+ * To use this new type of ProtobufCBuffer, it could be called as follows:
+ *
+~~~{.c}
+...
+BufferAppendToFile tmp = {0};
+tmp.base.append = my_buffer_file_append;
+tmp.fp = fp;
+protobuf_c_message_pack_to_buffer(&message, &tmp);
+...
+~~~
+ */
+struct ProtobufCBuffer {
+  /** Append function. Consumes the `len` bytes stored at `data`. */
+  void (*append)(ProtobufCBuffer *buffer, size_t len, const uint8_t *data);
+};
+
+/**
+ * Simple buffer "subclass" of `ProtobufCBuffer`.
+ *
+ * A `ProtobufCBufferSimple` object is declared on the stack and uses a
+ * scratch buffer provided by the user for the initial allocation. It performs
+ * exponential resizing, using dynamically allocated memory. A
+ * `ProtobufCBufferSimple` object can be created and used as follows:
+ *
+~~~{.c}
+uint8_t pad[128];
+ProtobufCBufferSimple simple = PROTOBUF_C_BUFFER_SIMPLE_INIT(pad);
+ProtobufCBuffer *buffer = (ProtobufCBuffer *) &simple;
+~~~
+ *
+ * `buffer` can now be used with `protobuf_c_message_pack_to_buffer()`. Once a
+ * message has been serialized to a `ProtobufCBufferSimple` object, the
+ * serialized data bytes can be accessed from the `.data` field.
+ *
+ * To free the memory allocated by a `ProtobufCBufferSimple` object, if any,
+ * call PROTOBUF_C_BUFFER_SIMPLE_CLEAR() on the object, for example:
+ *
+~~~{.c}
+PROTOBUF_C_BUFFER_SIMPLE_CLEAR(&simple);
+~~~
+ *
+ * \see PROTOBUF_C_BUFFER_SIMPLE_INIT
+ * \see PROTOBUF_C_BUFFER_SIMPLE_CLEAR
+ */
+struct ProtobufCBufferSimple {
+  /** "Base class". */
+  ProtobufCBuffer base;
+  /** Number of bytes allocated in `data`. */
+  size_t alloced;
+  /** Number of bytes currently stored in `data`. */
+  size_t len;
+  /** Data bytes. */
+  uint8_t *data;
+  /** Whether `data` must be freed. */
+  protobuf_c_boolean must_free_data;
+  /** Allocator to use. May be NULL to indicate the system allocator. */
+  ProtobufCAllocator *allocator;
+};
+
+/**
+ * Describes an enumeration as a whole, with all of its values.
+ */
+struct ProtobufCEnumDescriptor {
+  /** Magic value checked to ensure that the API is used correctly. */
+  uint32_t magic;
+
+  /** The qualified name (e.g., "namespace.Type"). */
+  const char *name;
+  /** The unqualified name as given in the .proto file (e.g., "Type"). */
+  const char *short_name;
+  /** Identifier used in generated C code. */
+  const char *c_name;
+  /** The dot-separated namespace. */
+  const char *package_name;
+
+  /** Number elements in `values`. */
+  unsigned n_values;
+  /** Array of distinct values, sorted by numeric value. */
+  const ProtobufCEnumValue *values;
+
+  /** Number of elements in `values_by_name`. */
+  unsigned n_value_names;
+  /** Array of named values, including aliases, sorted by name. */
+  const ProtobufCEnumValueIndex *values_by_name;
+
+  /** Number of elements in `value_ranges`. */
+  unsigned n_value_ranges;
+  /** Value ranges, for faster lookups by numeric value. */
+  const ProtobufCIntRange *value_ranges;
+
+  /** Reserved for future use. */
+  void *reserved1;
+  /** Reserved for future use. */
+  void *reserved2;
+  /** Reserved for future use. */
+  void *reserved3;
+  /** Reserved for future use. */
+  void *reserved4;
+};
+
+/**
+ * Represents a single value of an enumeration.
+ */
+struct ProtobufCEnumValue {
+  /** The string identifying this value in the .proto file. */
+  const char *name;
+
+  /** The string identifying this value in generated C code. */
+  const char *c_name;
+
+  /** The numeric value assigned in the .proto file. */
+  int value;
+};
+
+/**
+ * Used by `ProtobufCEnumDescriptor` to look up enum values.
+ */
+struct ProtobufCEnumValueIndex {
+  /** Name of the enum value. */
+  const char *name;
+  /** Index into values[] array. */
+  unsigned index;
+};
+
+/**
+ * Describes a single field in a message.
+ */
+struct ProtobufCFieldDescriptor {
+  /** Name of the field as given in the .proto file. */
+  const char *name;
+
+  /** Tag value of the field as given in the .proto file. */
+  uint32_t id;
+
+  /** Whether the field is `REQUIRED`, `OPTIONAL`, or `REPEATED`. */
+  ProtobufCLabel label;
+
+  /** The type of the field. */
+  ProtobufCType type;
+
+  /**
+   * The offset in bytes of the message's C structure's quantifier field
+   * (the `has_MEMBER` field for optional members or the `n_MEMBER` field
+   * for repeated members or the case enum for oneofs).
+   */
+  unsigned quantifier_offset;
+
+  /**
+   * The offset in bytes into the message's C structure for the member
+   * itself.
+   */
+  unsigned offset;
+
+  /**
+   * A type-specific descriptor.
+   *
+   * If `type` is `PROTOBUF_C_TYPE_ENUM`, then `descriptor` points to the
+   * corresponding `ProtobufCEnumDescriptor`.
+   *
+   * If `type` is `PROTOBUF_C_TYPE_MESSAGE`, then `descriptor` points to
+   * the corresponding `ProtobufCMessageDescriptor`.
+   *
+   * Otherwise this field is NULL.
+   */
+  const void *descriptor; /* for MESSAGE and ENUM types */
+
+  /** The default value for this field, if defined. May be NULL. */
+  const void *default_value;
+
+  /**
+   * A flag word. Zero or more of the bits defined in the
+   * `ProtobufCFieldFlag` enum may be set.
+   */
+  uint32_t flags;
+
+  /** Reserved for future use. */
+  unsigned reserved_flags;
+  /** Reserved for future use. */
+  void *reserved2;
+  /** Reserved for future use. */
+  void *reserved3;
+};
+
+/**
+ * Helper structure for optimizing int => index lookups in the case
+ * where the keys are mostly consecutive values, as they presumably are for
+ * enums and fields.
+ *
+ * The data structures requires that the values in the original array are
+ * sorted.
+ */
+struct ProtobufCIntRange {
+  int start_value;
+  unsigned orig_index;
+  /*
+   * NOTE: the number of values in the range can be inferred by looking
+   * at the next element's orig_index. A dummy element is added to make
+   * this simple.
+   */
+};
+
+/**
+ * An instance of a message.
+ *
+ * `ProtobufCMessage` is a light-weight "base class" for all messages.
+ *
+ * In particular, `ProtobufCMessage` doesn't have any allocation policy
+ * associated with it. That's because it's common to create `ProtobufCMessage`
+ * objects on the stack. In fact, that's what we recommend for sending messages.
+ * If the object is allocated from the stack, you can't really have a memory
+ * leak.
+ *
+ * This means that calls to functions like protobuf_c_message_unpack() which
+ * return a `ProtobufCMessage` must be paired with a call to a free function,
+ * like protobuf_c_message_free_unpacked().
+ */
+struct ProtobufCMessage {
+  /** The descriptor for this message type. */
+  const ProtobufCMessageDescriptor *descriptor;
+  /** The number of elements in `unknown_fields`. */
+  unsigned n_unknown_fields;
+  /** The fields that weren't recognized by the parser. */
+  ProtobufCMessageUnknownField *unknown_fields;
+};
+
+/**
+ * Describes a message.
+ */
+struct ProtobufCMessageDescriptor {
+  /** Magic value checked to ensure that the API is used correctly. */
+  uint32_t magic;
+
+  /** The qualified name (e.g., "namespace.Type"). */
+  const char *name;
+  /** The unqualified name as given in the .proto file (e.g., "Type"). */
+  const char *short_name;
+  /** Identifier used in generated C code. */
+  const char *c_name;
+  /** The dot-separated namespace. */
+  const char *package_name;
+
+  /**
+   * Size in bytes of the C structure representing an instance of this
+   * type of message.
+   */
+  size_t sizeof_message;
+
+  /** Number of elements in `fields`. */
+  unsigned n_fields;
+  /** Field descriptors, sorted by tag number. */
+  const ProtobufCFieldDescriptor *fields;
+  /** Used for looking up fields by name. */
+  const unsigned *fields_sorted_by_name;
+
+  /** Number of elements in `field_ranges`. */
+  unsigned n_field_ranges;
+  /** Used for looking up fields by id. */
+  const ProtobufCIntRange *field_ranges;
+
+  /** Message initialisation function. */
+  ProtobufCMessageInit message_init;
+
+  /** Reserved for future use. */
+  void *reserved1;
+  /** Reserved for future use. */
+  void *reserved2;
+  /** Reserved for future use. */
+  void *reserved3;
+};
+
+/**
+ * An unknown message field.
+ */
+struct ProtobufCMessageUnknownField {
+  /** The tag number. */
+  uint32_t tag;
+  /** The wire type of the field. */
+  ProtobufCWireType wire_type;
+  /** Number of bytes in `data`. */
+  size_t len;
+  /** Field data. */
+  uint8_t *data;
+};
+
+/**
+ * Method descriptor.
+ */
+struct ProtobufCMethodDescriptor {
+  /** Method name. */
+  const char *name;
+  /** Input message descriptor. */
+  const ProtobufCMessageDescriptor *input;
+  /** Output message descriptor. */
+  const ProtobufCMessageDescriptor *output;
+};
+
+/**
+ * Service.
+ */
+struct ProtobufCService {
+  /** Service descriptor. */
+  const ProtobufCServiceDescriptor *descriptor;
+  /** Function to invoke the service. */
+  void (*invoke)(ProtobufCService *service, unsigned method_index,
+                 const ProtobufCMessage *input, ProtobufCClosure closure,
+                 void *closure_data);
+  /** Function to destroy the service. */
+  void (*destroy)(ProtobufCService *service);
+};
+
+/**
+ * Service descriptor.
+ */
+struct ProtobufCServiceDescriptor {
+  /** Magic value checked to ensure that the API is used correctly. */
+  uint32_t magic;
+
+  /** Service name. */
+  const char *name;
+  /** Short version of service name. */
+  const char *short_name;
+  /** C identifier for the service name. */
+  const char *c_name;
+  /** Package name. */
+  const char *package;
+  /** Number of elements in `methods`. */
+  unsigned n_methods;
+  /** Method descriptors, in the order defined in the .proto file. */
+  const ProtobufCMethodDescriptor *methods;
+  /** Sort index of methods. */
+  const unsigned *method_indices_by_name;
+};
+
+/**
+ * Get the version of the protobuf-c library. Note that this is the version of
+ * the library linked against, not the version of the headers compiled against.
+ *
+ * \return A string containing the version number of protobuf-c.
+ */
+PROTOBUF_C__API
+const char *protobuf_c_version(void);
+
+/**
+ * Get the version of the protobuf-c library. Note that this is the version of
+ * the library linked against, not the version of the headers compiled against.
+ *
+ * \return A 32 bit unsigned integer containing the version number of
+ *      protobuf-c, represented in base-10 as (MAJOR*1E6) + (MINOR*1E3) + PATCH.
+ */
+PROTOBUF_C__API
+uint32_t protobuf_c_version_number(void);
+
+/**
+ * The version of the protobuf-c headers, represented as a string using the same
+ * format as protobuf_c_version().
+ */
+#define PROTOBUF_C_VERSION "1.3.0"
+
+/**
+ * The version of the protobuf-c headers, represented as an integer using the
+ * same format as protobuf_c_version_number().
+ */
+#define PROTOBUF_C_VERSION_NUMBER 1003000
+
+/**
+ * The minimum protoc-c version which works with the current version of the
+ * protobuf-c headers.
+ */
+#define PROTOBUF_C_MIN_COMPILER_VERSION 1000000
+
+/**
+ * Determine the number of bytes required to store the serialised message.
+ *
+ * \param message
+ *      The message object to serialise.
+ * \return
+ *      Number of bytes.
+ */
+PROTOBUF_C__API
+size_t protobuf_c_message_get_packed_size(const ProtobufCMessage *message);
+
+/**
+ * Unpack a serialised message into an in-memory representation.
+ *
+ * \param descriptor
+ *      The message descriptor.
+ * \param allocator
+ *      `ProtobufCAllocator` to use for memory allocation. May be NULL to
+ *      specify the default allocator.
+ * \param len
+ *      Length in bytes of the serialised message.
+ * \param data
+ *      Pointer to the serialised message.
+ * \return
+ *      An unpacked message object.
+ * \retval NULL
+ *      If an error occurred during unpacking.
+ */
+PROTOBUF_C__API
+ProtobufCMessage *protobuf_c_message_unpack(
+    const ProtobufCMessageDescriptor *descriptor, ProtobufCAllocator *allocator,
+    size_t len, const uint8_t *data);
+
+/**
+ * Free an unpacked message object.
+ *
+ * This function should be used to deallocate the memory used by a call to
+ * protobuf_c_message_unpack().
+ *
+ * \param message
+ *      The message object to free. May be NULL.
+ * \param allocator
+ *      `ProtobufCAllocator` to use for memory deallocation. May be NULL to
+ *      specify the default allocator.
+ */
+PROTOBUF_C__API
+void protobuf_c_message_free_unpacked(ProtobufCMessage *message,
+                                      ProtobufCAllocator *allocator);
+
+/**
+ * Check the validity of a message object.
+ *
+ * Makes sure all required fields (`PROTOBUF_C_LABEL_REQUIRED`) are present.
+ * Recursively checks nested messages.
+ *
+ * \retval TRUE
+ *      Message is valid.
+ * \retval FALSE
+ *      Message is invalid.
+ */
+PROTOBUF_C__API
+protobuf_c_boolean protobuf_c_message_check(const ProtobufCMessage *);
+
+/** Message initialiser. */
+#define PROTOBUF_C_MESSAGE_INIT(descriptor) \
+  { descriptor, 0, NULL }
+
+/**
+ * Initialise a message object from a message descriptor.
+ *
+ * \param descriptor
+ *      Message descriptor.
+ * \param message
+ *      Allocated block of memory of size `descriptor->sizeof_message`.
+ */
+PROTOBUF_C__API
+void protobuf_c_message_init(const ProtobufCMessageDescriptor *descriptor,
+                             void *message);
+
+/**
+ * Initialise a `ProtobufCBufferSimple` object.
+ */
+#define PROTOBUF_C_BUFFER_SIMPLE_INIT(array_of_bytes)             \
+  {                                                               \
+    {protobuf_c_buffer_simple_append}, sizeof(array_of_bytes), 0, \
+        (array_of_bytes), 0, NULL                                 \
+  }
+
+/**
+ * Clear a `ProtobufCBufferSimple` object, freeing any allocated memory.
+ */
+#define PROTOBUF_C_BUFFER_SIMPLE_CLEAR(simp_buf)                              \
+  do {                                                                        \
+    if ((simp_buf)->must_free_data) {                                         \
+      if ((simp_buf)->allocator != NULL)                                      \
+        (simp_buf)->allocator->free((simp_buf)->allocator, (simp_buf)->data); \
+      else                                                                    \
+        free((simp_buf)->data);                                               \
+    }                                                                         \
+  } while (0)
+
+/**
+ * The `append` method for `ProtobufCBufferSimple`.
+ *
+ * \param buffer
+ *      The buffer object to append to. Must actually be a
+ *      `ProtobufCBufferSimple` object.
+ * \param len
+ *      Number of bytes in `data`.
+ * \param data
+ *      Data to append.
+ */
+PROTOBUF_C__API
+void protobuf_c_buffer_simple_append(ProtobufCBuffer *buffer, size_t len,
+                                     const unsigned char *data);
+
+/**@}*/
+
+PROTOBUF_C__END_DECLS
+
+#endif /* PROTOBUF_C_H */
diff --git a/tools/quantification/src/tensor_desc.h b/tools/quantification/src/tensor_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..4eadf341db998ae12939d252d585051ba54c3bf0
--- /dev/null
+++ b/tools/quantification/src/tensor_desc.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "src/framework.pb-c.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+enum VarType_Type {
+  VARTYPE_TYPE_BOOL = 0,
+  VARTYPE_TYPE_INT16 = 1,
+  VARTYPE_TYPE_INT32 = 2,
+  VARTYPE_TYPE_INT64 = 3,
+  VARTYPE_TYPE_FP16 = 4,
+  VARTYPE_TYPE_FP32 = 5,
+  VARTYPE_TYPE_FP64 = 6,
+  VARTYPE_TYPE_LOD_TENSOR = 7,
+  VARTYPE_TYPE_SELECTED_ROWS = 8,
+  VARTYPE_TYPE_FEED_MINIBATCH = 9,
+  VARTYPE_TYPE_FETCH_LIST = 10,
+  VARTYPE_TYPE_STEP_SCOPES = 11,
+  VARTYPE_TYPE_STEP_LOD_RANK_TABLE = 12,
+  VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY = 13,
+  VARTYPE_TYPE_STEP_PLACE_LIST = 14,
+  VARTYPE_TYPE_READER = 15,
+  VARTYPE_TYPE_CHANNEL = 16,
+  VARTYPE_TYPE_RAW = 17,
+  VARTYPE_TYPE_TUPLE = 18
+};
+
+class TensorDesc {
+ public:
+  TensorDesc() = default;
+  TensorDesc(const TensorDesc &desc) {
+    this->dims_ = desc.dims_;
+    this->data_type_ = desc.data_type_;
+  }
+
+  explicit TensorDesc(
+      PaddleMobile__Framework__Proto__VarType__TensorDesc *desc) {
+    for (int i = 0; i < desc->n_dims; ++i) {
+      int64_t d = desc->dims[i];
+      dims_.emplace_back(d);
+    }
+    data_type_ = (VarType_Type)desc->data_type;
+  }
+
+  std::vector<int64_t> Dims() const { return dims_; }
+  VarType_Type DataType() const { return data_type_; }
+
+ private:
+  std::vector<int64_t> dims_;
+  VarType_Type data_type_;
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/tools/quantification/src/var_desc.h b/tools/quantification/src/var_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b9c5ac4d672be2dd8a8a2a2695c2816f9cae05a
--- /dev/null
+++ b/tools/quantification/src/var_desc.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "src/framework.pb-c.h"
+#include "src/tensor_desc.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+class VarDesc {
+ public:
+  VarDesc(const VarDesc &var_desc) {
+    this->data_type_ = var_desc.data_type_;
+    this->name_ = var_desc.name_;
+    this->persistable_ = var_desc.persistable_;
+    this->tensor_desc_ = var_desc.tensor_desc_;
+    this->type_ = var_desc.type_;
+  }
+  explicit VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) {
+    type_ = (VarType_Type)desc->type->type;
+    name_ = std::string(desc->name);
+    persistable_ = static_cast<bool>(desc->persistable);
+
+    switch (type_) {
+      case VARTYPE_TYPE_SELECTED_ROWS:
+        tensor_desc_ = TensorDesc(desc->type->selected_rows);
+        break;
+      case VARTYPE_TYPE_LOD_TENSOR:
+        tensor_desc_ = TensorDesc(desc->type->lod_tensor->tensor);
+        break;
+      case VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY:
+        // desc->type->tensor_array->tensor->data_type;
+        tensor_desc_ = TensorDesc(desc->type->tensor_array->tensor);
+
+        break;
+      default:
+        break;
+    }
+    switch (type_) {
+      case VARTYPE_TYPE_CHANNEL:
+        data_type_ = (VarType_Type)desc->type->channel->data_type;
+        break;
+      default:
+        data_type_ = tensor_desc_.DataType();
+        break;
+    }
+  }
+  std::string Name() const { return name_; }
+
+  VarType_Type Type() const { return type_; }
+
+  bool Persistable() const { return persistable_; }
+
+  const TensorDesc &Tensor_desc() const { return tensor_desc_; }
+
+ private:
+  std::string name_;
+  bool persistable_;
+  TensorDesc tensor_desc_;
+  VarType_Type type_;
+  VarType_Type data_type_;
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/tools/toolchains/arm-android-neon.cmake b/tools/toolchains/arm-android-neon.cmake
index f2fa600b90fb54886838e953e61c1e940569dee6..5e431059a974810b2fd0481e0942447f57bf1286 100644
--- a/tools/toolchains/arm-android-neon.cmake
+++ b/tools/toolchains/arm-android-neon.cmake
@@ -1,2 +1,5 @@
 set(ANDROID_ARM_NEON ON)
-include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")
\ No newline at end of file
+set(ANDROID_PIE TRUE)
+set(ANDROID_STL "c++_static")
+set(ANDROID_PLATFORM "android-22")
+include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")
diff --git a/tools/toolchains/arm-linux-gnueabi.cmake b/tools/toolchains/arm-linux-gnueabi.cmake
index ee3cb50796d184f9f4577e8aabb4cf0ca98c955f..c2b1b853def5f470565e670751708f76c59e16c4 100644
--- a/tools/toolchains/arm-linux-gnueabi.cmake
+++ b/tools/toolchains/arm-linux-gnueabi.cmake
@@ -13,4 +13,4 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 
-include("${CMAKE_CURRENT_LIST_DIR}/../arm-platform.cmake")
+set(ARM_LINUX 1)
diff --git a/tools/toolchains/arm-linux-gnueabihf.cmake b/tools/toolchains/arm-linux-gnueabihf.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..7db42c7e73f4cfabce670bb2bc691e4b5bd314a2
--- /dev/null
+++ b/tools/toolchains/arm-linux-gnueabihf.cmake
@@ -0,0 +1,11 @@
+# CMake toolchain file for building ARM software on Linux environment
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR arm)
+set(CMAKE_SYSTEM_VERSION 1)
+
+message("if U build on platform . this is right.")
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
\ No newline at end of file