Merge pull request #772 from codeWorm2015/metal

Metal

Merge pull request #772 from codeWorm2015/metal
Metal
858e4fa6 · Ruilong Liu · GitHub · 3c5bc1c8 · 4f154151 · 858e4fa6
337 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -70,6 +70,12 @@ build
 cmake-build-debug
 cmake-build-release

+#ios demo
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/googlenet_combine/
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/*.jpg
+demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/*.a
+*.xcuserstate
+/tools/quantification/quantify

 # metal
 Podfile.lock
@@ -78,12 +84,3 @@ SwiftProtobuf.framework
 paddle-mobile.xcworkspace
 metal/models/
 metal/images/
-
-
-
-
-
-
-
-
-
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "src/operators/kernel/mali/ACL_Android"]
+	path = src/operators/kernel/mali/ACL_Android
+	url = https://github.com/halsay/ACL_Android.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.6)
 project(paddle-mobile)

 option(DEBUGING "enable debug mode" ON)
@@ -6,41 +6,30 @@ option(USE_OPENMP "openmp support" OFF)
 option(USE_EXCEPTION "use std exception" ON)
 option(LOG_PROFILE "log profile" ON)
 # select the platform to build
-option(CPU "cpu" ON)
-option(MALI_GPU "mali gpu" ON)
+option(CPU "armv7 with neon" ON)
+option(MALI_GPU "mali gpu" OFF)
 option(FPGA "fpga" OFF)

-if (CPU)
-    add_definitions(-DPADDLE_MOBILE_CPU)
-endif()
-
-if (MALI_GPU)
-    add_definitions(-DPADDLE_MOBILE_MALI_GPU)
-endif()
+file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
+file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
+include_directories(src/)

-if(FPGA)
-    add_definitions(-DPADDLE_MOBILE_FPGA)
+if(IS_IOS)
+    set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
+else()
+    set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
 endif()

-set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
 if (DEBUGING)
-    set(CMAKE_BUILD_TYPE Debug)
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS}")
-else()
+    message(STATUS "debug")
    set(CMAKE_BUILD_TYPE Release)
-endif ()
-
-if(DEBUGING)
-    message(STATUS "debuging")
+    set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
    add_definitions(-DPADDLE_MOBILE_DEBUG)
-    if(ANDROID)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
-    endif()
-
-else()
-    message(STATUS "releasing")
+else ()
+    set(CMAKE_BUILD_TYPE Release)
+    set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
    add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
-endif()
+endif ()

 if (USE_EXCEPTION)
    message(STATUS "use exception")
@@ -54,115 +43,123 @@ if (LOG_PROFILE)
    add_definitions(-DPADDLE_MOBILE_PROFILE)
 endif()

-if(IS_MAC)
-    add_definitions(-DX86)
-elseif(IS_IOS)
-    add_definitions(-DIOS)
-elseif(V7)
-    add_definitions(-DARMV7)
-elseif(V8)
-    add_definitions(-DARMV8)
-else ()
-    add_definitions(-DX86)
+if(USE_OPENMP AND NOT IS_IOS)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
 endif()

-set(CMAKE_VERBOSE_MAKEFILE ON)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
-
-file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c)
-file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
-
-if (NOT ANDROID)
-list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.cpp)
-list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.h)
-list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
+# platform control
+if (ARM_LINUX)
+    include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake")
 endif ()

-include_directories(src/)
+if (CPU)
+  add_definitions(-DPADDLE_MOBILE_CPU)
+else()
+    file(GLOB_RECURSE _tmp_list src/operators/kernel/arm/*.cpp src/operators/kernel/arm/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/arm/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
+endif()

-if(USE_OPENMP)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
+if (MALI_GPU)
+    add_definitions(-DPADDLE_MOBILE_MALI_GPU)
+    add_definitions(-DUSE_ACL=1)
+    add_definitions(-DUSE_OPENCL)
+    set(ACL_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/ACL_Android)
+    include_directories(${ACL_ROOT} ${ACL_ROOT}/include)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_core")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_graph")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build/opencl-1.2-stubs")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ACL=1")
+else()
+    file(GLOB_RECURSE _tmp_list src/operators/kernel/mali/*.cpp src/operators/kernel/mali/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/mali/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
 endif()

-if (googlenet)
-    add_definitions(-DCONCAT_OP)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DLRN_OP)
-    add_definitions(-DMUL_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DFUSION_FC_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DFUSION_CONVADD_OP)
-    add_definitions(-DFUSION_CONVADD_RELU_OP)
-elseif (mobilenet)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DSOFTMAX_OP)
-    add_definitions(-DSOFTMAX_OP)
-    add_definitions(-DDEPTHWISECONV_OP)
-    add_definitions(-DBATCHNORM_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DRESHAPE_OP)
-elseif (yolo)
-    add_definitions(-DBATCHNORM_OP)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-elseif (squeezenet)
-    add_definitions(-DCONCAT_OP)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DRESHAPE_OP)
-    add_definitions(-DSOFTMAX_OP)
-elseif(resnet)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DBATCHNORM_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DSOFTMAX_OP)
-    add_definitions(-DMUL_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DRELU_OP)
-else ()
-    add_definitions(-DBATCHNORM_OP)
-    add_definitions(-DBOXCODER_OP)
-    add_definitions(-DCONCAT_OP)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DDEPTHWISECONV_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DFUSION_CONVADD_OP)
-    add_definitions(-DCONVADDRELU_OP)
-    add_definitions(-DFUSION_FC_OP)
-    add_definitions(-DLRN_OP)
-    add_definitions(-DMUL_OP)
-    add_definitions(-DMULTICLASSNMS_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DPRIORBOX_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DRESHAPE_OP)
-    add_definitions(-DSIGMOID_OP)
-    add_definitions(-DSOFTMAX_OP)
-    add_definitions(-DTRANSPOSE_OP)
-    add_definitions(-DFUSION_CONVADD_RELU_OP)
+if(FPGA)
+    add_definitions(-DPADDLE_MOBILE_FPGA)
+else()
+    file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/fpga/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
+
+
+    file(GLOB_RECURSE _tmp_list src/fpga/*.cpp src/fpga/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/fpga/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
 endif()

+if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
+else()
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
+endif ()
+
 if (IS_IOS)
-    add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
-elseif(ANDROID)
-    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
 else()
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobile.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobile.mm)
+    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h)
+endif ()
+
+set(CMAKE_VERBOSE_MAKEFILE ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
+
+# NET default
+set(NET "default" CACHE STRING "select net type")
+set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGAnets")
+include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
+
+
+# build library
+if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
+    list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS)
+    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+elseif(IS_IOS)
+    add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+else ()
    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
 endif ()

+# unit test
 if(DEBUGING)
-    add_subdirectory(test)
+    if(IS_IOS)
+    else()
+        add_subdirectory(test)
+    endif()
 endif()


--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -183,6 +183,9 @@ upstream

 接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。

+![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294877166787.jpg)
+之后就可以提交代码了
+
 ## 删除远程分支

 在 PR 被 merge 进主仓库后，我们可以在 PR 的页面删除远程仓库的分支。
@@ -219,7 +222,7 @@ upstream
     - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
     - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
   - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
-3. 如果解决了某个Issue的问题，请在该Pull Request的**第一个**评论框中加上：`fix #issue_number`，这样当该PUll Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
+3. 如果解决了某个Issue的问题，请在该Pull Request的**第一个**评论框中加上：`fix #issue_number`，这样当该Pull Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。

 此外，在回复评审人意见时，请您遵守以下约定：


--- a/Dockerfile
+++ b/Dockerfile
+FROM ubuntu:16.04
+
+RUN echo '\
+deb <mirror> <version> main restricted universe multiverse\n\
+deb <mirror> <version>-updates main restricted universe multiverse\n\
+deb <mirror> <version>-backports main restricted universe multiverse\n\
+deb <mirror> <version>-security main restricted universe multiverse\n'\
+> /etc/apt/sources.list
+RUN sed -ie 's|<mirror>|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|' /etc/apt/sources.list
+RUN sed -ie 's|<version>|xenial|' /etc/apt/sources.list
+
+RUN apt-get update && apt-get upgrade -y
+RUN apt-get install -y --no-install-recommends \
+        curl \
+        unzip \
+        git \
+        make \
+        cmake-curses-gui \
+        python \
+        python-pip \
+        python-setuptools \
+        clang-format-5.0 \
+        graphviz \
+        g++-arm-linux-gnueabi \
+        gcc-arm-linux-gnueabi
+RUN apt-get autoremove -y && apt-get clean
+RUN ln -s clang-format-5.0 /usr/bin/clang-format
+RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip
+RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel
+RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit
+RUN cd /tmp && curl -O http://mirrors.neusoft.edu.cn/android/repository/android-ndk-r17b-linux-x86_64.zip
+RUN curl -O https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
+        tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
+        mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \
+        mv /usr/bin/cmake /usr/bin/cmake.bak && ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
+        mv /usr/bin/ccmake /usr/bin/ccmake.bak && ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
+RUN cd /opt && unzip /tmp/android-ndk-r17b-linux-x86_64.zip
+ENV NDK_ROOT /opt/android-ndk-r17b
--- a/README.md
+++ b/README.md
-# Paddle-Mobile 
-
+# Paddle-Mobile
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/paddle-mobile.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/paddle-mobile)
-[![License](https://img.shields.io/badge/license-Apache%202-brightgreen.svg)](LICENSE)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/doc)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
+
+<!--[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)-->
+
+
+欢迎来到 Paddle-Mobile GitHub 项目。
+
+Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平台的深度学习的框架。Paddle-Moible设计思想和PaddlePaddle的最新版fluid版本保持了高度一致，同时针对嵌入式做了大量优化。设计之初就对嵌入式的性能、体积、能耗、硬件平台覆盖等方面做了考虑。
+
+## 简单搜索线上效果
+
+如下gif是简单搜索app的线上主体检测应用效果
+
+![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif)
+
+## Demo目录
+
+[点我](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo)
+
+## Features
+
+- **ARM CPU**
+
+|mobilenet arm v7|1线程|2线程|4线程|
+|------------|----|-----|-----|
+|麒麟960(ms)|110.586|70.897|47.474|
+|||||
+|mobilenetssd arm v7|1线程|2线程|4线程|
+|麒麟960(ms)|222.124|138.952|90.856|
+|||||
+|googlenet(v1) arm v7|1线程|2线程|4线程|
+|麒麟960(ms)|348.018|240.304|169.998|
+|||||
+|squeezenet arm v7|1线程|2线程|4线程|
+|麒麟960(ms)|84.685|56.544|38.833|
+|||||
+|yolo arm v7|1线程|2线程|4线程|
+|麒麟960(ms)|131.831|88.990|60.905|
+
+    arm cpu是paddle-mobile的主要支持方向，cpu的通用性一直是其优势。嵌入式深度学习，需要大量的cpu汇编实现。我们正在紧锣密鼓的编码，为的是能充分硬件的每一点加速能力。
+    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
+    
+- **Mali GPU**
+
+    Mali GPU是百度和ARM合作开发的，双方团队近期都在致力于将paddle的op能无缝运行在ACL(arm compute library)。目前已经支持squeezenet，googlenet，resnet等几个网络模型，后续会继续加大力度。使全部移动端paddle op能高效运行在mali gpu上。 
+
+- **苹果设备的GPU Metal实现**
+
+    基于Metal实现的苹果设备的GPU预测库，也已经在实现中，近期也会有相应可运行版本。
+     
+- **FPGA**
+
+    FPGA实现正在进行中，是基于Xilinx的ZU5目标开发板。
+
+- **灵活性**
+
+    * paddle-mobile cpu版不依赖任何第三库, 可进行快速集成。
+    * 使用泛型特化进行平台切换, 可灵活切换 cpu、gpu 和其他协处理器。
+    * 可根据特定的常见网络, 进行编译特定的 op, 降低编译时间, 减小包大小。
+    * 使用 docker 编译, 提供统一的编译环境。
+    * 高可拓展性, 方便拓展其他协处理器, 提供高性能 arm 算子实现, 方便其他协处理器开发者集成开发。
+    * 直接兼容 paddle-fluid 模型, 不需要额外的转换操作。
+
+- **体积**
+
+    paddle-mobile从设计之初就深入考虑到移动端的包体积的问题，cpu实现中没有外部依赖。在编译过程中，如果该网络不需要的op是完全不会被打入的。同时编译选项优化也为体积压缩提供了帮助。
+    除了二进制体积，我们对代码体积极力避免过大。整个仓库的代码体积也非常小。
+
+
+## 文档
+
+### 设计文档
+
+关于paddle-mobile设计文档在下面链接中，如果想了解更多内容。[issue](https://github.com/PaddlePaddle/paddle-mobile/issues)中会有很多早期的设计和讨论过程。
+[设计文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/design_doc.md)
+
+### 开发文档
+
+开发文档主要是关于编译、运行等问题。做为开发者，它可以和贡献文档共同结合使用。
+[开发文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_doc.md)

+### 贡献文档
+- [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
+- 上面文档中涵盖了主要的贡献代码流程，如果在实践中您还遇到了其他问题，可以发[issue](https://github.com/PaddlePaddle/paddle-mobile/issues)。我们看到后会尽快处理。

-This project is used to develop the next version deep learning freamwork for mobile device.

-# Development
+## 模型获得
+目前Paddle-Mobile仅支持Paddle fluid训练的模型。如果你手中的模型是不同种类的模型，需要进行模型转换才可以运行。
+### 1. 直接使用Paddle Fluid训练
+该方式最为可靠，推荐方式
+### 2. caffe转为Paddle Fluid模型
+[链接](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
+### 3. ONNX
+ONNX全称为“Open Neural Network Exchange”，即“开放的神经网络切换”。该项目的目的是让不同的神经网络开发框架做到互通互用。

-[Used model in development](https://mms-mis.cdn.bcebos.com/paddle-mobile/models.zip)
+除直接使用PaddlePaddle训练fluid版本的模型外，还可以通过onnx转换得到个别Paddle fluid模型。

-## cross-compilation to android
+目前，百度也在做onnx支持工作。相关转换项目在这里：[paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)。

-* NDK is required
-* ANDROID_NDK environment variable is required
+![](http://7xop3k.com1.z0.glb.clouddn.com/15311951836000.jpg)

-```bash 
-sh build.sh android
-```
+### 4. 部分测试模型和测试图片下载
+[下载链接](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)

-## build for x86
-paddle-mobile is to run on arm platform. x86 only used to test not arm assembly code. So do not recommend compiling x86.
+## 问题解决

-Now only support osx.
+欢迎提出或解决我们的问题，有疑问可以发issue. [Github Issues](https://github.com/PaddlePaddle/paddle-mobile/issues).

-```
-sh build.sh mac
-```
+## Copyright and License
+Paddle-Mobile 提供相对宽松的Apache-2.0开源协议 [Apache-2.0 license](LICENSE).

-## Old Version of Mobile-Deep-Learning
-The old version of MDL was I moved to here [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) 

+## 旧版 Mobile-Deep-Learning
+原MDL(Mobile-Deep-Learning)工程被迁移到了这里 [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) 


--- a/demo/ReadMe.md
+++ b/demo/ReadMe.md
+## 如何运行demo
+- Android demo下载路径   
+ http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip
+- iOS demo下载路径：   
+  http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip
+  
+在demo目录下执行下载demo的脚本
+```
+sh getDemo.sh
+```
+demo工程就下载解压到当前目录中了。
\ No newline at end of file
--- a/demo/getDemo.sh
+++ b/demo/getDemo.sh
+#!/usr/bin/env bash
+wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip
+wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip
+unzip paddle-mobile%2FPaddleMobile_Android.zip
+unzip paddle-mobile%2FPaddleMobileDemo_iOS.zip
+rm -rf paddle-mobile%2FPaddleMobile_Android.zip
+rm -rf paddle-mobile%2FPaddleMobileDemo_iOS.zip
+rm -rf __MACOSX
\ No newline at end of file
--- a/doc/build.md
+++ b/doc/build.md
+# 环境搭建
+## 使用 docker
+### 1. 安装 docker
+安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
+### 2. 使用 docker 搭建构建环境
+首先进入 paddle-mobile 的目录下，执行 `docker build`
+以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
+```
+$ docker build -t paddle-mobile:dev - < Dockerfile
+```
+使用 `docker images` 可以看到我们新建的 image
+```
+$ docker images
+REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
+paddle-mobile   dev     33b146787711   45 hours ago    372MB
+```
+### 3. 使用 docker 构建
+进入 paddle-mobile 目录，执行 docker run
+```
+$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
+root@5affd29d4fc5:/ # cd /paddle-mobile
+###
+### paddle-mobile 支持 arm 架构下的各种平台，包括 android 以及 linux 等，可以使用不同的
+### toolchain 文件生成满足需要的 makefile
+###
+# 生成构建 android 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
+
+# 生成构建 linux 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
+```
+### 4. 设置编译选项
+可以通过 ccmake 设置编译选项
+```
+root@5affd29d4fc5:/ # ccmake .
+                                                     Page 1 of 1
+ CMAKE_ASM_FLAGS
+ CMAKE_ASM_FLAGS_DEBUG
+ CMAKE_ASM_FLAGS_RELEASE
+ CMAKE_BUILD_TYPE
+ CMAKE_INSTALL_PREFIX             /usr/local
+ CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
+ CPU                              ON
+ DEBUGING                         ON
+ FPGA                             OFF
+ LOG_PROFILE                      ON
+ MALI_GPU                         OFF
+ NET                              googlenet
+ USE_EXCEPTION                    ON
+ USE_OPENMP                       OFF
+```
+修改选项后，按 `c`, `g` 更新 Makefile
+### 5. 构建
+使用 make 命令进行构建
+```
+root@5affd29d4fc5:/ # make
+```
+### 6. 查看构建产出
+构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
+
+## 不使用 docker
+不使用 docker 的方法，可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具，可能需要设置 CC，CXX 环境变量，或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake，或者增加自己需要的 toolchain file。
--- a/doc/design_doc.md
+++ b/doc/design_doc.md
+# paddle-mobile 设计文档
+
+
+#### 以下是 paddle-mobile 代码的执行流程图:
+
+![执行流程图](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305189473720.png)
+
+
+#### 主要分为: Loader 模块、 Program 模块、 Executor 模块、 op 模块、 kernel 模块、scope variable Tensor 模块
+
+#### 下面展开说一下各个模块的作用以及设计思路
+
+### 一. Loader
+先来看一下模型, 模型分为两种结构:
+ 一种为参数文件是散开的, 如下图, 红框为模型结构的 protobuf 文件, 其余为参数文件
+
+![模型描述](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305190629577.png)
+
+
+另一种为参数文件结合在一起的, 如下图, 红框内为模型结构描述的 protobuf 文件, 另一个文件为结合在一起的参数文件
+
+![模型描述combined](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191057130.png)
+
+
+loader 模块的作用是将模型结构信息 load 进内存, 将红框内的 protobuf 文件 load 进内存, 并对模型结构进行优化(如将几个细粒度的 op 融合成 粗粒度的 op, 如将 conv、 add、 batchnorm、 relu 融合为 conv\_add\_batchnorm\_relu).
+方便进行算法优化.
+
+__那么为什么融合在一起能够做算法优化 ?__
+
+如果未融合的 conv add batchnorm relu 运算是这样的
+
+```
+[n]
+[conv_res] = conv([n])
+
+for &res in conv_res {
+	res = add_biase(res)
+}
+
+for &res in conv_res {
+	res = batchnorm(res)
+}
+
+for &res in conv_res {
+	res = relu(res)
+}
+
+```
+融合后的 conv\_add\_batchnorm\_relu 运算是这样的:
+
+```
+[n]
+[conv_res] = conv([n])
+
+for &res in conv_res {
+	res = relu(batchnorm(add_biase(res)))
+}
+
+```
+由于 conv 可以转换为两个大矩阵相乘, 更进一步可以分为若干个一行一列的小矩阵相乘, 那最终的运算是这样的:
+
+```
+[n]
+for &res in [res] {
+	res = relu(batchnorm(add_biase(A * B)))
+}
+
+其中 A 和 B 为 1 * k 和 k * 1 矩阵
+
+```
+
+
+
+### 二. Program
+
+program 为 loader 模块的结果, 包含了优化前的模型结构对象, 以及优化后的模型结构对象, 此模块基本对应着 paddle 模型的结构, 关于paddle 模型的一些概念的定义, 详细设计可以参考 [program.md](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), 以下是一个简单的概况: 
+
+* programDesc 中包含着若干个(googlenet mobilenet yolo squeezenet resnet 常见的模型只有一个)可以嵌套的 block, blocks中的第一个block中的某个 op 可能会执行 blocks 中后边 block 中的一系列 op 运算(只有多个block才会有此概念)
+* block 包含着 ops 和 vars
+* ops 为一系列 op 的描述, 描述着每个 op 的类型, 输入输出, 所需参数
+* vars 里包含的为所有 op 运算所需的参数描述
+
+### 三. Executor
+
+executor 主要是用于 op 运算的上层调度操作, 主要有两个操作,  executor 实例化 和 暴露给上层的 predict 方法
+
+* executor 实例化过程中, 主要进行了这几个操作 
+	1. 根据 loader 产出的 program 初始化 operator 对象 
+	2. 分配所有需要用到的内存, 包括每个op 的输入输出, 权重参数, 目前模型的权重参数文件的内存格式为 NCHW, op 的输入输出中间矩阵参数也是 NCHW 格式
+	3. 调用每个 op 的 init 方法, init 方法是每个 op 实现者进行参数预处理的地方, 有助于减少 predict 的耗时
+
+* predict, 主要用于拿到外部的输入, 顺序调用 op 的 run 方法进行运算, 并返回最终的结果.
+
+
+### 四. op
+关于 op 模块代码的详细设计可以参考 [operator部分代码设计](https://github.com/PaddlePaddle/paddle-mobile/issues/300), operator主要包含一个kernel用于运算、一个 param 用于存储属性, operator 主要有三个操作, Init、RunImp、InferShape
+
+* Init: Init 函数主要用于参数预处理, 如对 batchNorm 参数进行预处理, 可以将 batchNorm 运算转化为 a * x + b 形式的运算, 这个函数也会调用, kernel 的 Init 函数对 kernel 进行初始化
+* RunImp: RunImp 函数会调用自己的kernel 的 compute 方法进行运算
+* InferShape: InferShape 函数会根据输入和参数得出输出的形状, 这个函数会在 executor 实例化时, 内存初始化前调用
+
+每个 operator 都需要进行注册才可以被使用, 以 conv 为例, 需在 conv_op.cpp 底部这样写: 
+
+```c++
+// 三个平台都注册了 conv op
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(conv2d);
+REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(conv2d);
+REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(conv2d);
+REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
+#endif
+
+```
+
+__一个关于包大小的优化__:
+
+每个 operator 都由一个宏控制编译, 如 conv_op.h(除了 conv_op.h ,  conv_op.cpp、conv_kernle.h、conv_kernle.cpp 也都需要加此宏控制)
+
+```c++
+
+#ifdef CONV_OP    //这个宏控制着 conv_op 是否被编译, 除了 conv_op.h ,  conv_op.cpp、conv_kernle.h conv_kernle.cpp 也都需要加此宏控制
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/conv_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+template <typename DeviceType, typename T>
+class ConvOp
+	//impl  
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+
+```
+这样做的目的是为了根据不同类型的网络编译特定的op, 在 cmake 中已经配置好不同网络编译的宏, 如果你要进行编译支持 yolo 的模型, 仅需执行:
+
+```sh
+cd toools
+sh build.sh android yolo
+
+```
+这样只会编译 yolo 所包含的四种 op, 极大的减小了包体积和编译时间
+
+### 五. kernel
+kernel 为 op 的底层运算实现, 主要有两个函数, Init 和 Compute, 分别用来初始化、预处理 和 运算操作, 值得提出的是, kernel 会根据泛型特化到不同的平台, 如图所示:
+
+![设备特化]![](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191401976.png)
+
+不同平台的 kernel 实现, 为同一个 kernel 类不同泛型的特化实现, 目前有三个平台, arm、mali、fpga, 图中的 central-arm-func\ 目录为 op kernel 的 arm 实现, 它承担了 arm\ 目录下 kernel 的底层实现, 同时 arm 处理器作为中央处理器, central-arm-func\ 也可以作为其他协处理器的底层实现, 如: fpga 的某一个 op kernel 还没有 fpga 协处理器的实现, 就可以直接调用使用这里的 arm 实现.
+
+__如果你有兴趣新增一个协处理器实现, 就可以在次添加一个 kernel 目录, 提供协处理器实现, 如果某个 kernel 你没有实现完, 你也可以直接使用 arm 实现__
+
+### 六. scope variable Tensor
+* scope 用来存储管理所需用到的所有 variable(用来存储不同类型的对象, 主要是矩阵Tensor, 也就是说 scpoe 管理着 op 运算过程中所有参数矩阵, 输入输出矩阵), 可以将 scope 理解为一个 map, 这里在 map 上封了一层 scope 的概念是为了方便内存管理
+* variable 可以用来存储不同类型的对象, paddle-mobile 里主要用它来存储矩阵 Tensor
+* tensor 代表着矩阵, 通过泛型可以用来存储不同类型的矩阵, 但需要注意的是, 存入和取出时的类型必须保持一致, 如果类型不一致,  使用 inline const T \*data() const 获取指针会不能通过类型检查, 通过  inline T \*mutable_data() 获取指针会重新分配内存, 以下是关于 Tensor 的一些小概念:
+	1. DDim: 用来存储矩阵的维度信息.
+	2. Slice(): 这个函数用来获取 N 维 (NCHW中的 N) 上切片
+	3. 当实例化未分配内存时, 调用 inline T *mutable_data() 会分配内存
+
+
+
+
+
+
--- a/doc/development_doc.md
+++ b/doc/development_doc.md
+### iOS&Android开发文档
+
+# iOS开发文档
+
+## 编译
+
+```sh
+
+# 在 paddle-mobile 目录下:
+cd tools
+
+sh build.sh ios
+
+# 如果只想编译某个特定模型的 op, 则需执行以下命令
+sh build.sh ios googlenet
+
+# 在这个文件夹下, 你可以拿到生成的 .a 库
+cd ../build/release/ios/build
+
+```
+#### 常见问题:
+
+1. No iOS SDK's found in default search path ...
+
+    这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定, 
+    以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk")
+
+## 集成
+
+```
+将上一步生成的:
+libpaddle-mobile.a
+
+/src/ios_io/ 下的
+PaddleMobile.h
+```
+拖入工程
+
+#### oc 接口
+
+接口如下:
+
+```
+/*
+	创建对象
+*/
+- (instancetype)init;
+
+/*
+	load 模型, 开辟内存
+*/
+- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
+
+/*
+	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
+
+/*
+	进行预测
+*/
+- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
+
+/*
+	清理内存
+*/
+- (void)clear;
+
+```
+
+
+# Android开发文档
+
+用户可通过如下两种方式，交叉编译Android平台上适用的paddle-mobile库：
+
+- 基于Docker容器编译
+- 基于Linux交叉编译
+
+
+## 基于Docker容器编译
+### 1. 安装 docker
+安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
+### 2. 使用 docker 搭建构建环境
+首先进入 paddle-mobile 的目录下，执行 `docker build`
+以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
+
+```
+$ docker build -t paddle-mobile:dev - < Dockerfile
+```
+使用 `docker images` 可以看到我们新建的 image
+
+```
+$ docker images
+REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
+paddle-mobile   dev     33b146787711   45 hours ago    372MB
+```
+### 3. 使用 docker 构建
+进入 paddle-mobile 目录，执行 docker run
+
+```
+$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
+root@5affd29d4fc5:/ # cd /paddle-mobile
+# 生成构建 android 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
+# 生成构建 linux 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
+```
+### 4. 设置编译选项
+可以通过 ccmake 设置编译选项
+
+```
+root@5affd29d4fc5:/ # ccmake .
+                                                     Page 1 of 1
+ CMAKE_ASM_FLAGS
+ CMAKE_ASM_FLAGS_DEBUG
+ CMAKE_ASM_FLAGS_RELEASE
+ CMAKE_BUILD_TYPE
+ CMAKE_INSTALL_PREFIX             /usr/local
+ CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
+ CPU                              ON
+ DEBUGING                         ON
+ FPGA                             OFF
+ LOG_PROFILE                      ON
+ MALI_GPU                         OFF
+ NET                              googlenet
+ USE_EXCEPTION                    ON
+ USE_OPENMP                       OFF
+```
+修改选项后，按 `c`, `g` 更新 Makefile
+### 5. 构建
+使用 make 命令进行构建
+
+```
+root@5affd29d4fc5:/ # make
+```
+### 6. 查看构建产出
+构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
+
+## 基于Linux交叉编译
+### 交叉编译环境准备
+##### 下载Android NDK
+
+从源码交叉编译paddle-mobile,用户需要提前准备好交叉编译环境。Android平台使用的C/C++交叉编译工具链是[Android NDK](https://developer.android.com/ndk/)，用户可以自行前往下载，也可以通过以下命令获取：
+- Mac平台
+```
+wget https://dl.google.com/android/repository/android-ndk-r17b-darwin-x86_64.zip
+unzip android-ndk-r17b-darwin-x86_64.zip
+
+```
+- Linux平台
+```
+wget https://dl.google.com/android/repository/android-ndk-r17b-linux-x86_64.zip
+unzip android-ndk-r17b-linux-x86_64.zip
+```
+
+##### 设置环境变量
+工程中自带的独立工具链会根据环境变量NDK_ROOT查找NDK，因此需要配置环境变量：
+
+```
+export NDK_ROOT = "path to ndk"
+```
+### 执行编译
+在paddle-mobile根目录中，执行以下命令：
+
+```
+cd tools
+sh build.sh android
+
+```
+执行完毕后，生成的so位于build目录中，单测可执行文件位于test/build目录中。
+##### Tips:
+如果想要获得体积更小的库，可选择编译支持指定模型结构的库。
+如执行如下命令：
+
+```
+sh build.sh android googlenet
+```
+会得到一个支持googlnet的体积更小的库。
+
+##测试
+在编译完成后，我们提供了自动化的测试脚本，帮助用户将运行单测文件所需要的模型及库文件push到Android设备中，执行以下命令：
+
+```
+cd tools/android-debug-script
+sh run_on_android.sh (npm) 可选参数npm,用于选择是否传输模型文件到手机上
+```
+出现如下提示：
+
+```
+**** choose OP or NET to test ****
+which to test :
+```
+输入名称即可运行对应的测试文件。
+
+##部署
+Android应用可通过JNI接口调用底层C/C++，paddle-mobile对外提供的JNI接口如下：
+
+##### 1 load接口  加载模型参数
+- 用于加载参数文件分散的模型
+```
+/**
+     * Load seperated parameters
+     * @param modelDir
+     * @return
+     */
+    public static native boolean load(String modelDir);
+```
+- 用于加载参数文件合并的模型文件
+```
+/**
+     * Load combined parameters
+     * @param modelPath
+     * @param paramPath
+     * @return
+     */
+    public static native boolean loadCombined(String modelPath,String paramPath);
+
+```
+##### 2 predict接口 执行预测
+- 接受预处理过的RGB数组的predict接口
+```
+/**
+*@param buf 输入数据
+*@return 输出数据
+JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
+    JNIEnv *env, jclass thiz, jfloatArray buf);
+```
+- 接受原始yuv数据的predict接口
+```
+ /**
+     *
+     * @param buf yuv420格式的字节数组
+     * @param imgWidth yuv数据的宽
+     * @param imgHeight yuv数据的高
+     * @param ddims 输入数据的形状
+     * @param meanValues 模型训练时各通道的均值
+     * @return
+     */
+
+    public static native float[] predictYuv(byte[] buf, int imgWidth, int imgHeight, int[] ddims, float[]meanValues);
+
+```
+##### 3 clear接口 销毁实例、清理内存操作
+
+```
+JNIEXPORT void JNICALL Java_com_baidu_paddle_PMLL_clear(JNIEnv *env,
+                                                        jclass thiz);
+```
+
+
--- a/doc/images/devices.png
+++ b/doc/images/devices.png
--- a/doc/images/flow_chart.png
+++ b/doc/images/flow_chart.png
--- a/doc/images/model_desc.png
+++ b/doc/images/model_desc.png
--- a/doc/images/model_desc_combined.png
+++ b/doc/images/model_desc_combined.png
--- a/doc/quantification.md
+++ b/doc/quantification.md
+# Quantification 模型量化、反量化
+
+## 背景故事
+部分网络如AlexNet训练出的模型体积较大，不适宜在移动设备上使用。
+
+
+## 解决模型过大办法
+1. 选用适合移动端的模型结构如：mobilenet、googlenet、 yolo、squeezenet 等；
+2. 使用我们提供的量化工具，可以在几乎不影响精度的情况下将float32模型减小至原模型的 1/4；
+
+- - - - - 
+## 量化工具介绍
+
+### 模型转化工具目录：
+
+- [量化工具目录](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/tools/quantification)
+
+- [模型转化工具](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/convert.cpp)
+
+#### 使用说明
+- [工具使用](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/README.md)
+
+## 如何读取量化后的模型
+load方法中添加了 quantification 参数，默认为false。 如果需要load量化后的模型，按需传参即可。
+
+[我是源代码](https://github.com/PaddlePaddle/paddle-mobile/blob/55302b33ea3bd68c9797d8f65e527544792b8095/src/io/paddle_mobile.h)
+
+```c++
+bool Load(const std::string &dirname, bool optimize = false,
+            bool quantification = false, int batch_size = 1);
+```
+
+- - - - - 
+
+
+
+
+
+
--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Info.plist
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Info.plist
@@ -35,8 +35,6 @@
 	<key>UISupportedInterfaceOrientations</key>
 	<array>
 		<string>UIInterfaceOrientationPortrait</string>
-		<string>UIInterfaceOrientationLandscapeLeft</string>
-		<string>UIInterfaceOrientationLandscapeRight</string>
 	</array>
 	<key>UISupportedInterfaceOrientations~ipad</key>
 	<array>

--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ModelHelper.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ModelHelper.swift
@@ -12,23 +12,17 @@ import Foundation
 import paddle_mobile
 import MetalPerformanceShaders

-class PreProccess: CusomKernel {
-    init(device: MTLDevice) {
-        let s = CusomKernel.Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
-        super.init(device: device, inFunctionName: "preprocess", outputDim: s, usePaddleMobileLib: false)
-    }
-}
-
-let modelHelperMap: [SupportModel : ModelHelper] = [.mobilenet : MobileNetHelper.init()]
+let modelHelperMap: [SupportModel : Net] = [.mobilenet : MobileNet.init(), .mobilenet_ssd : MobileNet_ssd_hand.init()]

 enum SupportModel: String{
    case mobilenet = "mobilenet"
+    case mobilenet_ssd = "mobilenetssd"
    static func supportedModels() -> [SupportModel] {
-        return [.mobilenet]
+        return [.mobilenet, .mobilenet_ssd]
    }
 }

-protocol ModelHelper {
+protocol Net {
    var dim: [Int] { get }
    var modelPath: String { get }
    var paramPath: String { get }
@@ -38,7 +32,7 @@ protocol ModelHelper {
    func resultStr(res: [Float]) -> String
 }

-extension ModelHelper {
+extension Net {
    func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) {
        let texture = try? MetalHelper.shared.textureLoader.newTexture(cgImage: image, options: [:]) ?! " texture loader error"
        MetalHelper.scaleTexture(queue: MetalHelper.shared.queue, input: texture!, size: (224, 224)) { (resTexture) in
@@ -47,7 +41,15 @@ extension ModelHelper {
    }
 }

-struct MobileNetHelper: ModelHelper{
+struct MobileNet: Net{
+    
+    class MobilenetPreProccess: CusomKernel {
+        init(device: MTLDevice) {
+            let s = CusomKernel.Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
+            super.init(device: device, inFunctionName: "preprocess", outputDim: s, usePaddleMobileLib: false)
+        }
+    }
+    
    class PreWords {
        var contents: [String] = []
        init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
@@ -84,6 +86,33 @@ struct MobileNetHelper: ModelHelper{
        modelPath = Bundle.main.path(forResource: "model", ofType: nil) ?! "model null"
        paramPath = Bundle.main.path(forResource: "params", ofType: nil) ?! "para null"
        modelDir = ""
-        preprocessKernel = PreProccess.init(device: MetalHelper.shared.device)
+        preprocessKernel = MobilenetPreProccess.init(device: MetalHelper.shared.device)
+    }
+}
+
+struct MobileNet_ssd_hand: Net{
+    class MobilenetssdPreProccess: CusomKernel {
+        init(device: MTLDevice) {
+            let s = CusomKernel.Shape.init(inWidth: 300, inHeight: 300, inChannel: 3)
+            super.init(device: device, inFunctionName: "mobilenet_ssd_preprocess", outputDim: s, usePaddleMobileLib: false)
+        }
+    }
+    
+    func resultStr(res: [Float]) -> String {
+       fatalError()
+    }
+    
+    var preprocessKernel: CusomKernel
+    let dim = [1, 300, 300, 3]
+    let modelPath: String
+    let paramPath: String
+    let modelDir: String
+    
+    init() {
+        modelPath = Bundle.main.path(forResource: "ssd_hand_model", ofType: nil) ?! "model null"
+        paramPath = Bundle.main.path(forResource: "ssd_hand_params", ofType: nil) ?! "para null"
+        modelDir = ""
+        preprocessKernel = MobilenetssdPreProccess.init(device: MetalHelper.shared.device)
    }
 }
+
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/PreProcessKernel.metal
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/PreProcessKernel.metal
@@ -39,6 +39,34 @@ kernel void preprocess_half(
 }


+kernel void mobilenet_ssd_preprocess(
+                       texture2d<float, access::read> inTexture [[texture(0)]],
+                       texture2d<float, access::write> outTexture [[texture(1)]],
+                       uint2 gid [[thread_position_in_grid]])
+{
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
+    const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+
+kernel void mobilenet_ssd_preprocess_half(
+                            texture2d<half, access::read> inTexture [[texture(0)]],
+                            texture2d<half, access::write> outTexture [[texture(1)]],
+                            uint2 gid [[thread_position_in_grid]])
+{
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height()) {
+        return;
+    }
+    const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f);
+    const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
+    outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
+}
+



--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -20,17 +20,17 @@ import MetalPerformanceShaders
 let threadSupport = [1]

 class ViewController: UIViewController {
+    @IBOutlet weak var resultTextView: UITextView!
    @IBOutlet weak var selectImageView: UIImageView!
    @IBOutlet weak var elapsedTimeLabel: UILabel!
-    @IBOutlet weak var resultLabel: UILabel!
    @IBOutlet weak var modelPickerView: UIPickerView!
    @IBOutlet weak var threadPickerView: UIPickerView!
    var selectImage: UIImage?
-
    var program: Program?
    var executor: Executor<Float32>?
    var modelType: SupportModel = .mobilenet
-    var modelHelper: ModelHelper {
+    var toPredictTexture: MTLTexture?
+    var modelHelper: Net {
        return modelHelperMap[modelType] ?! " has no this type "
    }
    var threadNum = 1
@@ -65,39 +65,40 @@ class ViewController: UIViewController {
    }
    
    @IBAction func predictAct(_ sender: Any) {        
-        guard let inImage = selectImage, let cgImage = inImage.cgImage else {
-            resultLabel.text = "请选择图片 ! "
+        guard let inTexture = toPredictTexture else {
+            resultTextView.text = "请选择图片 ! "
            return
        }
        
        guard let inExecutor = executor else {
-            resultLabel.text = "请先 load ! "
+            resultTextView.text = "请先 load ! "
            return
        }
-        
-        modelHelper.getTexture(image: cgImage) { [weak self] (texture) in
-            guard let sSelf = self else {
-                fatalError()
-            }
-            do {
-                try inExecutor.predict(input: texture, expect: [1, 224, 224, 3], completionHandle: { (result) in
-                }, preProcessKernle: sSelf.modelHelper.preprocessKernel)
-                
-                let startDate = Date.init()
-                for i in 0..<10 {
-                    try inExecutor.predict(input: texture, expect: [1, 224, 224, 3], completionHandle: { (result) in
-                        if i == 9 {
-                            let time = Date.init().timeIntervalSince(startDate)
-                            DispatchQueue.main.async {
-                                sSelf.resultLabel.text = sSelf.modelHelper.resultStr(res: result.resultArr)
-                                sSelf.elapsedTimeLabel.text = "平均耗时: \(time/10.0) ms"
-                            }
+
+        do {
+            let max = 10
+            var startDate = Date.init()
+            for i in 0..<max {
+                try inExecutor.predict(input: inTexture, expect: modelHelper.dim, completionHandle: { [weak self] (result) in
+                    guard let sSelf = self else {
+                        fatalError()
+                    }
+                    
+                    if i == (max / 2 - 1) {
+                        startDate = Date.init()
+                    }
+                    
+                    if i == max - 1 {
+                        let time = Date.init().timeIntervalSince(startDate)
+                        DispatchQueue.main.async {
+                            sSelf.resultTextView.text = sSelf.modelHelper.resultStr(res: result.resultArr)
+                            sSelf.elapsedTimeLabel.text = "平均耗时: \(time/Double(max/2) * 1000.0) ms"
                        }
-                    }, preProcessKernle: sSelf.modelHelper.preprocessKernel)
-                }
-            } catch let error {
-                print(error)
+                    }
+                }, preProcessKernle: self.modelHelper.preprocessKernel)
            }
+        } catch let error {
+            print(error)
        }
    }

@@ -110,6 +111,9 @@ class ViewController: UIViewController {
        
        selectImage = UIImage.init(named: "banana.jpeg")
        selectImageView.image = selectImage
+        modelHelper.getTexture(image: selectImage!.cgImage!) {[weak self] (texture) in
+            self?.toPredictTexture = texture
+        }
    }
 }

@@ -163,6 +167,9 @@ extension ViewController:  UIImagePickerControllerDelegate, UINavigationControll
            }
            sSelf.selectImage = image
            sSelf.selectImageView.image = image
+            sSelf.modelHelper.getTexture(image: image.cgImage!, getTexture: { (texture) in
+                sSelf.toPredictTexture = texture
+            })
        }
    }
 }

--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test.xcodeproj/project.pbxproj
@@ -8,6 +8,12 @@

 /* Begin PBXBuildFile section */
 		168DA950D7D6CF91EBF70A17 /* Pods_paddle_mobile_unit_test.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8BCD4792E483BFEE9F5523DE /* Pods_paddle_mobile_unit_test.framework */; };
+		FC607427211DF3B100B17547 /* synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = FC60734E211DF3B000B17547 /* synset.txt */; };
+		FC607428211DF3B100B17547 /* banana.jpeg in Resources */ = {isa = PBXBuildFile; fileRef = FC60734F211DF3B000B17547 /* banana.jpeg */; };
+		FC607429211DF3B100B17547 /* iphone.JPG in Resources */ = {isa = PBXBuildFile; fileRef = FC607350211DF3B000B17547 /* iphone.JPG */; };
+		FC60742A211DF3B100B17547 /* paddle-mobile.png in Resources */ = {isa = PBXBuildFile; fileRef = FC607351211DF3B000B17547 /* paddle-mobile.png */; };
+		FC60742B211DF3B100B17547 /* params in Resources */ = {isa = PBXBuildFile; fileRef = FC607354211DF3B000B17547 /* params */; };
+		FC60742C211DF3B100B17547 /* model in Resources */ = {isa = PBXBuildFile; fileRef = FC607355211DF3B000B17547 /* model */; };
 		FC91818D211DAE9A00B6F354 /* paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = FC91818C211DAE9A00B6F354 /* paddle_mobile.framework */; };
 		FC91818E211DAE9A00B6F354 /* paddle_mobile.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = FC91818C211DAE9A00B6F354 /* paddle_mobile.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
 		FCDFD409211D9185005AB38B /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDFD408211D9185005AB38B /* AppDelegate.swift */; };
@@ -35,6 +41,12 @@
 		5CC132C848027BE970FB2637 /* Pods-paddle-mobile-unit-test.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-unit-test.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-unit-test/Pods-paddle-mobile-unit-test.debug.xcconfig"; sourceTree = "<group>"; };
 		72F34AE9677943FC580DE7F4 /* Pods-paddle-mobile-unit-test.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-unit-test.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-unit-test/Pods-paddle-mobile-unit-test.release.xcconfig"; sourceTree = "<group>"; };
 		8BCD4792E483BFEE9F5523DE /* Pods_paddle_mobile_unit_test.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile_unit_test.framework; sourceTree = BUILT_PRODUCTS_DIR; };
+		FC60734E211DF3B000B17547 /* synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = synset.txt; sourceTree = "<group>"; };
+		FC60734F211DF3B000B17547 /* banana.jpeg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = banana.jpeg; sourceTree = "<group>"; };
+		FC607350211DF3B000B17547 /* iphone.JPG */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = iphone.JPG; sourceTree = "<group>"; };
+		FC607351211DF3B000B17547 /* paddle-mobile.png */ = {isa = PBXFileReference; lastKnownFileType = image.png; path = "paddle-mobile.png"; sourceTree = "<group>"; };
+		FC607354211DF3B000B17547 /* params */ = {isa = PBXFileReference; lastKnownFileType = file; path = params; sourceTree = "<group>"; };
+		FC607355211DF3B000B17547 /* model */ = {isa = PBXFileReference; lastKnownFileType = file; path = model; sourceTree = "<group>"; };
 		FC91818C211DAE9A00B6F354 /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		FCDFD405211D9185005AB38B /* paddle-mobile-unit-test.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "paddle-mobile-unit-test.app"; sourceTree = BUILT_PRODUCTS_DIR; };
 		FCDFD408211D9185005AB38B /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
@@ -75,6 +87,36 @@
 			name = Pods;
 			sourceTree = "<group>";
 		};
+		FC60734D211DF3B000B17547 /* images */ = {
+			isa = PBXGroup;
+			children = (
+				FC60734E211DF3B000B17547 /* synset.txt */,
+				FC60734F211DF3B000B17547 /* banana.jpeg */,
+				FC607350211DF3B000B17547 /* iphone.JPG */,
+				FC607351211DF3B000B17547 /* paddle-mobile.png */,
+			);
+			name = images;
+			path = ../../images;
+			sourceTree = "<group>";
+		};
+		FC607352211DF3B000B17547 /* models */ = {
+			isa = PBXGroup;
+			children = (
+				FC607353211DF3B000B17547 /* mobilenet */,
+			);
+			name = models;
+			path = ../../models;
+			sourceTree = "<group>";
+		};
+		FC607353211DF3B000B17547 /* mobilenet */ = {
+			isa = PBXGroup;
+			children = (
+				FC607354211DF3B000B17547 /* params */,
+				FC607355211DF3B000B17547 /* model */,
+			);
+			path = mobilenet;
+			sourceTree = "<group>";
+		};
 		FCDFD3FC211D9185005AB38B = {
 			isa = PBXGroup;
 			children = (
@@ -97,6 +139,8 @@
 		FCDFD407211D9185005AB38B /* paddle-mobile-unit-test */ = {
 			isa = PBXGroup;
 			children = (
+				FC60734D211DF3B000B17547 /* images */,
+				FC607352211DF3B000B17547 /* models */,
 				FCDFD408211D9185005AB38B /* AppDelegate.swift */,
 				FCDFD40A211D9185005AB38B /* ViewController.swift */,
 				FCDFD40C211D9185005AB38B /* Main.storyboard */,
@@ -168,6 +212,12 @@
 			isa = PBXResourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				FC607427211DF3B100B17547 /* synset.txt in Resources */,
+				FC60742B211DF3B100B17547 /* params in Resources */,
+				FC607428211DF3B100B17547 /* banana.jpeg in Resources */,
+				FC60742A211DF3B100B17547 /* paddle-mobile.png in Resources */,
+				FC607429211DF3B100B17547 /* iphone.JPG in Resources */,
+				FC60742C211DF3B100B17547 /* model in Resources */,
 				FCDFD413211D9187005AB38B /* LaunchScreen.storyboard in Resources */,
 				FCDFD410211D9187005AB38B /* Assets.xcassets in Resources */,
 				FCDFD40E211D9185005AB38B /* Main.storyboard in Resources */,

--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
@@ -13,7 +13,6 @@ class AppDelegate: UIResponder, UIApplicationDelegate {

    var window: UIWindow?

-
    func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplicationLaunchOptionsKey: Any]?) -> Bool {
        // Override point for customization after application launch.
        return true
@@ -43,4 +42,3 @@ class AppDelegate: UIResponder, UIApplicationDelegate {


 }
-
--- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
+++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
@@ -17,4 +17,3 @@ class ViewController: UIViewController {
    }

 }
-
--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
@@ -46,6 +46,20 @@
 		FC9D038020E22FBB000F735A /* FeedOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D037F20E22FBB000F735A /* FeedOp.swift */; };
 		FC9D038220E2312E000F735A /* FetchOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D038120E2312E000F735A /* FetchOp.swift */; };
 		FC9D038420E23B01000F735A /* Texture.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D038320E23B01000F735A /* Texture.swift */; };
+		FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */; };
+		FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */; };
+		FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */; };
+		FCBCCC5D2122F8A100D94F7E /* DepthwiseConvOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5C2122F8A100D94F7E /* DepthwiseConvOp.swift */; };
+		FCBCCC5F2122FB3B00D94F7E /* PriorBoxOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5E2122FB3B00D94F7E /* PriorBoxOp.swift */; };
+		FCBCCC612122FBDF00D94F7E /* PriorBoxKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC602122FBDF00D94F7E /* PriorBoxKernel.swift */; };
+		FCBCCC632122FCC000D94F7E /* TransposeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC622122FCC000D94F7E /* TransposeKernel.swift */; };
+		FCBCCC652122FCD700D94F7E /* TransposeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC642122FCD700D94F7E /* TransposeOp.swift */; };
+		FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC66212306B000D94F7E /* ConcatOp.swift */; };
+		FCBCCC69212306D300D94F7E /* ConcatKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC68212306D300D94F7E /* ConcatKernel.swift */; };
+		FCBCCC6B2123071700D94F7E /* BoxcoderOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6A2123071700D94F7E /* BoxcoderOp.swift */; };
+		FCBCCC6D2123073A00D94F7E /* BoxcoderKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */; };
+		FCBCCC6F2123097100D94F7E /* MulticlassNMSOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */; };
+		FCBCCC71212309A700D94F7E /* MulticlassNMSKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */; };
 		FCD04E6620F314C50007374F /* PoolOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6520F314C50007374F /* PoolOp.swift */; };
 		FCD04E6820F315020007374F /* PoolKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6720F315020007374F /* PoolKernel.swift */; };
 		FCD04E6A20F319EC0007374F /* SoftmaxOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6920F319EC0007374F /* SoftmaxOp.swift */; };
@@ -104,6 +118,20 @@
 		FC9D037F20E22FBB000F735A /* FeedOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FeedOp.swift; sourceTree = "<group>"; };
 		FC9D038120E2312E000F735A /* FetchOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FetchOp.swift; sourceTree = "<group>"; };
 		FC9D038320E23B01000F735A /* Texture.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Texture.swift; sourceTree = "<group>"; };
+		FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DwConvBNReluOp.swift; sourceTree = "<group>"; };
+		FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluOp.swift; sourceTree = "<group>"; };
+		FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluKernel.swift; sourceTree = "<group>"; };
+		FCBCCC5C2122F8A100D94F7E /* DepthwiseConvOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DepthwiseConvOp.swift; sourceTree = "<group>"; };
+		FCBCCC5E2122FB3B00D94F7E /* PriorBoxOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PriorBoxOp.swift; sourceTree = "<group>"; };
+		FCBCCC602122FBDF00D94F7E /* PriorBoxKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PriorBoxKernel.swift; sourceTree = "<group>"; };
+		FCBCCC622122FCC000D94F7E /* TransposeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TransposeKernel.swift; sourceTree = "<group>"; };
+		FCBCCC642122FCD700D94F7E /* TransposeOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TransposeOp.swift; sourceTree = "<group>"; };
+		FCBCCC66212306B000D94F7E /* ConcatOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConcatOp.swift; sourceTree = "<group>"; };
+		FCBCCC68212306D300D94F7E /* ConcatKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConcatKernel.swift; sourceTree = "<group>"; };
+		FCBCCC6A2123071700D94F7E /* BoxcoderOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BoxcoderOp.swift; sourceTree = "<group>"; };
+		FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BoxcoderKernel.swift; sourceTree = "<group>"; };
+		FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MulticlassNMSOp.swift; sourceTree = "<group>"; };
+		FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MulticlassNMSKernel.swift; sourceTree = "<group>"; };
 		FCD04E6520F314C50007374F /* PoolOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PoolOp.swift; sourceTree = "<group>"; };
 		FCD04E6720F315020007374F /* PoolKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PoolKernel.swift; sourceTree = "<group>"; };
 		FCD04E6920F319EC0007374F /* SoftmaxOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SoftmaxOp.swift; sourceTree = "<group>"; };
@@ -219,6 +247,14 @@
 				FCD04E6920F319EC0007374F /* SoftmaxOp.swift */,
 				FCD04E6D20F31B4B0007374F /* ReshapeOp.swift */,
 				FCD04E7120F343420007374F /* ConvAddOp.swift */,
+				FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */,
+				FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */,
+				FCBCCC5C2122F8A100D94F7E /* DepthwiseConvOp.swift */,
+				FCBCCC5E2122FB3B00D94F7E /* PriorBoxOp.swift */,
+				FCBCCC642122FCD700D94F7E /* TransposeOp.swift */,
+				FCBCCC66212306B000D94F7E /* ConcatOp.swift */,
+				FCBCCC6A2123071700D94F7E /* BoxcoderOp.swift */,
+				FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */,
 			);
 			path = Operators;
 			sourceTree = "<group>";
@@ -257,6 +293,12 @@
 				FCD04E6B20F31A280007374F /* SoftmaxKernel.swift */,
 				FCD04E6F20F31B720007374F /* ReshapeKernel.swift */,
 				FCD04E7320F3437E0007374F /* ConvAddKernel.swift */,
+				FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */,
+				FCBCCC602122FBDF00D94F7E /* PriorBoxKernel.swift */,
+				FCBCCC622122FCC000D94F7E /* TransposeKernel.swift */,
+				FCBCCC68212306D300D94F7E /* ConcatKernel.swift */,
+				FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */,
+				FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */,
 			);
 			path = Kernels;
 			sourceTree = "<group>";
@@ -375,6 +417,7 @@
 				FC039B9F20E11CB20081E9F8 /* Tensor.swift in Sources */,
 				FC0E2DBC20EE45FE009C1FAC /* ConvKernel.swift in Sources */,
 				FC039BAA20E11CBC0081E9F8 /* ElementwiseAddOp.swift in Sources */,
+				FCBCCC6B2123071700D94F7E /* BoxcoderOp.swift in Sources */,
 				FC039B9B20E11CA00081E9F8 /* Executor.swift in Sources */,
 				FCD04E7020F31B720007374F /* ReshapeKernel.swift in Sources */,
 				FCD04E7220F343420007374F /* ConvAddOp.swift in Sources */,
@@ -383,6 +426,7 @@
 				FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */,
 				FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */,
 				FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */,
+				FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */,
 				FCEBC0F420F1FDD90099DBAF /* ConvAddBatchNormReluOp.swift in Sources */,
 				FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */,
 				FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */,
@@ -393,8 +437,10 @@
 				FC039BB820E11CC20081E9F8 /* framework.pb.swift in Sources */,
 				FC039B9920E11C9A0081E9F8 /* Types.swift in Sources */,
 				FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */,
+				FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */,
 				FC039BA920E11CBC0081E9F8 /* ConvOp.swift in Sources */,
 				FC9D038420E23B01000F735A /* Texture.swift in Sources */,
+				FCBCCC652122FCD700D94F7E /* TransposeOp.swift in Sources */,
 				FCD04E6E20F31B4B0007374F /* ReshapeOp.swift in Sources */,
 				FC039B9820E11C9A0081E9F8 /* Errors.swift in Sources */,
 				FC039BBF20E11CC20081E9F8 /* Attribute.swift in Sources */,
@@ -402,22 +448,32 @@
 				FC039BB920E11CC20081E9F8 /* Scope.swift in Sources */,
 				FCD04E6620F314C50007374F /* PoolOp.swift in Sources */,
 				FC039BAC20E11CBC0081E9F8 /* BatchNormOp.swift in Sources */,
+				FCBCCC6F2123097100D94F7E /* MulticlassNMSOp.swift in Sources */,
 				FC039BBC20E11CC20081E9F8 /* VarDesc.swift in Sources */,
+				FCBCCC632122FCC000D94F7E /* TransposeKernel.swift in Sources */,
+				FCBCCC71212309A700D94F7E /* MulticlassNMSKernel.swift in Sources */,
 				FCDC0FEB21099A1D00DC9EFB /* Tools.swift in Sources */,
 				FC0E2DBA20EE3B8D009C1FAC /* ReluKernel.swift in Sources */,
+				FCBCCC6D2123073A00D94F7E /* BoxcoderKernel.swift in Sources */,
+				FCBCCC69212306D300D94F7E /* ConcatKernel.swift in Sources */,
 				FC82735920E3C04200BE430A /* OpCreator.swift in Sources */,
+				FCBCCC5D2122F8A100D94F7E /* DepthwiseConvOp.swift in Sources */,
 				FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */,
 				FC039BAB20E11CBC0081E9F8 /* Operator.swift in Sources */,
 				FCD04E6A20F319EC0007374F /* SoftmaxOp.swift in Sources */,
+				FCBCCC612122FBDF00D94F7E /* PriorBoxKernel.swift in Sources */,
+				FCBCCC5F2122FB3B00D94F7E /* PriorBoxOp.swift in Sources */,
 				FC9D038220E2312E000F735A /* FetchOp.swift in Sources */,
 				FC039BBD20E11CC20081E9F8 /* Program.swift in Sources */,
 				FC039BA220E11CB70081E9F8 /* Loader.swift in Sources */,
+				FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */,
 				FCD04E6C20F31A280007374F /* SoftmaxKernel.swift in Sources */,
 				FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */,
 				FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */,
 				FC039BC020E11CC20081E9F8 /* BlockDesc.swift in Sources */,
 				FCD04E6820F315020007374F /* PoolKernel.swift in Sources */,
 				FC039BAD20E11CBC0081E9F8 /* ReluOp.swift in Sources */,
+				FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */,
 				FC039BBE20E11CC20081E9F8 /* OpDesc.swift in Sources */,
 				FC039B9720E11C9A0081E9F8 /* Extensions.swift in Sources */,
 			);

--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
@@ -43,14 +43,20 @@ class OpCreator<P: PrecisionType> {
        [gConvType                  :     ConvOp<P>.creat,
         gBatchNormType             :     BatchNormOp<P>.creat,
         gReluType                  :     ReluOp<P>.creat,
-         gElementwiseAdd            :     ElementwiseAddOp<P>.creat,
+         gElementwiseAddType        :     ElementwiseAddOp<P>.creat,
         gFeedType                  :     FeedOp<P>.creat,
         gFetchType                 :     FetchOp<P>.creat,
         gConvAddBatchNormReluType  :     ConvAddBatchNormReluOp<P>.creat,
         gPooType                   :     PoolOp<P>.creat,
         gSoftmaxType               :     SoftmaxOp<P>.creat,
         gReshapeType               :     ReshapeOp<P>.creat,
-         gConvAddType               :     ConvAddOp<P>.creat]
+         gConvAddType               :     ConvAddOp<P>.creat,
+         gDepthConvType             :     DepthConvOp<P>.creat,
+         gConcatType                :     ConcatOp<P>.creat,
+         gBoxcoderType              :     BoxcoderOp<P>.creat,
+         gConvBnReluType            :     ConvBNReluOp<P>.creat,
+         gDwConvBnReluType          :     DwConvBNReluOp<P>.creat,
+         gMulticlassNMSType         :     MulticlassNMSOp<P>.creat]
    
    private init(){}
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift
@@ -118,22 +118,37 @@ let gFeedType                   = "feed"
 let gConvType                   = "conv2d"
 let gBatchNormType              = "batch_norm"
 let gReluType                   = "relu"
-let gElementwiseAdd             = "elementwise_add"
+let gElementwiseAddType         = "elementwise_add"
 let gConvAddBatchNormReluType   = "conv_add_batchnorm_relu"
 let gPooType                    = "pool2d"
 let gSoftmaxType                = "softmax"
 let gReshapeType                = "reshape"
 let gConvAddType                = "conv_add"
-
+let gDepthConvType              = "depthwise_conv2d"
+let gPriorBoxType               = "prior_box"
+let gTransposeType              = "transpose"
+let gConcatType                 = "concat"
+let gBoxcoderType               = "box_coder"
+let gMulticlassNMSType          = "multiclass_nms"
+let gConvBnReluType             = "conv_bn_relu"
+let gDwConvBnReluType           = "depth_conv_bn_relu"

 let opInfos = [gConvType                    : (inputs: ["Input"], outputs: ["Output"]),
               gBatchNormType               : (inputs: ["X"], outputs: ["Y"]),
               gReluType                    : (inputs: ["X"], outputs: ["Out"]),
-               gElementwiseAdd              : (inputs: ["X"], outputs: ["Out"]),
+               gElementwiseAddType          : (inputs: ["X"], outputs: ["Out"]),
               gFeedType                    : (inputs: ["X"], outputs: ["Out"]),
               gFetchType                   : (inputs: ["X"], outputs: ["Out"]),
               gConvAddBatchNormReluType    : (inputs: ["Input"], outputs: ["Out"]),
               gPooType                     : (inputs: ["X"], outputs: ["Out"]),
               gSoftmaxType                 : (inputs: ["X"], outputs: ["Out"]),
               gReshapeType                 : (inputs: ["X"], outputs: ["Out"]),
-               gConvAddType                 : (inputs: ["Input"], outputs: ["Out"])]
+               gConvAddType                 : (inputs: ["Input"], outputs: ["Out"]),
+               gDepthConvType               : (inputs: ["Input"], outputs: ["Output"]),
+               gConcatType                  : (inputs: ["X"], outputs: ["Out"]),
+               gBoxcoderType                : (inputs: ["PriorBox", "PriorBoxVar", "TargetBox"], outputs: ["OutputBox"]),
+               gTransposeType               : (inputs: ["X"], outputs: ["Out"]),
+               gConvBnReluType              : (inputs: ["Input"], outputs: ["Out"]),
+               gDwConvBnReluType            : (inputs: ["Input"], outputs: ["Out"]),
+               gMulticlassNMSType           : (inputs: ["BBoxes", "Scores"], outputs: ["Out"]),
+               gPriorBoxType                : (inputs: ["Input", "Image"], outputs: ["Boxes", "Variances"])]
--- a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
+///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. */
+
+import Foundation
+
+class BoxcoderParam<P: PrecisionType>: OpParam {
+    typealias ParamPrecisionType = P
+    required init(opDesc: OpDesc, inScope: Scope) throws {
+        do {
+            fatalError()
+        } catch let error {
+            throw error
+        }
+    }
+    let input: Texture<P>
+    var output: Texture<P>
+}
+
+class BoxcoderOp<P: PrecisionType>: Operator<BoxcoderKernel<P>, BoxcoderParam<P>>, Runable, Creator, InferShaperable{
+    
+    func inferShape() {
+        para.output.dim = para.input.dim
+    }
+    
+    typealias OpType = BoxcoderOp<P>
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+}
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
+///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. */
+
+import Foundation
+
+class ConcatParam<P: PrecisionType>: OpParam {
+    typealias ParamPrecisionType = P
+    required init(opDesc: OpDesc, inScope: Scope) throws {
+        do {
+            fatalError()
+        } catch let error {
+            throw error
+        }
+    }
+    let input: Texture<P>
+    var output: Texture<P>
+}
+
+class ConcatOp<P: PrecisionType>: Operator<ConcatKernel<P>, ConcatParam<P>>, Runable, Creator, InferShaperable{
+    
+    func inferShape() {
+        para.output.dim = para.input.dim
+    }
+    
+    typealias OpType = ConcatOp<P>
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+}
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
@@ -92,7 +92,7 @@ class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKer
    static func fusionNode() -> Node {
        let beginNode = Node.init(inType: gConvType)
        _ = beginNode
-            --> Node.init(inType: gElementwiseAdd)
+            --> Node.init(inType: gElementwiseAddType)
            --> Node.init(inType: gBatchNormType)
            --> Node.init(inType: gReluType)
        return beginNode

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
@@ -46,7 +46,7 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>,
    static func fusionNode() -> Node {
        let beginNode = Node.init(inType: gConvType)
        _ = beginNode
-            --> Node.init(inType: gElementwiseAdd)
+            --> Node.init(inType: gElementwiseAddType)
        return beginNode
    }
    

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvBNReluParam<P: PrecisionType>: OpParam {
+    typealias ParamPrecisionType = P
+    required init(opDesc: OpDesc, inScope: Scope) throws {
+        do {
+            filter = try ConvBNReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+            input = try ConvBNReluParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try ConvBNReluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            stride = try ConvBNReluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+            paddings = try ConvBNReluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+            dilations = try ConvBNReluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+            epsilon = try ConvBNReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs)
+            
+            groups = try ConvBNReluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+            variance = try ConvBNReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope)
+            bias = try ConvBNReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope)
+            scale = try ConvBNReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
+            mean = try ConvBNReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    let input: Texture<P>
+    
+    let variance: Tensor<ParamPrecisionType>
+    let bias: Tensor<ParamPrecisionType>
+    let mean: Tensor<ParamPrecisionType>
+    let scale: Tensor<ParamPrecisionType>
+    let filter: Tensor<ParamPrecisionType>
+    let epsilon: Float32
+    var newScale: MTLBuffer?
+    var newBiase: MTLBuffer?
+    
+    var output: Texture<P>
+    let stride: [Int32]
+    let paddings: [Int32]
+    let dilations: [Int32]
+    let groups: Int
+}
+
+class ConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+    typealias OpType = ConvBNReluOp<P>
+    
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gConvType)
+        _ = beginNode
+            --> Node.init(inType: gBatchNormType)
+            --> Node.init(inType: gReluType)
+        return beginNode
+    }
+    
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
+    
+    static func fusionType() -> String {
+        return gConvBnReluType
+    }
+    
+    func delogOutput() {
+        
+        //        let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
+        //        para.filter.logDataPointer(header: "filter data pointer: ")
+        //        print("filter: \(para.filter)")
+        
+        //        print("biase: \(para.y)")
+        //        print("padding: \(para.paddings)")
+        //        print("stride: \(para.stride)")
+        
+        //        let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false)
+        //        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
+        //        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
+        
+        let output = para.output.metalTexture.floatArray { (p: P) -> P in
+            return p
+        }
+        //
+        writeToLibrary(fileName: "output_112x112x32_2", array: output)
+        print(" write done")
+        
+        //        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
+    }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class DepthConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runable, Creator, InferShaperable {
+    required init(device: MTLDevice, opDesc: OpDesc, inScope: Scope) throws {
+        do {
+            try super.init(device: device, opDesc: opDesc, inScope: inScope)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
+    }
+    
+    typealias OpType = DepthConvOp<P>
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    func delogOutput() {
+        print("conv output : ")
+        print(para.output.metalTexture)
+        //        let _: Float16? = para.output.metalTexture.logDesc()
+    }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class DwConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNReluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+    typealias OpType = ConvBNReluOp<P>
+    
+    func inferShape() {
+        let inDims = para.input.dim
+        let filterDim = para.filter.dim
+        let strides = para.stride
+        let paddings = para.paddings
+        let dilations = para.dilations
+        
+        var outDim = [inDims[0]]
+        for i in 0..<strides.count {
+            let dilation: Int = Int(dilations[i])
+            let filterSize: Int = filterDim[i + 1]
+            let inputSize: Int = inDims[i + 1]
+            let padding: Int = Int(paddings[i])
+            let stride: Int = Int(strides[i])
+            let dKernel = dilation * (filterSize - 1) + 1
+            let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+            outDim.append(outputSize)
+        }
+        outDim.append(filterDim[0])
+        para.output.dim = Dim.init(inDim: outDim)
+    }
+    
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+    
+    static func fusionNode() -> Node {
+        let beginNode = Node.init(inType: gDepthConvType)
+        _ = beginNode
+            --> Node.init(inType: gBatchNormType)
+            --> Node.init(inType: gReluType)
+        return beginNode
+    }
+    
+    static func change() -> [String : [(from: String, to: String)]] {
+        return [:]
+    }
+    
+    static func fusionType() -> String {
+        return gDwConvBnReluType
+    }
+    
+    func delogOutput() {
+        
+        //        let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
+        //        para.filter.logDataPointer(header: "filter data pointer: ")
+        //        print("filter: \(para.filter)")
+        
+        //        print("biase: \(para.y)")
+        //        print("padding: \(para.paddings)")
+        //        print("stride: \(para.stride)")
+        
+        //        let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false)
+        //        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
+        //        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
+        
+        let output = para.output.metalTexture.floatArray { (p: P) -> P in
+            return p
+        }
+        //
+        writeToLibrary(fileName: "output_112x112x32_2", array: output)
+        print(" write done")
+        
+        //        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
+    }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class BoxcoderKernel<P: PrecisionType>: Kernel, Computable{
+    func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
+    
+    required init(device: MTLDevice, param: BoxcoderParam<P>) {
+        super.init(device: device, inFunctionName: "priorbox")
+    }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConcatKernel<P: PrecisionType>: Kernel, Computable{
+    func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
+    
+    required init(device: MTLDevice, param: ConcatParam<P>) {
+        super.init(device: device, inFunctionName: "priorbox")
+    }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+struct ConvBNReluTestParam: TestParam {
+    let inputTexture: MTLTexture
+    let outputTexture: MTLTexture
+    var metalParam: MetalConvParam
+    let filterBuffer: MTLBuffer
+    let biaseBuffer: MTLBuffer
+    let newScaleBuffer: MTLBuffer
+    let newBiaseBuffer: MTLBuffer
+    let filterSize: (width: Int, height: Int, channel: Int)
+    init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
+        inputTexture = inInputTexture
+        outputTexture = inOutputTexture
+        metalParam = inMetalParam
+        filterBuffer = inFilterBuffer
+        biaseBuffer = inBiaseBuffer
+        newScaleBuffer = inNewScaleBuffer
+        newBiaseBuffer = inNewBiaseBuffer
+        filterSize = inFilterSize
+    }
+}
+
+class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
+    required init(device: MTLDevice, testParam: ConvBNReluTestParam) {
+        if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
+            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
+        } else if testParam.filterSize.channel == 1 {
+            super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
+        } else {
+            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
+        }
+    }
+    
+    var metalParam: MetalConvParam!
+    
+    required init(device: MTLDevice, param: ConvBNReluParam<P>) {
+        
+        if param.filter.width == 1 && param.filter.height == 1 {
+            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
+        } else if param.filter.channel == 1 {
+            super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
+        } else {
+            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
+        }
+        
+        param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
+        
+        param.variance.initBuffer(device: device)
+        param.mean.initBuffer(device: device)
+        param.scale.initBuffer(device: device)
+        param.bias.initBuffer(device: device)
+        
+        let offsetX = param.filter.width/2 - Int(param.paddings[0])
+        let offsetY = param.filter.height/2 - Int(param.paddings[1])
+        
+        print("offset x: \(offsetX)")
+        print("offset y: \(offsetY)")
+        
+        let offsetZ = 0.0
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
+        
+        var invs: [P] = []
+        let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
+        
+        for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
+            let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
+            invs.append(P(inv))
+        }
+        
+        let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
+        let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
+        
+        let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
+        let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
+        let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
+        for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
+            newScale[i] = invs[i] * scaleContents[i]
+            newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
+        }
+        
+        param.newBiase = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)
+        param.newScale = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)
+        
+        newScale.deinitialize(count: param.scale.buffer.length)
+        newScale.deallocate()
+        
+        newBiase.deinitialize(count: param.bias.buffer.length)
+        newBiase.deallocate()
+    }
+    
+    func compute(commandBuffer: MTLCommandBuffer, param: ConvBNReluParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        encoder.setBuffer(param.newScale!, offset: 0, index: 3)
+        encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
+    
+    public func test(commandBuffer: MTLCommandBuffer, param: ConvBNReluTestParam) {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            fatalError()
+        }
+        
+        encoder.setTexture(param.inputTexture, index: 0)
+        encoder.setTexture(param.outputTexture, index: 1)
+        var inMetalParam = param.metalParam
+        encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
+        encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2)
+        encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3)
+        encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4)
+        encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
+        encoder.endEncoding()
+    }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class MulticlassNMSKernel<P: PrecisionType>: Kernel, Computable{
+    func compute(commandBuffer: MTLCommandBuffer, param: MulticlassNMSParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
+    
+    required init(device: MTLDevice, param: MulticlassNMSParam<P>) {
+        super.init(device: device, inFunctionName: "priorbox")
+    }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
+    func compute(commandBuffer: MTLCommandBuffer, param: PriorBoxParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
+    
+    required init(device: MTLDevice, param: PriorBoxParam<P>) {
+        super.init(device: device, inFunctionName: "priorbox")
+    }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class TransposeKernel<P: PrecisionType>: Kernel, Computable{
+    func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
+    
+    required init(device: MTLDevice, param: TransposeParam<P>) {
+        super.init(device: device, inFunctionName: "priorbox")
+    }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift
+///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. */
+
+import Foundation
+
+class MulticlassNMSParam<P: PrecisionType>: OpParam {
+    typealias ParamPrecisionType = P
+    required init(opDesc: OpDesc, inScope: Scope) throws {
+        do {
+            fatalError()
+        } catch let error {
+            throw error
+        }
+    }
+    let input: Texture<P>
+    var output: Texture<P>
+}
+
+class MulticlassNMSOp<P: PrecisionType>: Operator<MulticlassNMSKernel<P>, MulticlassNMSParam<P>>, Runable, Creator, InferShaperable{
+    
+    func inferShape() {
+        para.output.dim = para.input.dim
+    }
+    
+    typealias OpType =  MulticlassNMSOp<P>
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+}
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
+///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. */
+
+import Foundation
+
+class PriorBoxParam<P: PrecisionType>: OpParam {
+    typealias ParamPrecisionType = P
+    required init(opDesc: OpDesc, inScope: Scope) throws {
+        do {
+            fatalError()
+        } catch let error {
+            throw error
+        }
+    }
+    let input: Texture<P>
+    var output: Texture<P>
+}
+
+class PriorBoxOp<P: PrecisionType>: Operator<PriorBoxKernel<P>, PriorBoxParam<P>>, Runable, Creator, InferShaperable{
+    
+    func inferShape() {
+        para.output.dim = para.input.dim
+    }
+    
+    typealias OpType = PriorBoxOp<P>
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+}
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
+///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. */
+
+import Foundation
+
+class TransposeParam<P: PrecisionType>: OpParam {
+    typealias ParamPrecisionType = P
+    required init(opDesc: OpDesc, inScope: Scope) throws {
+        do {
+            fatalError()
+        } catch let error {
+            throw error
+        }
+    }
+    let input: Texture<P>
+    var output: Texture<P>
+}
+
+class TransposeOp<P: PrecisionType>: Operator<TransposeKernel<P>, TransposeParam<P>>, Runable, Creator, InferShaperable{
+    
+    func inferShape() {
+        para.output.dim = para.input.dim
+    }
+    
+    typealias OpType = TransposeOp<P>
+    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+        do {
+            try kernel.compute(commandBuffer: buffer, param: para)
+        } catch let error {
+            throw error
+        }
+    }
+}
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift
@@ -143,7 +143,12 @@ extension Node: Equatable {
 }

 class ProgramOptimize<P: PrecisionType> {
-    let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self, ConvAddOp<P>.self]
+    // register fusion
+    let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self,
+                                    ConvAddOp<P>.self,
+                                    ConvBNReluOp<P>.self,
+                                    DwConvBNReluOp<P>.self]
+    
    func optimize(originProgramDesc: ProgramDesc) -> ProgramDesc {
        
        guard originProgramDesc.blocks.count == 1 else {

--- a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
@@ -22,7 +22,6 @@ class InputTexture {
        mtlTexture = inMTLTexture
        expectDim = inExpectDim
    }
-    
 }

 extension InputTexture {
@@ -54,19 +53,34 @@ public class Texture<P: PrecisionType>: Tensorial {
        } else if inDim.cout() == 4 {
            tmpTextureDes.height = inDim[1]
            tmpTextureDes.width = inDim[2]
-//            print("n : \(inDim[0])")
-//            print(inDim[3] * inDim[0])
            tmpTextureDes.depth = 1
            tmpTextureDes.arrayLength = (inDim[3] * inDim[0] + 3)/4
            tmpTextureDes.textureType = .type2DArray
        } else if inDim.cout() == 2 {
-            tmpTextureDes.height = 1
-            tmpTextureDes.width = 1
+//            tmpTextureDes.height = 1
+//            tmpTextureDes.width = 1
+//            tmpTextureDes.depth = 1
+//            tmpTextureDes.arrayLength = (inDim[0] * inDim[1] + 3)/4
+
+            tmpTextureDes.width = inDim[0]
+            tmpTextureDes.height = inDim[1]
            tmpTextureDes.depth = 1
-            tmpTextureDes.arrayLength = (inDim[0] * inDim[1] + 3)/4
+            tmpTextureDes.arrayLength = 1
            tmpTextureDes.textureType = .type2DArray
        } else {
-            fatalError(" not suuprt ")
+            /*
+             var name: box_coder_0.tmp_0
+             in var tensor desc dims size: 3
+                var tensor desc dim 0 value: -1
+                var tensor desc dim 1 value: 1917
+                var tensor desc dim 2 value: 4
+             */
+            
+            tmpTextureDes.height = inDim[1]
+            tmpTextureDes.width = inDim[2]
+            tmpTextureDes.depth = 1
+            tmpTextureDes.arrayLength = 1
+            tmpTextureDes.textureType = .type2DArray
        }
        
        if MemoryLayout<P>.size == 1 {
@@ -79,7 +93,7 @@ public class Texture<P: PrecisionType>: Tensorial {

        }
 //        tmpTextureDes.pixelFormat = .rgba16Float
-
+        
        tmpTextureDes.usage = [.shaderRead, .shaderWrite]
        tmpTextureDes.storageMode = .shared
        textureDesc = tmpTextureDes

--- a/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
+++ b/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
@@ -12,6 +12,8 @@
 See the License for the specific language governing permissions and
 limitations under the License. */

+#pragma once
+
 #import <UIKit/UIKit.h>

 //! Project version number for paddle_mobile.
@@ -20,6 +22,4 @@ FOUNDATION_EXPORT double paddle_mobileVersionNumber;
 //! Project version string for paddle_mobile.
 FOUNDATION_EXPORT const unsigned char paddle_mobileVersionString[];

-// In this header, you should import all the public headers of your framework using statements like #import <paddle_mobile/PublicHeader.h>
-

--- a/src/common/common.h
+++ b/src/common/common.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <chrono>
+
+using Time = decltype(std::chrono::high_resolution_clock::now());
+
+inline Time time() { return std::chrono::high_resolution_clock::now(); }
+
+inline double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
--- a/src/common/depCore.h
+++ b/src/common/depCore.h
--- a/src/common/enforce.h
+++ b/src/common/enforce.h
@@ -61,7 +61,14 @@ struct PaddleMobileException : public std::exception {
  }
 #else
 #define PADDLE_MOBILE_THROW_EXCEPTION(...)
-#define PADDLE_MOBILE_ENFORCE(stat, ...)
+
+#define PADDLE_MOBILE_ENFORCE(stat, ...) \
+  {                                      \
+    if (stat) {                          \
+    } else {                             \
+    }                                    \
+  }
+
 #endif

 }  // namespace paddle_mobile
--- a/src/common/log.h
+++ b/src/common/log.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <vector>
 #ifdef PADDLE_MOBILE_DEBUG
+#include <cstring>
 #include <iostream>
 #include <sstream>
 #include <string>
@@ -115,26 +116,29 @@ struct ToLog {
  Print printer_;
 };

-#define LOG(level)                                                             \
-  if (level > paddle_mobile::log_level) {                                      \
-  } else                                                                       \
-    paddle_mobile::ToLog(                                                      \
-        level,                                                                 \
-        (std::stringstream()                                                   \
-         << "[file: "                                                          \
-         << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \
-         << "] [line: " << __LINE__ << "] ")                                   \
-            .str())
-
-#define DLOG                                                                   \
-  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {                  \
-  } else                                                                       \
-    paddle_mobile::ToLog(                                                      \
-        paddle_mobile::kLOG_DEBUG,                                             \
-        (std::stringstream()                                                   \
-         << "[file: "                                                          \
-         << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \
-         << "] [line: " << __LINE__ << "] ")                                   \
+#define LOG(level)                                                           \
+  if (level > paddle_mobile::log_level) {                                    \
+  } else                                                                     \
+    paddle_mobile::ToLog(                                                    \
+        level, static_cast<const std::stringstream &>(                       \
+                   std::stringstream()                                       \
+                   << "[file: "                                              \
+                   << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
+                                              : __FILE__)                    \
+                   << "] [line: " << __LINE__ << "] ")                       \
+                   .str())
+
+#define DLOG                                                          \
+  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {         \
+  } else                                                              \
+    paddle_mobile::ToLog(                                             \
+        paddle_mobile::kLOG_DEBUG,                                    \
+        static_cast<const std::stringstream &>(                       \
+            std::stringstream()                                       \
+            << "[file: "                                              \
+            << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
+                                       : __FILE__)                    \
+            << "] [line: " << __LINE__ << "] ")                       \
            .str())

 #define LOGF(level, format, ...)          \
@@ -170,7 +174,10 @@ struct ToLog;
 struct Print {
  friend struct ToLog;
  template <typename T>
-  Print &operator<<(T const &value) {}
+  Print &operator<<(T const &value) {
+    Print p = Print();
+    return p;
+  }

 private:
 };

--- a/src/common/type_define.h
+++ b/src/common/type_define.h
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#pragma once;
+#pragma once

+#include <functional>
 #include <map>
 #include <string>
 #include <vector>

--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -17,34 +17,47 @@ limitations under the License. */

 namespace paddle_mobile {

-const std::string G_OP_TYPE_CONV = "conv2d";
-const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
-const std::string G_OP_TYPE_BOX_CODER = "box_coder";
-const std::string G_OP_TYPE_CONCAT = "concat";
-const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
-const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
-const std::string G_OP_TYPE_FC = "fc";
-const std::string G_OP_TYPE_CONV_ADD = "conv_add";
-const std::string G_OP_TYPE_LRN = "lrn";
-const std::string G_OP_TYPE_MUL = "mul";
-const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
-const std::string G_OP_TYPE_POOL2D = "pool2d";
-const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
-const std::string G_OP_TYPE_RELU = "relu";
-const std::string G_OP_TYPE_RESHAPE = "reshape";
-const std::string G_OP_TYPE_SIGMOID = "sigmoid";
-const std::string G_OP_TYPE_SOFTMAX = "softmax";
-const std::string G_OP_TYPE_TRANSPOSE = "transpose";
-const std::string G_OP_TYPE_SPLIT = "split";
-const std::string G_OP_TYPE_FEED = "feed";
-const std::string G_OP_TYPE_FETCH = "fetch";
-const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
+const char *G_OP_TYPE_CONV = "conv2d";
+const char *G_OP_TYPE_BATCHNORM = "batch_norm";
+const char *G_OP_TYPE_BOX_CODER = "box_coder";
+const char *G_OP_TYPE_CONCAT = "concat";
+const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
+const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
+const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
+const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
+const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
+const char *G_OP_TYPE_FC = "fusion_fc";
+const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
+const char *G_OP_TYPE_LRN = "lrn";
+const char *G_OP_TYPE_MUL = "mul";
+const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
+const char *G_OP_TYPE_POOL2D = "pool2d";
+const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
+const char *G_OP_TYPE_RELU = "relu";
+const char *G_OP_TYPE_RESHAPE = "reshape";
+const char *G_OP_TYPE_SIGMOID = "sigmoid";
+const char *G_OP_TYPE_SOFTMAX = "softmax";
+const char *G_OP_TYPE_TRANSPOSE = "transpose";
+const char *G_OP_TYPE_SPLIT = "split";
+const char *G_OP_TYPE_FEED = "feed";
+const char *G_OP_TYPE_FETCH = "fetch";
+const char *G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
+const char *G_OP_TYPE_IM2SEQUENCE = "im2sequence";
+const char *G_OP_TYPE_DROPOUT = "dropout";
+const char *G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn";
+const char *G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn";
+const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU =
+    "fusion_elementwise_add_relu";
+const char *G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu";
+const char *G_OP_TYPE_REGION = "region";

 std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
    op_input_output_key = {
        {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
-        {G_OP_TYPE_CONV_ADD, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DWCONV_BN_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_BN_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
        {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
        {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
@@ -59,11 +72,19 @@ std::unordered_map<
        {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
        {G_OP_TYPE_BOX_CODER,
         {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
        {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
        {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}}};
+        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_BN, {{"Input"}, {"Y"}}},
+        {G_OP_TYPE_FUSION_POOL_BN, {{"X"}, {"Y"}}},
+        {G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_FC_RELU, {{"X", "Y", "Z"}, {"Out"}}},
+        {G_OP_TYPE_REGION, {{"X"}, {"Out"}}}};

 }  // namespace paddle_mobile
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -12,13 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#pragma once;
+#pragma once

 #include <string>
 #include <unordered_map>
+#include <utility>
+#include <vector>

 namespace paddle_mobile {
-enum class Precision : int { FP32 = 0 };
+enum class Precision : int { FP32 = 0, FP16 = 1 };
+
+typedef int16_t half;

 template <Precision p>
 struct PrecisionTrait {
@@ -29,6 +33,10 @@ template <>
 struct PrecisionTrait<Precision::FP32> {
  typedef float ptype;
 };
+template <>
+struct PrecisionTrait<Precision::FP16> {
+  typedef half ptype;
+};

 //! device type
 enum DeviceTypeEnum { kINVALID = -1, kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
@@ -71,28 +79,40 @@ enum PMStatus {
  PMWrongDevice = 0x08     /*!< un-correct device. */
 };

-extern const std::string G_OP_TYPE_CONV;
-extern const std::string G_OP_TYPE_BATCHNORM;
-extern const std::string G_OP_TYPE_BOX_CODER;
-extern const std::string G_OP_TYPE_CONCAT;
-extern const std::string G_OP_TYPE_ELEMENTWISE_ADD;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU;
-extern const std::string G_OP_TYPE_FC;
-extern const std::string G_OP_TYPE_CONV_ADD;
-extern const std::string G_OP_TYPE_LRN;
-extern const std::string G_OP_TYPE_MUL;
-extern const std::string G_OP_TYPE_MULTICLASS_NMS;
-extern const std::string G_OP_TYPE_POOL2D;
-extern const std::string G_OP_TYPE_PRIOR_BOX;
-extern const std::string G_OP_TYPE_RELU;
-extern const std::string G_OP_TYPE_RESHAPE;
-extern const std::string G_OP_TYPE_SIGMOID;
-extern const std::string G_OP_TYPE_SOFTMAX;
-extern const std::string G_OP_TYPE_TRANSPOSE;
-extern const std::string G_OP_TYPE_SPLIT;
-extern const std::string G_OP_TYPE_FEED;
-extern const std::string G_OP_TYPE_FETCH;
-extern const std::string G_OP_TYPE_DEPTHWISE_CONV;
+extern const char *G_OP_TYPE_CONV;
+extern const char *G_OP_TYPE_BATCHNORM;
+extern const char *G_OP_TYPE_BOX_CODER;
+extern const char *G_OP_TYPE_CONCAT;
+extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
+extern const char *G_OP_TYPE_FC;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_CONV_BN_RELU;
+
+extern const char *G_OP_TYPE_LRN;
+extern const char *G_OP_TYPE_MUL;
+extern const char *G_OP_TYPE_MULTICLASS_NMS;
+extern const char *G_OP_TYPE_POOL2D;
+extern const char *G_OP_TYPE_PRIOR_BOX;
+extern const char *G_OP_TYPE_RELU;
+extern const char *G_OP_TYPE_RESHAPE;
+extern const char *G_OP_TYPE_SIGMOID;
+extern const char *G_OP_TYPE_SOFTMAX;
+extern const char *G_OP_TYPE_TRANSPOSE;
+extern const char *G_OP_TYPE_SPLIT;
+extern const char *G_OP_TYPE_FEED;
+extern const char *G_OP_TYPE_FETCH;
+extern const char *G_OP_TYPE_DEPTHWISE_CONV;
+extern const char *G_OP_TYPE_IM2SEQUENCE;
+extern const char *G_OP_TYPE_DROPOUT;
+
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN;
+extern const char *G_OP_TYPE_FUSION_POOL_BN;
+extern const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU;
+extern const char *G_OP_TYPE_FUSION_FC_RELU;
+extern const char *G_OP_TYPE_REGION;

 extern std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>

--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <cstdlib>
+
 #include "common/enforce.h"
 #include "common/log.h"

@@ -82,7 +84,8 @@ struct Variant {
    if (type_id == typeid(T).hash_code()) {
      return *const_cast<T *>(reinterpret_cast<const T *>(&data));
    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant ");
+      PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant");
+      exit(0);
    }
  }


--- a/src/fpga/api/fpga_api.cpp
+++ b/src/fpga/api/fpga_api.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+
+#include "fpga/api/fpga_api.h"
+
+namespace paddle_mobile {
+namespace fpga {
+
+static int fd = -1;
+static const char *device_path = "/dev/fpgadrv0";
+
+static inline int do_ioctl(int req, const void *arg) {
+  return ioctl(req, (unsigned int64_t)arg);
+}
+
+int open_device() {
+  if (fd == -1) {
+    fd = open(device_path, O_RDWR);
+  }
+  return fd;
+}
+
+// memory management;
+void *fpga_malloc(size_t size) {
+  return reinterpret_cast<void *>(
+      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
+}
+
+void fpga_free(void *ptr) { munmap(ptr, 0); }
+
+void fpga_copy(void *dest, const void *src, size_t num) {
+  memcpy(dest, src, num);
+}
+
+int ComputeFpgaConv(const struct ConvArgs &args) {
+  return do_ioctl(IOCTL_CONFIG_CONV, &args);
+}
+int ComputeFpgaPool(const struct PoolingArgs &args) {
+  return do_ioctl(IOCTL_CONFIG_POOLING, &args);
+}
+int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
+  return do_ioctl(IOCTL_CONFIG_EW, &args);
+}
+int PerformBypass(const struct BypassArgs &args) {
+  return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/api/fpga_api.h
+++ b/src/fpga/api/fpga_api.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <cstddef>
+#include <iostream>
+#include <limits>
+
+// memory management;
+
+namespace paddle_mobile {
+namespace fpga {
+
+int open_device();
+int close_device();
+
+void* fpga_malloc(size_t size);
+void fpga_free(void* ptr);
+void fpga_copy(void* dst, const void* src, size_t num);
+
+enum DataConvertType {
+  DATA_NO_CONVERT = 0,
+  DATA_FP32_TO_FP16 = 1,
+  DATA_FP16_TO_FP32 = 2,
+};
+
+enum LayoutConvertType {
+  LAYOUT_NO_CONVERT = 0,
+  LAYOUT_CHW_TO_HWC = 1,
+  LAYOUT_HWC_TO_CHW = 2,
+};
+
+struct VersionArgs {
+  void* buffer;
+};
+
+struct MemoryCopyArgs {
+  void* src;
+  void* dest;
+  size_t size;
+};
+
+struct BNArgs {
+  bool enabled;
+  void* bias_address;
+  void* scale_address;
+};
+
+/**
+Conv and Pooling kernel
+*/
+struct KernelArgs {
+  uint32_t width;
+  uint32_t height;
+  uint32_t stride_w;
+  uint32_t stride_h;
+};
+
+struct ImageInputArgs {
+  void* address;         // input featuremap virtual address
+  float* scale_address;  // input scale address;
+  uint32_t channels;
+  uint32_t width;  // featuremap width
+  uint32_t height;
+  uint32_t pad_width;  // padding width;
+  uint32_t pad_height;
+};
+
+struct ImageOutputArgs {
+  void* address;         // output result address;
+  float* scale_address;  // output scale address;
+};
+
+struct ConvArgs {
+  bool relu_enabled;
+  void* sb_address;  // scale and bias are interlaced;
+  void* filter_address;
+  float* filter_scale_address;
+  uint32_t filter_num;
+  uint32_t group_num;
+
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+
+struct PoolingArgs {
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
+};
+
+// elementwise add arguments
+struct EWAddArgs {
+  bool relu_enabled;
+
+  float const0;  // output0 = const0 x input0 + const1 x input1;
+  float const1;
+  struct ImageInputArgs image0;
+  struct ImageInputArgs image1;
+  struct ImageOutputArgs output;
+};
+
+struct BypassArgs {
+  enum DataConvertType convert_type;
+  enum LayoutConvertType layout_type;
+  struct ImageInputArgs image;
+  struct ImageOutputArgs output;
+};
+
+struct FpgaRegWriteArgs {
+  uint64_t address;  //
+  uint64_t value;
+};
+
+#define IOCTL_FPGA_MAGIC 'FPGA'
+
+#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
+
+#define IOCTL_SEPARATOR_0 10
+
+#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
+
+#define IOCTL_SEPARATOR_1 20
+
+#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
+#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
+#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
+#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
+#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
+#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
+
+enum FPGA_ERR_TYPE {
+  ERR_IOCTL_CMD = -1,
+  ERR_TIMEOUT = -2,
+  ERR_COMPLETION_TIMEOUT = -3,
+  ERR_INVALID_FPGA_ADDR = -4,
+  ERR_NOMEM = -5,
+  ERR_NO_RESERVE_MEM = -6,
+  ERR_COPY_FROM_USER = -7,
+  ERR_COPY_TO_USER = -8,
+  ERR_DEL_TIMER = -9,
+  ERR_ENABLE_MSI = -10,
+  ERR_REGISTER_IRQ = -11,
+  ERR_PCIE_REGISTER = -12,
+  ERR_PCIE_PROBE = -13,
+  ERR_REGISTER_BLOCK = -14,
+  ERR_ALLOC_GENDISK = -15,
+  ERR_INIT_QUEUE = -16,
+  ERR_WAIT = -17,
+  ERR_ECC_ERROR = -31,
+  ERR_FPGA_FAIL_STOP = -64,
+  ERR_FPGA_DEBUG_STOP = -113,
+  DEV_TMP_UNAVAILABLE = -128
+};
+
+//============================== API =============================
+
+int PerformBypass(const struct BypassArgs& args);
+int ComputeFpgaConv(const struct ConvArgs& args);
+int ComputeFpgaPool(const struct PoolingArgs& args);
+int ComputeFpgaEWAdd(const struct EWAddArgs& args);
+
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/fpga_quantilization.cpp
+++ b/src/fpga/fpga_quantilization.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/fpga_quantilization.h"
+#include <algorithm>
+
+namespace paddle_mobile {
+namespace fpga {
+
+template <typename Dtype>
+static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
+                       int height, int width) {
+  int offset_height = 0;
+
+  for (int n = 0; n < num; n++) {
+    int amount_per_row = width * channel;
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        int offset_height = h * amount_per_row;
+        for (int w = 0; w < width; w++) {
+          *(data_out + offset_height + w * channel + c) = *(data_in++);
+        }
+      }
+    }
+    data_out += num;
+  }
+}
+
+template <typename Dtype>
+static Dtype find_max(Dtype* data, int num) {
+  Dtype max = 0;
+  for (int i = 0; i < num; ++i) {
+    max = std::max(max, data[i]);
+  }
+  return max;
+}
+
+// template <typename Dtype>
+framework::Tensor* quantify_filter(framework::Tensor* filter) {
+  float scale = 0;
+  float fix_range = static_cast<float>((1 << (8 - 1)) - 1);
+
+  const int batch_size = filter->dims()[0];
+  const int channel = filter->dims()[1];
+  const int height = filter->dims()[2];
+  const int width = filter->dims()[3];
+
+  int8_t* int_data = nullptr;
+  int8_t* tmp_data = new int8_t[filter->numel()];
+
+  // 32bit filter -> 8bit filter;
+  if (filter->type() == typeid(float)) {
+    float* float_data = filter->data<float>();
+    float max = find_max(float_data, filter->numel());
+
+    scale = (max / fix_range);
+
+    framework::Tensor* filter = filter;
+    framework::Tensor* quant_filter = new framework::Tensor();
+
+    int_data = quant_filter->mutable_data<int8_t>();
+    for (int i = 0; i < filter->numel(); ++i) {
+      tmp_data[i] = (int8_t)float_data[i] * scale;
+    }
+    filter = quant_filter;
+  } else {
+    int8_t max = find_max(filter->data<int8_t>(), filter->numel());
+    scale = (max / fix_range);
+
+    int_data = filter->data<int8_t>();
+    for (int i = 0; i < filter->numel(); ++i) {
+      tmp_data[i] = int_data[i];
+    }
+    int_data = filter->mutable_data<int8_t>();
+  }
+  // NCHW -> NHWC;
+  chw_to_hwc<int8_t>(tmp_data, int_data, batch_size, channel, height, width);
+  delete tmp_data;
+  *(filter->fpga_args().scale_pointer()) = scale;
+  return filter;
+}
+
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/fpga_quantilization.h
+++ b/src/fpga/fpga_quantilization.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "common/types.h"
+#include "framework/lod_tensor.h"
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+namespace fpga {
+
+template <typename Dtype>
+static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
+                       int height, int width);
+
+// template <typename Dtype>
+framework::Tensor* quantify_filter(framework::Tensor* filter);
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <cstdlib>
 #include <string>
+#include <typeinfo>
 #include <unordered_map>
 #include <vector>

@@ -128,6 +130,7 @@ class Attribute {
      return vistor(attr.variant_.Get<int64_t>());
    } else {
      PADDLE_MOBILE_THROW_EXCEPTION("type not support");
+      exit(0);
    }
  }


--- a/src/framework/data_layout.h
+++ b/src/framework/data_layout.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <cctype>
+#include <cstdlib>
 #include <string>

 namespace paddle_mobile {
@@ -40,6 +41,7 @@ inline DataLayout StringToDataLayout(const std::string &str) {
    return DataLayout::kAnyLayout;
  } else {
    PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
+    exit(0);
  }
 }

@@ -52,6 +54,8 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) {
    case DataLayout::kAnyLayout:
      return "ANY_LAYOUT";
    default:
+      PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string ")
+      exit(0);
      break;
  }
 }

--- a/src/framework/ddim.h
+++ b/src/framework/ddim.h
@@ -14,8 +14,11 @@ limitations under the License. */

 #pragma once

+#include <cstdlib>
 #include <initializer_list>
+#include <typeinfo>
 #include <vector>
+
 #include "common/enforce.h"
 #include "common/variant.h"
 #include "dim.h"
@@ -57,7 +60,8 @@ struct DDim {
    } else if (d.var.TypeId() == typeid(Dim<9>).hash_code()) {
      return vistor(d.var.Get<Dim<9>>());
    } else {
-      DLOG << " dim not support";
+      PADDLE_MOBILE_ENFORCE(false, " dim not support");
+      exit(0);
    }
  }


--- a/src/framework/dim.h
+++ b/src/framework/dim.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <cstdlib>
 #include "common/enforce.h"
 namespace paddle_mobile {
 namespace framework {
@@ -129,6 +130,7 @@ int64_t &indexer(Dim<D> &dim, int idx) {
 template <>
 int64_t &indexer<0>(Dim<0> &dim, int idx) {
  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
+  exit(0);
 }

 template <int D>
@@ -145,6 +147,7 @@ int64_t indexer(const Dim<D> &dim, int idx) {
 template <>
 int64_t indexer<0>(const Dim<0> &dim, int idx) {
  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
+  exit(0);
 }

 }  // namespace

--- a/src/framework/lod_tensor.cpp
+++ b/src/framework/lod_tensor.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "lod_tensor.h"
+#include <algorithm>

 namespace paddle_mobile {
 namespace framework {

--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -28,6 +28,16 @@ vector<string> OperatorBase<Dtype>::GetOutKeys() const {
  return it->second.second;
 }

+template <typename Dtype>
+vector<string> OperatorBase<Dtype>::GetInputKeys() const {
+  auto it = op_input_output_key.find(type_);
+  if (it == op_input_output_key.end()) {
+    DLOG << type_ << " has no outputs";
+    return {};
+  }
+  return it->second.first;
+}
+
 template <typename Dtype>
 OperatorBase<Dtype>::OperatorBase(const std::string &type,
                                  const VariableNameMap &inputs,
@@ -49,6 +59,11 @@ template <typename Dtype>
 void OperatorBase<Dtype>::Run() const {
  RunImpl();
 #ifdef PADDLE_MOBILE_DEBUG
+  vector<string> input_keys = GetInputKeys();
+  for (const auto key : input_keys) {
+    Tensor *input = GetVarValue<framework::LoDTensor>(key, inputs_, *scope_);
+    DLOG << type_ << " input- " << key << "=" << *input;
+  }
  vector<string> output_keys = GetOutKeys();
  for (const auto key : output_keys) {
    Tensor *out_ = GetVarValue<framework::LoDTensor>(key, outputs_, *scope_);

--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -61,8 +61,10 @@ class OperatorBase {
  virtual ~OperatorBase() {}
  void Run() const;
  std::vector<string> GetOutKeys() const;
+  std::vector<string> GetInputKeys() const;
  virtual void RunImpl() const = 0;

+  virtual void Init() = 0;
  /*
   * @b op 运算所需的输入, 如上一层的输出结果、卷积核
   * */
@@ -110,15 +112,21 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
                     const VariableNameMap &outputs, const AttributeMap &attrs,
                     std::shared_ptr<Scope> scope)
      : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope),
-        param_(inputs, outputs, attrs, *scope) {
-    PADDLE_MOBILE_ENFORCE(kernel_.Init(param_), "  %s kernel init failed",
-                          this->type_.c_str());
-  }
+        param_(inputs, outputs, attrs, *scope) {}

  virtual void RunImpl() const { this->kernel_.Compute(this->param_); }

  virtual void InferShape() const = 0;

+  void Init() {
+    //    for (auto i : this->inputs_) {
+    //      DLOG << i.first;
+    //      DLOG << i.second;
+    //    }
+    PADDLE_MOBILE_ENFORCE(kernel_.Init(&param_), "  %s kernel init failed",
+                          this->type_.c_str());
+  }
+
 protected:
  KernelType kernel_;
  ParamType param_;
@@ -135,9 +143,21 @@ class OpKernelBase {
   * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
   *    所有结构体存在与: paddle-mobile/src/operators/op_param.h
   * */
+#ifdef PADDLE_MOBILE_MALI_GPU
+  OpKernelBase() { acl_op_ = nullptr; }
+  void *GetAclOp() const { return acl_op_; }
+  void SetAclOp(void *op, void *ob) const {
+    reinterpret_cast<OpKernelBase<Dtype, P> *>(ob)->acl_op_ = op;
+  }
+#endif
  virtual void Compute(const P &para) const = 0;
-  virtual bool Init(const P &para) const { return true; };
+  virtual bool Init(P *para) { return true; }
  virtual ~OpKernelBase() = default;
+
+ private:
+#ifdef PADDLE_MOBILE_MALI_GPU
+  void *acl_op_;
+#endif
 };

 #define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)                                 \

--- a/src/framework/program/block_desc.cpp
+++ b/src/framework/program/block_desc.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "block_desc.h"
+#include <algorithm>

 namespace paddle_mobile {
 namespace framework {

--- a/src/framework/program/program-optimize/fusion_op_register.h
+++ b/src/framework/program/program-optimize/fusion_op_register.h
@@ -14,11 +14,13 @@ limitations under the License. */

 #pragma once

+#include <algorithm>
 #include <map>
 #include <string>
+#include <vector>

 #include "framework/operator.h"
-#include "node.h"
+#include "framework/program/program-optimize/node.h"

 namespace paddle_mobile {
 namespace framework {
@@ -34,12 +36,25 @@ class FusionOpRegister {
  }

  void regist(FusionOpMatcher* matcher) {
+    if (matchers_.find(matcher->Type()) != matchers_.end()) {
+      return;
+    }
+
    std::shared_ptr<FusionOpMatcher> shared_matcher(matcher);
    matchers_[matcher->Type()] = shared_matcher;
  }

-  const std::map<std::string, std::shared_ptr<FusionOpMatcher>> Matchers() {
-    return matchers_;
+  const std::vector<std::shared_ptr<FusionOpMatcher>> Matchers() {
+    std::vector<std::shared_ptr<FusionOpMatcher>> matchers;
+    for (const auto& match : matchers_) {
+      matchers.push_back(match.second);
+    }
+    std::sort(matchers.begin(), matchers.end(),
+              [](std::shared_ptr<FusionOpMatcher> first,
+                 std::shared_ptr<FusionOpMatcher> second) {
+                return first->BeginNode().Depth() > second->BeginNode().Depth();
+              });
+    return matchers;
  }

 private:

--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "framework/program/program-optimize/node.h"
+#include <algorithm>
 #include "framework/operator.h"

 namespace paddle_mobile {
@@ -43,23 +44,6 @@ bool Node::operator==(const Node &in) {
  return true;
 }

-std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(int size) {
-  std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
-  OpDescs(size - 1, &op_descs);
-  return op_descs;
-}
-
-void Node::OpDescs(int index,
-                   std::vector<std::shared_ptr<framework::OpDesc>> *op_desc) {
-  if (index == 0) {
-    return;
-  }
-  op_desc->push_back(this->op_desc_);
-  for (auto &output : outputs_) {
-    output->OpDescs(index, op_desc);
-  }
-}
-
 std::shared_ptr<Node> Node::To(int size) {
  std::shared_ptr<Node> node = std::make_shared<Node>();
  this->To(size - 1, node);
@@ -92,7 +76,8 @@ int Node::Depth(int begin) {

 Node &Node::Folder(
    int size, std::string type,
-    std::map<std::string, std::pair<std::string, std::string>> change,
+    std::map<std::string, std::vector<std::pair<std::string, std::string>>>
+        change,
    std::vector<std::shared_ptr<Node>> *removed_nodes) {
  std::shared_ptr<framework::OpDesc> op_desc =
      std::make_shared<framework::OpDesc>();
@@ -109,12 +94,15 @@ Node &Node::Folder(
 void Node::Folder(
    std::shared_ptr<framework::OpDesc> op_desc,
    std::vector<std::shared_ptr<Node>> *outputs, int index,
-    std::map<std::string, std::pair<std::string, std::string>> *change,
+    std::map<std::string, std::vector<std::pair<std::string, std::string>>>
+        *change,
    Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes) {
  if (change->find(this->type_) != change->end()) {
-    auto change_pair = (*change)[this->type_];
-    op_desc->GetInputs()[change_pair.second] =
-        this->op_desc_->GetInputs()[change_pair.first];
+    auto change_pairs = (*change)[this->type_];
+    for (const auto &change_pair : change_pairs) {
+      op_desc->GetInputs()[change_pair.second] =
+          this->op_desc_->GetInputs()[change_pair.first];
+    }
  }

  for (auto &attr_pair : this->op_desc_->attrs_) {

--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cinttypes>
 #include <map>
 #include <string>
+#include <utility>
 #include <vector>
 #include "common/log.h"
 #include "framework/program/op_desc.h"
@@ -43,20 +44,19 @@ class Node {
  int Depth(int begin = 0);
  Node &Folder(
      int size, std::string type,
-      std::map<std::string, std::pair<std::string, std::string>> change_map,
+      std::map<std::string, std::vector<std::pair<std::string, std::string>>>
+          change,
      std::vector<std::shared_ptr<Node>> *removed_nodes);
-  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(int size);
  std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
  std::string Type() { return type_; }

 private:
-  void OpDescs(int size,
-               std::vector<std::shared_ptr<framework::OpDesc>> *op_desc);
  void To(int index, std::shared_ptr<Node>);
  void Folder(
      std::shared_ptr<framework::OpDesc> op_desc,
      std::vector<std::shared_ptr<Node>> *outputs, int index,
-      std::map<std::string, std::pair<std::string, std::string>> *change,
+      std::map<std::string, std::vector<std::pair<std::string, std::string>>>
+          *change,
      Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes);
  std::shared_ptr<framework::OpDesc> op_desc_;
 #ifdef PADDLE_MOBILE_DEBUG

--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "framework/program/program-optimize/program_optimize.h"
+#include <algorithm>
 #include "framework/program/program-optimize/fusion_op_register.h"

 namespace paddle_mobile {
@@ -77,9 +78,8 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
    }

    for (auto &registed : FusionOpRegister::Instance()->Matchers()) {
-      std::string fusion_type = registed.first;
-      std::shared_ptr<FusionOpMatcher> matcher = registed.second;
-      //      DLOG << " registed node \n " << matcher->BeginNode();
+      std::string fusion_type = registed->Type();
+      std::shared_ptr<FusionOpMatcher> matcher = registed;

      auto match_vector = type_map[matcher->BeginType()];


--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -29,7 +29,8 @@ class Program {
  std::shared_ptr<Scope> scope;
  std::string model_path;
  std::string para_path;
-  bool is_commbine = false;
+  bool combined = false;
+  bool quantification = false;

 private:
 };

--- a/src/framework/scope.cpp
+++ b/src/framework/scope.cpp
@@ -14,6 +14,7 @@ limitations under the License. */

 #include "framework/scope.h"

+#include <algorithm>
 #include <set>
 #include <string>
 #include <vector>

--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -23,7 +23,17 @@ namespace framework {
 class Scope {
 public:
  Scope() = default;
-  ~Scope() = default;
+
+  ~Scope() {
+    for (auto &var : vars_) {
+      delete var.second;
+    }
+    vars_.clear();
+    for (auto kid : kids_) {
+      delete kid;
+    }
+    kids_.clear();
+  }

  Scope &NewScope() const;


--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -16,13 +16,15 @@ limitations under the License. */

 #include <cstdint>
 #include <cstring>
+#include <fstream>
 #include <memory>
+#include <string>
 #include <type_traits>
 #include <typeindex>
 #include <vector>
-#include "common/enforce.h"

 #include "common/enforce.h"
+#include "common/types.h"
 #include "framework/data_layout.h"
 #include "framework/ddim.h"
 #include "memory/t_malloc.h"
@@ -62,7 +64,8 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 };

 static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t> functor;
+  SizeOfTypeFunctor<int, half, float, double, int16_t, int64_t, bool, size_t>
+      functor;
  size_t size = functor(type);

  PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
@@ -131,11 +134,27 @@ class Tensor {
    return reinterpret_cast<T *>(mutable_data(typeid(T)));
  }

+#ifdef PADDLE_MOBILE_DEBUG
+  template <typename T>
+  inline void dump(std::string filename) const {
+    const T *dataptr = data<T>();
+    std::ofstream out(filename.c_str());
+    for (int i = 0; i < numel(); ++i) {
+      out << dataptr[i] << " ";
+    }
+    out << "形状：";
+    for (int j = 0; j < dims_.size(); ++j) {
+      out << dims_[j] << " ";
+    }
+    out.close();
+  }
+#endif
+
  inline void *mutable_data(std::type_index type) {
    if (holder_ != nullptr) {
      holder_->set_type(type);
    }
-    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor'snumel must >=0.")
+    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
    int64_t size = numel() * SizeOfType(type);
    if (holder_ == nullptr || holder_->size() < size + offset_) {
      holder_.reset(new PlaceholderImpl(size, type));
@@ -234,6 +253,18 @@ class Tensor {
                          "Tensor's dims_ is out of bound. ");
  }

+#ifdef PADDLE_MOBILE_FPGA
+  struct FPGAArgs {
+    float scale;
+
+    inline float *scale_pointer() { return &scale; }
+  };
+
+  struct FPGAArgs fpga_args() const {
+    return fpgaArgs_;
+  }
+#endif
+
 private:
  /**
   * @note    Placeholder hides type T, so it doesn't appear as a
@@ -300,6 +331,10 @@ class Tensor {
   * begins.
   */
  size_t offset_;
+
+#ifdef PADDLE_MOBILE_FPGA
+  FPGAArgs fpgaArgs_;
+#endif
 };

 #ifdef PADDLE_MOBILE_DEBUG

--- a/src/io/api.cc
+++ b/src/io/api.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "cstring"
+#include "io/paddle_inference_api.h"
+
+namespace paddle_mobile {
+
+int PaddleDtypeSize(PaddleDType dtype) {
+  switch (dtype) {
+    case PaddleDType::FLOAT32:
+      return sizeof(float);
+    case PaddleDType::INT64:
+      return sizeof(int64_t);
+    default:
+      assert(false);
+      return -1;
+  }
+}
+
+PaddleBuf::PaddleBuf(PaddleBuf&& other)
+    : data_(other.data_),
+      length_(other.length_),
+      memory_owned_(other.memory_owned_) {
+  other.memory_owned_ = false;
+  other.data_ = nullptr;
+  other.length_ = 0;
+}
+
+PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
+
+PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
+  // only the buffer with external memory can be copied
+  if (!other.memory_owned_) {
+    data_ = other.data_;
+    length_ = other.length_;
+    memory_owned_ = other.memory_owned_;
+  } else {
+    Resize(other.length());
+    memcpy(data_, other.data(), other.length());
+    length_ = other.length();
+    memory_owned_ = true;
+  }
+  return *this;
+}
+
+void PaddleBuf::Resize(size_t length) {
+  // Only the owned memory can be reset, the external memory can't be changed.
+  if (length_ == length) return;
+  if (memory_owned_) {
+    Free();
+  }
+  data_ = new char[length];
+  length_ = length;
+  memory_owned_ = true;
+}
+
+void PaddleBuf::Reset(void* data, size_t length) {
+  Free();
+  memory_owned_ = false;
+  data_ = data;
+  length_ = length;
+}
+
+void PaddleBuf::Free() {
+  if (memory_owned_ && data_) {
+    assert(length_ > 0);
+    delete[] static_cast<char*>(data_);
+    data_ = nullptr;
+    length_ = 0;
+  }
+}
+
+}  // namespace paddle_mobile
--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "io/api_paddle_mobile.h"
+#include <vector>
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+
+template <typename Dtype, Precision P>
+PaddleMobilePredictor<Dtype, P>::PaddleMobilePredictor(
+    const PaddleMobileConfig &config) {
+  PADDLE_MOBILE_ENFORCE(Init(config) == true,
+                        "paddle mobile predictor init failed!");
+  config_ = config;
+}
+
+template <typename Dtype, Precision P>
+bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
+  paddle_mobile_.reset(new PaddleMobile<Dtype, P>());
+  if (!config.model_dir.empty()) {
+    paddle_mobile_->Load(config.model_dir, config.optimize,
+                         config.quantification, config.batch_size);
+  } else if (!config.prog_file.empty() && !config.param_file.empty()) {
+    paddle_mobile_->Load(config.prog_file, config.param_file, config.optimize,
+                         config.quantification, config.batch_size);
+  } else {
+    LOG(kLOG_ERROR) << "fail to load inference model!";
+    return false;
+  }
+  // If the openmp is open, set the thread num
+  paddle_mobile_->SetThreadNum(config.thread_num);
+  return true;
+}
+
+template <typename Dtype, Precision P>
+bool PaddleMobilePredictor<Dtype, P>::Run(
+    const std::vector<PaddleTensor> &inputs,
+    std::vector<PaddleTensor> *output_data, int batch_size) {
+  if (inputs.empty()) {
+    LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
+    return false;
+  }
+  auto input = inputs[0];
+
+  if (input.shape.size() != 4) {
+    LOG(kLOG_ERROR) << "input shape not equal to 4!";
+    return false;
+  }
+  std::vector<int64_t> dims;
+  for (auto d : input.shape) {
+    dims.push_back(static_cast<int64_t>(d));
+  }
+
+  // use tensor
+  framework::DDim ddim =
+      framework::make_ddim({dims[0], dims[1], dims[2], dims[3]});
+
+  framework::Tensor input_tensor;
+  input_tensor.Resize(ddim);
+  int input_length = framework::product(ddim);
+  typedef typename PrecisionTrait<P>::ptype PType;
+  auto input_ptr = input_tensor.mutable_data<PType>();
+
+  memcpy(input_ptr, static_cast<PType *>(input.data.data()),
+         input_length * sizeof(PType));
+  auto output_tensor = paddle_mobile_->Predict(input_tensor);
+
+  if (output_data->empty()) {
+    LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
+    return false;
+  }
+
+  auto &output = (*output_data)[0];
+  int output_length = output_tensor->numel();
+  std::vector<int64_t> tensor_shape =
+      framework::vectorize(output_tensor->dims());
+
+  for (auto d : tensor_shape) {
+    output.shape.push_back(static_cast<int>(d));
+  }
+
+  if (output.data.length() < output_length * sizeof(PType)) {
+    output.data.Resize(output_length * sizeof(PType));
+  }
+
+  memcpy(output.data.data(), output_tensor->template data<PType>(),
+         output_length * sizeof(PType));
+
+  return true;
+}
+
+// A factory to help create difference predictor.
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(
+    const PaddleMobileConfig &config) {
+  std::unique_ptr<PaddlePredictor> x;
+  if (config.precision == PaddleMobileConfig::FP32) {
+    if (config.device == PaddleMobileConfig::kCPU) {
+      x.reset(new PaddleMobilePredictor<CPU, Precision::FP32>(config));
+    } else if (config.device == PaddleMobileConfig::kFPGA) {
+      x.reset(new PaddleMobilePredictor<FPGA, Precision::FP32>(config));
+    } else if (config.device == PaddleMobileConfig::kGPU_MALI) {
+      x.reset(new PaddleMobilePredictor<GPU_MALI, Precision::FP32>(config));
+    } else {
+      LOG(kLOG_ERROR) << "unsupport device type!";
+      return nullptr;
+    }
+  } else {
+    LOG(kLOG_ERROR) << "unsupport precision type!";
+    return nullptr;
+  }
+  return std::move(x);
+}
+
+}  // namespace paddle_mobile
--- a/src/io/api_paddle_mobile.h
+++ b/src/io/api_paddle_mobile.h
--- a/src/io/io.cpp
+++ b/src/io/io.cpp
--- a/src/io/io.h
+++ b/src/io/io.h
--- a/src/io/loader.cpp
+++ b/src/io/loader.cpp
--- a/src/io/loader.h
+++ b/src/io/loader.h
--- a/src/io/paddle_inference_api.h
+++ b/src/io/paddle_inference_api.h
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
--- a/src/ios_io/PaddleMobile.h
+++ b/src/ios_io/PaddleMobile.h
--- a/src/ios_io/PaddleMobile.mm
+++ b/src/ios_io/PaddleMobile.mm
--- a/src/ios_io/op_symbols.h
+++ b/src/ios_io/op_symbols.h
--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
--- a/src/jni/paddle_mobile_jni.h
+++ b/src/jni/paddle_mobile_jni.h
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -26,16 +26,16 @@ void BatchNormOp<Dtype, T>::InferShape() const {
  auto x_dims = this->param_.InputX()->dims();
  this->param_.OutputY()->Resize(x_dims);
 }
-template class BatchNormOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile

 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(batch_norm);
 REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/batchnorm_op.h
+++ b/src/operators/batchnorm_op.h
--- a/src/operators/box_coder_op.cpp
+++ b/src/operators/box_coder_op.cpp
--- a/src/operators/box_coder_op.h
+++ b/src/operators/box_coder_op.h
--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
--- a/src/common/openmp-fix.cpp
+++ b/src/common/openmp-fix.cpp
--- a/src/operators/conv_transpose_op.h
+++ b/src/operators/conv_transpose_op.h
--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
--- a/src/operators/dropout_op.cpp
+++ b/src/operators/dropout_op.cpp
--- a/src/operators/dropout_op.h
+++ b/src/operators/dropout_op.h
--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
--- a/src/operators/fusion_conv_add.cpp
+++ b/src/operators/fusion_conv_add.cpp
--- a/src/operators/fusion_conv_add.h
+++ b/src/operators/fusion_conv_add.h
--- a/src/operators/fusion_conv_add_bn_op.cpp
+++ b/src/operators/fusion_conv_add_bn_op.cpp
--- a/src/operators/fusion_conv_add_bn_op.h
+++ b/src/operators/fusion_conv_add_bn_op.h
--- a/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
--- a/src/operators/fusion_conv_add_bn_relu_op.h
+++ b/src/operators/fusion_conv_add_bn_relu_op.h
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
--- a/src/operators/fusion_conv_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_relu_op.cpp
--- a/src/operators/fusion_conv_bn_relu_op.h
+++ b/src/operators/fusion_conv_bn_relu_op.h
--- a/src/operators/fusion_dwconv_bn_relu_op.cpp
+++ b/src/operators/fusion_dwconv_bn_relu_op.cpp
--- a/src/operators/fusion_dwconv_bn_relu_op.h
+++ b/src/operators/fusion_dwconv_bn_relu_op.h
--- a/src/operators/fusion_elementwise_add_relu_op.cpp
+++ b/src/operators/fusion_elementwise_add_relu_op.cpp
--- a/src/operators/fusion_elementwise_add_relu_op.h
+++ b/src/operators/fusion_elementwise_add_relu_op.h
--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
--- a/src/operators/fusion_fc_relu_op.cpp
+++ b/src/operators/fusion_fc_relu_op.cpp
--- a/src/operators/fusion_fc_relu_op.h
+++ b/src/operators/fusion_fc_relu_op.h
--- a/src/operators/im2sequence_op.cpp
+++ b/src/operators/im2sequence_op.cpp
--- a/src/operators/im2sequence_op.h
+++ b/src/operators/im2sequence_op.h
--- a/src/operators/kernel/arm/batchnorm_kernel.cpp
+++ b/src/operators/kernel/arm/batchnorm_kernel.cpp
--- a/src/operators/kernel/arm/box_coder_kernel.cpp
+++ b/src/operators/kernel/arm/box_coder_kernel.cpp
--- a/src/operators/kernel/arm/concat_kernel.cpp
+++ b/src/operators/kernel/arm/concat_kernel.cpp
--- a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
--- a/src/operators/kernel/arm/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_kernel.cpp
--- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
--- a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
--- a/src/operators/kernel/arm/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/arm/conv_transpose_kernel.cpp
--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
--- a/src/operators/kernel/arm/dropout_kernel.cpp
+++ b/src/operators/kernel/arm/dropout_kernel.cpp
--- a/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
--- a/src/operators/kernel/arm/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_add_kernel.cpp
--- a/src/operators/kernel/arm/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/arm/fusion_fc_kernel.cpp
--- a/src/operators/kernel/arm/im2sequence_kernel.cpp
+++ b/src/operators/kernel/arm/im2sequence_kernel.cpp
--- a/src/operators/kernel/arm/lrn_kernel.cpp
+++ b/src/operators/kernel/arm/lrn_kernel.cpp
--- a/src/operators/kernel/arm/mul_kernel.cpp
+++ b/src/operators/kernel/arm/mul_kernel.cpp
--- a/src/operators/kernel/arm/multiclass_nms_kernel.cpp
+++ b/src/operators/kernel/arm/multiclass_nms_kernel.cpp
--- a/src/operators/kernel/arm/pool_kernel.cpp
+++ b/src/operators/kernel/arm/pool_kernel.cpp
--- a/src/operators/kernel/arm/prelu_kernel.cpp
+++ b/src/operators/kernel/arm/prelu_kernel.cpp
--- a/src/operators/kernel/arm/prior_box_kernel.cpp
+++ b/src/operators/kernel/arm/prior_box_kernel.cpp
--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
--- a/src/operators/kernel/arm/reshape_kernel.cpp
+++ b/src/operators/kernel/arm/reshape_kernel.cpp
--- a/src/operators/kernel/arm/resize_kernel.cpp
+++ b/src/operators/kernel/arm/resize_kernel.cpp
--- a/src/operators/kernel/arm/scale_kernel.cpp
+++ b/src/operators/kernel/arm/scale_kernel.cpp
--- a/src/operators/kernel/arm/sigmoid_kernel.cpp
+++ b/src/operators/kernel/arm/sigmoid_kernel.cpp
--- a/src/operators/kernel/arm/slice_kernel.cpp
+++ b/src/operators/kernel/arm/slice_kernel.cpp
--- a/src/operators/kernel/arm/softmax_kernel.cpp
+++ b/src/operators/kernel/arm/softmax_kernel.cpp
--- a/src/operators/kernel/arm/transpose_kernel.cpp
+++ b/src/operators/kernel/arm/transpose_kernel.cpp
--- a/src/operators/kernel/batchnorm_kernel.h
+++ b/src/operators/kernel/batchnorm_kernel.h
--- a/src/operators/kernel/box_coder_kernel.h
+++ b/src/operators/kernel/box_coder_kernel.h
--- a/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
+++ b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
--- a/src/operators/kernel/central-arm-func/box_coder_arm_func.h
+++ b/src/operators/kernel/central-arm-func/box_coder_arm_func.h
--- a/src/operators/kernel/central-arm-func/concat_arm_func.h
+++ b/src/operators/kernel/central-arm-func/concat_arm_func.h
--- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
--- a/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
--- a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
--- a/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
--- a/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
--- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
--- a/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
--- a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
--- a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
+++ b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
--- a/src/operators/kernel/central-arm-func/lrn_arm_func.h
+++ b/src/operators/kernel/central-arm-func/lrn_arm_func.h
--- a/src/operators/kernel/central-arm-func/mul_arm_func.h
+++ b/src/operators/kernel/central-arm-func/mul_arm_func.h
--- a/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
+++ b/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
--- a/src/operators/kernel/central-arm-func/pool_arm_func.h
+++ b/src/operators/kernel/central-arm-func/pool_arm_func.h
--- a/src/operators/kernel/central-arm-func/prior_box_arm_func.h
+++ b/src/operators/kernel/central-arm-func/prior_box_arm_func.h
--- a/src/operators/kernel/central-arm-func/relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/relu_arm_func.h
--- a/src/operators/kernel/central-arm-func/reshape_arm_func.h
+++ b/src/operators/kernel/central-arm-func/reshape_arm_func.h
--- a/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
+++ b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
--- a/src/operators/kernel/central-arm-func/softmax_arm_func.h
+++ b/src/operators/kernel/central-arm-func/softmax_arm_func.h
--- a/src/operators/kernel/central-arm-func/transpose_arm_func.h
+++ b/src/operators/kernel/central-arm-func/transpose_arm_func.h
--- a/src/operators/kernel/concat_kernel.h
+++ b/src/operators/kernel/concat_kernel.h
--- a/src/operators/kernel/conv_add_bn_kernel.h
+++ b/src/operators/kernel/conv_add_bn_kernel.h
--- a/src/operators/kernel/conv_add_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_add_bn_relu_kernel.h
--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
--- a/src/operators/kernel/conv_add_relu_kernel.h
+++ b/src/operators/kernel/conv_add_relu_kernel.h
--- a/src/operators/kernel/conv_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_bn_relu_kernel.h
--- a/src/operators/kernel/conv_kernel.h
+++ b/src/operators/kernel/conv_kernel.h
--- a/src/operators/kernel/conv_transpose_kernel.h
+++ b/src/operators/kernel/conv_transpose_kernel.h
--- a/src/operators/kernel/depthwise_conv_kernel.h
+++ b/src/operators/kernel/depthwise_conv_kernel.h
--- a/src/operators/kernel/dropout_kernel.h
+++ b/src/operators/kernel/dropout_kernel.h
--- a/src/operators/kernel/dwconv_bn_relu_kernel.h
+++ b/src/operators/kernel/dwconv_bn_relu_kernel.h
--- a/src/operators/kernel/elementwise_add_kernel.h
+++ b/src/operators/kernel/elementwise_add_kernel.h
--- a/src/operators/kernel/elementwise_add_relu_kernel.h
+++ b/src/operators/kernel/elementwise_add_relu_kernel.h
--- a/src/operators/kernel/fc_relu_kernel.h
+++ b/src/operators/kernel/fc_relu_kernel.h
--- a/src/operators/kernel/fpga/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/concat_kernel.cpp
--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
--- a/src/operators/kernel/fpga/conv_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_kernel.cpp
--- a/src/operators/kernel/fpga/dropout_kernel.cpp
+++ b/src/operators/kernel/fpga/dropout_kernel.cpp
--- a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp
--- a/src/operators/kernel/fpga/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
--- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
--- a/src/operators/kernel/fpga/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/pool_kernel.cpp
--- a/src/operators/kernel/fusion_fc_kernel.h
+++ b/src/operators/kernel/fusion_fc_kernel.h
--- a/src/operators/kernel/im2sequence_kernel.h
+++ b/src/operators/kernel/im2sequence_kernel.h
--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
--- a/ACL_Android @ 591027fc
+++ b/ACL_Android @ 591027fc
--- a/src/operators/kernel/mali/acl_operator.cc
+++ b/src/operators/kernel/mali/acl_operator.cc
--- a/src/operators/kernel/mali/acl_operator.h
+++ b/src/operators/kernel/mali/acl_operator.h
--- a/src/operators/kernel/mali/acl_tensor.cc
+++ b/src/operators/kernel/mali/acl_tensor.cc
--- a/src/operators/kernel/mali/acl_tensor.h
+++ b/src/operators/kernel/mali/acl_tensor.h
--- a/src/operators/kernel/mali/batchnorm_kernel.cpp
+++ b/src/operators/kernel/mali/batchnorm_kernel.cpp
--- a/src/operators/kernel/mali/concat_kernel.cpp
+++ b/src/operators/kernel/mali/concat_kernel.cpp
--- a/src/operators/kernel/mali/conv_add_kernel.cpp
+++ b/src/operators/kernel/mali/conv_add_kernel.cpp
--- a/src/operators/kernel/mali/conv_kernel.cpp
+++ b/src/operators/kernel/mali/conv_kernel.cpp
--- a/src/operators/kernel/mali/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp
--- a/src/operators/kernel/mali/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
--- a/src/operators/kernel/mali/lrn_kernel.cpp
+++ b/src/operators/kernel/mali/lrn_kernel.cpp
--- a/src/operators/kernel/mali/mul_kernel.cpp
+++ b/src/operators/kernel/mali/mul_kernel.cpp
--- a/src/operators/kernel/mali/pool_kernel.cpp
+++ b/src/operators/kernel/mali/pool_kernel.cpp
--- a/src/operators/kernel/mali/relu_kernel.cpp
+++ b/src/operators/kernel/mali/relu_kernel.cpp
--- a/src/operators/kernel/mali/reshape_kernel.cpp
+++ b/src/operators/kernel/mali/reshape_kernel.cpp
--- a/src/operators/kernel/mali/softmax_kernel.cpp
+++ b/src/operators/kernel/mali/softmax_kernel.cpp
--- a/src/operators/kernel/mul_kernel.h
+++ b/src/operators/kernel/mul_kernel.h
--- a/src/operators/kernel/multiclass_nms_kernel.h
+++ b/src/operators/kernel/multiclass_nms_kernel.h
--- a/src/operators/kernel/pool_kernel.h
+++ b/src/operators/kernel/pool_kernel.h
--- a/src/operators/kernel/prelu_kernel.h
+++ b/src/operators/kernel/prelu_kernel.h
--- a/src/operators/kernel/prior_box_kernel.h
+++ b/src/operators/kernel/prior_box_kernel.h
--- a/src/operators/kernel/relu_kernel.h
+++ b/src/operators/kernel/relu_kernel.h
--- a/src/operators/kernel/reshape_kernel.h
+++ b/src/operators/kernel/reshape_kernel.h
--- a/src/operators/kernel/resize_kernel.h
+++ b/src/operators/kernel/resize_kernel.h
--- a/src/operators/kernel/scale_kernel.h
+++ b/src/operators/kernel/scale_kernel.h
--- a/src/operators/kernel/sigmoid_kernel.h
+++ b/src/operators/kernel/sigmoid_kernel.h
--- a/src/operators/kernel/slice_kernel.h
+++ b/src/operators/kernel/slice_kernel.h
--- a/src/operators/kernel/softmax_kernel.h
+++ b/src/operators/kernel/softmax_kernel.h
--- a/src/operators/kernel/transpose_kernel.h
+++ b/src/operators/kernel/transpose_kernel.h
--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
--- a/src/operators/lrn_op.h
+++ b/src/operators/lrn_op.h
--- a/src/operators/math/conv_func.h
+++ b/src/operators/math/conv_func.h
--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
--- a/src/operators/math/depthwise_conv_3x3.h
+++ b/src/operators/math/depthwise_conv_3x3.h
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
--- a/src/operators/math/im2col.cpp
+++ b/src/operators/math/im2col.cpp
--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
--- a/src/operators/math/pool_2x2.cpp
+++ b/src/operators/math/pool_2x2.cpp
--- a/src/operators/math/pool_2x2.h
+++ b/src/operators/math/pool_2x2.h
--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
--- a/src/operators/math/pool_3x3.h
+++ b/src/operators/math/pool_3x3.h
--- a/src/operators/math/pooling.cpp
+++ b/src/operators/math/pooling.cpp
--- a/src/operators/math/pooling.h
+++ b/src/operators/math/pooling.h
--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
--- a/src/operators/multiclass_nms_op.cpp
+++ b/src/operators/multiclass_nms_op.cpp
--- a/src/operators/multiclass_nms_op.h
+++ b/src/operators/multiclass_nms_op.h
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
--- a/src/operators/prelu_op.cpp
+++ b/src/operators/prelu_op.cpp
--- a/src/operators/prelu_op.h
+++ b/src/operators/prelu_op.h
--- a/src/operators/prior_box_op.cpp
+++ b/src/operators/prior_box_op.cpp
--- a/src/operators/prior_box_op.h
+++ b/src/operators/prior_box_op.h
--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
--- a/src/operators/reshape_op.h
+++ b/src/operators/reshape_op.h
--- a/src/operators/resize_op.cpp
+++ b/src/operators/resize_op.cpp
--- a/src/operators/resize_op.h
+++ b/src/operators/resize_op.h
--- a/src/operators/scale_op.cpp
+++ b/src/operators/scale_op.cpp
--- a/src/operators/scale_op.h
+++ b/src/operators/scale_op.h
--- a/src/operators/sigmoid_op.cpp
+++ b/src/operators/sigmoid_op.cpp
--- a/src/operators/sigmoid_op.h
+++ b/src/operators/sigmoid_op.h
--- a/src/operators/slice_op.cpp
+++ b/src/operators/slice_op.cpp
--- a/src/operators/slice_op.h
+++ b/src/operators/slice_op.h
--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
--- a/src/operators/transpose_op.cpp
+++ b/src/operators/transpose_op.cpp
--- a/src/operators/transpose_op.h
+++ b/src/operators/transpose_op.h
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
--- a/test/common/test_gemm_accuracy.cpp
+++ b/test/common/test_gemm_accuracy.cpp
--- a/test/common/test_gemm.cpp
+++ b/test/common/test_gemm.cpp
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
--- a/test/fpga/test_tensor_quant.cpp
+++ b/test/fpga/test_tensor_quant.cpp
--- a/test/framework/test_inference_api.cpp
+++ b/test/framework/test_inference_api.cpp
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
--- a/test/framework/test_optimize.cpp
+++ b/test/framework/test_optimize.cpp
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
--- a/test/net/test_mobilenet_combine.cpp
+++ b/test/net/test_mobilenet_combine.cpp
--- a/test/net/test_resnet.cpp
+++ b/test/net/test_resnet.cpp
--- a/test/net/test_squeezenet.cpp
+++ b/test/net/test_squeezenet.cpp
--- a/test/net/test_yolo.cpp
+++ b/test/net/test_yolo.cpp
--- a/test/operators/test_fusion_conv_add_bn_relu_op.cpp
+++ b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
--- a/test/operators/test_im2sequence_op.cpp
+++ b/test/operators/test_im2sequence_op.cpp
--- a/test/operators/test_prelu_op.cpp
+++ b/test/operators/test_prelu_op.cpp
--- a/test/operators/test_resize_op.cpp
+++ b/test/operators/test_resize_op.cpp
--- a/test/operators/test_scale_op.cpp
+++ b/test/operators/test_scale_op.cpp
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
--- a/test/operators/test_slice_op.cpp
+++ b/test/operators/test_slice_op.cpp
--- a/test/test_helper.h
+++ b/test/test_helper.h
--- a/test/test_include.h
+++ b/test/test_include.h
--- a/tools/android-cmake/android.toolchain.cmake
+++ b/tools/android-cmake/android.toolchain.cmake
--- a/tools/push2android.sh
+++ b/tools/push2android.sh
--- a/tools/scripts/run_on_android.sh
+++ b/tools/scripts/run_on_android.sh
--- a/tools/arm-platform.cmake
+++ b/tools/arm-platform.cmake
--- a/tools/build.sh
+++ b/tools/build.sh
--- a/tools/ios-cmake/ios.toolchain.cmake
+++ b/tools/ios-cmake/ios.toolchain.cmake
--- a/tools/net-detail.awk
+++ b/tools/net-detail.awk
--- a/tools/net.awk
+++ b/tools/net.awk
--- a/tools/op.cmake
+++ b/tools/op.cmake
--- a/tools/pre-commit.hooks/clang-format.hook
+++ b/tools/pre-commit.hooks/clang-format.hook
--- a/tools/quantification/CMakeLists.txt
+++ b/tools/quantification/CMakeLists.txt
--- a/tools/quantification/README.md
+++ b/tools/quantification/README.md
--- a/tools/quantification/convert.cpp
+++ b/tools/quantification/convert.cpp
--- a/tools/quantification/src/block_desc_local.cpp
+++ b/tools/quantification/src/block_desc_local.cpp
--- a/tools/quantification/src/block_desc_local.h
+++ b/tools/quantification/src/block_desc_local.h
--- a/tools/quantification/src/enforce.h
+++ b/tools/quantification/src/enforce.h
--- a/tools/quantification/src/framework.pb-c.c
+++ b/tools/quantification/src/framework.pb-c.c
--- a/tools/quantification/src/framework.pb-c.h
+++ b/tools/quantification/src/framework.pb-c.h
--- a/tools/quantification/src/program_desc.cpp
+++ b/tools/quantification/src/program_desc.cpp
--- a/tools/quantification/src/program_desc.h
+++ b/tools/quantification/src/program_desc.h
--- a/tools/quantification/src/protobuf-c.c
+++ b/tools/quantification/src/protobuf-c.c
--- a/tools/quantification/src/protobuf-c.h
+++ b/tools/quantification/src/protobuf-c.h
--- a/tools/quantification/src/tensor_desc.h
+++ b/tools/quantification/src/tensor_desc.h
--- a/tools/quantification/src/var_desc.h
+++ b/tools/quantification/src/var_desc.h
--- a/tools/run.sh
+++ b/tools/run.sh
--- a/tools/toolchains/arm-android-neon.cmake
+++ b/tools/toolchains/arm-android-neon.cmake
--- a/tools/toolchains/arm-linux-gnueabi.cmake
+++ b/tools/toolchains/arm-linux-gnueabi.cmake