diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..f81f3717a4ead833784b63da35185f2d07409983
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "src/operators/kernel/mali/ACL_Android"]
+	path = src/operators/kernel/mali/ACL_Android
+	url = https://github.com/halsay/ACL_Android.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f60846e98aa9ca36bd6bd68cccdda6e3d2ff616a..6feabdbe4374c9200c4282f620fadc27f3128bc9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,39 +1,104 @@
 cmake_minimum_required(VERSION 3.0)
 project(paddle-mobile)
-add_definitions(-DPADDLE_MOBILE_DEBUG)
-add_definitions(-DENABLE_EXCEPTION)
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-set(CMAKE_BUILD_TYPE RelWithDebInfo)
-set(CMAKE_VERBOSE_MAKEFILE ON)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
+option(DEBUGING "enable debug mode" ON)
+option(USE_OPENMP "openmp support" OFF)
+option(USE_EXCEPTION "use std exception" ON)
+option(LOG_PROFILE "log profile" ON)
+# select the platform to build
+option(CPU "armv7 with neon" ON)
+option(MALI_GPU "mali gpu" ON)
+option(FPGA "fpga" OFF)
+set(DEBUGING ON)
+if (CPU)
+  add_definitions(-DPADDLE_MOBILE_CPU)
+endif()
+
+if (MALI_GPU)
+    add_definitions(-DPADDLE_MOBILE_MALI_GPU)
+    add_definitions(-DUSE_ACL=1)
+    add_definitions(-DUSE_OPENCL)
+    set(ACL_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/ACL_Android)
+    include_directories(${ACL_ROOT} ${ACL_ROOT}/include)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_core")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_graph")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build/opencl-1.2-stubs")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ACL=1")
+endif()
+
+if(FPGA)
+  add_definitions(-DPADDLE_MOBILE_FPGA)
+endif()
+
+
+set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
+
+if (DEBUGING)
+  message(STATUS "debug")
+  set(CMAKE_BUILD_TYPE Debug)
+  set(CMAKE_CXX_FLAGS_DEBUG "-g -DNDEBUG")
+  add_definitions(-DPADDLE_MOBILE_DEBUG)
+  if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
+    add_definitions(-DARMV7)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
+  endif ()
+else ()
+  set(CMAKE_BUILD_TYPE Release)
+  set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
+  add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
+endif ()
+
+if (USE_EXCEPTION)
+    message(STATUS "use exception")
+    add_definitions(-DENABLE_EXCEPTION)
+    add_definitions(-fexceptions)
+else()
+    add_definitions(-fno-exceptions)
+endif ()
+
+if (LOG_PROFILE)
+    add_definitions(-DPADDLE_MOBILE_PROFILE)
+endif()
+
+if(USE_OPENMP)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
+endif()
+
 
 file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c)
 file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
 
-# include headers
+if (NOT ANDROID_NDK_TOOLCHAIN_INCLUDED)
+  list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.cpp)
+  list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.h)
+  list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
+endif ()
+
 include_directories(src/)
 
-#include(ExternalProject)
-#ExternalProject_Add(openblas_proj
-#        GIT_REPOSITORY "https://github.com/xianyi/OpenBLAS.git"
-#        GIT_TAG "v0.2.20"
-#        SOURCE_DIR "openblas/"
-#        BUILD_IN_SOURCE 1
-#        CONFIGURE_COMMAND ""
-#        BUILD_COMMAND "make" "ONLY_CBLAS=1"
-#        INSTALL_COMMAND "make" "PREFIX=${CMAKE_BINARY_DIR}/" "install"
-#        )
-#set_target_properties(openblas_proj PROPERTIES EXCLUDE_FROM_ALL 1)
+set(CMAKE_VERBOSE_MAKEFILE ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
+
+include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
 
-#add_dependencies(paddle-mobile openblas_proj)
+# if (IS_IOS)
+#     add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
+    list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS)
+    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+else ()
+    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+endif ()
 
-# gen static
-ADD_LIBRARY(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+if(DEBUGING)
+    add_subdirectory(test)
+endif()
 
-#add_dependencies(paddle-mobile openblas_proj)
 
-add_subdirectory(test)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..1a25d65e02afb09dabc96e1ec241346cff34f6f2
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,233 @@
+# 贡献代码
+
+欢迎您对Paddle-Mobile项目的贡献。
+我们诚挚的感谢你的贡献，这个文档描述了我们的工作方式和工作流程。Paddle-Mobile在PaddlePaddle org下，和服务器版本的Paddle工程的代码规范基本相同，开发者也可以同时参考Paddle的相关文档。
+
+## Workflow
+
+Paddle-Mobile 开发中使用到的几种模型在这个链接下载 [点我](https://mms-mis.cdn.bcebos.com/paddle-mobile/models.zip).  
+之后是贡献代码的主要流程。
+
+### Fork
+
+* Paddle-Mobile采用Pull Request的方式提交代码，禁止直接push，所有的代码都需要人工review。首先要fork一份Paddle-Moble的代码 ["Fork" button](https://help.github.com/articles/fork-a-repo/).
+* 跳转到[Paddle-Mobile](https://github.com/PaddlePaddle/paddle-mobile) GitHub首页，然后单击 `Fork` 按钮，生成自己目录下的仓库，比如 <https://github.com/你的用户名/paddle-mobile>。
+
+### Clone(克隆)
+将远程仓库 clone 到本地：
+
+```bash
+➜  git clone https://github.com/你的用户名/paddle-mobile
+➜  cd Paddle
+```
+
+### 创建本地分支
+
+Paddle-Mobile 和Paddle一样，目前使用[Git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护，具体请参考 [Paddle 分支规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/releasing_process.md#paddle-分支规范)。
+
+所有的 feature 和 bug fix 的开发工作都应该在一个新的分支上完成，一般从 `develop` 分支上创建新分支。
+
+使用 `git checkout -b` 创建并切换到新分支。
+
+```bash
+➜  git checkout -b my-cool-stuff
+```
+
+值得注意的是，在 checkout 之前，需要保持当前分支目录 clean，否则会把 untracked 的文件也带到新分支上，这可以通过 `git status` 查看。
+
+### 使用 `pre-commit` 钩子
+
+Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码（C++，Python），在提交（commit）前自动检查一些基本事宜（如每个文件只有一个 EOL，Git 中不要添加大文件等）。
+
+`pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子的 PR 不能被提交到 Paddle，首先安装并在当前目录运行它：
+
+```bash
+pip install pre-commit
+pre-commit -v -a
+```
+
+Paddle-Mobile 使用 `clang-format` 来调整 C/C++ 源代码格式，在格式化代码时不同的`clang-format`版本会有不同的表现形态，和Paddle不同的是，Paddle-Mobile开发人员使用的是更的5.0版本的llvm工具集。所以为了防止无法CI，请确保 `clang-format` 版本是 5.0 版本。
+
+> 另外：通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的，Paddle 开发人员使用的是`pip install pre-commit`。
+
+
+
+## 开始开发
+
+在本例中，我删除了 README.md 中的一行，并创建了一个新文件。
+
+通过 `git status` 查看当前状态，这会提示当前目录的一些变化，同时也可以通过 `git diff` 查看文件具体被修改的内容。
+
+```bash
+➜  git status
+On branch test
+Changes not staged for commit:
+  (use "git add <file>..." to update what will be committed)
+  (use "git checkout -- <file>..." to discard changes in working directory)
+
+	modified:   README.md
+
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+no changes added to commit (use "git add" and/or "git commit -a")
+```
+
+## 构建
+
+paddle-mobile是为了移动端版本开发的，而移动端大多以arm平台为主。所以我们要交叉编译到arm平台。以cpu为例：
+
+1. 安装NDK最新版
+2. 配置ANDROID_NDK和NDK_ROOT环境变量
+3. 开发，并写单元测试
+4. sh build.sh
+
+## 提交（commit）
+
+接下来我们取消对 README.md 文件的改变，然后提交新添加的 test 文件。
+
+```bash
+➜  git checkout -- README.md
+➜  git status
+On branch test
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+nothing added to commit but untracked files present (use "git add" to track)
+➜  git add test
+```
+
+Git 每次提交代码，都需要写提交说明，这可以让其他人知道这次提交做了哪些改变，这可以通过`git commit` 完成。
+
+```bash
+▶ pre-commit run -a -v
+[remove-crlf] CRLF end-lines remover........................................Passed
+[remove-tabs] Tabs remover..................................................Passed
+[check-added-large-files] Check for added large files.......................Passed
+[check-merge-conflict] Check for merge conflicts............................Passed
+[check-symlinks] Check for broken symlinks..................................Passed
+[detect-private-key] Detect Private Key.....................................Passed
+[end-of-file-fixer] Fix End of Files........................................Passed
+[trailing-whitespace] Trim Trailing Whitespace..............................Passed
+[copyright] copyright.......................................................Passed
+[clang-format] clang-format.................................................Passed
+[cpplint] cpplint...........................................................Passed
+hookid: cpplint
+
+Ignoring build_bak.sh; not a valid file name (c, cc, h, hpp, c++, h++, cu, cpp, hxx, cxx, cuh)
+Done processing build_bak.sh
+Ignoring build_bak.sh; not a valid file name (c, cc, h, hpp, c++, h++, cu, cpp, hxx, cxx, cuh)
+Done processing build_bak.sh
+```
+
+## 保持本地仓库最新
+
+在准备发起 Pull Request 之前，需要同步原仓库（<https://github.com/PaddlePaddle/paddle-mobile>）最新的代码。
+
+首先通过 `git remote` 查看当前远程仓库的名字。
+
+```bash
+➜  git remote
+origin
+➜  git remote -v
+origin	https://github.com/USERNAME/paddle-mobile (fetch)
+origin	https://github.com/USERNAME/paddle-mobile (push)
+```
+
+这里 origin 是我们 clone 的远程仓库的名字，也就是自己用户名下的 paddle-mobile，接下来我们创建一个原始 paddle-mobile 仓库的远程主机，命名为 upstream。
+
+```bash
+➜  git remote add upstream https://github.com/PaddlePaddle/paddle-mobile
+➜  git remote
+origin
+upstream
+```
+
+获取 upstream 的最新代码并更新当前分支。
+
+```bash
+➜  git fetch upstream
+➜  git pull upstream develop
+```
+
+## Push 到远程仓库
+
+将本地的修改推送到 GitHub 上，也就是 https://github.com/USERNAME/paddle-mobile。
+
+```bash
+# 推送到远程仓库 origin 的 my-cool-stuff 分支上
+➜  git push origin my-cool-stuff
+```
+
+## 建立 Issue 并完成 Pull Request
+
+建立一个 Issue 描述问题，并记录它的编号。
+
+切换到所建分支，然后点击 `New pull request`。
+
+在 PR 的描述说明中，填写 `resolve #Issue编号` 可以在这个 PR 被 merge 后，自动关闭对应的 Issue
+> 具体请见 <https://help.github.com/articles/closing-issues-via-commit-messages/>
+
+
+## review
+
+在接到PR后，可以看到该pr页面内正在运行CI。如果运行出现问题，可以点Details进入Travis平台上看详细内容。
+![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294833030073.jpg)
+
+可以在travis上看到更加详细的信息。
+![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294833651326.jpg)
+
+接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。
+
+## 删除远程分支
+
+在 PR 被 merge 进主仓库后，我们可以在 PR 的页面删除远程仓库的分支。
+
+<img width="775" alt="screen shot 2017-04-26 at 9 18 24 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436457/e4cdd472-2ac5-11e7-9272-badc76c4a23e.png">
+
+也可以使用 `git push origin :分支名` 删除远程分支，如：
+
+```bash
+➜  git push origin :my-cool-stuff
+```
+
+## 删除本地分支
+
+最后，删除本地分支。
+
+```bash
+# 切换到 develop 分支
+➜  git checkout develop 
+
+# 删除 my-cool-stuff 分支
+➜  git branch -D my-cool-stuff
+```
+
+至此，我们就完成了一次代码贡献的过程。
+
+## 提交代码的一些约定
+
+为了使评审人在评审代码时更好地专注于代码本身，请您每次提交代码时，遵守以下约定：
+
+1. 请保证Travis-CI 中单元测试能顺利通过。如果没过，说明提交的代码存在问题，评审人一般不做评审。
+2. 提交Pull Request前：
+   - 请注意commit的数量：
+     - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
+     - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
+   - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
+3. 如果解决了某个Issue的问题，请在该Pull Request的**第一个**评论框中加上：`fix #issue_number`，这样当该PUll Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
+
+此外，在回复评审人意见时，请您遵守以下约定：
+
+1. 评审人的每个意见都必须回复（这是开源社区的基本礼貌，别人帮了忙，应该说谢谢）：
+   - 对评审意见同意且按其修改完的，给个简单的`Done`即可；
+   - 对评审意见不同意的，请给出您自己的反驳理由。
+2. 如果评审意见比较多：
+   - 请给出总体的修改情况。
+   - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复，而非直接回复的方式。原因是每个回复都会发送一封邮件，会造成邮件灾难。
+
+
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..82a08b3a2764841b13a1380647efadb3399fc3ec
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,33 @@
+FROM ubuntu:16.04
+
+RUN echo '\
+deb <mirror> <version> main restricted universe multiverse\n\
+deb <mirror> <version>-updates main restricted universe multiverse\n\
+deb <mirror> <version>-backports main restricted universe multiverse\n\
+deb <mirror> <version>-security main restricted universe multiverse\n'\
+> /etc/apt/sources.list
+RUN sed -ie 's|<mirror>|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|' /etc/apt/sources.list
+RUN sed -ie 's|<version>|xenial|' /etc/apt/sources.list
+
+RUN apt-get update && apt-get upgrade -y
+RUN apt-get install -y --no-install-recommends \
+        curl \
+        unzip \
+        git \
+        make \
+        cmake \
+        cmake-curses-gui \
+        python \
+        python-pip \
+        python-setuptools \
+        clang-format-5.0 \
+        graphviz \
+        g++-arm-linux-gnueabi \
+        gcc-arm-linux-gnueabi
+RUN apt-get autoremove -y && apt-get clean
+RUN pip install --upgrade pip
+RUN pip install wheel && pip install pre-commit
+RUN ln -s clang-format-5.0 /usr/bin/clang-format
+# RUN cd /tmp && curl -O http://mirrors.neusoft.edu.cn/android/repository/android-ndk-r17b-linux-x86_64.zip
+# RUN cd /opt && unzip /tmp/android-ndk-r17b-linux-x86_64.zip
+# ENV NDK_ROOT /opt/android-ndk-r17b
diff --git a/build.sh b/build.sh
deleted file mode 100755
index dc31f3b47f47975309e46c063bee142d1cc1a14f..0000000000000000000000000000000000000000
--- a/build.sh
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env bash
-
-build_for_mac() {
-    if [ ! `which brew` ]; then
-        echo "building failed! homebrew not found, please install homebrew."
-        return
-    fi
-    if [ ! `which cmake` ]; then
-        echo "installing cmake."
-        brew install cmake
-        if [ ! $? ]; then
-            echo "cmake install failed."
-            return
-        fi
-    fi
-    PLATFORM="x86"
-    MODE="Release"
-    CXX_FLAGS="-std=c++11 -O3 -s"
-    BUILD_DIR=build/release/"${PLATFORM}"
-    mkdir -p ${BUILD_DIR}/build
-
-    mkdir -p ${BUILD_DIR}/test
-    cp -r test/models ${BUILD_DIR}/test/models
-
-    cmake . \
-        -B"${BUILD_DIR}" \
-    	-DCMAKE_BUILD_TYPE="${MODE}" \
-    	-DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-    	-DIS_MAC=true
-
-    cd ${BUILD_DIR}
-    make -j 8
-}
-
-build_for_android() {
-    if [ -z "${ANDROID_NDK}" ]; then
-        echo "ANDROID_NDK not found!"
-        exit -1
-    fi
-
-    PLATFORM="arm-v7a"
-#    PLATFORM="arm-v8a"
-
-    if [ "${PLATFORM}" = "arm-v7a" ]; then
-        ABI="armeabi-v7a with NEON"
-        ARM_PLATFORM="V7"
-        CXX_FLAGS="-O3 -std=c++11 -s -march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security -llog"
-    elif [ "${PLATFORM}" = "arm-v8a" ]; then
-        ABI="arm64-v8a"
-        ARM_PLATFORM="V8"
-        CXX_FLAGS="-O3 -std=c++11 -s -march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog"
-    else
-        echo "unknown platform!"
-        exit -1
-    fi
-
-    MODE="Release"
-    ANDROID_PLATFORM_VERSION="android-15"
-    TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
-    ANDROID_ARM_MODE="arm"
-
-    cmake . \
-        -B"build/release/${PLATFORM}" \
-        -DANDROID_ABI="${ABI}" \
-        -DCMAKE_BUILD_TYPE="${MODE}" \
-        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-        -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \
-        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-        -DANDROID_STL=c++_static \
-        -DANDROID=true \
-        -D"${ARM_PLATFORM}"=true
-
-    cd "./build/release/${PLATFORM}"
-    make -j 8
-}
-
-build_for_ios() {
-    PLATFORM="ios"
-    MODE="Release"
-    BUILD_DIR=build/release/"${PLATFORM}"
-    TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
-    C_FLAGS="-fobjc-abi-version=2 -fobjc-arc -isysroot ${CMAKE_OSX_SYSROOT}"
-    CXX_FLAGS="-fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT}"
-    mkdir -p "${BUILD_DIR}"
-
-    cmake . \
-        -B"${BUILD_DIR}" \
-        -DCMAKE_BUILD_TYPE="${MODE}" \
-        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-        -DIOS_PLATFORM=OS \
-        -DCMAKE_C_FLAGS="${C_FLAGS}" \
-        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-        -DIS_IOS="true" \
-
-    cd "${BUILD_DIR}"
-    make -j 8
-}
-
-build_error() {
-    echo "unknown argument"
-}
-
-if [ $# -lt 1 ]; then
-	echo "error: target missing!"
-    echo "available targets: mac|linux|ios|android"
-    echo "sample usage: ./build.sh mac"
-else
-	if [ $1 = "mac" ]; then
-		build_for_mac
-	elif [ $1 = "linux" ]; then
-		build_for_linux
-	elif [ $1 = "android" ]; then
-		build_for_android
-	elif [ $1 = "ios" ]; then
-		build_for_ios
-	else
-		build_error
-	fi
-fi
diff --git a/doc/build.md b/doc/build.md
new file mode 100644
index 0000000000000000000000000000000000000000..6a8521b593ccdeab464687e7eae79192d131d51b
--- /dev/null
+++ b/doc/build.md
@@ -0,0 +1,59 @@
+# 环境搭建
+## 使用 docker
+### 1. 安装 docker
+安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
+### 2. 使用 docker 搭建构建环境
+首先进入 paddle-mobile 的目录下，执行 `docker build`
+以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
+```
+$ docker build -t paddle-mobile:dev - < Dockerfile
+```
+使用 `docker images` 可以看到我们新建的 image
+```
+$ docker images
+REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
+paddle-mobile   dev     33b146787711   45 hours ago    372MB
+```
+### 3. 使用 docker 构建
+进入 paddle-mobile 目录，执行 docker run
+```
+$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
+root@5affd29d4fc5:/ # cd /paddle-mobile
+# 生成构建 android 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
+# 生成构建 linux 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
+```
+### 4. 设置编译选项
+可以通过 ccmake 设置编译选项
+```
+root@5affd29d4fc5:/ # ccmake .
+                                                     Page 1 of 1
+ CMAKE_ASM_FLAGS
+ CMAKE_ASM_FLAGS_DEBUG
+ CMAKE_ASM_FLAGS_RELEASE
+ CMAKE_BUILD_TYPE
+ CMAKE_INSTALL_PREFIX             /usr/local
+ CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
+ CPU                              ON
+ DEBUGING                         ON
+ FPGA                             OFF
+ LOG_PROFILE                      ON
+ MALI_GPU                         OFF
+ NET                              googlenet
+ USE_EXCEPTION                    ON
+ USE_OPENMP                       OFF
+```
+修改选项后，按 `c`, `g` 更新 Makefile
+### 5. 构建
+使用 make 命令进行构建
+```
+root@5affd29d4fc5:/ # make
+```
+### 6. 查看构建产出
+构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
+
+## 不使用 docker
+不使用 docker 的方法，可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具，可能需要设置 CC，CXX 环境变量，或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake，或者增加自己需要的 toolchain file。
\ No newline at end of file
diff --git a/scripts/push2android.sh b/scripts/push2android.sh
deleted file mode 100644
index 44b0ee32e99ccddf5cc6060882dc37158c149693..0000000000000000000000000000000000000000
--- a/scripts/push2android.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env sh
-
-push_fn () {
-MODELS_PATH="../test/models/*"
-EXE_FILE="../test/build/*"
-EXE_DIR="data/local/tmp/bin"
-MODELS_DIR="data/local/tmp/models"
-LIB_PATH="../build/release/arm-v7a/build/*"
-adb push ${EXE_FILE} ${EXE_DIR}
-adb push ${LIB_PATH} ${EXE_DIR}
-adb push ${MODELS_PATH} ${MODELS_DIR}
-echo "test files sync completed"
-}
-push_fn
diff --git a/src/common/dep_core.h b/src/common/dep_core.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9873a3896d1ac83cfc45e0666ca8491a645ed8e
--- /dev/null
+++ b/src/common/dep_core.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "framework/operator.h"
+
+namespace paddle_mobile {
+
+class depCore {
+ public:
+  template <typename Dtype>
+  void analysisDep(
+      const std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>& ops) {
+    std::unordered_map<std::string, int> vars;
+    size_t nop = ops.size();
+    deps.resize(nop);
+    next.resize(nop);
+    for (size_t i = 0; i < nop; i++) {
+      const auto& op = ops[i];
+      for (const auto& kv : op->Inputs()) {
+        for (const auto& v : kv.second) {
+          if (vars.find(v) == vars.end()) {
+            continue;
+          }
+          int di = vars[v];
+          if (di == i) {
+            continue;
+          }
+          if (std::find(deps[i].begin(), deps[i].end(), di) != deps[i].end()) {
+            continue;
+          }
+          deps[i].push_back(di);
+          next[di].push_back(i);
+        }
+      }
+      for (const auto& kv : op->Outputs()) {
+        for (const auto& v : kv.second) {
+          vars[v] = i;
+        }
+      }
+    }
+  }
+  const std::vector<int>& getNext(int i) { return next[i]; }
+  const std::vector<int>& getDeps(int i) { return deps[i]; }
+  std::vector<std::vector<int>> deps;
+  std::vector<std::vector<int>> next;
+};
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/common/enforce.h b/src/common/enforce.h
index 52bda2258a00c7444762fe8297380c1c7752dd42..51d2110e32433686d1b3353bc63b92a564a13e9d 100644
--- a/src/common/enforce.h
+++ b/src/common/enforce.h
@@ -17,8 +17,6 @@ limitations under the License. */
 #ifdef ENABLE_EXCEPTION
 #include <stdio.h>
 #include <exception>
-#include <sstream>
-#include <stdexcept>
 #include <string>
 
 #endif
@@ -32,12 +30,11 @@ struct PaddleMobileException : public std::exception {
 
   PaddleMobileException(const char *header, const char *detail,
                         const char *file, const int line) {
-    std::stringstream ss;
-    ss << exception_prefix << "| " << header << "\n";
-    ss << "| [in file] : " << file << " \n";
-    ss << "| [on line] : " << line << " \n";
-    ss << "| [detail]  : " << detail;
-    message = ss.str();
+    char buffer[1500];
+    snprintf(buffer, sizeof(buffer),
+             "%s| %s \n| [in file] : %s\n| [on line] : %d\n| [detail]  : %s\n",
+             exception_prefix.c_str(), header, file, line, detail);
+    message = std::string(buffer);
   }
   const char *what() const noexcept { return message.c_str(); }
 };
diff --git a/src/common/log.h b/src/common/log.h
index 052fb7df2ba74177205ef26cbebbc88c08e03e09..a3cefe2541e310897ce753b8eb69711242762122 100644
--- a/src/common/log.h
+++ b/src/common/log.h
@@ -16,15 +16,43 @@ limitations under the License. */
 
 #include <vector>
 #ifdef PADDLE_MOBILE_DEBUG
+#include <cstring>
 #include <iostream>
 #include <sstream>
 #include <string>
 #endif
+#ifdef ANDROID
+#include <android/log.h>
+#endif
 
 namespace paddle_mobile {
 
 #ifdef PADDLE_MOBILE_DEBUG
 
+#ifdef ANDROID
+
+extern const char *ANDROID_LOG_TAG;
+
+#define ANDROIDLOGI(...)                                               \
+  __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, __VA_ARGS__); \
+  printf(__VA_ARGS__)
+#define ANDROIDLOGW(...)                                                  \
+  __android_log_print(ANDROID_LOG_WARNING, ANDROID_LOG_TAG, __VA_ARGS__); \
+  printf(__VA_ARGS__)
+#define ANDROIDLOGD(...)                                                \
+  __android_log_print(ANDROID_LOG_DEBUG, ANDROID_LOG_TAG, __VA_ARGS__); \
+  printf(__VA_ARGS__)
+#define ANDROIDLOGE(...)                                                \
+  __android_log_print(ANDROID_LOG_ERROR, ANDROID_LOG_TAG, __VA_ARGS__); \
+  printf(__VA_ARGS__)
+#else
+#define ANDROIDLOGI(...)
+#define ANDROIDLOGW(...)
+#define ANDROIDLOGD(...)
+#define ANDROIDLOGE(...)
+
+#endif
+
 enum LogLevel {
   kNO_LOG,
   kLOG_ERROR,
@@ -88,26 +116,29 @@ struct ToLog {
   Print printer_;
 };
 
-#define LOG(level)                                                             \
-  if (level > paddle_mobile::log_level) {                                      \
-  } else                                                                       \
-    paddle_mobile::ToLog(                                                      \
-        level,                                                                 \
-        (std::stringstream()                                                   \
-         << "[file: "                                                          \
-         << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \
-         << "] [line: " << __LINE__ << "] ")                                   \
-            .str())
-
-#define DLOG                                                                   \
-  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {                  \
-  } else                                                                       \
-    paddle_mobile::ToLog(                                                      \
-        paddle_mobile::kLOG_DEBUG,                                             \
-        (std::stringstream()                                                   \
-         << "[file: "                                                          \
-         << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \
-         << "] [line: " << __LINE__ << "] ")                                   \
+#define LOG(level)                                                           \
+  if (level > paddle_mobile::log_level) {                                    \
+  } else                                                                     \
+    paddle_mobile::ToLog(                                                    \
+        level, static_cast<std::stringstream &>(                             \
+                   std::stringstream()                                       \
+                   << "[file: "                                              \
+                   << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
+                                              : __FILE__)                    \
+                   << "] [line: " << __LINE__ << "] ")                       \
+                   .str())
+
+#define DLOG                                                          \
+  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {         \
+  } else                                                              \
+    paddle_mobile::ToLog(                                             \
+        paddle_mobile::kLOG_DEBUG,                                    \
+        static_cast<std::stringstream &>(                             \
+            std::stringstream()                                       \
+            << "[file: "                                              \
+            << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
+                                       : __FILE__)                    \
+            << "] [line: " << __LINE__ << "] ")                       \
             .str())
 
 #define LOGF(level, format, ...)          \
@@ -122,6 +153,11 @@ struct ToLog {
 
 #else
 
+#define ANDROIDLOGI(...)
+#define ANDROIDLOGW(...)
+#define ANDROIDLOGD(...)
+#define ANDROIDLOGE(...)
+
 enum LogLevel {
   kNO_LOG,
   kLOG_ERROR,
diff --git a/src/common/log.cpp b/src/common/macros.h
similarity index 89%
rename from src/common/log.cpp
rename to src/common/macros.h
index dbc4554c5f2ef6ea2dc9ec76714277e8e24b0c8a..ee38f19c9285b369e48c550b67f6c397695e73cf 100644
--- a/src/common/log.cpp
+++ b/src/common/macros.h
@@ -12,6 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "log.h"
+#pragma once
 
-namespace paddle_mobile {}
+#define EXPORT __attribute__((visibility("default")))
diff --git a/src/platform/macros.h b/src/common/openmp-fix.cpp
similarity index 51%
rename from src/platform/macros.h
rename to src/common/openmp-fix.cpp
index ce133562cae0e4cd8720973c8f71ebca0e7e897d..8c31ef45c68227c612155e826e664367a7917501 100644
--- a/src/platform/macros.h
+++ b/src/common/openmp-fix.cpp
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
-// Disable the copy and assignment operator for a class.
-#ifndef DISABLE_COPY_AND_ASSIGN
-#define DISABLE_COPY_AND_ASSIGN(classname)          \
- private:                                           \
-  classname(const classname &) = delete;            \
-  classname(classname &&) = delete;                 \
-  classname &operator=(const classname &) = delete; \
-  classname &operator=(classname &&) = delete
+#ifdef PADDLE_MOBILE_USE_OPENMP
+/**
+ * android-ndk-r17 has a problem when linking with openmp.
+ * if paddle-mobile enables -fopenmp, but didn't use those omp_* functions,
+ * after linking another binary with libpaddle-mobile.so, the omp_get_thread_num
+ * will not work. see test/common/test_openmp.cc the detailed reason is still
+ * unclear, but this trick will work. a better solution is hacking the linker,
+ * try some flags to make it link omp_* functions, but I didn't find out how to
+ * make it work.
+ */
+#include <omp.h>
+static int _ = omp_get_num_procs();
 #endif
diff --git a/src/common/protobuf-c.c b/src/common/protobuf-c.c
index fd0e3d80a21282fe7bb600c2fdb174411fa315a3..1092e3f78b02a343d8c8965ea7b2d777a6fac9ae 100644
--- a/src/common/protobuf-c.c
+++ b/src/common/protobuf-c.c
@@ -711,47 +711,6 @@ static inline size_t uint32_pack(uint32_t value, uint8_t *out) {
   return rv;
 }
 
-/**
- * Pack a signed 32-bit integer and return the number of bytes written.
- * Negative numbers are encoded as two's complement 64-bit integers.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t int32_pack(int32_t value, uint8_t *out) {
-  if (value < 0) {
-    out[0] = value | 0x80;
-    out[1] = (value >> 7) | 0x80;
-    out[2] = (value >> 14) | 0x80;
-    out[3] = (value >> 21) | 0x80;
-    out[4] = (value >> 28) | 0x80;
-    out[5] = out[6] = out[7] = out[8] = 0xff;
-    out[9] = 0x01;
-    return 10;
-  } else {
-    return uint32_pack(value, out);
-  }
-}
-
-/**
- * Pack a signed 32-bit integer using ZigZag encoding and return the number of
- * bytes written.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t sint32_pack(int32_t value, uint8_t *out) {
-  return uint32_pack(zigzag32(value), out);
-}
-
 /**
  * Pack a 64-bit unsigned integer using base-128 varint encoding and return the
  * number of bytes written.
@@ -789,116 +748,6 @@ static size_t uint64_pack(uint64_t value, uint8_t *out) {
   return rv;
 }
 
-/**
- * Pack a 64-bit signed integer in ZigZag encoding and return the number of
- * bytes written.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t sint64_pack(int64_t value, uint8_t *out) {
-  return uint64_pack(zigzag64(value), out);
-}
-
-/**
- * Pack a 32-bit quantity in little-endian byte order. Used for protobuf wire
- * types fixed32, sfixed32, float. Similar to "htole32".
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t fixed32_pack(uint32_t value, void *out) {
-#if !defined(WORDS_BIGENDIAN)
-  memcpy(out, &value, 4);
-#else
-  uint8_t *buf = out;
-
-  buf[0] = value;
-  buf[1] = value >> 8;
-  buf[2] = value >> 16;
-  buf[3] = value >> 24;
-#endif
-  return 4;
-}
-
-/**
- * Pack a 64-bit quantity in little-endian byte order. Used for protobuf wire
- * types fixed64, sfixed64, double. Similar to "htole64".
- *
- * \todo The big-endian impl is really only good for 32-bit machines, a 64-bit
- * version would be appreciated, plus a way to decide to use 64-bit math where
- * convenient.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t fixed64_pack(uint64_t value, void *out) {
-#if !defined(WORDS_BIGENDIAN)
-  memcpy(out, &value, 8);
-#else
-  fixed32_pack(value, out);
-  fixed32_pack(value >> 32, ((char *)out) + 4);
-#endif
-  return 8;
-}
-
-/**
- * Pack a boolean value as an integer and return the number of bytes written.
- *
- * \todo Perhaps on some platforms *out = !!value would be a better impl, b/c
- * that is idiomatic C++ in some STL implementations.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t boolean_pack(protobuf_c_boolean value, uint8_t *out) {
-  *out = value ? TRUE : FALSE;
-  return 1;
-}
-
-/**
- * Pack a NUL-terminated C string and return the number of bytes written. The
- * output includes a length delimiter.
- *
- * The NULL pointer is treated as an empty string. This isn't really necessary,
- * but it allows people to leave required strings blank. (See Issue #13 in the
- * bug tracker for a little more explanation).
- *
- * \param str
- *      String to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t string_pack(const char *str, uint8_t *out) {
-  if (str == NULL) {
-    out[0] = 0;
-    return 1;
-  } else {
-    size_t len = strlen(str);
-    size_t rv = uint32_pack(len, out);
-    memcpy(out + rv, str, len);
-    return rv + len;
-  }
-}
-
 /**
  * Pack a ProtobufCBinaryData and return the number of bytes written. The output
  * includes a length delimiter.
@@ -918,30 +767,6 @@ static inline size_t binary_data_pack(const ProtobufCBinaryData *bd,
   return rv + len;
 }
 
-/**
- * Pack a ProtobufCMessage and return the number of bytes written. The output
- * includes a length delimiter.
- *
- * \param message
- *      ProtobufCMessage object to pack.
- * \param[out] out
- *      Packed message.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t prefixed_message_pack(const ProtobufCMessage *message,
-                                           uint8_t *out) {
-  if (message == NULL) {
-    out[0] = 0;
-    return 1;
-  } else {
-    size_t rv = protobuf_c_message_pack(message, out + 1);
-    uint32_t rv_packed_size = uint32_size(rv);
-    if (rv_packed_size != 1) memmove(out + rv_packed_size, out + 1, rv);
-    return uint32_pack(rv, out) + rv;
-  }
-}
-
 /**
  * Pack a field tag.
  *
@@ -963,143 +788,6 @@ static size_t tag_pack(uint32_t id, uint8_t *out) {
     return uint64_pack(((uint64_t)id) << 3, out);
 }
 
-/**
- * Pack a required field and return the number of bytes written.
- *
- * \param field
- *      Field descriptor.
- * \param member
- *      The field member.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t required_field_pack(const ProtobufCFieldDescriptor *field,
-                                  const void *member, uint8_t *out) {
-  size_t rv = tag_pack(field->id, out);
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SINT32:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
-      return rv + sint32_pack(*(const int32_t *)member, out + rv);
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
-      return rv + int32_pack(*(const int32_t *)member, out + rv);
-    case PROTOBUF_C_TYPE_UINT32:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
-      return rv + uint32_pack(*(const uint32_t *)member, out + rv);
-    case PROTOBUF_C_TYPE_SINT64:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
-      return rv + sint64_pack(*(const int64_t *)member, out + rv);
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
-      return rv + uint64_pack(*(const uint64_t *)member, out + rv);
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_32BIT;
-      return rv + fixed32_pack(*(const uint32_t *)member, out + rv);
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_64BIT;
-      return rv + fixed64_pack(*(const uint64_t *)member, out + rv);
-    case PROTOBUF_C_TYPE_BOOL:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
-      return rv + boolean_pack(*(const protobuf_c_boolean *)member, out + rv);
-    case PROTOBUF_C_TYPE_STRING:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED;
-      return rv + string_pack(*(char *const *)member, out + rv);
-    case PROTOBUF_C_TYPE_BYTES:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED;
-      return rv +
-             binary_data_pack((const ProtobufCBinaryData *)member, out + rv);
-    case PROTOBUF_C_TYPE_MESSAGE:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED;
-      return rv + prefixed_message_pack(*(ProtobufCMessage *const *)member,
-                                        out + rv);
-  }
-  PROTOBUF_C__ASSERT_NOT_REACHED();
-  return 0;
-}
-
-/**
- * Pack a oneof field and return the number of bytes written. Only packs the
- * field that is selected by the case enum.
- *
- * \param field
- *      Field descriptor.
- * \param oneof_case
- *      Enum value that selects the field in the oneof.
- * \param member
- *      The field member.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t oneof_field_pack(const ProtobufCFieldDescriptor *field,
-                               uint32_t oneof_case, const void *member,
-                               uint8_t *out) {
-  if (oneof_case != field->id) {
-    return 0;
-  }
-  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
-      field->type == PROTOBUF_C_TYPE_STRING) {
-    const void *ptr = *(const void *const *)member;
-    if (ptr == NULL || ptr == field->default_value) return 0;
-  }
-  return required_field_pack(field, member, out);
-}
-
-/**
- * Pack an optional field and return the number of bytes written.
- *
- * \param field
- *      Field descriptor.
- * \param has
- *      Whether the field is set.
- * \param member
- *      The field member.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t optional_field_pack(const ProtobufCFieldDescriptor *field,
-                                  const protobuf_c_boolean has,
-                                  const void *member, uint8_t *out) {
-  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
-      field->type == PROTOBUF_C_TYPE_STRING) {
-    const void *ptr = *(const void *const *)member;
-    if (ptr == NULL || ptr == field->default_value) return 0;
-  } else {
-    if (!has) return 0;
-  }
-  return required_field_pack(field, member, out);
-}
-
-/**
- * Pack an unlabeled field and return the number of bytes written.
- *
- * \param field
- *      Field descriptor.
- * \param member
- *      The field member.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t unlabeled_field_pack(const ProtobufCFieldDescriptor *field,
-                                   const void *member, uint8_t *out) {
-  if (field_is_zeroish(field, member)) return 0;
-  return required_field_pack(field, member, out);
-}
-
 /**
  * Given a field type, return the in-memory size.
  *
@@ -1139,236 +827,6 @@ static inline size_t sizeof_elt_in_repeated_array(ProtobufCType type) {
   return 0;
 }
 
-/**
- * Pack an array of 32-bit quantities.
- *
- * \param[out] out
- *      Destination.
- * \param[in] in
- *      Source.
- * \param[in] n
- *      Number of elements in the source array.
- */
-static void copy_to_little_endian_32(void *out, const void *in,
-                                     const unsigned n) {
-#if !defined(WORDS_BIGENDIAN)
-  memcpy(out, in, n * 4);
-#else
-  unsigned i;
-  const uint32_t *ini = in;
-  for (i = 0; i < n; i++) fixed32_pack(ini[i], (uint32_t *)out + i);
-#endif
-}
-
-/**
- * Pack an array of 64-bit quantities.
- *
- * \param[out] out
- *      Destination.
- * \param[in] in
- *      Source.
- * \param[in] n
- *      Number of elements in the source array.
- */
-static void copy_to_little_endian_64(void *out, const void *in,
-                                     const unsigned n) {
-#if !defined(WORDS_BIGENDIAN)
-  memcpy(out, in, n * 8);
-#else
-  unsigned i;
-  const uint64_t *ini = in;
-  for (i = 0; i < n; i++) fixed64_pack(ini[i], (uint64_t *)out + i);
-#endif
-}
-
-/**
- * Get the minimum number of bytes required to pack a field value of a
- * particular type.
- *
- * \param type
- *      Field type.
- * \return
- *      Number of bytes.
- */
-static unsigned get_type_min_size(ProtobufCType type) {
-  if (type == PROTOBUF_C_TYPE_SFIXED32 || type == PROTOBUF_C_TYPE_FIXED32 ||
-      type == PROTOBUF_C_TYPE_FLOAT) {
-    return 4;
-  }
-  if (type == PROTOBUF_C_TYPE_SFIXED64 || type == PROTOBUF_C_TYPE_FIXED64 ||
-      type == PROTOBUF_C_TYPE_DOUBLE) {
-    return 8;
-  }
-  return 1;
-}
-
-/**
- * Get the packed size of an array of same field type.
- *
- * \param field
- *      Field descriptor.
- * \param count
- *      Number of elements of this type.
- * \param array
- *      The elements to get the size of.
- * \return
- *      Number of bytes required.
- */
-static size_t get_packed_payload_length(const ProtobufCFieldDescriptor *field,
-                                        unsigned count, const void *array) {
-  unsigned rv = 0;
-  unsigned i;
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      return count * 4;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      return count * 8;
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32: {
-      const int32_t *arr = (const int32_t *)array;
-      for (i = 0; i < count; i++) rv += int32_size(arr[i]);
-      break;
-    }
-    case PROTOBUF_C_TYPE_SINT32: {
-      const int32_t *arr = (const int32_t *)array;
-      for (i = 0; i < count; i++) rv += sint32_size(arr[i]);
-      break;
-    }
-    case PROTOBUF_C_TYPE_UINT32: {
-      const uint32_t *arr = (const uint32_t *)array;
-      for (i = 0; i < count; i++) rv += uint32_size(arr[i]);
-      break;
-    }
-    case PROTOBUF_C_TYPE_SINT64: {
-      const int64_t *arr = (const int64_t *)array;
-      for (i = 0; i < count; i++) rv += sint64_size(arr[i]);
-      break;
-    }
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64: {
-      const uint64_t *arr = (const uint64_t *)array;
-      for (i = 0; i < count; i++) rv += uint64_size(arr[i]);
-      break;
-    }
-    case PROTOBUF_C_TYPE_BOOL:
-      return count;
-    default:
-      PROTOBUF_C__ASSERT_NOT_REACHED();
-  }
-  return rv;
-}
-
-/**
- * Pack an array of same field type to a virtual buffer.
- *
- * \param field
- *      Field descriptor.
- * \param count
- *      Number of elements of this type.
- * \param array
- *      The elements to get the size of.
- * \param[out] buffer
- *      Virtual buffer to append data to.
- * \return
- *      Number of bytes packed.
- */
-static size_t pack_buffer_packed_payload(const ProtobufCFieldDescriptor *field,
-                                         unsigned count, const void *array,
-                                         ProtobufCBuffer *buffer) {
-  uint8_t scratch[16];
-  size_t rv = 0;
-  unsigned i;
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-#if !defined(WORDS_BIGENDIAN)
-      rv = count * 4;
-      goto no_packing_needed;
-#else
-      for (i = 0; i < count; i++) {
-        unsigned len = fixed32_pack(((uint32_t *)array)[i], scratch);
-        buffer->append(buffer, len, scratch);
-        rv += len;
-      }
-      break;
-#endif
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-#if !defined(WORDS_BIGENDIAN)
-      rv = count * 8;
-      goto no_packing_needed;
-#else
-      for (i = 0; i < count; i++) {
-        unsigned len = fixed64_pack(((uint64_t *)array)[i], scratch);
-        buffer->append(buffer, len, scratch);
-        rv += len;
-      }
-      break;
-#endif
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      for (i = 0; i < count; i++) {
-        unsigned len = int32_pack(((int32_t *)array)[i], scratch);
-        buffer->append(buffer, len, scratch);
-        rv += len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_SINT32:
-      for (i = 0; i < count; i++) {
-        unsigned len = sint32_pack(((int32_t *)array)[i], scratch);
-        buffer->append(buffer, len, scratch);
-        rv += len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_UINT32:
-      for (i = 0; i < count; i++) {
-        unsigned len = uint32_pack(((uint32_t *)array)[i], scratch);
-        buffer->append(buffer, len, scratch);
-        rv += len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_SINT64:
-      for (i = 0; i < count; i++) {
-        unsigned len = sint64_pack(((int64_t *)array)[i], scratch);
-        buffer->append(buffer, len, scratch);
-        rv += len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      for (i = 0; i < count; i++) {
-        unsigned len = uint64_pack(((uint64_t *)array)[i], scratch);
-        buffer->append(buffer, len, scratch);
-        rv += len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_BOOL:
-      for (i = 0; i < count; i++) {
-        unsigned len = boolean_pack(((protobuf_c_boolean *)array)[i], scratch);
-        buffer->append(buffer, len, scratch);
-        rv += len;
-      }
-      return count;
-    default:
-      PROTOBUF_C__ASSERT_NOT_REACHED();
-  }
-  return rv;
-
-#if !defined(WORDS_BIGENDIAN)
-no_packing_needed:
-  buffer->append(buffer, rv, array);
-  return rv;
-#endif
-}
-
 static inline int int_range_lookup(unsigned n_ranges,
                                    const ProtobufCIntRange *ranges, int value) {
   unsigned n;
@@ -2638,147 +2096,3 @@ protobuf_c_boolean protobuf_c_message_check(const ProtobufCMessage *message) {
 
 typedef void (*GenericHandler)(void *service, const ProtobufCMessage *input,
                                ProtobufCClosure closure, void *closure_data);
-void protobuf_c_service_invoke_internal(ProtobufCService *service,
-                                        unsigned method_index,
-                                        const ProtobufCMessage *input,
-                                        ProtobufCClosure closure,
-                                        void *closure_data) {
-  GenericHandler *handlers;
-  GenericHandler handler;
-
-  /*
-   * Verify that method_index is within range. If this fails, you are
-   * likely invoking a newly added method on an old service. (Although
-   * other memory corruption bugs can cause this assertion too.)
-   */
-  assert(method_index < service->descriptor->n_methods);
-
-  /*
-   * Get the array of virtual methods (which are enumerated by the
-   * generated code).
-   */
-  handlers = (GenericHandler *)(service + 1);
-
-  /*
-   * Get our method and invoke it.
-   * \todo Seems like handler == NULL is a situation that needs handling.
-   */
-  handler = handlers[method_index];
-  (*handler)(service, input, closure, closure_data);
-}
-
-void protobuf_c_service_generated_init(
-    ProtobufCService *service, const ProtobufCServiceDescriptor *descriptor,
-    ProtobufCServiceDestroy destroy) {
-  ASSERT_IS_SERVICE_DESCRIPTOR(descriptor);
-  service->descriptor = descriptor;
-  service->destroy = destroy;
-  service->invoke = protobuf_c_service_invoke_internal;
-  memset(service + 1, 0, descriptor->n_methods * sizeof(GenericHandler));
-}
-
-void protobuf_c_service_destroy(ProtobufCService *service) {
-  service->destroy(service);
-}
-
-/* --- querying the descriptors --- */
-
-const ProtobufCEnumValue *protobuf_c_enum_descriptor_get_value_by_name(
-    const ProtobufCEnumDescriptor *desc, const char *name) {
-  unsigned start = 0;
-  unsigned count;
-
-  if (desc == NULL || desc->values_by_name == NULL) return NULL;
-
-  count = desc->n_value_names;
-
-  while (count > 1) {
-    unsigned mid = start + count / 2;
-    int rv = strcmp(desc->values_by_name[mid].name, name);
-    if (rv == 0)
-      return desc->values + desc->values_by_name[mid].index;
-    else if (rv < 0) {
-      count = start + count - (mid + 1);
-      start = mid + 1;
-    } else
-      count = mid - start;
-  }
-  if (count == 0) return NULL;
-  if (strcmp(desc->values_by_name[start].name, name) == 0)
-    return desc->values + desc->values_by_name[start].index;
-  return NULL;
-}
-
-const ProtobufCEnumValue *protobuf_c_enum_descriptor_get_value(
-    const ProtobufCEnumDescriptor *desc, int value) {
-  int rv = int_range_lookup(desc->n_value_ranges, desc->value_ranges, value);
-  if (rv < 0) return NULL;
-  return desc->values + rv;
-}
-
-const ProtobufCFieldDescriptor *protobuf_c_message_descriptor_get_field_by_name(
-    const ProtobufCMessageDescriptor *desc, const char *name) {
-  unsigned start = 0;
-  unsigned count;
-  const ProtobufCFieldDescriptor *field;
-
-  if (desc == NULL || desc->fields_sorted_by_name == NULL) return NULL;
-
-  count = desc->n_fields;
-
-  while (count > 1) {
-    unsigned mid = start + count / 2;
-    int rv;
-    field = desc->fields + desc->fields_sorted_by_name[mid];
-    rv = strcmp(field->name, name);
-    if (rv == 0)
-      return field;
-    else if (rv < 0) {
-      count = start + count - (mid + 1);
-      start = mid + 1;
-    } else
-      count = mid - start;
-  }
-  if (count == 0) return NULL;
-  field = desc->fields + desc->fields_sorted_by_name[start];
-  if (strcmp(field->name, name) == 0) return field;
-  return NULL;
-}
-
-const ProtobufCFieldDescriptor *protobuf_c_message_descriptor_get_field(
-    const ProtobufCMessageDescriptor *desc, unsigned value) {
-  int rv = int_range_lookup(desc->n_field_ranges, desc->field_ranges, value);
-  if (rv < 0) return NULL;
-  return desc->fields + rv;
-}
-
-const ProtobufCMethodDescriptor *
-protobuf_c_service_descriptor_get_method_by_name(
-    const ProtobufCServiceDescriptor *desc, const char *name) {
-  unsigned start = 0;
-  unsigned count;
-
-  if (desc == NULL || desc->method_indices_by_name == NULL) return NULL;
-
-  count = desc->n_methods;
-
-  while (count > 1) {
-    unsigned mid = start + count / 2;
-    unsigned mid_index = desc->method_indices_by_name[mid];
-    const char *mid_name = desc->methods[mid_index].name;
-    int rv = strcmp(mid_name, name);
-
-    if (rv == 0) return desc->methods + desc->method_indices_by_name[mid];
-    if (rv < 0) {
-      count = start + count - (mid + 1);
-      start = mid + 1;
-    } else {
-      count = mid - start;
-    }
-  }
-  if (count == 0) return NULL;
-  if (strcmp(desc->methods[desc->method_indices_by_name[start]].name, name) ==
-      0)
-    return desc->methods + desc->method_indices_by_name[start];
-  return NULL;
-}
diff --git a/src/common/protobuf-c.h b/src/common/protobuf-c.h
index a04559d79a02cf11d2b8f5f168c34377c313bc9a..bd85695b868af6c7b91590196339bc4f7826a256 100644
--- a/src/common/protobuf-c.h
+++ b/src/common/protobuf-c.h
@@ -798,76 +798,6 @@ uint32_t protobuf_c_version_number(void);
  */
 #define PROTOBUF_C_MIN_COMPILER_VERSION 1000000
 
-/**
- * Look up a `ProtobufCEnumValue` from a `ProtobufCEnumDescriptor` by name.
- *
- * \param desc
- *      The `ProtobufCEnumDescriptor` object.
- * \param name
- *      The `name` field from the corresponding `ProtobufCEnumValue` object to
- *      match.
- * \return
- *      A `ProtobufCEnumValue` object.
- * \retval NULL
- *      If not found or if the optimize_for = CODE_SIZE option was set.
- */
-PROTOBUF_C__API
-const ProtobufCEnumValue *protobuf_c_enum_descriptor_get_value_by_name(
-    const ProtobufCEnumDescriptor *desc, const char *name);
-
-/**
- * Look up a `ProtobufCEnumValue` from a `ProtobufCEnumDescriptor` by numeric
- * value.
- *
- * \param desc
- *      The `ProtobufCEnumDescriptor` object.
- * \param value
- *      The `value` field from the corresponding `ProtobufCEnumValue` object to
- *      match.
- *
- * \return
- *      A `ProtobufCEnumValue` object.
- * \retval NULL
- *      If not found.
- */
-PROTOBUF_C__API
-const ProtobufCEnumValue *protobuf_c_enum_descriptor_get_value(
-    const ProtobufCEnumDescriptor *desc, int value);
-
-/**
- * Look up a `ProtobufCFieldDescriptor` from a `ProtobufCMessageDescriptor` by
- * the name of the field.
- *
- * \param desc
- *      The `ProtobufCMessageDescriptor` object.
- * \param name
- *      The name of the field.
- * \return
- *      A `ProtobufCFieldDescriptor` object.
- * \retval NULL
- *      If not found or if the optimize_for = CODE_SIZE option was set.
- */
-PROTOBUF_C__API
-const ProtobufCFieldDescriptor *protobuf_c_message_descriptor_get_field_by_name(
-    const ProtobufCMessageDescriptor *desc, const char *name);
-
-/**
- * Look up a `ProtobufCFieldDescriptor` from a `ProtobufCMessageDescriptor` by
- * the tag value of the field.
- *
- * \param desc
- *      The `ProtobufCMessageDescriptor` object.
- * \param value
- *      The tag value of the field.
- * \return
- *      A `ProtobufCFieldDescriptor` object.
- * \retval NULL
- *      If not found.
- */
-PROTOBUF_C__API
-const ProtobufCFieldDescriptor *protobuf_c_message_descriptor_get_field(
-    const ProtobufCMessageDescriptor *desc, unsigned value);
-
 /**
  * Determine the number of bytes required to store the serialised message.
  *
@@ -947,33 +877,6 @@ PROTOBUF_C__API
 void protobuf_c_message_init(const ProtobufCMessageDescriptor *descriptor,
                              void *message);
 
-/**
- * Free a service.
- *
- * \param service
- *      The service object to free.
- */
-PROTOBUF_C__API
-void protobuf_c_service_destroy(ProtobufCService *service);
-
-/**
- * Look up a `ProtobufCMethodDescriptor` by name.
- *
- * \param desc
- *      Service descriptor.
- * \param name
- *      Name of the method.
- *
- * \return
- *      A `ProtobufCMethodDescriptor` object.
- * \retval NULL
- *      If not found or if the optimize_for = CODE_SIZE option was set.
- */
-PROTOBUF_C__API
-const ProtobufCMethodDescriptor *
-protobuf_c_service_descriptor_get_method_by_name(
-    const ProtobufCServiceDescriptor *desc, const char *name);
-
 /**
  * Initialise a `ProtobufCBufferSimple` object.
  */
@@ -1011,18 +914,6 @@ PROTOBUF_C__API
 void protobuf_c_buffer_simple_append(ProtobufCBuffer *buffer, size_t len,
                                      const unsigned char *data);
 
-PROTOBUF_C__API
-void protobuf_c_service_generated_init(
-    ProtobufCService *service, const ProtobufCServiceDescriptor *descriptor,
-    ProtobufCServiceDestroy destroy);
-
-PROTOBUF_C__API
-void protobuf_c_service_invoke_internal(ProtobufCService *service,
-                                        unsigned method_index,
-                                        const ProtobufCMessage *input,
-                                        ProtobufCClosure closure,
-                                        void *closure_data);
-
 /**@}*/
 
 PROTOBUF_C__END_DECLS
diff --git a/src/common/threadpool.h b/src/common/threadpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf7894dd94a20f4f51df23c6355d26d6da3af01d
--- /dev/null
+++ b/src/common/threadpool.h
@@ -0,0 +1,126 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <stdexcept>
+#include <thread>
+#include <vector>
+
+namespace paddle_mobile {
+class ThreadPool {
+ public:
+  static ThreadPool& getThreadPool();
+  static int getThreadPoolThreadId();
+  explicit ThreadPool(size_t);
+  template <class F, class... Args>
+  auto enqueue(F&& f, Args&&... args)
+      -> std::future<typename std::result_of<F(Args...)>::type>;
+  ~ThreadPool();
+  int getTid(const std::thread::id& id) {
+    for (int i = 0; i < workers.size(); i++) {
+      if (workers[i].get_id() == id) {
+        return i;
+      }
+    }
+    return -1;
+  }
+
+ private:
+  // need to keep track of threads so we can join them
+  std::vector<std::thread> workers;
+  // the task queue
+  std::queue<std::function<void()>> tasks;
+
+  // synchronization
+  std::mutex queue_mutex;
+  std::condition_variable condition;
+  bool stop;
+};
+
+// the constructor just launches some amount of workers
+inline ThreadPool::ThreadPool(size_t threads) : stop(false) {
+  for (size_t i = 0; i < threads; ++i)
+    workers.emplace_back([this] {
+      for (;;) {
+        std::function<void()> task;
+        {
+          std::unique_lock<std::mutex> lock(this->queue_mutex);
+          this->condition.wait(
+              lock, [this] { return this->stop || !this->tasks.empty(); });
+          // for (;;) {
+          //     if (this->stop || !this->tasks.empty()) {
+          //         break;
+          //     }
+          //     lock.unlock();
+          //     lock.lock();
+          // }
+          if (this->stop && this->tasks.empty()) return;
+          task = std::move(this->tasks.front());
+          this->tasks.pop();
+        }
+
+        task();
+      }
+    });
+}
+
+// add new work item to the pool
+template <class F, class... Args>
+auto ThreadPool::enqueue(F&& f, Args&&... args)
+    -> std::future<typename std::result_of<F(Args...)>::type> {
+  using return_type = typename std::result_of<F(Args...)>::type;
+
+  auto task = std::make_shared<std::packaged_task<return_type()>>(
+      std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+
+  std::future<return_type> res = task->get_future();
+  {
+    std::unique_lock<std::mutex> lock(queue_mutex);
+
+    // don't allow enqueueing after stopping the pool
+    // if(stop)
+    //     throw std::runtime_error("enqueue on stopped ThreadPool");
+
+    tasks.emplace([task]() { (*task)(); });
+  }
+  condition.notify_one();
+  return res;
+}
+
+// the destructor joins all threads
+inline ThreadPool::~ThreadPool() {
+  {
+    std::unique_lock<std::mutex> lock(queue_mutex);
+    stop = true;
+  }
+  condition.notify_all();
+  for (std::thread& worker : workers) worker.join();
+}
+
+ThreadPool& ThreadPool::getThreadPool() {
+  static ThreadPool threadPool(3);
+  return threadPool;
+}
+
+int ThreadPool::getThreadPoolThreadId() {
+  return getThreadPool().getTid(std::this_thread::get_id());
+}
+}  // namespace paddle_mobile
diff --git a/src/common/type_define.h b/src/common/type_define.h
index 63665bf7933f773b2b3de40ade9c700e3e93e6a9..389f9a715f8cec3f0b494ae3b43b3952e49677f8 100644
--- a/src/common/type_define.h
+++ b/src/common/type_define.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once;
+#pragma once
 
+#include <functional>
 #include <map>
 #include <string>
-#include <unordered_set>
 #include <vector>
 #include "framework/attribute.h"
 #include "framework/scope.h"
@@ -40,13 +40,6 @@ using OpCreator = std::function<framework::OperatorBase<Dtype> *(
     const framework::AttributeMap & /*attrs*/,
     std::shared_ptr<framework::Scope> /*scope*/)>;
 
-using GradOpMakerFN =
-    std::function<std::vector<std::unique_ptr<framework::OpDesc>>(
-        const framework::OpDesc &,
-        const std::unordered_set<std::string> & /*no_grad_set*/,
-        std::unordered_map<std::string, std::string> * /*grad_to_var*/,
-        const std::vector<framework::BlockDesc *> &grad_block)>;
-
 using InferVarTypeFN = std::function<void(const framework::OpDesc & /*op_desc*/,
                                           framework::BlockDesc * /*block*/)>;
 
diff --git a/src/common/types.cpp b/src/common/types.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c06b6f0f69f8b1a85ca24bfcde2fd217ecc8f0d
--- /dev/null
+++ b/src/common/types.cpp
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "common/types.h"
+#include <vector>
+
+namespace paddle_mobile {
+
+const std::string G_OP_TYPE_CONV = "conv2d";
+const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
+const std::string G_OP_TYPE_BOX_CODER = "box_coder";
+const std::string G_OP_TYPE_CONCAT = "concat";
+const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
+const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
+const std::string G_OP_TYPE_FC = "fc";
+const std::string G_OP_TYPE_CONV_ADD = "conv_add";
+const std::string G_OP_TYPE_LRN = "lrn";
+const std::string G_OP_TYPE_MUL = "mul";
+const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
+const std::string G_OP_TYPE_POOL2D = "pool2d";
+const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
+const std::string G_OP_TYPE_RELU = "relu";
+const std::string G_OP_TYPE_RESHAPE = "reshape";
+const std::string G_OP_TYPE_SIGMOID = "sigmoid";
+const std::string G_OP_TYPE_SOFTMAX = "softmax";
+const std::string G_OP_TYPE_TRANSPOSE = "transpose";
+const std::string G_OP_TYPE_SPLIT = "split";
+const std::string G_OP_TYPE_FEED = "feed";
+const std::string G_OP_TYPE_FETCH = "fetch";
+const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
+const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence";
+
+std::unordered_map<
+    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
+    op_input_output_key = {
+        {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_CONV_ADD, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
+        {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_BOX_CODER,
+         {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
+        {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
+        {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
+        {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
+        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}}};
+
+}  // namespace paddle_mobile
diff --git a/src/common/types.h b/src/common/types.h
index 77aeec0426b1e3c627bb493ab11eabe3ca3a6d98..e632c0b52f5b6ba578b8dd9d62c587b0e949768f 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once;
+#pragma once
 
 #include <string>
 #include <unordered_map>
-#include <utility>
+#include <vector>
 
 namespace paddle_mobile {
 enum class Precision : int { FP32 = 0 };
@@ -72,50 +72,32 @@ enum PMStatus {
   PMWrongDevice = 0x08     /*!< un-correct device. */
 };
 
-static const std::string G_OP_TYPE_CONV = "conv2d";
-static const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
-static const std::string G_OP_TYPE_BOX_CODER = "box_coder";
-static const std::string G_OP_TYPE_CONCAT = "concat";
-static const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
-static const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU =
-    "fusion_conv_add_relu";
-static const std::string G_OP_TYPE_FC = "fc";
-static const std::string G_OP_TYPE_LRN = "lrn";
-static const std::string G_OP_TYPE_MUL = "mul";
-static const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
-static const std::string G_OP_TYPE_POOL2D = "pool2d";
-static const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
-static const std::string G_OP_TYPE_RELU = "relu";
-static const std::string G_OP_TYPE_RESHAPE = "reshape";
-static const std::string G_OP_TYPE_SIGMOID = "sigmoid";
-static const std::string G_OP_TYPE_SOFTMAX = "softmax";
-static const std::string G_OP_TYPE_TRANSPOSE = "transpose";
-static const std::string G_OP_TYPE_SPLIT = "split";
-static const std::string G_OP_TYPE_FEED = "feed";
-static const std::string G_OP_TYPE_FETCH = "fetch";
-static const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
-static const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence";
+extern const std::string G_OP_TYPE_CONV;
+extern const std::string G_OP_TYPE_BATCHNORM;
+extern const std::string G_OP_TYPE_BOX_CODER;
+extern const std::string G_OP_TYPE_CONCAT;
+extern const std::string G_OP_TYPE_ELEMENTWISE_ADD;
+extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU;
+extern const std::string G_OP_TYPE_FC;
+extern const std::string G_OP_TYPE_CONV_ADD;
+extern const std::string G_OP_TYPE_LRN;
+extern const std::string G_OP_TYPE_MUL;
+extern const std::string G_OP_TYPE_MULTICLASS_NMS;
+extern const std::string G_OP_TYPE_POOL2D;
+extern const std::string G_OP_TYPE_PRIOR_BOX;
+extern const std::string G_OP_TYPE_RELU;
+extern const std::string G_OP_TYPE_RESHAPE;
+extern const std::string G_OP_TYPE_SIGMOID;
+extern const std::string G_OP_TYPE_SOFTMAX;
+extern const std::string G_OP_TYPE_TRANSPOSE;
+extern const std::string G_OP_TYPE_SPLIT;
+extern const std::string G_OP_TYPE_FEED;
+extern const std::string G_OP_TYPE_FETCH;
+extern const std::string G_OP_TYPE_DEPTHWISE_CONV;
+extern const std::string G_OP_TYPE_IM2SEQUENCE;
 
-static std::unordered_map<
+extern std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
-    op_input_output_key = {
-        {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
-        {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
-        {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_BOX_CODER,
-         {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
-        {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
-        {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
-        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}}};
+    op_input_output_key;
+
 }  // namespace paddle_mobile
diff --git a/src/common/variant.cpp b/src/common/variant.cpp
deleted file mode 100644
index 6bbf34eae933d69d00517c723326111901444ab0..0000000000000000000000000000000000000000
--- a/src/common/variant.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
diff --git a/src/common/variant.h b/src/common/variant.h
index c198ff511c2e90e0387238aede9f1373f0379d3c..7fbf0ec0772f102165770dc9c8e053f469965f10 100644
--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <iostream>
-
+#include "common/enforce.h"
 #include "common/log.h"
 
 #pragma once
@@ -57,15 +56,11 @@ class RawData {
   char data[size];
   RawData() {}
   RawData(const RawData &raw_data) { strcpy(data, raw_data.data); }
-  //      void operator=(const RawData &raw_data){
-  //        strcpy(data, raw_data.data);
-  //      }
 };
 
 template <typename... Ts>
 struct Variant {
   Variant(const Variant &variant) {
-    //        std::cout << " 赋值构造函数 " << std::endl;
     type_id = variant.type_id;
     data = variant.data;
   }
@@ -87,8 +82,7 @@ struct Variant {
     if (type_id == typeid(T).hash_code()) {
       return *const_cast<T *>(reinterpret_cast<const T *>(&data));
     } else {
-      //      std::cout << " bad cast in variant " << std::endl;
-      throw std::bad_cast();
+      PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant ");
     }
   }
 
diff --git a/src/framework/attribute.cpp b/src/framework/attribute.cpp
index 01b0ed523c2ccf125c4bb81d3d50ff5e4b289c7e..8b150f4e9e6aa3ccc30f13f661ff9cd6be79ae7a 100644
--- a/src/framework/attribute.cpp
+++ b/src/framework/attribute.cpp
@@ -17,14 +17,8 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {
 
-/*
- * Variant<int, float, std::string, std::vector<int>, std::vector<float>,
-          std::vector<std::string>, bool, std::vector<bool>, BlockDesc *,
-          int64_t>
- * */
-
 struct PrintVistor : Vistor<Print &> {
-  PrintVistor(Print &printer) : printer_(printer) {}
+  explicit PrintVistor(Print &printer) : printer_(printer) {}
   template <typename T>
   Print &operator()(const T &value) {
     printer_ << value;
diff --git a/src/framework/attribute.h b/src/framework/attribute.h
index b77d94521e8be9bdfdfd00ca1628bdefc60d688d..478fc1b3f5ae95d9720b057d3ab0d2e8912e7093 100644
--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -14,7 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <typeinfo>
 #include <unordered_map>
+#include <vector>
+
 #include "common/enforce.h"
 #include "common/log.h"
 #include "common/variant.h"
@@ -22,28 +26,15 @@ limitations under the License. */
 
 namespace paddle_mobile {
 namespace framework {
+using std::string;
+using std::vector;
 
 class BlockDesc;
 
 class Attribute {
  public:
-  /*
-   *  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT = 0,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT = 1,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING = 2,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS = 3,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS = 4,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS = 5,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN = 6,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG = 9
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE)
-   *
-   * */
   static Attribute GetAttrValue(
       PaddleMobile__Framework__Proto__OpDesc__Attr *attr_desc) {
-    //    std::cout << "begin get attr value" << std::endl;
     Attribute attr;
     switch (attr_desc->type) {
       case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN: {
@@ -63,35 +54,35 @@ class Attribute {
         break;
       }
       case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS: {
-        std::vector<bool> val(attr_desc->n_bools);
+        vector<bool> val(attr_desc->n_bools);
         for (int i = 0; i < attr_desc->n_bools; ++i) {
           val[i] = attr_desc->bools[i];
         }
-        attr.Set<std::vector<bool>>(val);
+        attr.Set<vector<bool>>(val);
         break;
       }
       case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS: {
-        std::vector<int> val(attr_desc->n_ints);
+        vector<int> val(attr_desc->n_ints);
         for (int i = 0; i < attr_desc->n_ints; ++i) {
           val[i] = attr_desc->ints[i];
         }
-        attr.Set<std::vector<int>>(val);
+        attr.Set<vector<int>>(val);
         break;
       }
       case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS: {
-        std::vector<float> val(attr_desc->n_floats);
+        vector<float> val(attr_desc->n_floats);
         for (int i = 0; i < attr_desc->n_floats; ++i) {
           val[i] = attr_desc->floats[i];
         }
-        attr.Set<std::vector<float>>(val);
+        attr.Set<vector<float>>(val);
         break;
       }
       case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS: {
-        std::vector<std::string> val(attr_desc->n_strings);
+        vector<string> val(attr_desc->n_strings);
         for (int i = 0; i < attr_desc->n_strings; ++i) {
           val[i] = attr_desc->strings[i];
         }
-        attr.Set<std::vector<std::string>>(val);
+        attr.Set<vector<string>>(val);
         break;
       }
       case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG: {
@@ -122,47 +113,41 @@ class Attribute {
       return vistor(attr.variant_.Get<int>());
     } else if (attr.variant_.TypeId() == typeid(float).hash_code()) {
       return vistor(attr.variant_.Get<float>());
-    } else if (attr.variant_.TypeId() == typeid(std::string).hash_code()) {
-      return vistor(attr.variant_.Get<std::string>());
-    } else if (attr.variant_.TypeId() == typeid(std::vector<int>).hash_code()) {
-      return vistor(attr.variant_.Get<std::vector<int>>());
-    } else if (attr.variant_.TypeId() ==
-               typeid(std::vector<float>).hash_code()) {
-      return vistor(attr.variant_.Get<std::vector<float>>());
-    } else if (attr.variant_.TypeId() ==
-               typeid(std::vector<std::string>).hash_code()) {
-      return vistor(attr.variant_.Get<std::vector<std::string>>());
+    } else if (attr.variant_.TypeId() == typeid(string).hash_code()) {
+      return vistor(attr.variant_.Get<string>());
+    } else if (attr.variant_.TypeId() == typeid(vector<int>).hash_code()) {
+      return vistor(attr.variant_.Get<vector<int>>());
+    } else if (attr.variant_.TypeId() == typeid(vector<float>).hash_code()) {
+      return vistor(attr.variant_.Get<vector<float>>());
+    } else if (attr.variant_.TypeId() == typeid(vector<string>).hash_code()) {
+      return vistor(attr.variant_.Get<vector<string>>());
     } else if (attr.variant_.TypeId() == typeid(bool).hash_code()) {
       return vistor(attr.variant_.Get<bool>());
-    } else if (attr.variant_.TypeId() ==
-               typeid(std::vector<bool>).hash_code()) {
-      return vistor(attr.variant_.Get<std::vector<bool>>());
+    } else if (attr.variant_.TypeId() == typeid(vector<bool>).hash_code()) {
+      return vistor(attr.variant_.Get<vector<bool>>());
     } else if (attr.variant_.TypeId() == typeid(int64_t).hash_code()) {
       return vistor(attr.variant_.Get<int64_t>());
     } else {
-      throw std::bad_exception();
+      PADDLE_MOBILE_THROW_EXCEPTION("type not support");
     }
   }
 
  private:
-  Variant<int, float, std::string, std::vector<int>, std::vector<float>,
-          std::vector<std::string>, bool, std::vector<bool>, BlockDesc *,
-          int64_t>
+  Variant<int, float, string, vector<int>, vector<float>, vector<string>, bool,
+          vector<bool>, BlockDesc *, int64_t>
       variant_;
 };
 
-using AttributeMap = std::unordered_map<std::string, Attribute>;
+using AttributeMap = std::unordered_map<string, Attribute>;
 
 class AttrReader {
  public:
   explicit AttrReader(const AttributeMap &attrs) : attrs_(attrs) {}
 
   template <typename T>
-  inline T Get(const std::string &name) const {
-    //          PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should
-    //          be in
-    //          AttributeMap",
-    //                         name);
+  inline T Get(const string &name) const {
+    PADDLE_MOBILE_ENFORCE(attrs_.count(name) != 0,
+                          "%s should  be in AttributeMap", name);
     return ((Attribute)attrs_.at(name)).Get<T>();
   }
 
diff --git a/src/framework/data_layout.h b/src/framework/data_layout.h
index 72c16c36733c0660ae2cf46de31031370eed444a..3b31445707a887a2715afd0b9e7192ad76724351 100644
--- a/src/framework/data_layout.h
+++ b/src/framework/data_layout.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include <cctype>
-#include <iostream>
 #include <string>
 
 namespace paddle_mobile {
@@ -40,7 +39,7 @@ inline DataLayout StringToDataLayout(const std::string &str) {
   } else if (s == "ANYLAYOUT") {
     return DataLayout::kAnyLayout;
   } else {
-    //    std::cout << "Unknown storage order string: %s", s;
+    PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
   }
 }
 
@@ -54,14 +53,8 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) {
       return "ANY_LAYOUT";
     default:
       break;
-      //      std::cout << "unknown DataLayou %d", data_layout;
   }
 }
 
-inline std::ostream &operator<<(std::ostream &out, const DataLayout &l) {
-  out << DataLayoutToString(l);
-  return out;
-}
-
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/data_transform.cpp b/src/framework/data_transform.cpp
deleted file mode 100644
index a6be4d2fcbbc6e0dd2adb9f71d644b2bd60d4259..0000000000000000000000000000000000000000
--- a/src/framework/data_transform.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/data_transform.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-static void PassTensorData(Tensor *from, Tensor *to) {
-  to->ShareDataWith(*from);
-  *from = Tensor();
-}
-
-void DataTransform(const OpKernelType &expected_kernel_type,
-                   const OpKernelType &kernel_type_for_var,
-                   const Tensor &input_tensor, Tensor *output_tensor) {
-  bool transformed = false;
-  Tensor in;
-  in.ShareDataWith(input_tensor);
-  Tensor out;
-
-  //  // do layout transform
-  //  if (NeedTransformLayout(expected_kernel_type.data_layout_,
-  //                          kernel_type_for_var.data_layout_)) {
-  //    TransDataLayout(kernel_type_for_var, expected_kernel_type, in,
-  //    &out);
-  //    transformed = true;
-  //    PassTensorData(&out, &in);
-  //  }
-  //
-  //  // do data type transform
-  //  if (expected_kernel_type.data_type_ !=
-  //  kernel_type_for_var.data_type_) {
-  //    TransDataType(kernel_type_for_var, expected_kernel_type, in,
-  //    &out);
-  //    transformed = true;
-  //    PassTensorData(&out, &in);
-  //  }
-  //
-  //  // do device transform
-  //  if (!platform::is_same_place(kernel_type_for_var.place_,
-  //                               expected_kernel_type.place_)) {
-  //    TransDataDevice(in, expected_kernel_type.place_, &out);
-  //    transformed = true;
-  //    PassTensorData(&out, &in);
-  //  }
-  //
-  //  PADDLE_ENFORCE(transformed, "No transform is applied, please
-  //  check!");
-  // get output data
-  output_tensor->ShareDataWith(in);
-}
-
-void CopyVariableWithTensor(const Variable &in_var, const Tensor &tensor,
-                            Variable *out_var) {
-  //  if (in_var.IsType<LoDTensor>()) {
-  //    auto& in_lod_tensor = in_var.Get<LoDTensor>();
-  //    auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
-  //    tran_lod_tensor->set_lod(in_lod_tensor.lod());
-  //    tran_lod_tensor->set_layout(in_lod_tensor.layout());
-  //    tran_lod_tensor->ShareDataWith(tensor);
-  //  } else if (in_var.IsType<SelectedRows>()) {
-  //    auto& in_selected_rows = in_var.Get<SelectedRows>();
-  //    auto* trans_selected_rows =
-  //    out_var.GetMutable<SelectedRows>();
-  //    trans_selected_rows->set_height(in_selected_rows.height());
-  //    trans_selected_rows->set_rows(in_selected_rows.rows());
-  //    trans_selected_rows->mutable_value()->ShareDataWith(tensor);
-  //  } else {
-  //    PADDLE_THROW("unknown var type");
-  //  }
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/src/framework/data_type.h b/src/framework/data_type.h
deleted file mode 100644
index ddfc0dcc4adf8e5897f5f4ea67f9514889863f32..0000000000000000000000000000000000000000
--- a/src/framework/data_type.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace framework {
-
-//    inline proto::VarType::Type ToDataType(std::type_index type) {
-//        using namespace paddle_mobile::framework::proto;
-//        if (typeid(float).hash_code() == type.hash_code()) {
-//            return proto::VarType::FP32;
-//        } else if (typeid(double).hash_code() == type.hash_code()) {
-//            return proto::VarType::FP64;
-//        } else if (typeid(int).hash_code() == type.hash_code()) {
-//            return proto::VarType::INT32;
-//        } else if (typeid(int64_t).hash_code() == type.hash_code()) {
-//            return proto::VarType::INT64;
-//        } else if (typeid(bool).hash_code() == type.hash_code()) {
-//            return proto::VarType::BOOL;
-//        } else {
-////            PADDLE_THROW("Not supported");
-//        }
-//    }
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/src/framework/ddim.cpp b/src/framework/ddim.cpp
index db6f2cd6aba92fec6a42839c0e3198ac749807b0..3a4a3abb7cd4c632251e6f0190e32c99dd232c01 100644
--- a/src/framework/ddim.cpp
+++ b/src/framework/ddim.cpp
@@ -63,9 +63,6 @@ void make_ddim(DDim &ddim, const int64_t *dims, int n) {
       ddim = make_dim<9>(dims);
       break;
     default:
-      //      std::cout << "Dynamic dimensions must have between [1,
-      //      9]
-      //      dimensions.";
       break;
   }
 }
@@ -133,9 +130,6 @@ int64_t DDim::operator[](int idx) const {
 int DDim::size() const { return arity(*this); }
 
 bool DDim::operator==(DDim d) const {
-  //  if (var.which() != d.getVar().which()) {
-  //    return false;
-  //  } else {
   std::vector<int64_t> v1 = vectorize(*this);
   std::vector<int64_t> v2 = vectorize(d);
 
@@ -157,7 +151,7 @@ DDim DDim::operator+(DDim d) const {
 
   std::vector<int64_t> v3;
 
-  assert(v1.size() == v2.size());
+  PADDLE_MOBILE_ENFORCE(v1.size() == v2.size(), "v1.size() != v2.size()");
 
   for (unsigned int i = 0; i < v1.size(); i++) {
     v3.push_back(v1[i] + v2[i]);
@@ -172,7 +166,7 @@ DDim DDim::operator*(DDim d) const {
 
   std::vector<int64_t> v3;
 
-  assert(v1.size() == v2.size());
+  PADDLE_MOBILE_ENFORCE(v1.size() == v2.size(), "v1.size() == v2.size()");
 
   for (unsigned int i = 0; i < v1.size(); i++) {
     v3.push_back(v1[i] * v2[i]);
@@ -183,7 +177,7 @@ DDim DDim::operator*(DDim d) const {
 
 int64_t get(const DDim &ddim, int idx) { return ddim[idx]; }
 
-void set(DDim &ddim, int idx, int value) { ddim[idx] = value; }
+void set(DDim *ddim, int idx, int value) { (*ddim)[idx] = value; }
 
 /// @cond HIDDEN
 struct VectorizeVisitor : Vistor<void> {
@@ -235,13 +229,10 @@ struct SliceVectorizeVisitor : Vistor<void> {
 
   SliceVectorizeVisitor(std::vector<int64_t> &v, int b, int e)
       : vector(v), begin(b), end(e) {
-    //    PADDLE_ENFORCE(begin < end,
-    //                   "Begin index must be less than end index in
-    //                   ddim
-    //                   slice.");
-    //    PADDLE_ENFORCE(begin >= 0,
-    //                   "Begin index can't be less than zero in
-    //                   ddim slice.");
+    PADDLE_MOBILE_ENFORCE(
+        begin < end, "Begin index must be less than end index in ddim slice.");
+    PADDLE_MOBILE_ENFORCE(begin >= 0,
+                          "Begin index can't be less than zero in ddim slice.");
   }
 
   template <int S>
@@ -267,9 +258,7 @@ DDim slice_ddim(const DDim &ddim, int begin, int end) {
   std::vector<int64_t> vec;
   vec.reserve(end - begin);
   SliceVectorizeVisitor visitor(vec, begin, end);
-  //  boost::apply_visitor(visitor, dim);
   DDim::ApplyVistor(visitor, ddim);
-  //  visitor(ddim.var.Get<Dim<4>>());
   return make_ddim(vec);
 }
 
@@ -287,31 +276,19 @@ struct ArityVisitor : Vistor<int> {
 int arity(const DDim &d) {
   ArityVisitor arityVisitor = ArityVisitor();
   return DDim::ApplyVistor(arityVisitor, d);
-  //  return arityVisitor(d.var.Get<Dim<4>>());
-  //  return boost::apply_visitor(ArityVisitor(), d); }
 }
-/// \cond HIDDEN
-
-/// \endcond
 
-struct OSVistor : Vistor<std::ostream &> {
-  OSVistor(std::ostream &os) : os_(os) {}
-
-  template <int D>
-  std::ostream &operator()(Dim<D> dim) const {
-    return os_ << dim;
+#ifdef PADDLE_MOBILE_DEBUG
+Print &operator<<(Print &printer, const DDim &ddim) {
+  for (int j = 0; j < ddim.size(); ++j) {
+    printer << ddim[j] << " ";
   }
 
- private:
-  std::ostream &os_;
-};
-
-std::ostream &operator<<(std::ostream &os, const DDim &ddim) {
-  auto vistor = OSVistor(os);
-  DDim::ApplyVistor(vistor, ddim);
-  return os;
+  return printer;
 }
 
+#endif
+
 DDim::DDim(std::initializer_list<int64_t> init_list) {
   *this = make_ddim(init_list);
 }
diff --git a/src/framework/ddim.h b/src/framework/ddim.h
index 88039b2e0a57b4f79247129d1d95e4d5954da6c6..ff94c24adac5bea1fd4c7a80857b212056c06d36 100644
--- a/src/framework/ddim.h
+++ b/src/framework/ddim.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include <assert.h>
 #include <initializer_list>
-#include <stdexcept>
+#include <typeinfo>
 #include <vector>
+#include "common/enforce.h"
 #include "common/variant.h"
 #include "dim.h"
 
@@ -58,9 +58,7 @@ struct DDim {
     } else if (d.var.TypeId() == typeid(Dim<9>).hash_code()) {
       return vistor(d.var.Get<Dim<9>>());
     } else {
-      printf(" dim not support  \n");
-      throw std::bad_exception();
-      //        return typename Vistor::type_t();
+      DLOG << " dim not support";
     }
   }
 
@@ -83,17 +81,6 @@ struct DDim {
 
   int64_t operator[](int idx) const;
 
-  //  template <typename Visitor>
-  //  typename Visitor::result_type apply_visitor(Visitor& visitor) {
-  //    return var.apply_visitor(visitor);
-  //  }
-  //
-  //  template <typename Visitor>
-  //  typename Visitor::result_type apply_visitor(Visitor& visitor)
-  //  const {
-  //    return var.apply_visitor(visitor);
-  //  }
-
   DDimVar getVar() { return var; }
 
   bool operator==(DDim d) const;
@@ -126,7 +113,7 @@ DDim make_ddim(std::initializer_list<int64_t> dims);
 
 int64_t get(const DDim &dim, int idx);
 
-void set(DDim &dim, int idx, int val);
+void set(DDim *dim, int idx, int val);
 
 std::vector<int64_t> vectorize(const DDim &ddim);
 
@@ -151,8 +138,6 @@ DDim slice_ddim(const DDim &dim, int begin, int end);
 
 int arity(const DDim &ddim);
 
-std::ostream &operator<<(std::ostream &, const DDim &);
-
 // Reshape a tensor to a matrix. The matrix's first dimension(column
 // length)
 // will be the product of tensor's first `num_col_dims` dimensions.
@@ -163,5 +148,9 @@ DDim flatten_to_1d(const DDim &src);
 DDim stride(const DDim &ddim);
 
 DDim stride_numel(const DDim &ddim);
+
+#ifdef PADDLE_MOBILE_DEBUG
+Print &operator<<(Print &printer, const DDim &ddim);
+#endif
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/dim.h b/src/framework/dim.h
index 6740386c057d6e3a3466219170073cf65b29e03e..38e62df99519c3e869dc0fd2ae71beed28370122 100644
--- a/src/framework/dim.h
+++ b/src/framework/dim.h
@@ -14,13 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <type_traits>
-
-#include "platform/hostdevice.h"
-
+#include "common/enforce.h"
 namespace paddle_mobile {
 namespace framework {
 
@@ -30,42 +24,35 @@ struct Dim {
   static constexpr int dimensions = i;
 
   template <typename... Args>
-  HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
+  Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
     static_assert(sizeof...(_tail) == i - 1,
                   "Dim initialized with the wrong number of parameters");
   }
 
-  HOSTDEVICE
   Dim(int64_t _head, const Dim<i - 1> &_tail) : head(_head), tail(_tail) {}
 
-  HOSTDEVICE
   Dim() : head(0), tail() {}
 
   /** Construct a Dim from a linear index and size.  Uses Fortran
    * order
    * indexing. */
-  HOSTDEVICE
   Dim(int64_t idx, const Dim<i> &size)
       : head(idx % size.head), tail(idx / size.head, size.tail) {}
 
   /** Construct a Dim with each dimension set to the given index */
-  HOSTDEVICE
   Dim(int64_t idx) : head(idx), tail(idx) {}
 
-  HOSTDEVICE
   bool operator==(const Dim<i> &o) const {
     return (head == o.head) && (tail == o.tail);
   }
 
-  HOSTDEVICE
   bool operator!=(const Dim<i> &o) const { return !(*this == o); }
 
-  HOSTDEVICE
   int64_t &operator[](int idx);
-  HOSTDEVICE
+
   int64_t operator[](int idx) const;
 
-  HOST std::string to_string() const;
+  std::string to_string() const;
 
   int64_t head;
   Dim<i - 1> tail;
@@ -76,32 +63,22 @@ template <>
 struct Dim<0> {
   static constexpr int dimensions = 0;
 
-  HOSTDEVICE
   Dim(int64_t _head) {}
 
-  HOSTDEVICE
   Dim() {}
 
-  HOSTDEVICE
   Dim(int idx, const Dim<0> &size) {
-#ifndef __CUDA_ARCH__
     if (idx > 0) {
-      throw std::invalid_argument("Index out of range.");
+      PADDLE_MOBILE_THROW_EXCEPTION("Index out of range.")
     }
-#else
-    PADDLE_ASSERT(idx == 0);
-#endif
   }
 
-  HOSTDEVICE
   bool operator==(const Dim<0> &o) const { return true; }
 
-  HOSTDEVICE
   bool operator!=(const Dim<0> &o) const { return false; }
 
-  HOSTDEVICE
   int64_t &operator[](int idx);
-  HOSTDEVICE
+
   int64_t operator[](int idx) const;
 };
 
@@ -112,12 +89,12 @@ template <int i>
 struct DimGetter {
   // Return a copy if Dim is const
   template <typename D>
-  HOSTDEVICE static int64_t impl(const D &d) {
+  static int64_t impl(const D &d) {
     return DimGetter<i - 1>::impl(d.tail);
   }
   // Return a reference if Dim is mutable
   template <typename D>
-  HOSTDEVICE static int64_t &impl(D &d) {
+  static int64_t &impl(D &d) {
     return DimGetter<i - 1>::impl(d.tail);
   }
 };
@@ -127,25 +104,22 @@ template <>
 struct DimGetter<0> {
   // Return a copy if Dim is const
   template <typename D>
-  HOSTDEVICE static int64_t impl(const D &d) {
+  static int64_t impl(const D &d) {
     return d.head;
   }
   // Return a reference if Dim is mutable
   template <typename D>
-  HOSTDEVICE static int64_t &impl(D &d) {
+  static int64_t &impl(D &d) {
     return d.head;
   }
 };
 
 template <int D>
-HOSTDEVICE int64_t &indexer(Dim<D> &dim, int idx) {
-#ifndef __CUDA_ARCH__
+int64_t &indexer(Dim<D> &dim, int idx) {
   if (idx < 0) {
-    throw std::invalid_argument("Tried to access a negative dimension");
+    PADDLE_MOBILE_THROW_EXCEPTION("Tried to access a negative dimension")
   }
-#else
-  PADDLE_ASSERT(idx >= 0);
-#endif
+
   if (idx == 0) {
     return dim.head;
   }
@@ -153,31 +127,15 @@ HOSTDEVICE int64_t &indexer(Dim<D> &dim, int idx) {
 }
 
 template <>
-HOSTDEVICE int64_t &indexer<0>(Dim<0> &dim, int idx) {
-#ifndef __CUDA_ARCH__
-  throw std::invalid_argument("Invalid index");
-#else
-  PADDLE_ASSERT(false);
-#if CUDA_VERSION < 8000
-  // On CUDA versions previous to 8.0, only __shared__ variables
-  // could be declared as static in the device code.
-  int64_t head = 0;
-#else
-  static int64_t head = 0;
-#endif
-  return head;
-#endif
+int64_t &indexer<0>(Dim<0> &dim, int idx) {
+  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
 }
 
 template <int D>
-HOSTDEVICE int64_t indexer(const Dim<D> &dim, int idx) {
-#ifndef __CUDA_ARCH__
+int64_t indexer(const Dim<D> &dim, int idx) {
   if (idx < 0) {
-    throw std::invalid_argument("Tried to access a negative dimension");
+    PADDLE_MOBILE_THROW_EXCEPTION("Tried to access a negative dimension")
   }
-#else
-  PADDLE_ASSERT(idx >= 0);
-#endif
   if (idx == 0) {
     return dim.head;
   }
@@ -185,102 +143,84 @@ HOSTDEVICE int64_t indexer(const Dim<D> &dim, int idx) {
 }
 
 template <>
-HOSTDEVICE int64_t indexer<0>(const Dim<0> &dim, int idx) {
-#ifndef __CUDA_ARCH__
-  throw std::invalid_argument("Invalid index");
-#else
-  PADDLE_ASSERT(false);
-#if CUDA_VERSION < 8000
-  // On CUDA versions previous to 8.0, only __shared__ variables
-  // could be declared as static in the device code.
-  int64_t head = 0;
-#else
-  static int64_t head = 0;
-#endif
-  return head;
-#endif
+int64_t indexer<0>(const Dim<0> &dim, int idx) {
+  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
 }
 
 }  // namespace
 // Static access to constant Dim
 template <int i, int l>
-HOSTDEVICE int64_t get(const Dim<l> &d) {
+int64_t get(const Dim<l> &d) {
   return DimGetter<i>::impl(d);
 }
 
 // Static access to mutable Dim
 template <int i, int l>
-HOSTDEVICE int64_t &get(Dim<l> &d) {
+int64_t &get(Dim<l> &d) {
   return DimGetter<i>::impl(d);
 }
 
 // Dynamic access to constant Dim
 template <int l>
-HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
+int64_t Dim<l>::operator[](int i) const {
   //  std::cout << "l: " << l << std::endl;
   return indexer(*this, i);
 }
 
 // Dynamic access to mutable Dim
 template <int l>
-HOSTDEVICE int64_t &Dim<l>::operator[](int i) {
+int64_t &Dim<l>::operator[](int i) {
   return indexer(*this, i);
 }
 
 // Dynamic access to constant Dim
-inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const {
-  return indexer(*this, i);
-}
+inline int64_t Dim<0>::operator[](int i) const { return indexer(*this, i); }
 
 // Dynamic access to mutable Dim
-inline HOSTDEVICE int64_t &Dim<0>::operator[](int i) {
-  return indexer(*this, i);
-}
+inline int64_t &Dim<0>::operator[](int i) { return indexer(*this, i); }
 
 // Dynamic access to constant Dim
 // without std::enable_if will try to instantiate this on get<0>(d)
 template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l> &d,
-                                                               int i) {
+typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l> &d, int i) {
   return d[i];
 }
 
 // Dynamic access to mutable Dim
 template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int64_t &>::type get(Dim<l> &d,
-                                                                 int i) {
+typename std::enable_if<(l > 0), int64_t &>::type get(Dim<l> &d, int i) {
   return d[i];
 }
 
 // Dot product of two dims
 template <int i>
-HOSTDEVICE int64_t linearize(const Dim<i> &a, const Dim<i> &b) {
+int64_t linearize(const Dim<i> &a, const Dim<i> &b) {
   return a.head * b.head + linearize(a.tail, b.tail);
 }
 
 // Base case dot product of two Dims
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int64_t linearize(const Dim<0> &a, const Dim<0> &b) {
+inline int64_t linearize(const Dim<0> &a, const Dim<0> &b) {
   return 0;
 }
 
 // Product of a Dim
 template <int i>
-HOSTDEVICE int64_t product(const Dim<i> &a, int prod = 1) {
+int64_t product(const Dim<i> &a, int prod = 1) {
   return prod * a.head * product(a.tail);
 }
 
 // Base case product of a Dim
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int64_t product(const Dim<0> &a, int prod) {
+inline int64_t product(const Dim<0> &a, int prod) {
   return prod;
 }
 
 // Is 0 <= idx_i < size_i for all i?
 template <int i>
-HOSTDEVICE bool contained(const Dim<i> &idx, const Dim<i> &size) {
+bool contained(const Dim<i> &idx, const Dim<i> &size) {
   return ((0 <= idx.head) && (idx.head < size.head) &&
           contained(idx.tail, size.tail));
 }
@@ -288,7 +228,7 @@ HOSTDEVICE bool contained(const Dim<i> &idx, const Dim<i> &size) {
 // Base case of is 0 <= idx_i < size_i ?
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline bool contained(const Dim<0> &idx, const Dim<0> &size) {
+inline bool contained(const Dim<0> &idx, const Dim<0> &size) {
   return true;
 }
 
@@ -296,7 +236,7 @@ HOSTDEVICE inline bool contained(const Dim<0> &idx, const Dim<0> &size) {
  * \brief Compute exclusive prefix-multiply of a Dim.
  */
 template <int i>
-HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i> &src, int mul = 1) {
+Dim<i> ex_prefix_mul(const Dim<i> &src, int mul = 1) {
   return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head));
 }
 
@@ -304,7 +244,7 @@ HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i> &src, int mul = 1) {
 // Base case of ex_prefix_mul
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0> &src, int mul) {
+inline Dim<0> ex_prefix_mul(const Dim<0> &src, int mul) {
   return Dim<0>();
 }
 ///\endcond
@@ -313,18 +253,18 @@ HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0> &src, int mul) {
  * Add two dimensions together
  */
 template <int i>
-HOSTDEVICE Dim<i> dim_plus(const Dim<i> &a, const Dim<i> &b) {
+Dim<i> dim_plus(const Dim<i> &a, const Dim<i> &b) {
   return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail));
 }
 
 // Base case
 template <>
-HOSTDEVICE inline Dim<0> dim_plus(const Dim<0> &a, const Dim<0> &b) {
+inline Dim<0> dim_plus(const Dim<0> &a, const Dim<0> &b) {
   return Dim<0>();
 }
 
 template <int i>
-HOSTDEVICE Dim<i> operator+(const Dim<i> &lhs, const Dim<i> &rhs) {
+Dim<i> operator+(const Dim<i> &lhs, const Dim<i> &rhs) {
   return dim_plus(lhs, rhs);
 }
 
@@ -332,18 +272,18 @@ HOSTDEVICE Dim<i> operator+(const Dim<i> &lhs, const Dim<i> &rhs) {
  * Multiply two dimensions together
  */
 template <int i>
-HOSTDEVICE Dim<i> dim_mult(const Dim<i> &a, const Dim<i> &b) {
+Dim<i> dim_mult(const Dim<i> &a, const Dim<i> &b) {
   return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail));
 }
 
 // Base case
 template <>
-HOSTDEVICE inline Dim<0> dim_mult(const Dim<0> &a, const Dim<0> &b) {
+inline Dim<0> dim_mult(const Dim<0> &a, const Dim<0> &b) {
   return Dim<0>();
 }
 
 template <int i>
-HOSTDEVICE Dim<i> operator*(const Dim<i> &lhs, const Dim<i> &rhs) {
+Dim<i> operator*(const Dim<i> &lhs, const Dim<i> &rhs) {
   return dim_mult(lhs, rhs);
 }
 
@@ -358,7 +298,7 @@ HOSTDEVICE Dim<i> operator*(const Dim<i> &lhs, const Dim<i> &rhs) {
  */
 
 template <int i>
-HOSTDEVICE Dim<i> normalize_strides(const Dim<i> &size, const Dim<i> &stride) {
+Dim<i> normalize_strides(const Dim<i> &size, const Dim<i> &stride) {
   int norm_stride = size.head == 1 ? 0 : stride.head;
   return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail));
 }
@@ -366,8 +306,7 @@ HOSTDEVICE Dim<i> normalize_strides(const Dim<i> &size, const Dim<i> &stride) {
 ///\cond HIDDEN
 
 template <>
-HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0> &size,
-                                           const Dim<0> &stride) {
+inline Dim<0> normalize_strides(const Dim<0> &size, const Dim<0> &stride) {
   return Dim<0>();
 }
 
@@ -382,54 +321,9 @@ HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0> &size,
  */
 
 template <typename... Args>
-HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
+Dim<sizeof...(Args)> make_dim(Args... idxes) {
   return Dim<sizeof...(Args)>(idxes...);
 }
 
-// Allows us to output a Dim
-// XXX For some reason, overloading fails to resolve this correctly
-template <int i>
-typename std::enable_if<(i > 1), std::ostream &>::type operator<<(
-    std::ostream &os, const Dim<i> &d) {
-  os << d.head << ", " << d.tail;
-  return os;
-}
-
-// Base case that allows us to output a Dim
-// XXX I wish this could be an overload instead of a template
-template <int i>
-typename std::enable_if<(i == 1), std::ostream &>::type operator<<(
-    std::ostream &os, const Dim<i> &d) {
-  os << d.head;
-  return os;
-}
-
-inline std::ostream &operator<<(std::ostream &os, const Dim<0> &d) {
-  return os;
-}
-
-template <int i>
-HOST std::string Dim<i>::to_string() const {
-  std::stringstream stream;
-
-  stream << *this;
-
-  return stream.str();
-}
-
-template <int D>
-HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
-  Dim<D> result;
-
-  for (int i = 0; i < D - 1; ++i) {
-    result[i] = linear_index % extents[i];
-    linear_index /= extents[i];
-  }
-
-  result[D - 1] = linear_index;
-
-  return result;
-}
-
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/lod_tensor.cpp b/src/framework/lod_tensor.cpp
index 48c2c46989e2d477ed0a005f25a8252da0955f13..e165e55507ed04a9b63e4ad5eb002f206c71d96c 100644
--- a/src/framework/lod_tensor.cpp
+++ b/src/framework/lod_tensor.cpp
@@ -13,72 +13,56 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "lod_tensor.h"
-#include <stdint.h>
-#include <string.h>
 #include <algorithm>
-#include <iterator>
 
 namespace paddle_mobile {
 namespace framework {
 
-std::ostream &operator<<(std::ostream &os, const LoD &lod) {
-  os << "{";
-  for (auto &v : lod) {
-    os << "{";
-    bool is_first = true;
-    for (auto &i : v) {
-      if (is_first) {
-        os << i;
-        is_first = false;
-      } else {
-        os << ", " << i;
-      }
-    }
-    os << "}";
-  }
-  os << "}";
-
-  return os;
-}
-
-std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
-  //  PADDLE_ENFORCE(t.type().hash_code() ==
-  //  typeid(float).hash_code());
-
-  //  if (!platform::is_cpu_place(t.place())) {
-  //    LoDTensor tt;
-  //    framework::TensorCopy(t, platform::CPUPlace(), &tt);
-  //    platform::DeviceContextPool &pool =
-  //    platform::DeviceContextPool::Instance(); auto &dev_ctx =
-  //    *pool.Get(t.place()); dev_ctx.Wait();
-  //
-  //    os << tt;
-  //    return os;
-  //  }
-
-  os << "dim: " << t.dims() << "\n";
-  os << "lod: " << t.lod() << "\n";
-
-  // only print first ten elements
-  int64_t size = t.numel() < 10 ? t.numel() : 10;
-  for (int64_t i = 0; i < size; ++i) {
-    os << t.data<float>()[i] << " ";
-  }
-
-  return os;
-}
-
-std::string LoDToString(const LoD &lod) {
-  std::ostringstream stream;
-  stream << lod;
-  return stream.str();
-}
+// std::ostream &operator<<(std::ostream &os, const LoD &lod) {
+//  os << "{";
+//  for (auto &v : lod) {
+//    os << "{";
+//    bool is_first = true;
+//    for (auto &i : v) {
+//      if (is_first) {
+//        os << i;
+//        is_first = false;
+//      } else {
+//        os << ", " << i;
+//      }
+//    }
+//    os << "}";
+//  }
+//  os << "}";
+//
+//  return os;
+//}
+//
+// std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
+//  PADDLE_MOBILE_ENFORCE(t.type().hash_code() == typeid(float).hash_code(),
+//                        "t.type() is not float");
+//  os << "dim: " << t.dims() << "\n";
+//  os << "lod: " << t.lod() << "\n";
+//  // only print first ten elements
+//  int64_t size = t.numel() < 10 ? t.numel() : 10;
+//  for (int64_t i = 0; i < size; ++i) {
+//    os << t.data<float>()[i] << " ";
+//  }
+//
+//  return os;
+//}
+
+// std::string LoDToString(const LoD &lod) {
+//  std::ostringstream stream;
+//  stream << lod;
+//  return stream.str();
+//}
 
 LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
                  size_t elem_end) {
-  //  PADDLE_ENFORCE_LT(level, in.size());
-  //  PADDLE_ENFORCE_LT(elem_end, in[level].size());
-
+  PADDLE_MOBILE_ENFORCE(level < in.size(), "level should >= in.size()");
+  PADDLE_MOBILE_ENFORCE(elem_end < in[level].size(),
+                        "elem_end >= in[level].size()");
   LoD res;
   res.resize(in.size() - level);
   // copy the first level
@@ -152,7 +136,7 @@ bool CheckLoD(const LoD &in, int tensor_height) {
           if (a < b) return true;
           return false;
         })) {
-      std::cout << "ascending error";
+      PADDLE_MOBILE_THROW_EXCEPTION("ascending error")
       return false;
     }
   }
@@ -211,8 +195,9 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
   LoD sub_lod;
 
   for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
-    //    PADDLE_ENFORCE_LE(start_idx, end_idx);
-    //    PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
+    PADDLE_MOBILE_ENFORCE(start_idx <= end_idx, "start_idx > end_idx");
+    PADDLE_MOBILE_ENFORCE(end_idx < lod[level_idx].size(),
+                          "end_idx >= lod[level_idx].size()");
     std::vector<size_t> level_lens;
     for (size_t i = start_idx; i < end_idx; ++i) {
       level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
@@ -226,10 +211,9 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
 }
 
 void AppendLoD(LoD *lod, const LoD &lod_length) {
-  //  PADDLE_ENFORCE(
-  //      lod->empty() || lod->size() == lod_length.size(),
-  //      "The lod_length should has the same size with the appended
-  //      lod.");
+  PADDLE_MOBILE_ENFORCE(
+      lod->empty() || lod->size() == lod_length.size(),
+      "The lod_length should has the same size with the appended lod.");
   if (lod->empty()) {
     for (size_t i = 0; i < lod_length.size(); ++i) {
       lod->emplace_back(1, 0);  // size = 1, value = 0;
diff --git a/src/framework/lod_tensor.h b/src/framework/lod_tensor.h
index bab3db1389610e7ed5db1a387004bdf95267867f..3b34d664d3608dd361ed7c7bb549870284adcc33 100644
--- a/src/framework/lod_tensor.h
+++ b/src/framework/lod_tensor.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <memory>
 #include <string>
-#include <utility>
 #include <vector>
 #include "tensor.h"
 #include "tensor_util.h"
diff --git a/src/framework/op_info.h b/src/framework/op_info.h
index 7475d155232e31cf00dab6273200f5bc4671f2e9..16b3487955ce05721e6e3f3e79b6d8ebd180e020 100644
--- a/src/framework/op_info.h
+++ b/src/framework/op_info.h
@@ -25,9 +25,8 @@ template <typename Dtype>
 struct OpInfo {
   OpCreator<Dtype> creator_;
   const OpCreator<Dtype> &Creator() const {
-    //    PADDLE_ENFORCE_NOT_NULL(creator_,
-    //                            "Operator Creator has not been
-    //                            registered");
+    PADDLE_MOBILE_ENFORCE(creator_ != nullptr,
+                          "Operator Creator has not been registered");
     return creator_;
   }
 };
@@ -48,17 +47,15 @@ class OpInfoMap {
   }
 
   void Insert(const std::string &type, const OpInfo<Dtype> &info) {
-    //    PADDLE_ENFORCE(!Has(type), "Operator %s has been
-    //    registered", type);
+    PADDLE_MOBILE_ENFORCE(!Has(type), "Operator %s has been registered",
+                          type.c_str());
     map_.insert({type, info});
   }
 
   const OpInfo<Dtype> &Get(const std::string &type) const {
     auto op_info_ptr = GetNullable(type);
-    //    PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not
-    //    been
-    //    registered",
-    //                            type);
+    PADDLE_MOBILE_ENFORCE(op_info_ptr != nullptr,
+                          "Operator %s has not been registered", type.c_str());
     return *op_info_ptr;
   }
 
diff --git a/src/framework/op_registry.h b/src/framework/op_registry.h
index 62398dcb15dc61ef2f778b738da0afd073b37908..8a7beae993be1a9f2a52fb48d4930754aba784e1 100644
--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -96,24 +96,39 @@ class OpRegistry {
   }
 };
 
-#define REGISTER_OPERATOR(op_type, op_class)                                \
-  template <typename Dtype, typename T>                                     \
-  class _OpClass_##op_type##_ : public op_class<Dtype, T> {                 \
-   public:                                                                  \
-    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class);                 \
-  };                                                                        \
-  static paddle_mobile::framework::OperatorRegistrar<                       \
-      paddle_mobile::CPU, _OpClass_##op_type##_<paddle_mobile::CPU, float>> \
-      __op_registrar_##op_type##__(#op_type);                               \
-  int TouchOpRegistrar_##op_type() {                                        \
-    __op_registrar_##op_type##__.Touch();                                   \
-    return 0;                                                               \
+#define REGISTER_OPERATOR(op_type, op_class, device_name, device_type)     \
+  template <typename Dtype, typename T>                                    \
+  class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> {   \
+   public:                                                                 \
+    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_##device_name, op_class);   \
+  };                                                                       \
+  static paddle_mobile::framework::OperatorRegistrar<                      \
+      device_type, _OpClass_##op_type##_##device_name<device_type, float>> \
+      __op_registrar_##op_type##_##device_name(#op_type);                  \
+  int TouchOpRegistrar_##op_type##_##device_name() {                       \
+    __op_registrar_##op_type##_##device_name.Touch();                      \
+    return 0;                                                              \
   }
 
-#define USE_OP(op_type)                                           \
-  extern int TouchOpRegistrar_##op_type();                        \
-  static int use_op_itself_##op_type##_ __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type()
+#define REGISTER_OPERATOR_CPU(op_type, op_class) \
+  REGISTER_OPERATOR(op_type, op_class, cpu, paddle_mobile::CPU);
+
+#define REGISTER_OPERATOR_MALI_GPU(op_type, op_class) \
+  REGISTER_OPERATOR(op_type, op_class, mali_gpu, paddle_mobile::GPU_MALI);
+
+#define REGISTER_OPERATOR_FPGA(op_type, op_class) \
+  REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);
+
+#define USE_OP(op_type, device_name)                                           \
+  extern int TouchOpRegistrar_##op_type##_##device_name();                     \
+  static int use_op_itself_##op_type##_##device_name __attribute__((unused)) = \
+      TouchOpRegistrar_##op_type##_##device_name()
+
+#define USE_OP_CPU(op_type) USE_OP(op_type, cpu);
+
+#define USE_OP_MALI_GPU(op_type) USE_OP(op_type, mali_gpu);
+
+#define USE_OP_FPGA(op_type) USE_OP(op_type, fpga);
 
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp
index f798d7ade200208c1b199aee0410ed4c297ed7fd..36b4663cb603d29bb60cfc297899d1c300e8ca91 100644
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -58,7 +58,8 @@ void OperatorBase<Dtype>::Run() const {
 }
 
 template class OperatorBase<CPU>;
-template class OperatorWithKernel<CPU>;
+template class OperatorBase<FPGA>;
+template class OperatorBase<GPU_MALI>;
 
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/operator.h b/src/framework/operator.h
index cb27985244a1dd9e92a54edce9f15fd3d8defaad..c68744a676030413e81570ded0db5671cdf4ba7a 100644
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <map>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include "common/enforce.h"
@@ -27,7 +26,6 @@ limitations under the License. */
 #include "framework/op_info.h"
 #include "framework/op_kernel_type.h"
 #include "framework/op_registry.h"
-#include "framework/paddle_mobile_object.h"
 #include "framework/program/block_desc.h"
 #include "framework/program/program-optimize/node.h"
 #include "framework/scope.h"
@@ -52,7 +50,7 @@ static T *GetVarValue(const string &key, const VariableNameMap &var_map,
 }
 
 template <typename Dtype>
-class OperatorBase : PaddleMobileObject {
+class OperatorBase {
  public:
   /*
    *  @b op 基类的实例化方法, op 获取到了 输入、参数以及提前分配好的输出 tensor
@@ -65,6 +63,7 @@ class OperatorBase : PaddleMobileObject {
   std::vector<string> GetOutKeys() const;
   virtual void RunImpl() const = 0;
 
+  virtual void Init() const = 0;
   /*
    * @b op 运算所需的输入, 如上一层的输出结果、卷积核
    * */
@@ -105,31 +104,55 @@ class OperatorBase : PaddleMobileObject {
 /*
  * @b 这个类为所有带有运算的 op 的父类, 这个 op 继承与 OperatorBase
  * */
-template <typename Dtype>
+template <typename Dtype, typename ParamType, typename KernelType>
 class OperatorWithKernel : public OperatorBase<Dtype> {
  public:
   OperatorWithKernel(const std::string &type, const VariableNameMap &inputs,
                      const VariableNameMap &outputs, const AttributeMap &attrs,
                      std::shared_ptr<Scope> scope)
-      : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope) {}
+      : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope),
+        param_(inputs, outputs, attrs, *scope) {}
+
+  virtual void RunImpl() const { this->kernel_.Compute(this->param_); }
 
-  virtual void RunImpl() const = 0;
   virtual void InferShape() const = 0;
+
+  void Init() const {
+    PADDLE_MOBILE_ENFORCE(kernel_.Init(param_), "  %s kernel init failed",
+                          this->type_.c_str());
+  }
+
+ protected:
+  KernelType kernel_;
+  ParamType param_;
 };
 
 /*
  * @b 所有kernel的父类
  * */
 template <typename Dtype, typename P>
-class OpKernelBase : PaddleMobileObject {
+class OpKernelBase {
  public:
   /*
    * @b 所有kernel 需实现 Compute 方法
    * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
    *    所有结构体存在与: paddle-mobile/src/operators/op_param.h
    * */
+#ifdef PADDLE_MOBILE_MALI_GPU
+  OpKernelBase() { acl_op_ = nullptr; }
+  void *GetAclOp() const { return acl_op_; }
+  void SetAclOp(void *op, void *ob) const {
+    reinterpret_cast<OpKernelBase<Dtype, P> *>(ob)->acl_op_ = op;
+  }
+#endif
   virtual void Compute(const P &para) const = 0;
+  virtual bool Init(const P &para) const { return true; };
   virtual ~OpKernelBase() = default;
+
+ private:
+#ifdef PADDLE_MOBILE_MALI_GPU
+  void *acl_op_;
+#endif
 };
 
 #define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)                                 \
@@ -139,20 +162,23 @@ class OpKernelBase : PaddleMobileObject {
       std::shared_ptr<::paddle_mobile::framework::Scope> scope)                \
       : parent_cls<Dtype, T>(type, inputs, outputs, attrs, scope) {}
 
-class FusionOpMatcher : PaddleMobileObject {
+class FusionOpMatcher {
  public:
   FusionOpMatcher() {}
 
   virtual std::string Type() = 0;
 
-  virtual void FolderNodes(Node *node) {
-    node->Folder(node_.Depth(), Type(), {});
+  virtual void FolderNodes(
+      Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
   }
 
   virtual Node &BeginNode() { return node_; }
 
   std::string BeginType() { return node_.Type(); }
 
+  //  virtual  bool Fusion();
  protected:
   Node node_;
   std::string type_;
diff --git a/src/framework/paddle_mobile_object.cpp b/src/framework/paddle_mobile_object.cpp
deleted file mode 100644
index acf37a3c117af3bf2bc70aac624335399b4a796b..0000000000000000000000000000000000000000
--- a/src/framework/paddle_mobile_object.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle_mobile_object.h"
diff --git a/src/framework/program/block_desc.cpp b/src/framework/program/block_desc.cpp
index 21322f0825636a321b022220e535cad0e4b8cf41..4e3eb79d07d0c8c363a6c3a9556cf718ebdc08f2 100644
--- a/src/framework/program/block_desc.cpp
+++ b/src/framework/program/block_desc.cpp
@@ -13,17 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "block_desc.h"
+#include <algorithm>
 
 namespace paddle_mobile {
 namespace framework {
 
-std::vector<std::shared_ptr<VarDesc>> BlockDesc::Vars() const {
-  std::vector<std::shared_ptr<VarDesc>> res;
-  for (const auto &p : vars_) {
-    res.push_back(p.second);
-  }
-  return res;
-}
+std::vector<std::shared_ptr<VarDesc>> BlockDesc::Vars() const { return vars_; }
 
 std::vector<std::shared_ptr<OpDesc>> BlockDesc::Ops() const { return ops_; }
 
@@ -31,10 +26,14 @@ BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc)
     : index_(desc->idx), parent_index_(desc->idx) {
   for (int i = 0; i < desc->n_vars; ++i) {
     PaddleMobile__Framework__Proto__VarDesc *var_desc = desc->vars[i];
-    vars_[std::string(var_desc->name)] =
-        std::shared_ptr<VarDesc>(new VarDesc(var_desc));
+    vars_.emplace_back(std::shared_ptr<VarDesc>(new VarDesc(var_desc)));
   }
 
+  std::sort(vars_.begin(), vars_.end(),
+            [](std::shared_ptr<VarDesc> left, std::shared_ptr<VarDesc> right) {
+              return left->Name() < right->Name();
+            });
+
   for (int j = 0; j < desc->n_ops; ++j) {
     PaddleMobile__Framework__Proto__OpDesc *op_desc = desc->ops[j];
     ops_.emplace_back(new framework::OpDesc(op_desc));
diff --git a/src/framework/program/block_desc.h b/src/framework/program/block_desc.h
index 84d7a90fc11ddf360eacb01be9456ced4a30dad8..dd33a274266cb503cea0b960c026276d90cea57a 100644
--- a/src/framework/program/block_desc.h
+++ b/src/framework/program/block_desc.h
@@ -15,14 +15,13 @@ limitations under the License. */
 #pragma once
 
 #include "framework/framework.pb-c.h"
-#include "framework/paddle_mobile_object.h"
 #include "framework/program/op_desc.h"
 #include "framework/program/var_desc.h"
 
 namespace paddle_mobile {
 namespace framework {
 
-class BlockDesc : PaddleMobileObject {
+class BlockDesc {
  public:
   friend class Node;
   friend class ProgramOptimize;
@@ -35,10 +34,9 @@ class BlockDesc : PaddleMobileObject {
       ops_.push_back(copy_op_desc);
     }
 
-    for (auto &var_desc : block_desc.vars_) {
-      std::shared_ptr<VarDesc> copy_var_desc =
-          std::make_shared<VarDesc>(*var_desc.second);
-      vars_[var_desc.first] = copy_var_desc;
+    for (int i = 0; i < block_desc.vars_.size(); ++i) {
+      auto &var_desc = block_desc.vars_[i];
+      vars_.emplace_back(std::make_shared<VarDesc>(*var_desc));
     }
   }
 
@@ -64,7 +62,7 @@ class BlockDesc : PaddleMobileObject {
   bool multi_thread_;
   int parent_index_;
   std::vector<std::shared_ptr<OpDesc>> ops_;
-  std::unordered_map<std::string, std::shared_ptr<VarDesc>> vars_;
+  std::vector<std::shared_ptr<VarDesc>> vars_;
 };
 
 }  // namespace framework
diff --git a/src/framework/program/op_desc.h b/src/framework/program/op_desc.h
index 07b903085d5d9044b93e3e9309390c9a3976580d..4fdfac253f0525b288983e8bcf9c1b4eff8f393d 100644
--- a/src/framework/program/op_desc.h
+++ b/src/framework/program/op_desc.h
@@ -20,12 +20,11 @@ limitations under the License. */
 #include "common/log.h"
 #include "common/type_define.h"
 #include "framework/framework.pb-c.h"
-#include "framework/paddle_mobile_object.h"
 
 namespace paddle_mobile {
 namespace framework {
 
-class OpDesc : PaddleMobileObject {
+class OpDesc {
  public:
   friend class ProgramOptimize;
   friend class FusionOpMatcher;
diff --git a/src/framework/program/program-optimize/fusion_op_register.cpp b/src/framework/program/program-optimize/fusion_op_register.cpp
deleted file mode 100644
index 010585166cc0828612a48c128f8753338ef16ff9..0000000000000000000000000000000000000000
--- a/src/framework/program/program-optimize/fusion_op_register.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fusion_op_register.h"
diff --git a/src/framework/program/program-optimize/node.cpp b/src/framework/program/program-optimize/node.cpp
index c165b6568aa37e850d9dfd4dfbec63f8c4d85b02..89385e12d9c5f20a21f6ee6f3987c088c4b15563 100644
--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <sstream>
-
-#include "framework/operator.h"
 #include "framework/program/program-optimize/node.h"
+#include <algorithm>
+#include "framework/operator.h"
 
 namespace paddle_mobile {
 
@@ -45,54 +44,13 @@ bool Node::operator==(const Node &in) {
   return true;
 }
 
-bool Node::CanSplit(std::unordered_set<std::string> complex_compute_set) {
-  bool split = false;
-  CanSplit(&split, false, 0, &complex_compute_set, this);
-  return split;
-}
-
-void Node::CanSplit(bool *split, bool spliting, int complex_count,
-                    std::unordered_set<std::string> *complex_compute_set,
-                    Node *pre_node) {
-  if (spliting) {
-    if (complex_compute_set->find(this->type_) != complex_compute_set->end()) {
-      complex_count++;
-    }
-  }
-
-  if (inputs_.size() > 1 && pre_node != inputs_.back()) {
-    return;
-  }
-  if (inputs_.size() > 1 && pre_node == inputs_.back()) {
-    if (complex_count > 1) {
-      *split = true;
-      return;
-    }
-  }
-
-  // multi output, to check
-  if (outputs_.size() > 1) {
-    spliting = true;
-    complex_compute_set = 0;
-  } else {
-    if (spliting == true && inputs_.size() > 0) {
-      spliting = false;
-    } else {
-    }
-  }
-
-  for (auto &output : outputs_) {
-    output->CanSplit(split, spliting, complex_count, complex_compute_set, this);
-  }
-}
-
-std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(uint size) {
+std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(int size) {
   std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
   OpDescs(size - 1, &op_descs);
   return op_descs;
 }
 
-void Node::OpDescs(uint index,
+void Node::OpDescs(int index,
                    std::vector<std::shared_ptr<framework::OpDesc>> *op_desc) {
   if (index == 0) {
     return;
@@ -103,107 +61,6 @@ void Node::OpDescs(uint index,
   }
 }
 
-void Node::OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-                   Node *node, bool adding_thread, int thread_num) {
-  if (outputs_.size() > 1) {
-    adding_thread = false;
-  }
-
-  bool can_add_split = false;
-  // 如果当前节点有多个输出 并且 只有当前节点对应的 op_desc_ 输出数为 1 时支持
-  if (outputs_.size() > 1 &&
-      op_input_output_key[op_desc_->type_].second.size() == 1) {
-    can_add_split = true;
-
-    // 遍历当前节点的 output 节点
-    for (const auto &output : outputs_) {
-      // 不支持 output 有多个 output 的情况
-      if (output->outputs_.size() > 0) {
-        can_add_split = false;
-        break;
-      }
-
-      //与节点关联的 OpDesc
-      std::shared_ptr<framework::OpDesc> &op_desc = output->op_desc_;
-
-      //获取这个 op 的 inputs key 和 outputs key
-      auto inputs_and_outputs = op_input_output_key[op_desc->type_];
-
-      //判断现在 是否存在这个 op
-      //判断这个 output 和 input key 的 size 等于 1
-      if (op_input_output_key.find(op_desc->type_) !=
-              op_input_output_key.end() &&
-          inputs_and_outputs.first.size() == 1 &&
-          inputs_and_outputs.second.size() == 1) {
-        auto inputs_of_output = op_desc->Input(inputs_and_outputs.first[0]);
-        auto outputs_of_output = op_desc->Output(inputs_and_outputs.second[0]);
-
-        // 判断一下, 如果输入和输出没有同名, 是支持的
-        for (int i = 0; i < inputs_of_output.size(); ++i) {
-          std::string input_of_output = inputs_of_output[i];
-          for (int j = 0; j < outputs_of_output.size(); ++j) {
-            std::string output_of_output = outputs_of_output[j];
-            if (input_of_output == output_of_output) {
-              DLOG << "output的 output 包含 input" << input_of_output;
-              can_add_split = false;
-              break;
-            }
-          }
-        }
-      } else {  // 如果模型中包含没有的 op, 则不支持添加 split
-        DLOG << "找不到 这个 op 类型: " << output->op_desc_->type_;
-        can_add_split = false;
-      }
-    }
-  }
-
-  if (inputs_.size() > 1 && node != inputs_.back()) {
-    return;
-  } else if (inputs_.size() > 1 && node == inputs_.back()) {
-    adding_thread = false;
-    op_desc->push_back(this->op_desc_);
-  } else {
-    op_desc->push_back(this->op_desc_);
-  }
-  if (adding_thread) {
-    Attribute attr;
-    attr.Set<int>(thread_num);
-    this->op_desc_->attrs_["thread"] = attr;
-  }
-
-  if (can_add_split) {
-    adding_thread = true;
-    std::shared_ptr<OpDesc> split_op_desc = std::make_shared<OpDesc>();
-    split_op_desc->type_ = G_OP_TYPE_SPLIT;
-    auto outputs = this->op_desc_->Output(
-        op_input_output_key[this->op_desc_->Type()].second[0]);
-    split_op_desc->inputs_ = {
-        {op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
-    auto &split_outputs =
-        split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]];
-    for (const auto &output : outputs_) {
-      split_outputs.push_back(outputs[0]);
-    }
-    DLOG << "add split";
-    op_desc->push_back(split_op_desc);
-  }
-
-  for (int i = 0; i < outputs_.size(); ++i) {
-    auto &output = outputs_[i];
-    if (can_add_split) {
-      output->OpDescs(op_desc, this, adding_thread, i);
-    } else {
-      output->OpDescs(op_desc, this, adding_thread, thread_num);
-    }
-  }
-}
-
-std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs() {
-  std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
-  OpDescs(&op_descs, this, false, 0);
-  return op_descs;
-}
-
 std::shared_ptr<Node> Node::To(int size) {
   std::shared_ptr<Node> node = std::make_shared<Node>();
   this->To(size - 1, node);
@@ -224,24 +81,25 @@ void Node::To(int index, std::shared_ptr<Node> node) {
   }
 }
 
-uint Node::Depth(uint begin) {
-  uint depth = 0;
+int Node::Depth(int begin) {
+  int depth = 0;
   begin++;
   for (int i = 0; i < outputs_.size(); ++i) {
-    uint output_depth = outputs_[i]->Depth(begin);
+    int output_depth = outputs_[i]->Depth(begin);
     depth = output_depth > depth ? output_depth : depth;
   }
   return begin > depth ? begin : depth;
 }
 
 Node &Node::Folder(
-    uint size, std::string type,
-    std::map<std::string, std::pair<std::string, std::string>> change) {
+    int size, std::string type,
+    std::map<std::string, std::pair<std::string, std::string>> change,
+    std::vector<std::shared_ptr<Node>> *removed_nodes) {
   std::shared_ptr<framework::OpDesc> op_desc =
       std::make_shared<framework::OpDesc>();
   op_desc->inputs_ = this->op_desc_->inputs_;
   std::vector<std::shared_ptr<Node>> outputs;
-  this->Folder(op_desc, &outputs, size - 1, &change, this);
+  this->Folder(op_desc, &outputs, size - 1, &change, this, removed_nodes);
   this->outputs_ = outputs;
   this->type_ = type;
   this->op_desc_ = op_desc;
@@ -251,9 +109,9 @@ Node &Node::Folder(
 
 void Node::Folder(
     std::shared_ptr<framework::OpDesc> op_desc,
-    std::vector<std::shared_ptr<Node>> *outputs, uint index,
+    std::vector<std::shared_ptr<Node>> *outputs, int index,
     std::map<std::string, std::pair<std::string, std::string>> *change,
-    Node *begin_node) {
+    Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes) {
   if (change->find(this->type_) != change->end()) {
     auto change_pair = (*change)[this->type_];
     op_desc->GetInputs()[change_pair.second] =
@@ -266,7 +124,9 @@ void Node::Folder(
   if (index > 0) {
     --index;
     for (auto output : outputs_) {
-      output->Folder(op_desc, outputs, index, change, begin_node);
+      removed_nodes->push_back(output);
+      output->Folder(op_desc, outputs, index, change, begin_node,
+                     removed_nodes);
     }
   } else {
     for (auto &op_output : this->op_desc_->outputs_) {
@@ -285,7 +145,7 @@ void Node::Folder(
     }
   }
 }
-
+#ifdef PADDLE_MOBILE_DEBUG
 std::string Node::ToString(std::string blank, const Node *node) const {
   std::stringstream ss;
   ss << type_ << "-> \n";
@@ -316,6 +176,7 @@ Print &operator<<(Print &printer, const Node &node) {
   printer << node.ToString();
   return printer;
 }
+#endif
 
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/program/program-optimize/node.h b/src/framework/program/program-optimize/node.h
index 8ef26f897d2052db97780d7bdc23db1641fc4f6f..7236ffdd1782dfb39af73195da9b3756030c9117 100644
--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -14,20 +14,17 @@ limitations under the License. */
 
 #pragma once
 
+#include <cinttypes>
 #include <map>
 #include <string>
-#include <unordered_set>
-#include <utility>
 #include <vector>
-
 #include "common/log.h"
-#include "framework/paddle_mobile_object.h"
 #include "framework/program/op_desc.h"
 
 namespace paddle_mobile {
 namespace framework {
 
-class Node : PaddleMobileObject {
+class Node {
   friend class ProgramOptimize;
 
  public:
@@ -37,35 +34,34 @@ class Node : PaddleMobileObject {
       : op_desc_(op_desc), type_(op_desc->Type()) {}
   Node &operator>(std::shared_ptr<Node> node);
   bool operator==(const Node &in);
-  bool CanSplit(std::unordered_set<std::string> complex_compute_set);
+
+#ifdef PADDLE_MOBILE_DEBUG
   std::string ToString() const;
+  void Description();
+#endif
   std::shared_ptr<Node> To(int size);
-  uint Depth(uint begin = 0);
+  int Depth(int begin = 0);
   Node &Folder(
-      uint size, std::string type,
-      std::map<std::string, std::pair<std::string, std::string>> change_map);
-  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(uint size);
-  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs();
+      int size, std::string type,
+      std::map<std::string, std::pair<std::string, std::string>> change_map,
+      std::vector<std::shared_ptr<Node>> *removed_nodes);
+  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(int size);
   std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
   std::string Type() { return type_; }
-  void Description();
 
  private:
-  void CanSplit(bool *split, bool spliting, int complex_count,
-                std::unordered_set<std::string> *complex_compute_set,
-                Node *pre_node);
-  void OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-               Node *node, bool adding_thread, int thread_num);
-  void OpDescs(uint size,
+  void OpDescs(int size,
                std::vector<std::shared_ptr<framework::OpDesc>> *op_desc);
   void To(int index, std::shared_ptr<Node>);
   void Folder(
       std::shared_ptr<framework::OpDesc> op_desc,
-      std::vector<std::shared_ptr<Node>> *outputs, uint index,
+      std::vector<std::shared_ptr<Node>> *outputs, int index,
       std::map<std::string, std::pair<std::string, std::string>> *change,
-      Node *begin_node);
+      Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes);
   std::shared_ptr<framework::OpDesc> op_desc_;
+#ifdef PADDLE_MOBILE_DEBUG
   std::string ToString(std::string blank, const Node *node) const;
+#endif
   std::vector<std::shared_ptr<Node>> outputs_;
   std::vector<Node *> inputs_;
   std::string type_;
diff --git a/src/framework/program/program-optimize/program_optimize.cpp b/src/framework/program/program-optimize/program_optimize.cpp
index d9c3c51c3c8ab528d71d992b3710e981a5087729..3619bc79f576651245aa322992df9d318c810cd4 100644
--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "framework/program/program-optimize/program_optimize.h"
+#include <algorithm>
 #include "framework/program/program-optimize/fusion_op_register.h"
 
 namespace paddle_mobile {
 
 namespace framework {
 
-std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
+std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
     std::shared_ptr<ProgramDesc> ori_des, bool add_split) {
   //  ProgramDesc *optimize_program = new ProgramDesc(*ori_des);
   std::shared_ptr<ProgramDesc> optimize_program =
@@ -31,6 +32,8 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
     std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>>
         type_map;
 
+    std::vector<std::shared_ptr<Node>> nodes;
+
     std::shared_ptr<Node> begin_node;
     auto block = optimize_program->Block(i);
     //        DLOG << " ops size: " << block->Ops().size();
@@ -38,11 +41,13 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
       auto op = block->Ops()[j];
       auto op_type = op->Type();
       if (op_input_output_key.find(op->Type()) == op_input_output_key.end()) {
-        LOG(kLOG_ERROR) << "return null ";
+        LOG(kLOG_ERROR) << "has not support op return null "
+                        << " op type: " << op->Type();
         return nullptr;
       }
 
       std::shared_ptr<Node> node = std::make_shared<Node>(op);
+      nodes.push_back(node);
 
       //
       type_map[op->Type()].push_back(node);
@@ -87,21 +92,29 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
           //          DLOG << " match success " << " fusion node: \n" <<
           //          matcher->BeginNode() << "\nsub node: \n" << *sub_node;
           //          DLOG << "match node\n"<< *match_node;
-          matcher->FolderNodes(match_node.get());
-          //          DLOG << " after match node\n"<< *match_node;
-          //          match_node->Description();
 
-          //          DLOG << "begin node: \n" << *begin_node;
+          std::vector<std::shared_ptr<Node>> removed_nodes;
+          matcher->FolderNodes(match_node.get(), &removed_nodes);
+
+          for (int j = 0; j < removed_nodes.size(); ++j) {
+            auto removed_node = removed_nodes[j];
+            auto removed_ite =
+                std::find(nodes.begin(), nodes.end(), removed_node);
+            nodes.erase(removed_ite);
+          }
         }
       }
     }
 
-    //    DLOG << "node: \n" << *begin_node;
-
     std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
-    //    bool can_splite = begin_node->CanSplit({G_OP_TYPE_CONV,
-    //    G_OP_TYPE_BATCHNORM, G_OP_TYPE_DEPTHWISE_CONV});
-    GenerateOps(&op_descs, begin_node.get());
+    if (add_split) {
+      GenerateOps(&op_descs, begin_node.get(), add_split);
+    } else {
+      for (int m = 0; m < nodes.size(); ++m) {
+        auto &node = nodes[m];
+        op_descs.push_back(node->op_desc_);
+      }
+    }
     block->ops_ = op_descs;
   }
 
@@ -118,6 +131,14 @@ void ProgramOptimize::GenerateOps(
     Node *current_node) {
   if (current_node->inputs_.size() > 1 &&
       input_node != current_node->inputs_.back()) {
+    DLOG << " current type " << current_node->type_;
+
+    DLOG << " inputs size of current node > 0 ";
+
+    for (int i = 0; i < current_node->inputs_.size(); ++i) {
+      DLOG << " input i: " << current_node->inputs_[i]->type_;
+    }
+
     return;
   } else if (current_node->inputs_.size() > 1 &&
              input_node == current_node->inputs_.back()) {
@@ -250,12 +271,12 @@ void ProgramOptimize::GenerateOps(
 }
 
 void ProgramOptimize::GenerateOps(
-    std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
-    Node *begin_node) {
+    std::vector<std::shared_ptr<framework::OpDesc>> *op_descs, Node *begin_node,
+    bool can_add_split) {
   // std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
   //             Node *input_node, Node *current_node, bool adding_thread, int
   //             thread_num
-  if (false) {
+  if (can_add_split) {
     this->GenerateOps(op_descs, begin_node, begin_node, false, -1, nullptr);
   } else {
     this->GenerateOps(op_descs, begin_node, begin_node);
diff --git a/src/framework/program/program-optimize/program_optimize.h b/src/framework/program/program-optimize/program_optimize.h
index 93943cf83951565d91f67bfa77881dbcb130278d..57b282926d443fb11db86169ecb46a6724a88829 100644
--- a/src/framework/program/program-optimize/program_optimize.h
+++ b/src/framework/program/program-optimize/program_optimize.h
@@ -27,14 +27,14 @@ namespace framework {
 class ProgramOptimize {
  public:
   ProgramOptimize() {}
-  std::shared_ptr<ProgramDesc> FushionOptimize(
+  std::shared_ptr<ProgramDesc> FusionOptimize(
       std::shared_ptr<ProgramDesc> ori_des, bool add_split = false);
 
  private:
   int current_block_;
   std::vector<std::shared_ptr<BlockDesc>> new_blocks_;
   void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
-                   Node *begin_node);
+                   Node *begin_node, bool can_add_split);
   void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
                    Node *input_node, Node *current_node);
   void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
diff --git a/src/framework/program/program.cpp b/src/framework/program/program.cpp
deleted file mode 100644
index 83e389917c76df50ea0380795b36ff012da01568..0000000000000000000000000000000000000000
--- a/src/framework/program/program.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-namespace paddle_mobile {
-namespace framework {}
-}  // namespace paddle_mobile
diff --git a/src/framework/program/program.h b/src/framework/program/program.h
index 3a9cbfc1d9e3f3d099bcfeac32260613fc4dccc3..bb82fa7334a7d1941734dcd846c8e66befdbdd10 100644
--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "common/types.h"
-#include "framework/paddle_mobile_object.h"
 #include "framework/program/program_desc.h"
 #include "framework/scope.h"
 
@@ -23,12 +22,14 @@ namespace paddle_mobile {
 namespace framework {
 
 template <typename Dtype, Precision P = Precision::FP32>
-class Program : PaddleMobileObject {
+class Program {
  public:
   std::shared_ptr<ProgramDesc> originProgram;
   std::shared_ptr<ProgramDesc> optimizeProgram;
   std::shared_ptr<Scope> scope;
   std::string model_path;
+  std::string para_path;
+  bool is_commbine = false;
 
  private:
 };
diff --git a/src/framework/program/program_desc.h b/src/framework/program/program_desc.h
index 6aa7dd44ce5880d8b1db0e2b3ffad2e0bd31d46e..5c87f565e13df1564343b43150a5696c3adaca39 100644
--- a/src/framework/program/program_desc.h
+++ b/src/framework/program/program_desc.h
@@ -18,13 +18,12 @@ limitations under the License. */
 
 #include "common/types.h"
 #include "framework/framework.pb-c.h"
-#include "framework/paddle_mobile_object.h"
 #include "framework/program/block_desc.h"
 
 namespace paddle_mobile {
 namespace framework {
 
-class ProgramDesc : PaddleMobileObject {
+class ProgramDesc {
  public:
   friend class Node;
   friend class ProgramOptimize;
diff --git a/src/framework/program/var_desc.h b/src/framework/program/var_desc.h
index 5ab2fc56178b8d48d2dfb637817eca13b53677d5..f6f04f2c7026166e1024dcc1a4b2a233deac649b 100644
--- a/src/framework/program/var_desc.h
+++ b/src/framework/program/var_desc.h
@@ -14,40 +14,14 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+
 #include "framework/framework.pb-c.h"
-#include "framework/paddle_mobile_object.h"
 #include "framework/program/tensor_desc.h"
 
 namespace paddle_mobile {
 namespace framework {
 
-/*
-
-PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL = 0,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16 = 1,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32 = 2,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64 = 3,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16 = 4,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32 = 5,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64 = 6,
-
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR = 7,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS = 8,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH = 9,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST = 10,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES = 11,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE = 12,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_ARRAY = 13,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST = 14,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER = 15,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL = 16,
-
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW = 17,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE = 18
-
-
-                                                                 */
-
 class VarDesc {
  public:
   VarDesc(const VarDesc &var_desc) {
@@ -56,14 +30,6 @@ class VarDesc {
     this->persistable_ = var_desc.persistable_;
     this->tensor_desc_ = var_desc.tensor_desc_;
     this->type_ = var_desc.type_;
-    /*
-     *
-     *  std::string name_;
-  bool persistable_;
-  TensorDesc tensor_desc_;
-  VarType_Type type_;
-  VarType_Type data_type_;
-     * */
   }
   VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) {
     type_ = (VarType_Type)desc->type->type;
@@ -102,39 +68,6 @@ class VarDesc {
 
   const TensorDesc &Tensor_desc() const { return tensor_desc_; }
 
-  //  const proto::VarType::ChannelDesc &channel_desc() const {
-  //    switch (desc_.type().type()) {
-  //      case proto::VarType::CHANNEL:
-  //        return desc_.type().channel();
-  //      default:
-  //        break;
-  //    }
-  //  }
-
-  //  proto::VarType::Type GetDataType() const {
-  //    switch (desc_.type().type()) {
-  //      case proto::VarType::CHANNEL:
-  //        return channel_desc().data_type();
-  //        break;
-  //      default:
-  //        return tensor_desc().data_type();
-  //    }
-  //  }
-
-  //  template <typename T>
-  //  std::vector<T> RepeatedToVector(
-  //      const google::protobuf::RepeatedField<T> &repeated_field) const {
-  //    std::vector<T> ret;
-  //    ret.reserve(repeated_field.size());
-  //    std::copy(repeated_field.begin(), repeated_field.end(),
-  //              std::back_inserter(ret));
-  //    return ret;
-  //  }
-
-  //  std::vector<int64_t> GetShape() const {
-  //    return this->RepeatedToVector(tensor_desc().dims());
-  //  }
-
  private:
   std::string name_;
   bool persistable_;
diff --git a/src/framework/scope.cpp b/src/framework/scope.cpp
index c5ee2d39fa7a7bf4c1c7b1c2f3fb8f1e92f4e455..a1f5789aa52d2a70f54cef5c622c3a15907a4683 100644
--- a/src/framework/scope.cpp
+++ b/src/framework/scope.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "framework/scope.h"
 
+#include <algorithm>
 #include <set>
 #include <string>
 #include <vector>
@@ -22,7 +23,6 @@ namespace paddle_mobile {
 namespace framework {
 
 Scope &Scope::NewScope() const {
-  std::unique_lock<std::mutex> lock(mutex_);
   kids_.push_back(new Scope(this));
   return *kids_.back();
 }
@@ -72,11 +72,9 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 
 void Scope::DeleteScope(Scope *scope) const {
-  std::unique_lock<std::mutex> lock(mutex_);
   auto it = std::find(kids_.begin(), kids_.end(), scope);
   kids_.erase(it);
   delete scope;
-  // deferent
 }
 
 void Scope::EraseVars(const std::vector<std::string> &var_names) {
@@ -104,14 +102,6 @@ void Scope::Rename(const std::string &origin_name,
   vars_[new_name] = origin_it->second;
   vars_.erase(origin_it);
 }
-//
-//            std::string Scope::Rename(const std::string& origin_name)
-//            const {
-//                auto var_name = string::Sprintf("%p.%d", this,
-//                vars_.size());
-//                Rename(origin_name, var_name);
-//                return var_name;
-//            }
 
 Variable *Scope::FindVarLocally(const std::string &name) const {
   auto it = vars_.find(name);
diff --git a/src/framework/scope.h b/src/framework/scope.h
index 8b194654f61d7502184b45c7eb07d655b70784dc..d714f61af3bd443c09fcef7aacee2416b90b5e02 100644
--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -14,17 +14,16 @@ limitations under the License. */
 
 #pragma once
 
-#include <list>           //std::list
-#include <mutex>          //std::mutex
-#include <unordered_map>  //std::unordered_map
+#include <list>
+#include <unordered_map>
 #include "variable.h"
 
 namespace paddle_mobile {
 namespace framework {
 class Scope {
  public:
-  Scope() {}
-  ~Scope() {}
+  Scope() = default;
+  ~Scope() = default;
 
   Scope &NewScope() const;
 
@@ -70,8 +69,6 @@ class Scope {
   mutable std::unordered_map<std::string, Variable *> vars_;
   mutable std::list<Scope *> kids_;
   Scope const *parent_{nullptr};
-
-  mutable std::mutex mutex_;
 };
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/selected_rows.h b/src/framework/selected_rows.h
deleted file mode 100644
index f59bd1aabfb7a0571b484fa21375acb4cb8254d3..0000000000000000000000000000000000000000
--- a/src/framework/selected_rows.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "lod_tensor.h"
-#include "tensor.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class SelectedRows {
- public:
-  SelectedRows(const std::vector<int64_t> &rows, const int64_t &height)
-      : rows_(rows), height_(height) {
-    value_.reset(new Tensor());
-  }
-
-  SelectedRows() {
-    height_ = 0;
-    value_.reset(new Tensor());
-  }
-
-  const Tensor &value() const { return *value_; }
-
-  Tensor *mutable_value() { return value_.get(); }
-
-  int64_t height() const { return height_; }
-
-  void set_height(int64_t height) { height_ = height; }
-
-  const std::vector<int64_t> &rows() const { return rows_; }
-
-  std::vector<int64_t> *mutable_rows() { return &rows_; }
-
-  void set_rows(const std::vector<int64_t> &rows) { rows_ = rows; }
-
-  /**
-   * get the index of id in rows
-   */
-  int64_t index(int64_t id) const {
-    auto it = std::find(rows_.begin(), rows_.end(), id);
-    //    PADDLE_ENFORCE(it != rows_.end(), "id should be in rows");
-    return static_cast<int64_t>(std::distance(rows_.begin(), it));
-  }
-
-  DDim GetCompleteDims() const {
-    std::vector<int64_t> dims = vectorize(value_->dims());
-    dims[0] = height_;
-    return make_ddim(dims);
-  }
-
- private:
-  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9}
-  // here.
-  // SelectedRows are simply concated when adding together. Until a
-  // SelectedRows add a Tensor, will the duplicate rows be handled.
-  std::vector<int64_t> rows_;
-  std::unique_ptr<Tensor> value_{nullptr};
-  int64_t height_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index 674edd67733ef8d0520d28f5c131e9da6746ad17..a221a26aa1435000646cf7d58321df28f3322834 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -14,14 +14,15 @@ limitations under the License. */
 
 #pragma once
 
-#include <common/enforce.h>
 #include <cstdint>
 #include <cstring>
 #include <memory>
 #include <type_traits>
 #include <typeindex>
 #include <vector>
+#include "common/enforce.h"
 
+#include "common/enforce.h"
 #include "framework/data_layout.h"
 #include "framework/ddim.h"
 #include "memory/t_malloc.h"
@@ -84,6 +85,12 @@ class Tensor {
     }
   }
 
+  Tensor(const Tensor &inTensor) {
+    this->dims_ = inTensor.dims_;
+    this->holder_ = inTensor.holder_;
+    this->offset_ = inTensor.offset_;
+  }
+
   /*! Return a pointer to mutable memory block. */
   template <typename T>
   inline T *data() {
@@ -130,7 +137,6 @@ class Tensor {
     }
     PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor'snumel must >=0.")
     int64_t size = numel() * SizeOfType(type);
-    /* some versions of boost::variant don't have operator!= */
     if (holder_ == nullptr || holder_->size() < size + offset_) {
       holder_.reset(new PlaceholderImpl(size, type));
       offset_ = 0;
@@ -169,7 +175,9 @@ class Tensor {
   /*! The internal of two tensors share the same memory block. */
   inline Tensor &ShareDataWith(const Tensor &src) {
     src.check_memory_size();
-    *this = src;
+    if (holder_.get() != src.holder_.get()) {
+      *this = src;
+    }
     return *this;
   }
 
@@ -198,7 +206,6 @@ class Tensor {
       size_t base = numel() / dims_[0];
       Tensor dst;
       dst.holder_ = holder_;
-      dst.set_layout(layout_);
       DDim dst_dims = dims_;
       dst_dims[0] = end_idx - begin_idx;
       dst.Resize(dst_dims);
@@ -227,10 +234,6 @@ class Tensor {
                           "Tensor's dims_ is out of bound. ");
   }
 
-  inline DataLayout layout() const { return layout_; }
-
-  inline void set_layout(const DataLayout layout) { layout_ = layout; }
-
  private:
   /**
    * @note    Placeholder hides type T, so it doesn't appear as a
@@ -288,21 +291,6 @@ class Tensor {
 
   DDim dims_;
 
-  /**
-   * @brief the layout of memory block, default is NHWC.
-   *
-   * @note the memory allocation order, describe how weight/data is
-   * stored
-   *       For example, in 4-D Tensor(rank=4), there are three
-   * commonly
-   *       used layout. They are
-   *            NCHW, NHWC, CHWN.
-   *       N,C,H,W for respectively the batch size, the number of
-   *       feature maps, the height, the width.
-   */
-
-  DataLayout layout_ = DataLayout::kNHWC;
-
   /**
    * @brief   A PlaceHolder may be shared by more than one tensor.
    *
diff --git a/src/framework/tensor_util.cpp b/src/framework/tensor_util.cpp
index 23b775b095d04c46764791f9f8438f2b888263bd..6722ec3e37b8219eee9e1b9913799b08d8f902bc 100644
--- a/src/framework/tensor_util.cpp
+++ b/src/framework/tensor_util.cpp
@@ -13,137 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "tensor_util.h"
-#include <algorithm>
-#include <limits>
-#include <vector>
 
 namespace paddle_mobile {
 namespace framework {
 
 void TensorCopy(const Tensor &src, Tensor *dst) {
-  //  VLOG(3) << "TensorCopy " << src.dims() << " from " <<
-  //  src.place() << " to
-  //  "
-  //          << dst_place;
-  src.check_memory_size();
-
-  dst->Resize(src.dims());
-  dst->set_layout(src.layout());
-  auto src_ptr = src.data<void>();
-
-  auto dst_ptr = dst->mutable_data(src.type());
-
-  auto size = src.numel() * SizeOfType(src.type());
-
-  memory::Copy(dst_ptr, src_ptr, size);
-}
-
-void TensorCopySync(const Tensor &src, Tensor *dst) {
-  //  VLOG(3) << "TensorCopySync " << src.dims() << " from " <<
-  //  src.place()
-  //          << " to " << dst_place;
   src.check_memory_size();
   dst->Resize(src.dims());
-  dst->set_layout(src.layout());
   auto src_ptr = src.data<void>();
   auto dst_ptr = dst->mutable_data(src.type());
   auto size = src.numel() * SizeOfType(src.type());
   memory::Copy(dst_ptr, src_ptr, size);
 }
 
-template <typename Predicate>
-struct AnyDTypeVisitor {
-  Predicate predicate_;
-  const Tensor &tensor_;
-  Tensor *out_;
-
-  AnyDTypeVisitor(Predicate predicate, const Tensor &tensor, Tensor *out)
-      : predicate_(predicate), tensor_(tensor), out_(out) {}
-
-  template <typename T>
-  void operator()() const {
-    //    auto t = EigenVector<T>::Flatten(tensor_);
-    //    auto o = EigenScalar<bool>::From(*out_);
-    // return any of predicate_(t) is true.
-    //    o.device(*ctx_.eigen_device()) = predicate_(t).any();
-  }
-};
-
-template <typename Predicate>
-inline void AnyImpl(Predicate predicate, const Tensor &tensor,
-                    framework::Tensor *out) {
-  VisitDataType(ToDataType(tensor.type()),
-                AnyDTypeVisitor<Predicate>(predicate, tensor, out));
-}
-
-template <typename Predicate>
-struct AnyVisitor {
-  const framework::Tensor &tensor_;
-  Predicate predicate_;
-
-  AnyVisitor(const framework::Tensor &tensor, Predicate predicate)
-      : tensor_(tensor), predicate_(std::move(predicate)) {}
-
-  bool operator()(void) const {
-    framework::Tensor out;
-    out.Resize({1});
-    out.mutable_data<bool>();
-    AnyImpl(predicate_, tensor_, &out);
-    return this->GetResult(out);
-  }
-
-  bool GetResult(const framework::Tensor &out) const {
-    return *out.data<bool>();
-  }
-};
-
-template <typename Predicate>
-inline bool Any(const framework::Tensor &tensor, Predicate predicate) {
-  AnyVisitor<Predicate> visitor(tensor, predicate);
-  //  return platform::VisitPlace(visitor);
-  return visitor();
-}
-
-struct ContainsNANPredicate {
-  template <typename T>
-  auto operator()(const T &eigen_vec) const
-      -> decltype(std::declval<T>().isnan()) {
-    // Cast eigen_vector to vector of bool. true if is inf.
-    return eigen_vec.isnan();
-  }
-};
-
-bool TensorContainsNAN(const framework::Tensor &tensor) {
-  ContainsNANPredicate predicate;
-  return Any(tensor, predicate);
-}
-
-struct ContainsInfPredicate {
-  template <typename T>
-  auto operator()(const T &eigen_vec) const
-      -> decltype(std::declval<T>().isinf()) {
-    // Cast eigen_vector to vector of bool. true if is inf.
-    return eigen_vec.isinf();
-  }
-};
-
-bool TensorContainsInf(const framework::Tensor &tensor) {
-  ContainsInfPredicate predicate;
-  return Any(tensor, predicate);
-}
-
-struct DeserializedDataFunctor {
-  DeserializedDataFunctor(void **buf, Tensor *tensor)
-      : buf_(buf), tensor_(tensor) {}
-
-  template <typename T>
-  void operator()() {
-    *buf_ = tensor_->mutable_data<T>();
-  }
-
-  void **buf_;
-  Tensor *tensor_;
-};
-
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/tensor_util.h b/src/framework/tensor_util.h
index 9af873d34a914b966a20c79a9c8f815309cba680..f888049b395e48b9d10cea731b092c899952e3d8 100644
--- a/src/framework/tensor_util.h
+++ b/src/framework/tensor_util.h
@@ -15,51 +15,12 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "memory/t_malloc.h"
-#include "platform/data_type.h"
 #include "tensor.h"
 
 namespace paddle_mobile {
 namespace framework {
 
 void TensorCopy(const Tensor &src, Tensor *dst);
-void TensorCopySync(const Tensor &src, Tensor *dst);
-
-template <typename T>
-void TensorFromVector(const std::vector<T> &src, Tensor *dst);
-
-template <typename T>
-void TesnorToVector(const Tensor &src, std::vector<T> *dst);
-
-bool TensorContainsNAN(const framework::Tensor &tensor);
-bool TensorContainsInf(const framework::Tensor &tensor);
-
-void TensorToStream(std::ostream &os, const Tensor &tensor);
-void TensorFromStream(std::istream &is, Tensor *tensor);
-
-//
-// The implementation of template functions.
-//
-
-template <typename T>
-void TensorFromVector(const std::vector<T> &src, Tensor *dst) {
-  auto src_ptr = static_cast<const void *>(src.data());
-  dst->Resize({static_cast<int64_t>(src.size())});
-  auto dst_ptr = static_cast<void *>(dst->mutable_data<T>());
-  auto size = src.size() * sizeof(T);
-
-  memory::Copy(dst_ptr, src_ptr, size);
-}
-
-template <typename T>
-void TensorToVector(const Tensor &src, std::vector<T> *dst) {
-  auto src_ptr = static_cast<const void *>(src.data<T>());
-  auto size = src.numel() * sizeof(T);
-
-  dst->resize(src.numel());
-  auto dst_ptr = static_cast<void *>(dst->data());
-
-  memory::Copy(dst_ptr, src_ptr, size);
-}
 
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/src/framework/var_type.h b/src/framework/var_type.h
deleted file mode 100644
index 5e132c73759bfa3a863023baf52df6ef41365047..0000000000000000000000000000000000000000
--- a/src/framework/var_type.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "framework.pb.h"
-#include "lod_tensor.h"
-#include "selected_rows.h"
-#include "variable.h"
-
-namespace paddle_mobile {
-namespace framework {
-inline proto::VarType::Type ToVarType(std::type_index type) {
-  if (type.hash_code() == typeid(LoDTensor).hash_code()) {
-    return proto::VarType_Type_LOD_TENSOR;
-  } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
-    return proto::VarType_Type_SELECTED_ROWS;
-  } else {
-    //    PADDLE_THROW("ToVarType:Unsupported type %s",
-    //    type.name());
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/src/framework/variable.h b/src/framework/variable.h
index 07cb6377e0c9ca89f828eded887b8d1da2d8aae6..e1527b3a331eb67c31aec5011bf84de3dc9bc247 100644
--- a/src/framework/variable.h
+++ b/src/framework/variable.h
@@ -14,19 +14,17 @@ limitations under the License. */
 
 #pragma once
 
-#include <iostream>
 #include <memory>
 #include <string>
 #include <typeindex>
 #include <typeinfo>
 #include "../common/variant.h"
-#include "paddle_mobile_object.h"
 
 namespace paddle_mobile {
 namespace framework {
 using std::string;
 
-class Variable : public PaddleMobileObject {
+class Variable {
  public:
   template <typename T>
   const T *Get() const {
diff --git a/src/io.cpp b/src/io/io.cpp
similarity index 51%
rename from src/io.cpp
rename to src/io/io.cpp
index 8f6a07f2dd1f8f2c2daa09f220bddc463c268e9e..8b1577b0ee16b5a4e54fa2bcfc14abaa6497b699 100644
--- a/src/io.cpp
+++ b/src/io/io.cpp
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "io.h"
-#include <fstream>
+#include "io/io.h"
+#include <algorithm>
 #include <vector>
-#include "common/log.h"
-
 #include "common/enforce.h"
+#include "common/log.h"
 #include "framework/framework.pb-c.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"
@@ -26,20 +25,29 @@ limitations under the License. */
 #include "framework/program/var_desc.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+#include <queue>
+#include <utility>
+#include "common/threadpool.h"
+#endif
 
 namespace paddle_mobile {
 using framework::Variable;
 
-void ReadBinaryFile(const std::string &filename, std::string *contents) {
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_MOBILE_ENFORCE(fin.is_open(), "open file: %s failed",
+char *Get_binary_data(std::string filename) {
+  FILE *file = fopen(filename.c_str(), "rb");
+  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
                         filename.c_str());
-  fin.seekg(0, std::ios::end);
-  contents->clear();
-  contents->resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(contents->at(0)), contents->size());
-  fin.close();
+  fseek(file, 0, SEEK_END);
+  long size = ftell(file);
+  PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
+  rewind(file);
+  char *data = new char[size];
+  size_t bytes_read = fread(data, 1, size, file);
+  PADDLE_MOBILE_ENFORCE(bytes_read == size,
+                        "read binary file bytes do not match with fseek");
+  fclose(file);
+  return data;
 }
 
 static size_t ReadBuffer(const char *file_name, uint8_t **out) {
@@ -66,110 +74,28 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
 }
 
 template <typename Dtype, Precision P>
-void Loader<Dtype, P>::LoadVar(framework::Variable *variable,
-                               const framework::VarDesc &var_desc,
-                               const std::string &file_path) {
-  auto tensor = variable->GetMutable<framework::LoDTensor>();
-  std::ifstream is(file_path);
-  PADDLE_MOBILE_ENFORCE(is.is_open(), "open file: %s failed",
-                        file_path.c_str());
-
-  std::fpos<mbstate_t> pos;
-  pos = is.tellg();  // save   current   position
-  is.seekg(0, std::ios::end);
-  is.seekg(pos);  // restore   saved   position
-
-  // 1. version
-  uint32_t version;
-  is.read(reinterpret_cast<char *>(&version), sizeof(version));
-
-  // 2 Lod information
-  uint64_t lod_level;
-  is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-  auto &lod = *tensor->mutable_lod();
-  lod.resize(lod_level);
-  for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size;
-    is.read(reinterpret_cast<char *>(&size), sizeof(size));
-    std::vector<size_t> tmp(size / sizeof(size_t));
-    is.read(reinterpret_cast<char *>(tmp.data()),
-            static_cast<std::streamsize>(size));
-    for (auto j : tmp) {
-      LOG(kLOG_DEBUG1) << "    lod - " << j;
-    }
-    lod[i] = tmp;
-  }
-
-  // 3. tensor version
-  uint32_t tensor_version;
-  is.read(reinterpret_cast<char *>(&tensor_version), sizeof(tensor_version));
-
-  // 4. tensor desc
-  int32_t size;
-  is.read(reinterpret_cast<char *>(&size), sizeof(size));
-  std::unique_ptr<char[]> buf(new char[size]);
-  is.read(reinterpret_cast<char *>(buf.get()), size);
-
-  const framework::TensorDesc &desc = var_desc.Tensor_desc();
-
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor_desc = NULL;
-  //  void *v;
-  //  PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure()(tensor_desc,
-  //  buf.get());
-
-  //  DLOG << "PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure- " <<
-  //  tensor_desc;
-
-  //  framework::TensorDesc &tensor_desc = variable->
-  //  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-  //  uint8_t *proto_buf = NULL;
-  //  size_t read_size = ReadBuffer(file_path.c_str(), &proto_buf);
-  //  c_program = paddle_mobile__framework__proto__program_desc__unpack(NULL,
-  //  read_size, buf);
-
-  //  paddle_mobile__framework__proto__var_type__tensor_desc__init()
-
-  int memory_size = 1;
-  for (auto l : desc.Dims()) {
-    memory_size *= l;
-  }
-
-  tensor->Resize(framework::make_ddim(desc.Dims()));
-
-  void *memory = tensor;
-  int type_size = 0;
-  switch (desc.DataType()) {
-    case framework::VARTYPE_TYPE_FP16:
-      type_size = 2;
-      break;
-    case framework::VARTYPE_TYPE_FP32:
-      type_size = 4;
-      memory = tensor->mutable_data<float>();
-      break;
-    case framework::VARTYPE_TYPE_FP64:
-      type_size = 8;
-      break;
-    case framework::VARTYPE_TYPE_INT32:
-      type_size = 4;
-      break;
-    case framework::VARTYPE_TYPE_INT64:
-      type_size = 8;
-      break;
-    case framework::VARTYPE_TYPE_BOOL:
-      type_size = 1;
-      break;
-    default:
-      break;
-  }
-
-  is.read(static_cast<char *>(memory), memory_size * type_size);
-  is.close();
+const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
+    const std::string &dirname, bool optimize, bool can_add_split) {
+  auto program =
+      this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
+  program.model_path = dirname;
+  return program;
 }
 
 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &dirname, bool optimize) {
-  std::string model_filename = dirname + "/__model__";
+    const std::string &model_path, const std::string &para_path,
+    bool optimize) {
+  auto program = this->LoadProgram(model_path, optimize);
+  program.para_path = para_path;
+  program.is_commbine = true;
+  return program;
+}
+
+template <typename Dtype, Precision P>
+const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
+    const std::string &model_path, bool optimize, bool can_add_split) {
+  std::string model_filename = model_path;
   PaddleMobile__Framework__Proto__ProgramDesc *c_program;
   uint8_t *buf = NULL;
   size_t read_size = ReadBuffer(model_filename.c_str(), &buf);
@@ -183,22 +109,16 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
   //
   DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
   //
-  std::shared_ptr<framework::ProgramDesc> originProgramDesc =
-      std::make_shared<framework::ProgramDesc>(c_program);
+  auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
 
   framework::Program<Dtype, P> program;
-  program.model_path = dirname;
   program.originProgram = originProgramDesc;
 
-  std::shared_ptr<framework::Scope> scope =
-      std::make_shared<framework::Scope>();
+  auto scope = std::make_shared<framework::Scope>();
   program.scope = scope;
-  originProgramDesc->Block(0);
 
   for (const auto &block : originProgramDesc->Blocks()) {
-    for (int i = 0; i < block->Vars().size(); ++i) {
-      std::shared_ptr<framework::VarDesc> var_desc = block->Vars()[i];
-      //      DLOG << "var name-- " << var_desc->Name();
+    for (auto var_desc : block->Vars()) {
       auto var = scope->Var(var_desc->Name());
 
       if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
@@ -224,7 +144,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
   if (optimize) {
     framework::ProgramOptimize program_optimize;
     program.optimizeProgram =
-        program_optimize.FushionOptimize(originProgramDesc);
+        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
   }
   if (optimize) {
     program.optimizeProgram->Description("optimize: ");
@@ -237,9 +157,10 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
 }
 
 template class Loader<CPU, Precision::FP32>;
+template class Loader<FPGA, Precision::FP32>;
+template class Loader<GPU_MALI, Precision::FP32>;
 
 #pragma mark - executor
-
 template <typename Dtype, Precision P>
 Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
                              bool use_optimize)
@@ -253,6 +174,9 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
   variable_ptr[0].SetValue<int>(batch_size);
   const std::vector<std::shared_ptr<framework::BlockDesc>> blocks =
       to_predict_program_->Blocks();
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+  depManager.resize(blocks.size());
+#endif
   for (int i = 0; i < blocks.size(); ++i) {
     std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
     std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
@@ -263,40 +187,54 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
           op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
           program_.scope);
       op_base->InferShape();
-
       ops_of_block_[*block_desc.get()].push_back(op_base);
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+      depManager[i].analysisDep(ops_of_block_[*block_desc.get()]);
+#endif
     }
   }
-  InitMemory();
+  if (program_.is_commbine) {
+    InitCombineMemory();
+  } else {
+    InitMemory();
+  }
+
+  std::shared_ptr<framework::BlockDesc> to_predict_block =
+      to_predict_program_->Block(0);
+  auto &ops = ops_of_block_[*to_predict_block.get()];
+  for (const auto &op : ops) {
+    op->Init();
+  }
 }
 
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
-                                    framework::LoDTensor *tensor,
-                                    const std::string &file_path) {
-  std::ifstream is(file_path);
-  PADDLE_MOBILE_ENFORCE(is.is_open(), "open file: %s failed",
-                        file_path.c_str());
-  std::fpos<mbstate_t> pos;
-  pos = is.tellg();  // save   current   position
-  is.seekg(0, std::ios::end);
-  is.seekg(pos);  // restore   saved   position
-
+                                    framework::LoDTensor *tensor, char *&data) {
   // 1. version
-  uint32_t version;
-  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  uint32_t version = *(uint32_t *)data;
+  data += sizeof(uint32_t);
 
   // 2 Lod information
-  uint64_t lod_level;
-  is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+  uint64_t *lod_level_ptr = new uint64_t();
+  memcpy(lod_level_ptr, data, sizeof(uint64_t));
+  uint64_t lod_level = *lod_level_ptr;
+  delete lod_level_ptr;
+  data += sizeof(uint64_t);
+
   auto &lod = *tensor->mutable_lod();
   lod.resize(lod_level);
   for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size;
-    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    uint64_t size = *(uint64_t *)data;
+    data += sizeof(uint64_t);
+    DLOG << "lod size: " << i << size;
     std::vector<size_t> tmp(size / sizeof(size_t));
-    is.read(reinterpret_cast<char *>(tmp.data()),
-            static_cast<std::streamsize>(size));
+
+    for (int k = 0; k < tmp.size(); ++k) {
+      tmp[k] = *(size_t *)data;
+      DLOG << "tmp[k]: " << k << *(size_t *)data;
+      data += sizeof(size_t);
+    }
+
     for (auto j : tmp) {
       LOG(kLOG_DEBUG1) << "    lod - " << j;
     }
@@ -304,17 +242,20 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
   }
 
   // 3. tensor version
-  uint32_t tensor_version;
-  is.read(reinterpret_cast<char *>(&tensor_version), sizeof(tensor_version));
+  uint32_t tensor_version = *(uint32_t *)data;
+  data += sizeof(uint32_t);
 
   // 4. tensor desc
-  int32_t size;
-  is.read(reinterpret_cast<char *>(&size), sizeof(size));
+  int32_t size = *(int32_t *)data;
+  data += sizeof(int32_t);
+
   std::unique_ptr<char[]> buf(new char[size]);
-  is.read(reinterpret_cast<char *>(buf.get()), size);
+  for (int m = 0; m < size; ++m) {
+    buf.get()[m] = data[m];
+  }
+  data += (sizeof(char) * size);
 
   const framework::TensorDesc &desc = var_desc.Tensor_desc();
-
   int memory_size = 1;
   for (auto l : desc.Dims()) {
     memory_size *= l;
@@ -348,8 +289,10 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
       break;
   }
 
-  is.read(static_cast<char *>(memory), memory_size * type_size);
-  is.close();
+  for (int n = 0; n < memory_size * type_size; ++n) {
+    static_cast<char *>(memory)[n] = data[n];
+  }
+  data += (sizeof(char) * memory_size * type_size);
 }
 
 template <typename Dtype, Precision P>
@@ -362,8 +305,12 @@ void Executor<Dtype, P>::InitMemory() {
         if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
           continue;
         }
-        LoadMemory(*var_desc, tensor,
-                   program_.model_path + "/" + var_desc->Name());
+
+        char *origin_data =
+            Get_binary_data(program_.model_path + "/" + var_desc->Name());
+        char *data = origin_data;
+        LoadMemory(*var_desc, tensor, data);
+        delete origin_data;
       } else {
         if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
           auto tensor = var->template GetMutable<framework::LoDTensor>();
@@ -375,6 +322,32 @@ void Executor<Dtype, P>::InitMemory() {
   }
 }
 
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::InitCombineMemory() {
+  LOG(kLOG_INFO) << " begin init combine memory";
+  char *origin_data = Get_binary_data(program_.para_path);
+  char *data = origin_data;
+  for (const auto &block : to_predict_program_->Blocks()) {
+    for (const auto &var_desc : block->Vars()) {
+      auto var = program_.scope->Var(var_desc->Name());
+      if (var_desc->Persistable()) {
+        auto tensor = var->template GetMutable<framework::LoDTensor>();
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+          continue;
+        }
+        LoadMemory(*var_desc, tensor, data);
+      } else {
+        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+          auto tensor = var->template GetMutable<framework::LoDTensor>();
+          tensor->template mutable_data<Ptype>();
+        }
+      }
+    }
+  }
+  delete origin_data;
+  LOG(kLOG_INFO) << " end init combine memory ";
+}
+
 template <typename Dtype, Precision P>
 std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
     const framework::Tensor &t) {
@@ -385,19 +358,135 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
   feed_tensor->ShareDataWith(t);
   std::shared_ptr<framework::BlockDesc> to_predict_block =
       to_predict_program_->Block(0);
-  for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-    auto op = ops_of_block_[*to_predict_block.get()][j];
-    op->Run();
+  auto &ops = ops_of_block_[*to_predict_block.get()];
+#ifdef PADDLE_MOBILE_PROFILE
+  std::vector<ProfInfo> profile(ops.size());
+#endif
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+  std::mutex m;
+  std::condition_variable cv;
+  std::queue<int> next;
+  next.push(0);
+  int rsize = ops.size();
+  std::vector<int> status(rsize, 0);
+  auto &threadPool = ThreadPool::getThreadPool();
+  auto &dep = depManager[0];
+  auto finishF = [&ops, &m, &cv, &next, &status, &rsize, &dep](int opi) {
+    std::lock_guard<std::mutex> lk(m);
+    rsize--;
+    status[opi] = 2;
+    for (int i : dep.getNext(opi)) {
+      bool ok = true;
+      for (int j : dep.getDeps(i)) {
+        if (status[j] != 2) {
+          ok = false;
+          break;
+        }
+      }
+      if (ok && (status[i] == 0)) {
+        next.push(i);
+      }
+    }
+    cv.notify_one();
+  };
+  for (;;) {
+    std::unique_lock<std::mutex> lk(m);
+    cv.wait(lk, [&next, &rsize] { return rsize == 0 || !next.empty(); });
+    if (rsize == 0) {
+      break;
+    }
+    while (next.size() > 0) {
+      int opi = next.front();
+      next.pop();
+      status[opi] = 1;
+      threadPool.enqueue([opi, &ops, &finishF, &profile] {
+        auto &op = ops[opi];
+#ifdef PADDLE_MOBILE_PROFILE
+        struct timespec ts;
+        clock_gettime(CLOCK_MONOTONIC, &ts);
+        profile[opi].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+        profile[opi].tid = ThreadPool::getThreadPoolThreadId();
+#endif
+        ops[opi]->Run();
+#ifdef PADDLE_MOBILE_PROFILE
+        clock_gettime(CLOCK_MONOTONIC, &ts);
+        profile[opi].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+        finishF(opi);
+      });
+    }
   }
-  auto ops = ops_of_block_[*to_predict_program_->Block(0)];
+#else
+  for (int i = 0; i < ops.size(); i++) {
+#ifdef PADDLE_MOBILE_PROFILE
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+
+    // to Run
+    ops[i]->Run();
+#ifdef PADDLE_MOBILE_PROFILE
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+  }
+#endif
   auto last_op = ops.rbegin();
+
   auto output_map = (*last_op)->Outputs();
   std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
   PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
   framework::LoDTensor *output_tensor =
       framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                    *(program_.scope));
-  return std::shared_ptr<framework::Tensor>(output_tensor);
+#ifdef PADDLE_MOBILE_PROFILE
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+  // TODO expose profile info as an interface, user can get them to analysis
+  //      the performance of their deepnet.
+  FILE *df = fopen("net.dot", "w");
+  fprintf(df, "digraph {\n");
+  for (int i = 0; i < ops.size(); i++) {
+    for (int j : dep.getNext(i)) {
+      fprintf(df, "op_%d -> op_%d\n", i, j);
+    }
+  }
+  for (int i = 0; i < ops.size(); i++) {
+    fprintf(df, "op_%d[label=\"%s (%d)\"]\n", i, ops[i]->Type().c_str(), i);
+  }
+  fprintf(df, "}\n");
+  fclose(df);
+#endif
+  FILE *pf = fopen("profile.out", "w");
+  std::unordered_map<std::string, uint64_t> _tp;
+  for (int i = 0; i < profile.size(); i++) {
+    const auto &pInfo = profile[i];
+    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
+    _tp[ops[i]->Type()] += timeCost;
+    fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i, ops[i]->Type().c_str(),
+            pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
+  }
+  fclose(pf);
+  printf("====================[ profile ]======================\n");
+  using prof_t = std::pair<std::string, uint64_t>;
+  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
+  uint64_t _ptotal = 0;
+  for (auto const &p : _tv) {
+    _ptotal += p.second;
+  }
+  auto compf = [](const prof_t &a, const prof_t &b) {
+    return a.second > b.second;
+  };
+  std::sort(_tv.begin(), _tv.end(), compf);
+  _tv.push_back(std::make_pair("total", _ptotal));
+  for (auto const &p : _tv) {
+    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(), (float)p.second,
+           (float)p.second / _ptotal * 100.0);
+  }
+  printf("====================[---------]======================\n");
+#endif
+
+  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
 }
 template <typename Dtype, Precision P>
 std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
@@ -420,5 +509,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
 }
 
 template class Executor<CPU, Precision::FP32>;
+template class Executor<FPGA, Precision::FP32>;
+template class Executor<GPU_MALI, Precision::FP32>;
 
 }  // namespace paddle_mobile
diff --git a/src/io.h b/src/io/io.h
similarity index 53%
rename from src/io.h
rename to src/io/io.h
index ae99197baa97c07d2a883f8721d533b85ab7873a..acae829339bdc049c5899b9c7f6a7a2c91693ae8 100644
--- a/src/io.h
+++ b/src/io/io.h
@@ -14,51 +14,80 @@ limitations under the License. */
 
 #pragma once
 
-#include <memory.h>
 #include <map>
+#include <memory>
 #include <string>
 #include <vector>
-
 #include "common/types.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"
-#include "framework/paddle_mobile_object.h"
 #include "framework/program/program.h"
 #include "framework/tensor.h"
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+#include "common/dep_core.h"
+#endif
 
 namespace paddle_mobile {
 
-template <typename Dtype, Precision P = Precision::FP32>
-class Loader : PaddleMobileObject {
+template <typename Dtype = CPU, Precision P = Precision::FP32>
+class Loader {
  public:
+  /*
+   * @b load separate format fluid model
+   * @b 加载分开形式的 fluid 模型
+   * */
   const framework::Program<Dtype, P> Load(const std::string &dirname,
-                                          bool optimize = true);
+                                          bool optimize = false,
+                                          bool can_add_split = false);
+
+  /*
+   * @b load combine format fluid mode
+   * @b 加载结合在一起格式的模型
+   * */
+  const framework::Program<Dtype, P> Load(const std::string &model_path,
+                                          const std::string &para_path,
+                                          bool optimize = false);
 
  private:
-  void LoadVar(framework::Variable *variable,
-               const framework::VarDesc &var_desc,
-               const std::string &file_path);
+  const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
+                                                 bool optimize = false,
+                                                 bool can_add_split = false);
 };
 
-template <typename Dtype, Precision P = Precision::FP32>
+template <typename Dtype = CPU, Precision P = Precision::FP32>
 class Executor {
  public:
   typedef typename PrecisionTrait<P>::ptype Ptype;
 
+  /*
+   * @b init executor with program load by Loader class
+   * @b 用 loader load 的 program 实例化 executor
+   * */
   Executor(const framework::Program<Dtype> p, int batch_size = 1,
            bool use_optimize = true);
 
+  /*
+   * @b to predict
+   * */
   std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
 
+  /*
+   * @b to predict with vector and dim
+   *
+   * @b 使用 输入 和 输入的维度信息 进行预测
+   * */
   std::vector<Ptype> Predict(const std::vector<Ptype> &input,
                              const std::vector<int64_t> &dims);
 
  protected:
   Executor() = default;
-
   void InitMemory();
   void LoadMemory(const framework::VarDesc var_desc,
-                  framework::LoDTensor *tensor, const std::string &file_path);
+                  framework::LoDTensor *tensor, char *&data);
+  void InitCombineMemory();
   framework::Program<Dtype> program_;
   int batch_size_ = 1;
   std::shared_ptr<framework::ProgramDesc> to_predict_program_;
@@ -68,6 +97,16 @@ class Executor {
            std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>>
       ops_of_block_;
   bool use_optimize_ = false;
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+  std::vector<depCore> depManager;
+#endif
+#ifdef PADDLE_MOBILE_PROFILE
+  struct ProfInfo {
+    int tid = 0;
+    uint64_t runBegin = 0UL;
+    uint64_t runEnd = 0UL;
+  };
+#endif
 };
 
 }  // namespace paddle_mobile
diff --git a/src/jni/paddle_mobile_jni.cpp b/src/jni/paddle_mobile_jni.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f663b78fd490f2c9f0af525c7dabd2cc513c3a53
--- /dev/null
+++ b/src/jni/paddle_mobile_jni.cpp
@@ -0,0 +1,93 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ANDROID
+
+#include "paddle_mobile_jni.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+namespace paddle_mobile {
+namespace jni {
+using framework::DDim;
+using framework::Program;
+using framework::Tensor;
+using paddle_mobile::CPU;
+using std::string;
+
+extern const char *ANDROID_LOG_TAG =
+    "paddle_mobile LOG built on " __DATE__ " " __TIME__;
+static Executor<CPU> *shared_executor_instance = nullptr;
+
+// toDo mutex lock
+// static std::mutex shared_mutex;
+
+Executor<CPU> *getExecutorInstance(const Program<CPU> p, int batch_size,
+                                   bool use_optimize) {
+  if (nullptr == shared_executor_instance) {
+    shared_executor_instance = new Executor<CPU>(p, batch_size, use_optimize);
+  }
+  return shared_executor_instance;
+}
+
+string jstring2cppstring(JNIEnv *env, jstring jstr) {
+  const char *cstr = env->GetStringUTFChars(jstr, 0);
+  string cppstr(cstr);
+  env->ReleaseStringUTFChars(jstr, cstr);
+  return cppstr;
+}
+
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
+                                                          jclass thiz,
+                                                          jstring modelPath) {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  bool optimize = true;
+  auto program = loader.Load(jstring2cppstring(env, modelPath), optimize);
+  shared_executor_instance = getExecutorInstance(program, 1, optimize);
+  return shared_executor_instance != nullptr ? JNI_TRUE : JNI_FALSE;
+}
+
+JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
+    JNIEnv *env, jclass thiz, jfloatArray buf) {
+  jfloatArray result = NULL;
+  int count = 0;
+  float *dataPointer = nullptr;
+  if (nullptr != buf) {
+    dataPointer = env->GetFloatArrayElements(buf, NULL);
+  }
+  framework::Tensor input;
+  framework::DDim ddim = framework::make_ddim({1, 3, 224, 224});
+  input.Resize(ddim);
+  auto input_ptr = input.mutable_data<float>();
+  for (int i = 0; i < framework::product(ddim); i++) {
+    input_ptr[i] = dataPointer[i];
+  }
+  auto output = shared_executor_instance->Predict(input);
+  count = output->numel();
+  result = env->NewFloatArray(count);
+  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
+  return result;
+}
+
+JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_clear(JNIEnv *env,
+                                                       jclass thiz) {}
+
+}  // namespace jni
+}  // namespace paddle_mobile
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/jni/paddle_mobile_jni.h b/src/jni/paddle_mobile_jni.h
new file mode 100644
index 0000000000000000000000000000000000000000..a262d4070c37013977e869fa816d52d78fbfa485
--- /dev/null
+++ b/src/jni/paddle_mobile_jni.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef ANDROID
+#include <jni.h>
+#include "common/log.h"
+#include "framework/tensor.h"
+#include "io/io.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+namespace paddle_mobile {
+namespace jni {
+/**
+ * load model & params of the net for android
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
+                                                          jclass thiz,
+                                                          jstring modelPath);
+
+/**
+ * object detection for anroid
+ */
+JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
+    JNIEnv *env, jclass thiz, jfloatArray buf);
+
+/**
+ * clear data of the net when destroy for android
+ */
+JNIEXPORT void JNICALL Java_com_baidu_paddle_PMLL_clear(JNIEnv *env,
+                                                        jclass thiz);
+}  // namespace jni
+}  // namespace paddle_mobile
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp
index 280391da5ac0b5c7bdbbbbe8df6772377ca075c5..0252f3c07c06487720586b0f650e2179d247234f 100644
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -12,19 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
-#include "t_malloc.h"
+#include "memory/t_malloc.h"
 #include <cstdlib>
 #include <cstring>
 
 namespace paddle_mobile {
 namespace memory {
-const int MALLOC_ALIGN = 16;
+const int MALLOC_ALIGN = 64;
 
 void Copy(void *dst, const void *src, size_t num) {
   std::memcpy(dst, src, num);
-};
+}
 
 void *Alloc(size_t size) {
   size_t offset = sizeof(void *) + MALLOC_ALIGN - 1;
diff --git a/src/operators/batchnorm_op.cpp b/src/operators/batchnorm_op.cpp
index 1f8a1698f4281174d2503650bde5deb0ef9825e9..5d94d54f88e33b168739b1bbdf9af0bea9fe1b4f 100644
--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -12,20 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef BATCHNORM_OP
+
 #include "batchnorm_op.h"
+#include "framework/op_proto_maker.h"
+#include "framework/op_registry.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <typename Dtype, typename T>
 void BatchNormOp<Dtype, T>::InferShape() const {
-  auto x_dims = param_.InputX()->dims();
-  param_.OutputY()->Resize(x_dims);
+  auto x_dims = this->param_.InputX()->dims();
+  this->param_.OutputY()->Resize(x_dims);
 }
 template class BatchNormOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
-USE_OP(batch_norm);
-REGISTER_OPERATOR(batch_norm, ops::BatchNormOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(batch_norm);
+REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(batch_norm);
+REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/batchnorm_op.h b/src/operators/batchnorm_op.h
index 760466eeddcb472ed2a47625b786a021ce7c1ef5..9ee0b2dcf6b6ec46fcb08cac88d3df275d33f7d6 100644
--- a/src/operators/batchnorm_op.h
+++ b/src/operators/batchnorm_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef BATCHNORM_OP
+
 #pragma once
 
 #include <string>
@@ -23,27 +25,24 @@ namespace paddle_mobile {
 namespace operators {
 using std::string;
 template <typename DeviceType, typename T>
-class BatchNormOp : public framework::OperatorWithKernel<DeviceType> {
+class BatchNormOp
+    : public framework::OperatorWithKernel<DeviceType, BatchNormParam,
+                                           BatchNormKernel<DeviceType, T>> {
  public:
   BatchNormOp(const string &type, const VariableNameMap &inputs,
               const VariableNameMap &outputs,
-              const framework::AttributeMap attrs,
+              const framework::AttributeMap &attrs,
               std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
-        param_(inputs, outputs, attrs, *scope) {}
-
-  void RunImpl() const {
-    operators::BatchNormKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
+      : framework::OperatorWithKernel<DeviceType, BatchNormParam,
+                                      BatchNormKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
 
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
-  BatchNormParam param_;
 };
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/box_coder_op.cpp b/src/operators/box_coder_op.cpp
index ca653b5711241e77a9df308922aedb0551b1103f..31891ed74266d599898dd7426eed5cd28f320ab6 100644
--- a/src/operators/box_coder_op.cpp
+++ b/src/operators/box_coder_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef BOXCODER_OP
+
 #include "operators/box_coder_op.h"
 #include <vector>
 namespace paddle_mobile {
@@ -19,11 +21,11 @@ namespace operators {
 
 template <typename Dtype, typename T>
 void BoxCoderOp<Dtype, T>::InferShape() const {
-  auto input_priorbox_dims = param_.InputPriorBox()->dims();
-  auto input_priorboxvar_dims = param_.InputPriorBoxVar()->dims();
-  auto input_targetbox_dims = param_.InputTargetBox()->dims();
+  auto input_priorbox_dims = this->param_.InputPriorBox()->dims();
+  auto input_priorboxvar_dims = this->param_.InputPriorBoxVar()->dims();
+  auto input_targetbox_dims = this->param_.InputTargetBox()->dims();
 
-  auto code_type = param_.CodeType();
+  auto code_type = this->param_.CodeType();
 
   if (code_type == "encode_center_size") {
     if (input_targetbox_dims.size() != 2) {
@@ -42,7 +44,7 @@ void BoxCoderOp<Dtype, T>::InferShape() const {
       LOG(kLOG_ERROR) << " dimension not match";
     }
   }
-  param_.OutputBox()->Resize(framework::make_ddim(
+  this->param_.OutputBox()->Resize(framework::make_ddim(
       {input_targetbox_dims[0], input_priorbox_dims[0], 4}));
 }
 template class BoxCoderOp<CPU, float>;
@@ -50,5 +52,13 @@ template class BoxCoderOp<CPU, float>;
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
-USE_OP(box_coder);
-REGISTER_OPERATOR(box_coder, ops::BoxCoderOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(box_coder);
+REGISTER_OPERATOR_CPU(box_coder, ops::BoxCoderOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/box_coder_op.h b/src/operators/box_coder_op.h
index a2203e1d89f8b5b6270c1576711a4c008d927e34..33ff2358bc8285a026c217ed11c2250769395567 100644
--- a/src/operators/box_coder_op.h
+++ b/src/operators/box_coder_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef BOXCODER_OP
+
 #pragma once
 
 #include <string>
@@ -26,27 +28,27 @@ namespace operators {
 using paddle_mobile::framework::Tensor;
 
 template <typename DeviceType, typename T>
-class BoxCoderOp : public framework::OperatorWithKernel<DeviceType> {
+class BoxCoderOp
+    : public framework::OperatorWithKernel<
+          DeviceType, BoxCoderParam, operators::BoxCoderKernel<DeviceType, T>> {
  public:
   BoxCoderOp(const std::string &type, const VariableNameMap &inputs,
              const VariableNameMap &outputs,
-             const framework::AttributeMap attrs,
+             const framework::AttributeMap &attrs,
              std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
-        param_(inputs, outputs, attrs, *scope) {}
-
-  void RunImpl() const {
-    operators::BoxCoderKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
+      : framework::OperatorWithKernel<DeviceType, BoxCoderParam,
+                                      operators::BoxCoderKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
 
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, BoxCoderParam,
+      operators::BoxCoderKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
-  BoxCoderParam param_;
 };
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/concat_op.cpp b/src/operators/concat_op.cpp
index 6744b47b7728558f95fad0435979841a73a7a6f6..fe0507dc812a3ddafcc0433c2659c3b49ea87f6e 100644
--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef CONCAT_OP
+
 #include "concat_op.h"
 
 namespace paddle_mobile {
@@ -19,7 +21,7 @@ namespace operators {
 
 template <typename Dtype, typename T>
 void ConcatOp<Dtype, T>::InferShape() const {
-  auto inputs = param_.Inputs();
+  auto inputs = this->param_.Inputs();
   const size_t n = inputs.size();
 
   std::vector<DDim> inputs_dims;
@@ -28,7 +30,7 @@ void ConcatOp<Dtype, T>::InferShape() const {
     inputs_dims.push_back(inputs[i]->dims());
   }
 
-  auto axis = static_cast<size_t>(param_.Axis());
+  auto axis = static_cast<size_t>(this->param_.Axis());
 
   if (n == 1) {
     DLOG << "Warning: concat op have only one input, "
@@ -52,7 +54,7 @@ void ConcatOp<Dtype, T>::InferShape() const {
     out_dims[axis] = -1;
   }
 
-  param_.Out()->Resize(out_dims);
+  this->param_.Out()->Resize(out_dims);
 }
 template class ConcatOp<CPU, float>;
 
@@ -60,5 +62,15 @@ template class ConcatOp<CPU, float>;
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
-USE_OP(concat);
-REGISTER_OPERATOR(concat, ops::ConcatOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(concat);
+REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(concat);
+REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/concat_op.h b/src/operators/concat_op.h
index 15160e20a403d73bb11e982f5a527454f26b5dd6..93612c6b1b6d1f6aa992773ef5cccc0c93f1b6e8 100644
--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef CONCAT_OP
+
 #pragma once
 
 #include <string>
@@ -22,26 +24,26 @@ namespace paddle_mobile {
 namespace operators {
 using std::string;
 template <typename DeviceType, typename T>
-class ConcatOp : public framework::OperatorWithKernel<DeviceType> {
+class ConcatOp
+    : public framework::OperatorWithKernel<
+          DeviceType, ConcatParam, operators::ConcatKernel<DeviceType, T>> {
  public:
   ConcatOp(const string &type, const VariableNameMap &inputs,
-           const VariableNameMap &outputs, const framework::AttributeMap attrs,
+           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
            std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
-        param_(inputs, outputs, attrs, *scope) {}
-
-  void RunImpl() const {
-    operators::ConcatKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
+      : framework::OperatorWithKernel<DeviceType, ConcatParam,
+                                      operators::ConcatKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
 
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, ConcatParam,
+      operators::ConcatKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
-  ConcatParam param_;
 };
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/conv_op.cpp b/src/operators/conv_op.cpp
index bfddcf14acbba016c4e4333e05fcc7dd6eebc509..01d284a06ed33142a8d16cdc32f304c3d1a75e28 100644
--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -12,42 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef CONV_OP
+
 #include "operators/conv_op.h"
 #include <vector>
-#include "framework/data_type.h"
 #include "framework/op_proto_maker.h"
 #include "framework/op_registry.h"
+#include "operators/math/conv_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <typename Dtype, typename T>
 void ConvOp<Dtype, T>::InferShape() const {
-  //  std::cout << " begin get dims: " << std::endl;
-
-  auto in_dims = param_.Input()->dims();
-
-  //  std::cout << " end get in dims: " << std::endl;
-
-  //  std::cout << " in_dims: " << in_dims << std::endl;
-
-  //  std::cout << " begin get Filter " << std::endl;
-
-  auto filter_dims = param_.Filter()->dims();
-
-  //  std::cout << " end get Filter " << std::endl;
-
-  //  std::cout << " begin get Attrs " << std::endl;
-
-  const std::vector<int> &strides = param_.Strides();
-
-  //  std::cout << " end get Attrs " << strides[0] << std::endl;
-
-  std::vector<int> paddings = param_.Paddings();
-
-  int groups = param_.Groups();
-
-  std::vector<int> dilations = param_.Dilations();
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
 
   PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
                          dilations.size() == paddings.size() &&
@@ -56,13 +39,13 @@ void ConvOp<Dtype, T>::InferShape() const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
   }
 
   framework::DDim ddim = framework::make_ddim(output_shape);
-  param_.Output()->Resize(ddim);
+  this->param_.Output()->Resize(ddim);
 }
 
 template class ConvOp<CPU, float>;
@@ -71,5 +54,17 @@ template class ConvOp<CPU, float>;
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
-USE_OP(conv2d);
-REGISTER_OPERATOR(conv2d, ops::ConvOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(conv2d);
+REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(conv2d);
+REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(conv2d);
+REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
+#endif
+
+#endif
diff --git a/src/operators/conv_op.h b/src/operators/conv_op.h
index f15f286b606db1403b0e0e609bfc38caac2c5105..f8e8952d47fd726c712c0f7817606d959095b65b 100644
--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef CONV_OP
+
 #pragma once
 
 #include <string>
@@ -22,34 +24,26 @@ namespace paddle_mobile {
 namespace operators {
 using std::string;
 template <typename DeviceType, typename T>
-class ConvOp : public framework::OperatorWithKernel<DeviceType> {
+class ConvOp
+    : public framework::OperatorWithKernel<
+          DeviceType, ConvParam, operators::ConvKernel<DeviceType, T>> {
  public:
   ConvOp(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
          std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
-        param_(inputs, outputs, attrs, *scope) {}
+      : framework::OperatorWithKernel<DeviceType, ConvParam,
+                                      operators::ConvKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
 
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, ConvParam,
+      operators::ConvKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
-  void RunImpl() const {
-    operators::ConvKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-    this->ClearVariables({"Filter", "Input"});
-  }
-
  private:
-  ConvParam param_;
 };
 
-inline int ConvOutputSize(int input_size, int filter_size, int dilation,
-                          int padding, int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  return output_size;
-}
-
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/depthwise_conv_op.cpp b/src/operators/depthwise_conv_op.cpp
index 2538298175c5ea40d7e44338caee853a73c089c4..46f2db30ba2fbff5839d6a737dda12fa6cd10b43 100644
--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -12,24 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef DEPTHWISECONV_OP
+
 #include "operators/depthwise_conv_op.h"
 #include <vector>
-#include "framework/data_type.h"
 #include "framework/op_proto_maker.h"
 #include "framework/op_registry.h"
 #include "operators/conv_op.h"
+#include "operators/math/conv_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <typename Dtype, typename T>
 void DepthwiseConvOp<Dtype, T>::InferShape() const {
-  auto in_dims = param_.Input()->dims();
-  auto filter_dims = param_.Filter()->dims();
-  const std::vector<int> &strides = param_.Strides();
-  std::vector<int> paddings = param_.Paddings();
-  int groups = param_.Groups();
-  std::vector<int> dilations = param_.Dilations();
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
 
   PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
                          dilations.size() == paddings.size() &&
@@ -38,13 +40,13 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
   }
 
   framework::DDim ddim = framework::make_ddim(output_shape);
-  param_.Output()->Resize(ddim);
+  this->param_.Output()->Resize(ddim);
 }
 
 template class DepthwiseConvOp<CPU, float>;
@@ -53,5 +55,13 @@ template class DepthwiseConvOp<CPU, float>;
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
-USE_OP(depthwise_conv2d);
-REGISTER_OPERATOR(depthwise_conv2d, ops::DepthwiseConvOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(depthwise_conv2d);
+REGISTER_OPERATOR_CPU(depthwise_conv2d, ops::DepthwiseConvOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/depthwise_conv_op.h b/src/operators/depthwise_conv_op.h
index c47fa0ffcacd54a5ddf7280419ca1170173bde1b..75bcf44cb8790365e7f33719c481354c1a57c80a 100644
--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef DEPTHWISECONV_OP
+
 #pragma once
 
 #include <string>
@@ -22,28 +24,28 @@ namespace paddle_mobile {
 namespace operators {
 
 template <typename DeviceType, typename T>
-class DepthwiseConvOp : public framework::OperatorWithKernel<DeviceType> {
+class DepthwiseConvOp : public framework::OperatorWithKernel<
+                            DeviceType, ConvParam,
+                            operators::DepthwiseConvKernel<DeviceType, T>> {
  public:
   DepthwiseConvOp(const std::string &type, const VariableNameMap &inputs,
                   const VariableNameMap &outputs,
                   const framework::AttributeMap &attrs,
                   std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
-        param_(inputs, outputs, attrs, *scope) {}
-
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+      : framework::OperatorWithKernel<
+            DeviceType, ConvParam,
+            operators::DepthwiseConvKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, ConvParam,
+      operators::DepthwiseConvKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
-  void RunImpl() const {
-    operators::DepthwiseConvKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-    this->ClearVariables({"Filter", "Input"});
-  }
-
  private:
-  ConvParam param_;
 };
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/elementwise_add_op.cpp b/src/operators/elementwise_add_op.cpp
index 1eff80152bfb193fc8cd3866d63b1ae4d55f4b9c..12c59da6452992e3dd73b985db685a651df02250 100644
--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef ELEMENTWISEADD_OP
+
 #include "elementwise_add_op.h"
 
 namespace paddle_mobile {
@@ -19,13 +21,23 @@ namespace operators {
 
 template <typename Dtype, typename T>
 void ElementwiseAddOp<Dtype, T>::InferShape() const {
-  auto x_dim = param_.InputX()->dims();
-  param_.Out()->Resize(x_dim);
+  auto x_dim = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(x_dim);
 }
 template class ElementwiseAddOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
-USE_OP(elementwise_add);
-REGISTER_OPERATOR(elementwise_add, ops::ElementwiseAddOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(elementwise_add);
+REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(elementwise_add);
+REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/elementwise_add_op.h b/src/operators/elementwise_add_op.h
index 7dd7e147a0630450c3ad9f830d661b2b92a5f995..6cb80d06d0a4d66935c77a3c23a6264d0be53ecc 100644
--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef ELEMENTWISEADD_OP
+
 #pragma once
 
 #include <string>
@@ -23,26 +25,27 @@ namespace paddle_mobile {
 namespace operators {
 using std::string;
 template <typename DeviceType, typename T>
-class ElementwiseAddOp : public framework::OperatorWithKernel<DeviceType> {
+class ElementwiseAddOp : public framework::OperatorWithKernel<
+                             DeviceType, ElementwiseAddParam,
+                             operators::ElementwiseAddKernel<DeviceType, T>> {
  public:
   ElementwiseAddOp(const string &type, const VariableNameMap &inputs,
                    const VariableNameMap &outputs,
-                   const framework::AttributeMap attrs,
+                   const framework::AttributeMap &attrs,
                    std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
-        param_(inputs, outputs, attrs, *scope) {}
-
-  void RunImpl() const {
-    operators::ElementwiseAddKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
-
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+      : framework::OperatorWithKernel<
+            DeviceType, ElementwiseAddParam,
+            operators::ElementwiseAddKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, ElementwiseAddParam,
+      operators::ElementwiseAddKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
-  ElementwiseAddParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h
index 25a82894ea96420e94d9d2e4d70809930a954642..bd5fd8cb32d484b7f76652139603f6b0f1b4b5d7 100644
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -32,6 +32,8 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
         param_(inputs, outputs, attrs, *scope) {}
   void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
 
+  void Init() const {}
+
   void InferShape() const {
     auto out_dims = param_.Out()->dims();
     out_dims[0] = param_.BatchSize();
@@ -43,8 +45,16 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
 };
 
 namespace ops = paddle_mobile::operators;
-USE_OP(feed);
-REGISTER_OPERATOR(feed, ops::FeedOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(feed);
+REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(feed);
+REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/fetch_op.h b/src/operators/fetch_op.h
index 31e17f2b562567de1b4194098995f6ee4cd3caa3..4b3680b58357d8295b1b6acf111d3573d4e4d1bd 100644
--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -33,6 +33,8 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
         param_(inputs, outputs, attrs, *scope) {}
   void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
 
+  void Init() const {}
+
   void InferShape() const {
     auto x_dims = param_.InputX()->dims();
     param_.Out()->Resize(x_dims);
@@ -43,8 +45,16 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
 };
 
 namespace ops = paddle_mobile::operators;
-USE_OP(fetch);
-REGISTER_OPERATOR(fetch, ops::FetchOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fetch);
+REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fetch);
+REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/fusion_conv_add.cpp b/src/operators/fusion_conv_add.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4c01603509b0a1d9da2c2dc31a38719d5117e05c
--- /dev/null
+++ b/src/operators/fusion_conv_add.cpp
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADD_OP
+
+#include "operators/fusion_conv_add.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionConvAddOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+template class FusionConvAddOp<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(conv_add);
+REGISTER_OPERATOR_CPU(conv_add, ops::FusionConvAddOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(conv_add);
+REGISTER_OPERATOR_MALI_GPU(conv_add, ops::FusionConvAddOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_add.h b/src/operators/fusion_conv_add.h
new file mode 100644
index 0000000000000000000000000000000000000000..24f1d3f63b3300db9b60a595466a0ced3b9e996b
--- /dev/null
+++ b/src/operators/fusion_conv_add.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#define FUSION_CONVADD_OP
+#ifdef FUSION_CONVADD_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "op_param.h"
+#include "operators/kernel/conv_add_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionConvAddMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionConvAddMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
+        node->OpDescs(node_.Depth());
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Y"}}}, removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_CONV_ADD; }
+};
+
+template <typename DeviceType, typename T>
+class FusionConvAddOp : public framework::OperatorWithKernel<
+                            DeviceType, FusionConvAddParam,
+                            operators::ConvAddKernel<DeviceType, T>> {
+ public:
+  FusionConvAddOp(const string &type, const VariableNameMap &inputs,
+                  const VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs,
+                  std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, FusionConvAddParam,
+                                      operators::ConvAddKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, FusionConvAddParam,
+      operators::ConvAddKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+#ifndef CONV_ADD_REGISTER
+static framework::FusionOpRegistrar convadd_registrar(
+    new FusionConvAddMatcher());
+#define CONV_ADD_REGISTER
+#endif
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+
+#ifndef CONV_ADD_REGISTER
+static framework::FusionOpRegistrar convadd_registrar(
+    new FusionConvAddMatcher());
+#define CONV_ADD_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/fusion_conv_add_relu_op.cpp b/src/operators/fusion_conv_add_relu_op.cpp
index 92f6fcf848f169eed141b1456c05e6fbd8ca9895..694e46af1f8dec3513c5a6d2ff26e3676e9204e4 100644
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -12,4 +12,49 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef CONVADDRELU_OP
+
 #include "fusion_conv_add_relu_op.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionConvAddReluOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_relu);
+REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h
index e93c910d2b3132fb1894043a7c6aa3c8593dbb20..fd27005c8bef8f8cb91fbf5b6e5a852306c28a9b 100644
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -12,38 +12,72 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef CONVADDRELU_OP
+
 #pragma once
 
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/conv_add_relu_kernel.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
 
-class FushionConvAddReluOpMatcher : public framework::FusionOpMatcher {
+class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher {
  public:
-  FushionConvAddReluOpMatcher() {
+  FusionConvAddReluOpMatcher() {
     node_ = framework::Node(G_OP_TYPE_CONV);
     node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
         std::make_shared<framework::Node>(G_OP_TYPE_RELU);
   }
 
-  void FolderNodes(framework::Node *node) {
-    std::vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node->OpDescs(node_.Depth());
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
     node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Y"}}}, removed_nodes);
   }
   std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; }
 };
 
-class FusionFcOp {
+template <typename DeviceType, typename T>
+class FusionConvAddReluOp : public framework::OperatorWithKernel<
+                                DeviceType, FusionConvAddReluParam,
+                                operators::ConvAddReluKernel<DeviceType, T>> {
  public:
- private:
+  FusionConvAddReluOp(const string &type, const VariableNameMap &inputs,
+                      const VariableNameMap &outputs,
+                      const framework::AttributeMap &attrs,
+                      std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionConvAddReluParam,
+            operators::ConvAddReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                         attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, FusionConvAddReluParam,
+      operators::ConvAddReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
 };
 
-// static framework::FusionOpRegistrar fc_registrar(
-//    new FushionConvAddReluOpMatcher());
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef CONV_ADD_RELU_REGISTER
+#define CONV_ADD_RELU_REGISTER
+// static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(new
+// FusionConvAddReluOpMatcher());
+#endif
+
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/fusion_fc_op.cpp b/src/operators/fusion_fc_op.cpp
index 0f1be5c29fee1f741b773bbfa11b50b5aa49b8b7..fae561348899dadc4c25f84ec3a0993d9ae693f9 100644
--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -12,16 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef FUSION_FC_OP
+
 #include "operators/fusion_fc_op.h"
 namespace paddle_mobile {
 namespace operators {
 
 template <typename Dtype, typename T>
-void FushionFcOp<Dtype, T>::InferShape() const {
-  auto x_dims = param_.InputX()->dims();
-  auto y_dims = param_.InputY()->dims();
-  int x_num_col_dims = param_.XNumColDims();
-  int y_num_col_dims = param_.YNumColDims();
+void FusionFcOp<Dtype, T>::InferShape() const {
+  auto x_dims = this->param_.InputX()->dims();
+  auto y_dims = this->param_.InputY()->dims();
+  int x_num_col_dims = this->param_.XNumColDims();
+  int y_num_col_dims = this->param_.YNumColDims();
 
   assert(x_dims.size() > x_num_col_dims);
   assert(y_dims.size() > y_num_col_dims);
@@ -45,12 +47,22 @@ void FushionFcOp<Dtype, T>::InferShape() const {
   }
 
   framework::DDim ddim = framework::make_ddim(output_dims);
-  param_.Out()->Resize(ddim);
+  this->param_.Out()->Resize(ddim);
 }
-template class FushionFcOp<CPU, float>;
+template class FusionFcOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
-USE_OP(fc);
-REGISTER_OPERATOR(fc, ops::FushionFcOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fc);
+REGISTER_OPERATOR_CPU(fc, ops::FusionFcOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fc);
+REGISTER_OPERATOR_MALI_GPU(fc, ops::FusionFcOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/fusion_fc_op.h b/src/operators/fusion_fc_op.h
index 9019ef4d49641414682639b7a27cf93a20e43cf4..0ca4d2b27ad46b77ddba55b6b377e741c97bdc9e 100644
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef FUSION_FC_OP
+
 #pragma once
 
 #include <string>
@@ -19,7 +21,7 @@ limitations under the License. */
 
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/fushion_fc_kernel.h"
+#include "operators/kernel/fusion_fc_kernel.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -32,40 +34,55 @@ class FusionFcMatcher : public framework::FusionOpMatcher {
     node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
   }
 
-  void FolderNodes(framework::Node *node) {
-    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node->OpDescs(node_.Depth());
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
     node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}}, removed_nodes);
   }
 
   std::string Type() { return G_OP_TYPE_FC; }
 };
 
 template <typename DeviceType, typename T>
-class FushionFcOp : public framework::OperatorWithKernel<DeviceType> {
+class FusionFcOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionFcParam, operators::FusionFcKernel<DeviceType, T>> {
  public:
-  FushionFcOp(const string &type, const VariableNameMap &inputs,
-              const VariableNameMap &outputs,
-              const framework::AttributeMap attrs,
-              std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
-        param_(inputs, outputs, attrs, *scope) {}
-
-  void RunImpl() const {
-    operators::FushionFcKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
+  FusionFcOp(const string &type, const VariableNameMap &inputs,
+             const VariableNameMap &outputs,
+             const framework::AttributeMap &attrs,
+             std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, FusionFcParam,
+                                      operators::FusionFcKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
 
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, FusionFcParam,
+      operators::FusionFcKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
-  FushionFcParam param_;
 };
 
+#ifdef PADDLE_MOBILE_CPU
+#ifndef CONV_CPU_REGISTER
+#define CONV_CPU_REGISTER
+static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
+#endif
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+#ifndef CONV_CPU_REGISTER
+#define CONV_CPU_REGISTER
 static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
+#endif
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/batchnorm_kernel.cpp b/src/operators/kernel/arm/batchnorm_kernel.cpp
index e28bdd7147f300cb181ffc5e0aeebec412ec45e7..964bf71f451e2ca48d3742ed5151e9784c516d5c 100644
--- a/src/operators/kernel/arm/batchnorm_kernel.cpp
+++ b/src/operators/kernel/arm/batchnorm_kernel.cpp
@@ -12,82 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+#ifdef BATCHNORM_OP
 
 #include "operators/kernel/batchnorm_kernel.h"
+#include "operators/kernel/central-arm-func/batchnorm_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
-  /// todo: test.
-  const Tensor *input_x = param.InputX();
-  auto input_x_ptr = input_x->data<float>();
-  const auto &x_dims = input_x->dims();
-  const int N = x_dims[0];
-  const int C = x_dims[1];
-  const int H = x_dims[2];
-  const int W = x_dims[3];
-  const int stride0 = C * H * W;
-  const int stride1 = H * W;
-  const int stride2 = W;
-  Tensor *out = param.OutputY();
-  auto out_ptr = out->mutable_data<float>();
-  const float epsilon = param.Epsilon();
-  const Tensor *mean = param.InputMean();
-  const Tensor *variance = param.InputVariance();
-  const Tensor *scale = param.InputScale();
-  const Tensor *bias = param.InputBias();
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-
-  Tensor inv_std;
-  auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
-  if (C != variance->numel()) {
-    std::cout << "C must equal to variance.numel()" << std::endl;
-  }
-  assert(C == variance->numel());
-
-  /// std = (var + epsilon).sqrt();
-  /// inv_std = 1 / std;
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-
-  Tensor new_scale;
-  auto new_scale_ptr = new_scale.mutable_data<float>(make_ddim({C}));
-  Tensor new_bias;
-  auto new_bias_ptr = new_bias.mutable_data<float>(make_ddim({C}));
+bool BatchNormKernel<CPU, float>::Init(const BatchNormParam &para) const {
+  return true;
+}
 
-  /// ((x - est_mean) * (inv_var) * scale + bias equal to
-  /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-    {
-      for (int n = 0; n < N; n++) {
-        for (int h = 0; h < H; h++) {
-          int tmp_index = n * stride0 + i * stride1 + h * stride2;
-          for (int w = 0; w < W; w++) {
-            int index = tmp_index + w;
-            out_ptr[index] =
-                input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
-          }
-        }
-      }
-    }
-  }
-  DLOG << "input[2,5,1,0](input[102]) ,channel 5 :";
-  DLOG << "input_x_ptr : " << input_x_ptr[102];
-  DLOG << "variance : " << variance_ptr[5];
-  DLOG << "inv_std_ptr : " << inv_std_ptr[5];
-  DLOG << "new_scale_ptr : " << new_scale_ptr[5];
-  DLOG << "new_bias_ptr : " << new_bias_ptr[5];
-  DLOG << "out_ptr : " << out_ptr[102];
+template <>
+void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
+  BatchnormCompute<float>(param);
 }
+
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/box_coder_kernel.cpp b/src/operators/kernel/arm/box_coder_kernel.cpp
index d604c3d2a8d7f7fb1c817397a61cb156f1d0f392..df0a75f357658736ede4265a6cc57db30afee1d4 100644
--- a/src/operators/kernel/arm/box_coder_kernel.cpp
+++ b/src/operators/kernel/arm/box_coder_kernel.cpp
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+#ifdef BOXCODER_OP
 
 #include "operators/kernel/box_coder_kernel.h"
+#include <cmath>
 
 namespace paddle_mobile {
 namespace operators {
@@ -109,6 +110,11 @@ void DecodeCenterSize(const framework::Tensor& target_box,
   }
 }
 
+template <>
+bool BoxCoderKernel<CPU, float>::Init(const BoxCoderParam& para) const {
+  return true;
+}
+
 template <>
 void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam& param) const {
   const auto* input_priorbox = param.InputPriorBox();
@@ -135,3 +141,5 @@ void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam& param) const {
 }
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/concat_kernel.cpp b/src/operators/kernel/arm/concat_kernel.cpp
index 705b698dbe9e9768713417f85ae2879df66acf9e..0312047b8e8af1eb9dad57c751e392e8a5054878 100644
--- a/src/operators/kernel/arm/concat_kernel.cpp
+++ b/src/operators/kernel/arm/concat_kernel.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+#ifdef CONCAT_OP
 
 #include "operators/kernel/concat_kernel.h"
 
@@ -52,6 +52,11 @@ class ConcatFunctor {
   }
 };
 
+template <>
+bool ConcatKernel<CPU, float>::Init(const ConcatParam &para) const {
+  return true;
+}
+
 template <>
 void ConcatKernel<CPU, float>::Compute(const ConcatParam &param) const {
   auto inputs = param.Inputs();
@@ -85,3 +90,5 @@ void ConcatKernel<CPU, float>::Compute(const ConcatParam &param) const {
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/conv_add_kernel.cpp b/src/operators/kernel/arm/conv_add_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4bde8289007415dccbc7a630c7646ac718087c55
--- /dev/null
+++ b/src/operators/kernel/arm/conv_add_kernel.cpp
@@ -0,0 +1,120 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADD_OP
+
+#include "operators/kernel/conv_add_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddKernel<CPU, float>::Init(const FusionConvAddParam &para) const {
+  return true;
+}
+
+template <>
+void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam &param) const {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor bias = *param.Bias();
+  int axis = param.Axis();
+  Tensor *output = param.Output();
+  math::expand_bias(bias, axis, output->dims());
+  output->ShareDataWith(bias);
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(1));
+    }
+  }
+}
+template class ConvAddKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/framework/program/var_desc.cpp b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
similarity index 55%
rename from src/framework/program/var_desc.cpp
rename to src/operators/kernel/arm/conv_add_relu_kernel.cpp
index e54ae67b55c15540a0232dc6fdd97e70ae721ddb..d3c04179b37014adc6c81f32dd6c08f697283671 100644
--- a/src/framework/program/var_desc.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
@@ -12,9 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "var_desc.h"
+#ifdef FUSION_CONVADD_RELU_OP
+
+#include "operators/kernel/conv_add_relu_kernel.h"
+#include "operators/kernel/central-arm-func/conv_add_relu_arm_func.h"
 
 namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddReluKernel<CPU, float>::Init(
+    const FusionConvAddReluParam &para) const {
+  return true;
+}
 
-namespace framework {}  // namespace framework
+template <>
+void ConvAddReluKernel<CPU, float>::Compute(
+    const FusionConvAddReluParam &param) const {
+  ConvAddReluCompute<float>(param);
+}
+template class ConvAddReluKernel<CPU, float>;
+
+}  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp
index f04b8156c9d3c88520b1c74b60a20f41e7fedc98..049425d88f96a322a0b4cb47c18d85f2df03d577 100644
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -12,103 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef CONV_OP
+
 #include "operators/kernel/conv_kernel.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
-  LOG(kLOG_DEBUG) << param;
-
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  //  DLOG << " compute end get Attrs " << strides[0];
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  DLOG << " filter.dims() = " << filter.dims();
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
+bool ConvKernel<CPU, float>::Init(const ConvParam &para) const {
+  return true;
+}
 
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(0));
-    }
-  }
+template <>
+void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
+  ConvCompute<float>(param);
 }
 
 template class ConvKernel<CPU, float>;
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/depthwise_conv_kernel.cpp b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
index 1da52fa8d469bd81d043843d7bcca3a7b01f6663..4cbfa23248e87e2bf3a8d97330fa19f92985a9d0 100644
--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
@@ -12,115 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef DEPTHWISECONV_OP
+
 #include "operators/kernel/depthwise_conv_kernel.h"
-#include "operators/kernel/conv_kernel.h"
+#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
-  LOG(kLOG_DEBUG) << param;
-
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  //  DLOG << " compute end get Attrs " << strides[0];
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-  //  DLOG << " col_shape = " << col_shape;
-  //  DLOG << " col_matrix_shape = " << col_matrix_shape;
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-  //  DLOG << " input_shape = " << input_shape;
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  //  DLOG << " filter.dims() = " << filter.dims();
-
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    //    DLOG << " in_batch.dims() = " << in_batch.dims();
-    //    DLOG << " out_batch.dims() = " << out_batch.dims();
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
+bool DepthwiseConvKernel<CPU, float>::Init(const ConvParam &para) const {
+  return true;
+}
 
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      //      DLOG << " out_slice " << out_slice.dims();
-      //      DLOG << " filter_slice " << filter_slice.dims();
-      //      DLOG << " col_matrix " << col_matrix.dims();
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(0));
-      auto filter_ptr = filter_slice.data<float>();
-    }
-  }
+template <>
+void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
+  DepthwiseConvCompute<float>(param);
 }
 
 template class DepthwiseConvKernel<CPU, float>;
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/elementwise_add_kernel.cpp b/src/operators/kernel/arm/elementwise_add_kernel.cpp
index f8d40ad17ff09d77c26a9f32a87190f1cdd6038a..2f5e26a37e4f2c1d370805ee7b565a60f4748b0a 100644
--- a/src/operators/kernel/arm/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_add_kernel.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef ELEMENTWISEADD_OP
+
 #pragma once
 
 #include "operators/kernel/elementwise_add_kernel.h"
@@ -24,6 +26,12 @@ struct AddFunctor {
   inline T operator()(T a, T b) const { return a + b; }
 };
 
+template <>
+bool ElementwiseAddKernel<CPU, float>::Init(
+    const ElementwiseAddParam &para) const {
+  return true;
+}
+
 template <>
 void ElementwiseAddKernel<CPU, float>::Compute(
     const ElementwiseAddParam &param) const {
@@ -40,3 +48,5 @@ template class ElementwiseAddKernel<CPU, float>;
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/fushion_fc_kernel.cpp b/src/operators/kernel/arm/fusion_fc_kernel.cpp
similarity index 90%
rename from src/operators/kernel/arm/fushion_fc_kernel.cpp
rename to src/operators/kernel/arm/fusion_fc_kernel.cpp
index ebec90aa27154334488329d079b76d14630e3294..5fac70e40781593669abd15b8f28ff6272f7133c 100644
--- a/src/operators/kernel/arm/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/arm/fusion_fc_kernel.cpp
@@ -12,15 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef FUSION_FC_OP
+
 #pragma once
 
-#include "operators/kernel/fushion_fc_kernel.h"
+#include "operators/kernel/fusion_fc_kernel.h"
 
 namespace paddle_mobile {
 namespace operators {
 
 template <>
-void FushionFcKernel<CPU, float>::Compute(const FushionFcParam &param) const {
+bool FusionFcKernel<CPU, float>::Init(const FusionFcParam &para) const {
+  return true;
+}
+
+template <>
+void FusionFcKernel<CPU, float>::Compute(const FusionFcParam &param) const {
   const Tensor *input_x = param.InputX();
   const Tensor *input_y = param.InputY();
   const Tensor *input_z = param.InputZ();
@@ -65,3 +72,5 @@ void FushionFcKernel<CPU, float>::Compute(const FushionFcParam &param) const {
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/lrn_kernel.cpp b/src/operators/kernel/arm/lrn_kernel.cpp
index 47e64d487d72eb191e6b0ec8751c877363dd7b48..839c5ee95bd4d1e9d3fd80af3df0f8a45797434e 100644
--- a/src/operators/kernel/arm/lrn_kernel.cpp
+++ b/src/operators/kernel/arm/lrn_kernel.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef LRN_OP
+
 #pragma once
 
 #include "operators/kernel/lrn_kernel.h"
@@ -19,17 +21,23 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 
+template <>
+bool LrnKernel<CPU, float>::Init(const LrnParam &para) const {
+  return true;
+}
+
 template <>
 void LrnKernel<CPU, float>::Compute(const LrnParam &param) const {
   const Tensor *input_x = param.InputX();
   auto x_dims = input_x->dims();
+  Tensor *out = param.Out();
+  out->mutable_data<float>();
   /// data_format = NCHW
   const int N = x_dims[0];
   const int C = x_dims[1];
   const int H = x_dims[2];
   const int W = x_dims[3];
-  Tensor *out = param.Out();
-  out->mutable_data<float>();
+
   const int n = param.N();
   const float alpha = param.Alpha();
   const float beta = param.Beta();
@@ -42,3 +50,5 @@ template class LrnKernel<CPU, float>;
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/mul_kernel.cpp b/src/operators/kernel/arm/mul_kernel.cpp
index f1eea3950cebe8d4c27b3481bf527e75f26c99aa..b3bb2b8075fdf306d47640c2bee3f2fc00ef0bc0 100644
--- a/src/operators/kernel/arm/mul_kernel.cpp
+++ b/src/operators/kernel/arm/mul_kernel.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef MUL_OP
+
 #pragma once
 
 #include "operators/kernel/mul_kernel.h"
@@ -19,6 +21,11 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 
+template <>
+bool MulKernel<CPU, float>::Init(const MulParam &para) const {
+  return true;
+}
+
 template <>
 void MulKernel<CPU, float>::Compute(const MulParam &param) const {
   const Tensor *input_x = param.InputX();
@@ -48,3 +55,5 @@ template class MulKernel<CPU, float>;
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/multiclass_nms_kernel.cpp b/src/operators/kernel/arm/multiclass_nms_kernel.cpp
index 61470ee31936f092e2f534c5534c1c78aaf5d44c..67cf8197ca4c3113fc4fde3d493d6ed209221b59 100644
--- a/src/operators/kernel/arm/multiclass_nms_kernel.cpp
+++ b/src/operators/kernel/arm/multiclass_nms_kernel.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+#ifdef MULTICLASSNMS_OP
 
 #include "operators/kernel/multiclass_nms_kernel.h"
-
+#include <algorithm>
 namespace paddle_mobile {
 namespace operators {
 
@@ -203,6 +203,12 @@ void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
   }
 }
 
+template <>
+bool MultiClassNMSKernel<CPU, float>::Init(
+    const MultiClassNMSParam& para) const {
+  return true;
+}
+
 template <>
 void MultiClassNMSKernel<CPU, float>::Compute(
     const MultiClassNMSParam& param) const {
@@ -273,3 +279,5 @@ void MultiClassNMSKernel<CPU, float>::Compute(
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/pool_kernel.cpp b/src/operators/kernel/arm/pool_kernel.cpp
index 6aa1b76058fdf8a9828321a23f26b1c17134d7c9..09162a13a4d0c59220cc25a02d06369c3f21ed32 100644
--- a/src/operators/kernel/arm/pool_kernel.cpp
+++ b/src/operators/kernel/arm/pool_kernel.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef POOL_OP
+
 #include <operators/kernel/pool_kernel.h>
 #include "common/log.h"
 
@@ -33,6 +35,11 @@ inline void PoolBasic(std::string pooling_type, std::vector<int> ksize,
   }
 }
 
+template <>
+bool PoolKernel<CPU, float>::Init(const PoolParam &para) const {
+  return true;
+}
+
 template <>
 void PoolKernel<CPU, float>::Compute(const PoolParam &param) const {
   const Tensor *in_x = param.Input();
@@ -54,22 +61,25 @@ void PoolKernel<CPU, float>::Compute(const PoolParam &param) const {
       paddings[i] = 0;
       ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
     }
-  }
+  } else if (ksize[0] == 3 && ksize[0] == ksize[1]) {
+    if (pooling_type == "max") {
+      math::Pool3x3Max(strides, paddings, in_x, out);
+    } else if (pooling_type == "avg") {
+      math::Pool3x3Avg(strides, paddings, in_x, out);
+    }
+
+  } else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
+    if (pooling_type == "max") {
+      math::Pool2x2Max(strides, paddings, in_x, out);
+    } else if (pooling_type == "avg") {
+      math::Pool2x2Avg(strides, paddings, in_x, out);
+    }
 
-  PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
-
-  //    if (param.isGlobalPooling() || ksize[0] != ksize[1] ||
-  //        strides[0] != strides[1] || strides[1] != 2 ||
-  //        paddings[0] != paddings[1] || paddings[1] > 1) {
-  //        PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
-  //
-  //    } else if (ksize[0] == 2) {
-  //
-  //    } else if (ksize[0] == 3) {
-  //
-  //    } else {
-  //        PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
-  //    }
+  } else {
+    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+  }
 }
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/prior_box_kernel.cpp b/src/operators/kernel/arm/prior_box_kernel.cpp
index fc61f43f3fe363c1f6d67f81ef37fb2d950f9717..13939bc7bf27904405677560f17d2e0b85748310 100644
--- a/src/operators/kernel/arm/prior_box_kernel.cpp
+++ b/src/operators/kernel/arm/prior_box_kernel.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+#ifdef PRIORBOX_OP
 
 #include "operators/kernel/prior_box_kernel.h"
 
@@ -26,6 +26,11 @@ struct ClipFunctor {
   }
 };
 
+template <>
+bool PriorBoxKernel<CPU, float>::Init(const PriorBoxParam &para) const {
+  return true;
+}
+
 template <>
 void PriorBoxKernel<CPU, float>::Compute(const PriorBoxParam &param) const {
   const auto *input_ = param.Input();
@@ -143,3 +148,5 @@ void PriorBoxKernel<CPU, float>::Compute(const PriorBoxParam &param) const {
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/relu_kernel.cpp b/src/operators/kernel/arm/relu_kernel.cpp
index 586d981175184e2da03f2949390932b888d67f4a..5bc485b77a8fac9379adbd1a3bd4d406e5a82fcb 100644
--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+#ifdef RELU_OP
 
 #include "operators/kernel/relu_kernel.h"
 #include <operators/math/transform.h>
@@ -25,6 +25,11 @@ struct ReluFunctor {
   inline T operator()(T in) const { return in > 0 ? in : 0; }
 };
 
+template <>
+bool ReluKernel<CPU, float>::Init(const ReluParam &para) const {
+  return true;
+}
+
 /*
  * @b 特化到具体平台的实现, param 从 op 层传入
  * */
@@ -35,13 +40,74 @@ void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
   auto *out = param.Out();
   auto *out_ptr = out->mutable_data<float>();
 
+  int numel = input_x->numel();
+  //  if (numel > 64) {
+  //    asm volatile(
+  //        "pld        [%[input_x_ptr], #0]        \n\t"
+  //        "vmov.f32   q8,    #0.0                 \n\t"
+  //        "subs %[num], %[num], #32                \n\t"
+  //        "blt        end_num_%=                  \n\t"
+  //        "loop_num_%=:                           \n\t"
+  //        "pld        [%[input_x_ptr], #1024]      \n\t"
+  //
+  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+  //
+  //        "vmax.f32 q0, q0, q8                   \n\t"
+  //        "vmax.f32 q1, q1, q8                    \n\t"
+  //        "vmax.f32 q2, q2, q8                   \n\t"
+  //        "vmax.f32 q3, q3, q8                   \n\t"
+  //        "vmax.f32 q4, q4, q8                   \n\t"
+  //        "vmax.f32 q5, q5, q8                   \n\t"
+  //        "vmax.f32 q6, q6, q8                   \n\t"
+  //        "vmax.f32 q7, q7, q8                   \n\t"
+  //
+  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+  //
+  //        "subs %[num], %[num], #32              \n\t"
+  //        "bge        loop_num_%=                \n\t"
+  //        "end_num_%=:                           \n\t"
+  //        "cmp %[num], #0                         \n\t"
+  //        "bge   end_%=                          \n\t"
+  //        "mov r6, #4                             \n\t"
+  //        "mul r5, %[num], r6                     \n\t"
+  //        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
+  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+  //        "vmax.f32 q0, q0, q8                   \n\t"
+  //        "vmax.f32 q1, q1, q8                    \n\t"
+  //        "vmax.f32 q2, q2, q8                   \n\t"
+  //        "vmax.f32 q3, q3, q8                   \n\t"
+  //        "vmax.f32 q4, q4, q8                   \n\t"
+  //        "vmax.f32 q5, q5, q8                   \n\t"
+  //        "vmax.f32 q6, q6, q8                   \n\t"
+  //        "vmax.f32 q7, q7, q8                   \n\t"
+  //        "add %[out_ptr], %[out_ptr], r5       \n\t"
+  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+  //        "end_%=:                                \n\t"
+  //        :
+  //        :
+  //        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num]
+  //        "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
+  //        "q7", "q8", "r5",
+  //          "r6");
+  //  } else {
   ReluFunctor<float> func_;
   math::Transform trans;
-  trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_);
-
-  //  for (int i = 0; i < input_x->numel(); i++) {
-  //    out_ptr[i] = input_x_ptr[i] > 0 ? input_x_ptr[i] : 0;
+  trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
   //  }
 }
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/reshape_kernel.cpp b/src/operators/kernel/arm/reshape_kernel.cpp
index 7f7e80ece9f30631c109d0d27f4025e2617cec95..97364f9a3f7ce9fe8da5814ad2a483f858938bbf 100644
--- a/src/operators/kernel/arm/reshape_kernel.cpp
+++ b/src/operators/kernel/arm/reshape_kernel.cpp
@@ -12,13 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+#ifdef RESHAPE_OP
 
 #include "operators/kernel/reshape_kernel.h"
 
 namespace paddle_mobile {
 namespace operators {
 
+template <>
+bool ReshapeKernel<CPU, float>::Init(const ReshapeParam &para) const {
+  return true;
+}
+
 template <>
 void ReshapeKernel<CPU, float>::Compute(const ReshapeParam &param) const {
   const auto *input_x = param.InputX();
@@ -49,3 +54,5 @@ void ReshapeKernel<CPU, float>::Compute(const ReshapeParam &param) const {
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/sigmoid_kernel.cpp b/src/operators/kernel/arm/sigmoid_kernel.cpp
index 74bc29878019dfe52de94f6fef966a416e04cc72..3e87bfacc5335e52ecdcb0b917f5826b80449ef4 100644
--- a/src/operators/kernel/arm/sigmoid_kernel.cpp
+++ b/src/operators/kernel/arm/sigmoid_kernel.cpp
@@ -12,11 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef SIGMOID_OP
+
 #include "../sigmoid_kernel.h"
 #if __ARM_NEON
 #include "../../math/math_func_neon.h"
 #endif
-
+#include <cmath>
 namespace paddle_mobile {
 namespace operators {
 
@@ -25,35 +27,23 @@ using framework::Tensor;
 
 void sigmoid(const Tensor *X, Tensor *Y) {
 #if __ARM_NEON
-  DLOG << "step1";
   const float *input = X->data<float>();
-  DLOG << "step11";
-
   float *output = Y->mutable_data<float>();
-  DLOG << "step2";
-
   const DDim &dDim = X->dims();
-  DLOG << "step3";
-
   int axis_index = 1;
   if (dDim.size() < 4) {
     axis_index = 0;
   }
-  DLOG << "step4";
-
   DDim outer_ddim =
       paddle_mobile::framework::slice_ddim(dDim, 0, axis_index + 1);
   DDim inner_ddim =
       paddle_mobile::framework::slice_ddim(dDim, axis_index + 1, dDim.size());
-  DLOG << "step5";
-
   int out_size = paddle_mobile::framework::product(outer_ddim);
   int inner_size = paddle_mobile::framework::product(inner_ddim);
-  DLOG << "step6";
 
-#pragma omp parallel for
   DLOG << "outsize=" << out_size;
   DLOG << "innersize=" << inner_size;
+  #pragma omp parallel for
   for (int i = 0; i < out_size; ++i) {
     const float *input_outer_ptr = input + i * inner_size;
     float *output_outer_ptr = output + i * inner_size;
@@ -81,6 +71,11 @@ void sigmoid(const Tensor *X, Tensor *Y) {
 #endif
 }
 
+template <>
+bool SigmoidKernel<CPU, float>::Init(const SigmoidParam &para) const {
+  return true;
+}
+
 template <>
 void SigmoidKernel<CPU, float>::Compute(const SigmoidParam &param) const {
   const Tensor *in_x = param.InputX();
@@ -93,3 +88,5 @@ void SigmoidKernel<CPU, float>::Compute(const SigmoidParam &param) const {
 template class SigmoidKernel<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/softmax_kernel.cpp b/src/operators/kernel/arm/softmax_kernel.cpp
index 0a50fc0a0136b66df4f55c10decc84a541b52dce..8e966aa0af9ac84b70b154b33bad7dad9e79121d 100644
--- a/src/operators/kernel/arm/softmax_kernel.cpp
+++ b/src/operators/kernel/arm/softmax_kernel.cpp
@@ -12,11 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef SOFTMAX_OP
+
 #include "../softmax_kernel.h"
 #include "../../math/softmax.h"
 namespace paddle_mobile {
 namespace operators {
 
+template <>
+bool SoftmaxKernel<CPU, float>::Init(const SoftmaxParam &para) const {
+  return true;
+}
+
 template <>
 void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam &param) const {
   const Tensor *in_x = param.InputX();
@@ -29,3 +36,5 @@ void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam &param) const {
 template class SoftmaxKernel<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/transpose_kernel.cpp b/src/operators/kernel/arm/transpose_kernel.cpp
index 92b5916ec40d53bb55c1cc4aaf0ce6ec9a9bfaeb..a44ff22a2f228cc357c066a01e142de7cc4f2083 100644
--- a/src/operators/kernel/arm/transpose_kernel.cpp
+++ b/src/operators/kernel/arm/transpose_kernel.cpp
@@ -11,28 +11,32 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#pragma once
+#ifdef TRANSPOSE_OP
 
 #include "operators/kernel/transpose_kernel.h"
-
 namespace paddle_mobile {
 namespace operators {
 
-template <typename T>
-void TransposeFunc(const int numel, const T* input, const vector<int> axis,
-                   const vector<int> old_strides, const vector<int> new_strides,
-                   T* output) {
-  for (int i = 0; i < numel; ++i) {
-    int old_idx = 0;
-    int idx = i;
-    for (int j = 0; j < axis.size(); ++j) {
-      int order = axis[j];
-      old_idx += (idx / new_strides[j]) * old_strides[order];
-      idx %= new_strides[j];
-    }
-    output[i] = input[old_idx];
-  }
+// vector<int> pos;
+// template <typename T>
+// void TransposeFunc(const int numel, const T* input, const vector<int> axis,
+//                    const vector<int> old_strides, const vector<int>
+//                    new_strides, T* output) {
+//   for (int i = 0; i < numel; ++i) {
+//     int old_idx = 0;
+//     int idx = i;
+//     for (int j = 0; j < axis.size(); ++j) {
+//       int order = axis[j];
+//       old_idx += (idx / new_strides[j]) * old_strides[order];
+//       idx %= new_strides[j];
+//     }
+//     output[i] = input[old_idx];
+//   }
+// }
+
+template <>
+bool TransposeKernel<CPU, float>::Init(const TransposeParam& para) const {
+  return true;
 }
 
 template <>
@@ -44,29 +48,41 @@ void TransposeKernel<CPU, float>::Compute(const TransposeParam& param) const {
   const auto* input_x_data = input_x->data<float>();
   auto* out_data = out->mutable_data<float>();
 
-  size_t axis_size = axis.size();
-  std::vector<int> new_dims;
-  new_dims.reserve(axis_size);
-  for (auto c : axis) {
-    new_dims.push_back(input_x_dims[c]);
+  size_t ndim = axis.size();
+  std::vector<int> xdim(ndim);
+  std::vector<int> xstride(ndim);
+  std::vector<int> xout(ndim);
+  for (int i = 0; i < ndim; i++) {
+    int j = ndim - 1 - i;
+    xdim[j] = input_x_dims[axis[i]];
+    xstride[j] = 1;
+    for (int k = axis[i] + 1; k < ndim; k++) {
+      xstride[j] *= input_x_dims[k];
+    }
+    xout[j] = xstride[j] * xdim[j];
   }
 
-  std::vector<int> old_strides;
-  std::vector<int> new_strides;
-  for (int i = 0; i < axis.size(); i++) {
-    int temp_old = 1;
-    int temp_new = 1;
-    for (int j = i + 1; j < axis.size(); j++) {
-      temp_old *= input_x_dims[j];
-      temp_new *= new_dims[j];
+  auto numel = input_x->numel();
+  size_t pind = 0;
+  std::vector<int> ind(ndim);
+  for (int i = 0; i < numel; i++) {
+    out_data[i] = input_x_data[pind];
+    ind[0]++;
+    pind += xstride[0];
+    for (int j = 0; j < ndim - 1; j++) {
+      if (ind[j] == xdim[j]) {
+        ind[j + 1]++;
+        ind[j] = 0;
+        pind += xstride[j + 1];
+        pind -= xout[j];
+      } else {
+        break;
+      }
     }
-    old_strides.push_back(temp_old);
-    new_strides.push_back(temp_new);
   }
-
-  TransposeFunc<float>(input_x->numel(), input_x_data, axis, old_strides,
-                       new_strides, out_data);
 }
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/batchnorm_kernel.h b/src/operators/kernel/batchnorm_kernel.h
index ebace43e1c559df1bf997d05f68db862d1ed3cb4..6ef5329bc58fea8bfc17d9115b7004fed2bc4ed7 100644
--- a/src/operators/kernel/batchnorm_kernel.h
+++ b/src/operators/kernel/batchnorm_kernel.h
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef BATCHNORM_OP
+
+#pragma once
+
 #include "framework/operator.h"
 #include "operators/op_param.h"
-#pragma once;
 
 namespace paddle_mobile {
 namespace operators {
@@ -26,7 +29,10 @@ class BatchNormKernel
     : public framework::OpKernelBase<DeviceType, BatchNormParam> {
  public:
   void Compute(const BatchNormParam &param) const;
+  bool Init(const BatchNormParam &para) const;
 };
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/box_coder_kernel.h b/src/operators/kernel/box_coder_kernel.h
index 2d350202d091563f668f9209a1540bb0a32b6ac3..4c4206f52b3ffc5e60983bf1d6adb25292d01ac4 100644
--- a/src/operators/kernel/box_coder_kernel.h
+++ b/src/operators/kernel/box_coder_kernel.h
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef BOXCODER_OP
+
+#pragma once
+
 #include <vector>
 
 #include "framework/operator.h"
 #include "operators/math/transform.h"
 #include "operators/op_param.h"
 
-#pragma once;
-
 namespace paddle_mobile {
 namespace operators {
 
@@ -28,6 +30,9 @@ class BoxCoderKernel
     : public framework::OpKernelBase<DeviceType, BoxCoderParam> {
  public:
   void Compute(const BoxCoderParam& param) const;
+  bool Init(const BoxCoderParam& para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/batchnorm_arm_func.h b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f02d768b790b5f496ab0eac369fa3a4100ee733
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
@@ -0,0 +1,234 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef BATCHNORM_OP
+
+#pragma once
+
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void BatchnormCompute(const BatchNormParam &param) {
+  const Tensor *input_x = param.InputX();
+  auto input_x_ptr = input_x->data<float>();
+  const auto &x_dims = input_x->dims();
+  const int N = x_dims[0];
+  const int C = x_dims[1];
+  const int H = x_dims[2];
+  const int W = x_dims[3];
+  const int stride0 = C * H * W;
+  const int stride1 = H * W;
+  const int stride2 = W;
+  Tensor *out = param.OutputY();
+  auto out_ptr = out->mutable_data<float>();
+  const float epsilon = param.Epsilon();
+  const Tensor *mean = param.InputMean();
+  const Tensor *variance = param.InputVariance();
+  const Tensor *scale = param.InputScale();
+  const Tensor *bias = param.InputBias();
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+
+  //  Tensor inv_std;
+  //  auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
+
+  PADDLE_MOBILE_ENFORCE(C == variance->numel(),
+                        "C must equal to variance.numel()");
+
+  int HXW = H * W;
+  if (HXW > 32) {
+    int NXC = N * C;
+    float *inv_std_ptr = new float[NXC * 4];
+    float *volatile new_scale_ptr = new float[NXC * 4];
+    float *volatile new_bias_ptr = new float[NXC * 4];
+
+    /// std = (var + epsilon).sqrt();
+    /// inv_std = 1 / std;
+    for (int i = 0; i < C * 4; i += 4) {
+      int index = i / 4;
+      inv_std_ptr[i] =
+          1 / static_cast<float>(pow((variance_ptr[index] + epsilon), 0.5));
+      inv_std_ptr[i + 1] = inv_std_ptr[i];
+      inv_std_ptr[i + 2] = inv_std_ptr[i];
+      inv_std_ptr[i + 3] = inv_std_ptr[i];
+
+      new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[index];
+      new_scale_ptr[i + 1] = new_scale_ptr[i];
+      new_scale_ptr[i + 2] = new_scale_ptr[i];
+      new_scale_ptr[i + 3] = new_scale_ptr[i];
+
+      new_bias_ptr[i] =
+          bias_ptr[index] - mean_ptr[index] * inv_std_ptr[i] * scale_ptr[index];
+
+      new_bias_ptr[i + 1] = new_bias_ptr[i];
+      new_bias_ptr[i + 2] = new_bias_ptr[i];
+      new_bias_ptr[i + 3] = new_bias_ptr[i];
+    }
+
+    for (int j = C * 4; j < NXC * 4; ++j) {
+      new_scale_ptr[j] = new_scale_ptr[j - C * 4];
+      new_bias_ptr[j] = new_bias_ptr[j - C * 4];
+    }
+
+    asm volatile(
+        "subs %[N], %[N], #1                  \n\t"
+        "blt        end_n_%=                  \n\t"
+        "loop_n_%=:                           \n\t"
+
+        "subs %[C], %[C], #1                   \n\t"
+        "blt        end_c_%=                  \n\t"
+        "loop_c_%=:                           \n\t"
+
+        "vld1.32 {q9}, [%[new_scale_ptr]]!    \n\t"
+        "vld1.32 {q10}, [%[new_bias_ptr]]!    \n\t"
+
+        "mov r6, %[HXW]       \n\t"
+
+        "subs r6, r6, #32                       \n\t"
+        "blt        end_hw_%=                   \n\t"
+        "loop_hw_%=:                            \n\t"
+
+        "vld1.32 {q1, q2}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q3, q4}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q5, q6}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q7, q8}, [%[input_x_ptr]]!    \n\t"
+
+        "vmul.f32   q1, q1,   q9  \n\t"
+        "vmul.f32   q2, q2,   q9  \n\t"
+        "vmul.f32   q3, q3,   q9  \n\t"
+        "vmul.f32   q4, q4,   q9  \n\t"
+
+        "vmul.f32   q5, q5,   q9  \n\t"
+        "vmul.f32   q6, q6,   q9  \n\t"
+        "vmul.f32   q7, q7,   q9  \n\t"
+        "vmul.f32   q8, q8,   q9  \n\t"
+
+        "vadd.f32   q1,  q1,  q10 \n\t"
+        "vadd.f32   q2, q2,   q10  \n\t"
+        "vadd.f32   q3, q3,   q10  \n\t"
+        "vadd.f32   q4,  q4,  q10 \n\t"
+        "vadd.f32   q5,  q5,  q10 \n\t"
+        "vadd.f32   q6,  q6,  q10 \n\t"
+        "vadd.f32   q7,  q7,  q10 \n\t"
+        "vadd.f32   q8,  q8,  q10 \n\t"
+
+        "vst1.32 {q1, q2}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q3, q4}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q5, q6}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q7, q8}, [%[out_ptr]]!       \n\t"
+
+        "subs r6, r6, #32                    \n\t"
+        "bge        loop_hw_%=                \n\t"
+        "end_hw_%=:                           \n\t"
+
+        "cmp  r6, #0                                \n\t"
+        "bge  end_remainder_%=                      \n\t"
+        "mov r5, #4                             \n\t"
+        "mul  r6, r6, r5                            \n\t"
+        "add %[input_x_ptr], %[input_x_ptr], r6     \n\t"
+
+        "vld1.32 {q1, q2}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q3, q4}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q5, q6}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q7, q8}, [%[input_x_ptr]]!    \n\t"
+
+        "vmul.f32   q1, q1,   q9  \n\t"
+        "vmul.f32   q2, q2,   q9  \n\t"
+        "vmul.f32   q3, q3,   q9  \n\t"
+        "vmul.f32   q4, q4,   q9  \n\t"
+        "vmul.f32   q5, q5,   q9  \n\t"
+        "vmul.f32   q6, q6,   q9  \n\t"
+        "vmul.f32   q7, q7,   q9  \n\t"
+        "vmul.f32   q8, q8,   q9  \n\t"
+        "vadd.f32   q1,  q1,  q10 \n\t"
+        "vadd.f32   q2, q2,   q10  \n\t"
+        "vadd.f32   q3, q3,   q10  \n\t"
+        "vadd.f32   q4,  q4,  q10 \n\t"
+        "vadd.f32   q5,  q5,  q10 \n\t"
+        "vadd.f32   q6,  q6,  q10 \n\t"
+        "vadd.f32   q7,  q7,  q10 \n\t"
+        "vadd.f32   q8,  q8,  q10 \n\t"
+
+        "add %[out_ptr], %[out_ptr], r6         \n\t"
+        "vst1.32 {q1, q2}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q3, q4}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q5, q6}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q7, q8}, [%[out_ptr]]!        \n\t"
+
+        "end_remainder_%=:                      \n\t"
+
+        "subs %[C], %[C], #1                    \n\t"
+        "bge        loop_c_%=                   \n\t"
+        "end_c_%=:                              \n\t"
+
+        "subs %[N], %[N], #1                    \n\t"
+        "bge        loop_n_%=                   \n\t"
+        "end_n_%=:                              \n\t"
+        :
+        : [input_x_ptr] "r"(input_x_ptr), [out_ptr] "r"(out_ptr),
+          [new_scale_ptr] "r"(new_scale_ptr), [new_bias_ptr] "r"(new_bias_ptr),
+          [N] "r"(N), [C] "r"(C), [HXW] "r"(HXW)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+          "q10", "r5", "r6");
+
+    delete[] inv_std_ptr;
+    delete[] new_scale_ptr;
+    delete[] new_bias_ptr;
+
+  } else {
+    float *inv_std_ptr = new float[C];
+    for (int i = 0; i < C; i++) {
+      inv_std_ptr[i] =
+          1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+    }
+
+    Tensor new_scale;
+    auto new_scale_ptr =
+        new_scale.mutable_data<float>(framework::make_ddim({C}));
+    Tensor new_bias;
+    auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
+
+    /// ((x - est_mean) * (inv_var) * scale + bias equal to
+    /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+    for (int i = 0; i < C; i++) {
+      new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+      new_bias_ptr[i] =
+          bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+      {
+        for (int n = 0; n < N; n++) {
+          for (int h = 0; h < H; h++) {
+            int tmp_index = n * stride0 + i * stride1 + h * stride2;
+            for (int w = 0; w < W; w++) {
+              int index = tmp_index + w;
+              out_ptr[index] =
+                  input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
+            }
+          }
+        }
+      }
+    }
+
+    delete[] inv_std_ptr;
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..6aadbab95c591d4286fdbb3c3f01a291cdd90429
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
@@ -0,0 +1,117 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADD_RELU_OP
+
+#pragma once
+#include <vector>
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void ConvAddReluCompute(const FusionConvAddReluParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor bias = *param.Bias();
+  int axis = param.Axis();
+  Tensor *output = param.Output();
+  math::expand_bias(bias, axis, output->dims());
+  output->ShareDataWith(bias);
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(1), true);
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..d08eebe5493bd9026073c3349631a42024579b95
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -0,0 +1,115 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_OP
+
+#pragma once
+#include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void ConvCompute(const ConvParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor *output = param.Output();
+  output->mutable_data<float>();
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(0));
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..e43e3664cb005bab4d3c5ec8b5b35bd6925c982d
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
@@ -0,0 +1,116 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef DEPTHWISECONV_OP
+
+#pragma once
+#include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void DepthwiseConvCompute(const ConvParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor *output = param.Output();
+  output->mutable_data<float>();
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  //  DLOG << " compute end get Attrs " << strides[0];
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(0));
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/concat_kernel.h b/src/operators/kernel/concat_kernel.h
index d91fb84f015851074e317980f1fe9ff930e9e399..6a7b7c6005b6e85e5b1ccfee713672b6e333b98a 100644
--- a/src/operators/kernel/concat_kernel.h
+++ b/src/operators/kernel/concat_kernel.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef CONCAT_OP
+
 #pragma once
 #include "framework/operator.h"
 #include "operators/op_param.h"
@@ -25,7 +27,10 @@ template <typename DeviceType, typename T>
 class ConcatKernel : public framework::OpKernelBase<DeviceType, ConcatParam> {
  public:
   void Compute(const ConcatParam &param) const;
+  bool Init(const ConcatParam &para) const;
 };
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/conv_add_kernel.h b/src/operators/kernel/conv_add_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f733f245dc26664ce38413a09fc5404029cdd2f
--- /dev/null
+++ b/src/operators/kernel/conv_add_kernel.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADD_OP
+
+#pragma once
+
+#include <vector>
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::DDim;
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class ConvAddKernel : public OpKernelBase<DeviceType, FusionConvAddParam> {
+ public:
+  void Compute(const FusionConvAddParam &param) const;
+  bool Init(const FusionConvAddParam &para) const;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/framework/data_transform.h b/src/operators/kernel/conv_add_relu_kernel.h
similarity index 51%
rename from src/framework/data_transform.h
rename to src/operators/kernel/conv_add_relu_kernel.h
index b3947985d8b09e183c690b4d51093c2ae96e7d80..9b86cd22e82e641ee6cb0a15bd25c8a1c6cbe8cb 100644
--- a/src/framework/data_transform.h
+++ b/src/operators/kernel/conv_add_relu_kernel.h
@@ -14,24 +14,32 @@ limitations under the License. */
 
 #pragma once
 
-#include <functional>
-#include <utility>
-#include <vector>
+#ifdef FUSION_CONVADD_RELU_OP
 
-#include "framework/op_kernel_type.h"
-#include "framework/selected_rows.h"
-#include "framework/tensor.h"
-#include "framework/variable.h"
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
-namespace framework {
+namespace operators {
 
-void DataTransform(const OpKernelType &expected_kernel_type,
-                   const OpKernelType &kernel_type_for_var,
-                   const Tensor &input_tensor, Tensor *out);
+using framework::DDim;
+using framework::OpKernelBase;
 
-void CopyVariableWithTensor(const Variable &in_var, const Tensor &tensor,
-                            Variable *out_var);
+template <typename DeviceType, typename T>
+class ConvAddReluKernel
+    : public OpKernelBase<DeviceType, FusionConvAddReluParam> {
+ public:
+  void Compute(const FusionConvAddReluParam &param) const;
+  bool Init(const FusionConvAddReluParam &para) const;
+};
 
-}  // namespace framework
+}  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/conv_kernel.h b/src/operators/kernel/conv_kernel.h
index d43a174ffdbf0ca6dbb39e463b8e97652c7b0daf..812ddd5a441f3a24c557546c1780248a557a6eb0 100644
--- a/src/operators/kernel/conv_kernel.h
+++ b/src/operators/kernel/conv_kernel.h
@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef CONV_OP
+
+#pragma once
+
 #include <vector>
 #include "framework/operator.h"
 #include "operators/math/im2col.h"
@@ -19,8 +23,6 @@ limitations under the License. */
 #include "operators/math/vol2col.h"
 #include "operators/op_param.h"
 
-#pragma once;
-
 namespace paddle_mobile {
 namespace operators {
 
@@ -30,22 +32,10 @@ template <typename DeviceType, typename T>
 class ConvKernel : public OpKernelBase<DeviceType, ConvParam> {
  public:
   void Compute(const ConvParam &param) const;
+  bool Init(const ConvParam &para) const;
 };
 
-inline bool IsExpand(const std::vector<int64_t> &filter_dim,
-                     const std::vector<int> &strides,
-                     const std::vector<int> &paddings,
-                     const std::vector<int> &dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
-
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/depthwise_conv_kernel.h b/src/operators/kernel/depthwise_conv_kernel.h
index 43ddfb25cd859a7e937577221215d8352b846bff..a8a8fb338620477670477703018bf9e6e9a8a604 100644
--- a/src/operators/kernel/depthwise_conv_kernel.h
+++ b/src/operators/kernel/depthwise_conv_kernel.h
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef DEPTHWISECONV_OP
+
+#pragma once
+
 #include "framework/operator.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"
 #include "operators/op_param.h"
 
-#pragma once;
-
 namespace paddle_mobile {
 namespace operators {
 
@@ -29,6 +31,9 @@ template <typename DeviceType, typename T>
 class DepthwiseConvKernel : public OpKernelBase<DeviceType, ConvParam> {
  public:
   void Compute(const ConvParam &param) const;
+  bool Init(const ConvParam &para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/elementwise_add_kernel.h b/src/operators/kernel/elementwise_add_kernel.h
index 28b3bc29e593561d18512cbf1af947dd64cd9d87..fe6a0238dcd5249e822de3b5930438df808bf853 100644
--- a/src/operators/kernel/elementwise_add_kernel.h
+++ b/src/operators/kernel/elementwise_add_kernel.h
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once;
+#ifdef ELEMENTWISEADD_OP
+
+#pragma once
 
 #include "framework/operator.h"
 #include "operators/math/elementwise_op_function.h"
@@ -28,6 +30,9 @@ class ElementwiseAddKernel
     : public framework::OpKernelBase<DeviceType, ElementwiseAddParam> {
  public:
   void Compute(const ElementwiseAddParam &param) const;
+  bool Init(const ElementwiseAddParam &para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fpga/conv_kernel.cpp b/src/operators/kernel/fpga/conv_kernel.cpp
index a50a5c59bdaaa3829602049bf88bf41fa02af53c..30dd64fd1466902036a72faa4be5d359d2bdb0bf 100644
--- a/src/operators/kernel/fpga/conv_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_kernel.cpp
@@ -12,13 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef CONV_OP
+
+#include "operators/kernel/conv_kernel.h"
+
 namespace paddle_mobile {
 namespace operators {
 
-// template<>
-// void ConvKernel<FPGA, float>::Compute(const ConvParam &param) const
-// {}
-//
-// template class ConvKernel<FPGA, float>;
+template <>
+bool ConvKernel<FPGA, float>::Init(const ConvParam &para) const {
+  return true;
 }
+
+template <>
+void ConvKernel<FPGA, float>::Compute(const ConvParam &param) const {}
+template class ConvKernel<FPGA, float>;
+
+}  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/fushion_fc_kernel.h b/src/operators/kernel/fusion_fc_kernel.h
similarity index 79%
rename from src/operators/kernel/fushion_fc_kernel.h
rename to src/operators/kernel/fusion_fc_kernel.h
index 7597a7120d1840128810730ad3fab11fd01b10fa..c4e2b30176fb904d7fb906c5efc5137a5dcb8d59 100644
--- a/src/operators/kernel/fushion_fc_kernel.h
+++ b/src/operators/kernel/fusion_fc_kernel.h
@@ -12,20 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef FUSION_FC_OP
+
+#pragma once
+
 #include "framework/operator.h"
 #include "operators/math/math_function.h"
 #include "operators/op_param.h"
 
-#pragma once;
-
 namespace paddle_mobile {
 namespace operators {
 
 template <typename DeviceType, typename T>
-class FushionFcKernel
-    : public framework::OpKernelBase<DeviceType, FushionFcParam> {
+class FusionFcKernel
+    : public framework::OpKernelBase<DeviceType, FusionFcParam> {
  public:
-  void Compute(const FushionFcParam& param) const;
+  void Compute(const FusionFcParam& param) const;
+  bool Init(const FusionFcParam& para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/lrn_kernel.h b/src/operators/kernel/lrn_kernel.h
index f5fd8313482a92aad0c01d3e0acc9dcfcc83f2d8..40c48b3663c6825e03028439725c428ce048d254 100644
--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
@@ -12,9 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef LRN_OP
+
 #include "framework/operator.h"
 #include "operators/op_param.h"
-#pragma once;
+
+#include <cmath>
+
+#ifdef __ARM_NEON
+#include "arm_neon.h"
+#include "operators/math/math_func_neon.h"
+#endif
 
 namespace paddle_mobile {
 namespace operators {
@@ -24,42 +32,137 @@ using namespace framework;
 template <typename T>
 struct LRNFunctor {
   void operator()(const framework::Tensor &input, framework::Tensor *out, int N,
-                  int C, int H, int W, int n, T k, T alpha, T beta) {
-    auto input_ptr = input.data<T>();
+                  int C, int H, int W, int n, float k, float alpha,
+                  float beta) {
+    const float *input_ptr = input.data<float>();
     const int start = -(n - 1) / 2;
     const int end = start + n;
+    auto out_ptr = out->data<T>();
 
     const int stride0 = C * H * W;
     const int stride1 = H * W;
     const int stride2 = W;
-    const int stride3 = 1;
-
     framework::Tensor sqr_buffer;
-    auto sqr_buffer_ptr = sqr_buffer.mutable_data<T>(input.dims());
-    std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), k);
+    auto sqr_buffer_ptr = sqr_buffer.mutable_data<float>(input.dims());
+    std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), 0.0);
+
     for (int a = 0; a < N; a++) {
       for (int b = 0; b < C; b++) {
         for (int index = start; index < end; index++) {
           int channel = b + index;
           if (channel >= 0 && channel < C) {
-            int tmp_u = a * stride0 + b * stride1;
-            int tmp_i = a * stride0 + channel * stride1;
-            for (int c = 0; c < H; c++) {
-              for (int d = 0; d < W; d++) {
-                int tmp = c * stride2 + d;
-                int u = tmp_u + tmp;
-                int i = tmp_i + tmp;
-                sqr_buffer_ptr[u] += alpha * input_ptr[i] * input_ptr[i];
-              }
+            int tmp_s = a * stride0 + b * stride1;
+            int tmp_c = a * stride0 + channel * stride1;
+#ifdef __ARM_NEON
+            int n4 = stride1 / 4;
+            int m4 = stride1 % 4;
+            float32x4_t sqr0;
+            float32x4_t in0;
+            float32x4_t res0;
+            for (int i = 0; i < n4; i++) {
+              sqr0 = vld1q_f32(sqr_buffer_ptr + tmp_s);
+              in0 = vld1q_f32(input_ptr + tmp_c);
+
+              res0 = vmlaq_f32(sqr0, in0, in0);
+              vst1q_f32(sqr_buffer_ptr + tmp_s, res0);
+
+              tmp_s += 4;
+              tmp_c += 4;
             }
+
+            for (int i = 0; i < m4; i++) {
+              int s_i = tmp_s + i;
+              int c_i = tmp_c + i;
+              sqr_buffer_ptr[s_i] += input_ptr[c_i] * input_ptr[c_i];
+            }
+
+#else
+            for (int tmp = 0; tmp < stride1; tmp++) {
+              int s_i = tmp_s + tmp;
+              int c_i = tmp_c + tmp;
+              sqr_buffer_ptr[s_i] += input_ptr[c_i] * input_ptr[c_i];
+            }
+#endif
           }
         }
       }
     }
-    auto out_ptr = out->data<T>();
+
+#ifdef __ARM_NEON
+
+    float32x4_t sqr1, sqr2, sqr3, sqr4;
+    float32x4_t alpha4;
+    float32x4_t k4;
+    float32x4_t beta4;
+    float32x4_t res1, res2, res3, res4;
+    float32x4_t in1, in2, in3, in4;
+
+    beta4 = vdupq_n_f32(beta);
+    alpha4 = vdupq_n_f32(alpha);
+    k4 = vdupq_n_f32(k);
+    auto out_tmp_ptr = out_ptr;
+
+    int n16 = input.numel() / 16;
+    int m16 = input.numel() % 16;
+    int m16n4 = m16 / 4;
+    int m16m4 = m16 % 4;
+
+    for (int i = 0; i < n16; i++) {
+      sqr1 = vld1q_f32(sqr_buffer_ptr);
+      sqr2 = vld1q_f32(sqr_buffer_ptr + 4);
+      sqr3 = vld1q_f32(sqr_buffer_ptr + 8);
+      sqr4 = vld1q_f32(sqr_buffer_ptr + 12);
+
+      in1 = vld1q_f32(input_ptr);
+      in2 = vld1q_f32(input_ptr + 4);
+      in3 = vld1q_f32(input_ptr + 8);
+      in4 = vld1q_f32(input_ptr + 12);
+
+      sqr1 = vmlaq_f32(k4, sqr1, alpha4);
+      sqr2 = vmlaq_f32(k4, sqr2, alpha4);
+      sqr3 = vmlaq_f32(k4, sqr3, alpha4);
+      sqr4 = vmlaq_f32(k4, sqr4, alpha4);
+
+      sqr1 = pow_ps(sqr1, -beta4);
+      sqr2 = pow_ps(sqr2, -beta4);
+      sqr3 = pow_ps(sqr3, -beta4);
+      sqr4 = pow_ps(sqr4, -beta4);
+
+      sqr1 = vmulq_f32(sqr1, in1);
+      sqr2 = vmulq_f32(sqr2, in2);
+      sqr3 = vmulq_f32(sqr3, in3);
+      sqr4 = vmulq_f32(sqr4, in4);
+
+      vst1q_f32(out_tmp_ptr, sqr1);
+      vst1q_f32(out_tmp_ptr + 4, sqr2);
+      vst1q_f32(out_tmp_ptr + 8, sqr3);
+      vst1q_f32(out_tmp_ptr + 12, sqr4);
+
+      sqr_buffer_ptr += 4 * 4;
+      input_ptr += 4 * 4;
+      out_tmp_ptr += 4 * 4;
+    }
+    for (int i = 0; i < m16n4; i++) {
+      sqr4 = vld1q_f32(sqr_buffer_ptr);
+      in4 = vld1q_f32(input_ptr);
+      sqr4 = vmlaq_f32(k4, sqr4, alpha4);
+      sqr4 = pow_ps(sqr4, -beta4);
+      sqr4 = vmulq_f32(sqr4, in4);
+      vst1q_f32(out_tmp_ptr, sqr4);
+      sqr_buffer_ptr += 4;
+      input_ptr += 4;
+      out_tmp_ptr += 4;
+    }
+
+    for (int i = 0; i < m16m4; i++) {
+      out_tmp_ptr[i] = input_ptr[i] / pow(k + alpha * sqr_buffer_ptr[i], beta);
+    }
+
+#else
     for (int i = 0; i < input.numel(); i++) {
-      out_ptr[i] = input_ptr[i] / pow(sqr_buffer_ptr[i], beta);
+      out_ptr[i] = input_ptr[i] / pow(k + alpha * sqr_buffer_ptr[i], beta);
     }
+#endif
   }
 };
 
@@ -67,6 +170,9 @@ template <typename DeviceType, typename T>
 class LrnKernel : public framework::OpKernelBase<DeviceType, LrnParam> {
  public:
   void Compute(const LrnParam &param) const;
+  bool Init(const LrnParam &para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/mali/ACL_Android b/src/operators/kernel/mali/ACL_Android
new file mode 160000
index 0000000000000000000000000000000000000000..591027fcffea084100c756e48356e0f8a48e35e5
--- /dev/null
+++ b/src/operators/kernel/mali/ACL_Android
@@ -0,0 +1 @@
+Subproject commit 591027fcffea084100c756e48356e0f8a48e35e5
diff --git a/src/operators/kernel/mali/acl_operator.cc b/src/operators/kernel/mali/acl_operator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..562d2fe1c46aa7a30b6418c7a3fcb21daafffa0f
--- /dev/null
+++ b/src/operators/kernel/mali/acl_operator.cc
@@ -0,0 +1,220 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#if USE_ACL == 1
+#include "acl_operator.h"
+unsigned int bypass_acl_class_layer =
+    (0 | FLAGS_ENABLE_ACL_CONCAT |
+     /*0xffffffff |*/ /*FLAGS_ENABLE_ACL_FC |*/ /*FLAGS_ENABLE_ACL_LRN
+                                                   |*/
+     0);
+
+int enable_schedule = 0;
+
+#ifdef USE_PROFILING
+
+#include "arm_neon.h"
+
+unsigned int acl_log_flags =
+    (0 | MASK_LOG_APP_TIME | /*MASK_LOG_ALLOCATE | */  /*MASK_LOG_ALLOCATE | */
+     /*MASK_LOG_RUN      | */ /*MASK_LOG_CONFIG   | */ /*MASK_LOG_COPY     | */
+     MASK_LOG_ABSVAL | MASK_LOG_BNLL | MASK_LOG_CONV | MASK_LOG_FC |
+     MASK_LOG_LRN | MASK_LOG_POOLING | MASK_LOG_RELU | MASK_LOG_SIGMOID |
+     MASK_LOG_SOFTMAX | MASK_LOG_TANH | MASK_LOG_LC | MASK_LOG_BN |
+     MASK_LOG_CONCAT | 0);
+#include <stdio.h>  /* printf */
+#include <stdlib.h> /* getenv */
+#endif              // USE_PROFILING
+
+static bool force_enable_gpu = false;
+bool AclEnableSchedule(int enable) {
+  enable_schedule = enable;
+  if (enable) {
+    force_enable_gpu = true;
+  }
+  return true;
+}
+int isScheduleEnable() { return enable_schedule; }
+
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+
+bool ACLOperator::init_gpu_env = true;
+#ifdef USE_OPENCL
+bool ACLOperator::support_opencl_ = false;
+bool opencl_is_available() { return arm_compute::opencl_is_available(); }
+#elif defined(USE_OPENGLES)
+bool ACLOperator::support_opengles_ = false;
+#endif
+ACLOperator::ACLOperator(bool is_gpu)
+    : operator_state_(operator_not_init),
+      force_bypass_acl_path_(false),
+      target_hint_(TargetHint::DONT_CARE),
+      convolution_method_hint_(ConvolutionMethodHint::GEMM),
+      _group(1),
+      name_(""),
+      input_idx_(0),
+      output_idx_(0),
+      is_gpu_(is_gpu) {
+  const char* pBypassACL;
+  if (init_gpu_env) {
+#ifdef USE_OPENCL
+    try {
+      if (opencl_is_available()) {
+        arm_compute::CLScheduler::get().default_init();
+        support_opencl_ = true;
+      }
+    } catch (std::exception& e) {
+      support_opencl_ = false;
+    }
+#elif defined(USE_OPENGLES)
+    try {
+      arm_compute::GCScheduler::get().default_init();
+      support_opengles_ = true;
+    } catch (std::exception& e) {
+      support_opengles_ = false;
+    }
+#endif
+    init_gpu_env = false;
+  }
+  if (force_enable_gpu) is_gpu_ = true;
+  pBypassACL = getenv("BYPASSACL");
+  if (pBypassACL) {
+    unsigned int bacl;
+    sscanf(pBypassACL, "%i", &bacl);
+    if (bacl != bypass_acl_class_layer) {
+      bypass_acl_class_layer = bacl;
+      printf("BYPASSACL<%s>\n", pBypassACL);
+      printf("BYPASSACL: %x\n", bypass_acl_class_layer);
+    }
+  }
+
+#ifdef USE_PROFILING
+  const char* pLogACL;
+  pLogACL = getenv("LOGACL");
+  if (pLogACL) {
+    unsigned int alf;
+    sscanf(pLogACL, "%i", &alf);
+    if (alf != acl_log_flags) {
+      acl_log_flags = alf;
+      printf("LOGACL<%s>\n", pLogACL);
+      printf("LOGACL: %x\n", acl_log_flags);
+    }
+  }
+#endif  // USE_PROFILING
+  const char* pEnableSchedule;
+  pEnableSchedule = getenv("ENABLESCHEDULE");
+  if (pEnableSchedule) {
+    int bshedule;
+    sscanf(pEnableSchedule, "%i", &bshedule);
+    if (bshedule != enable_schedule) {
+      enable_schedule = bshedule;
+      printf("ENABLESCHEDULE<%s>\n", pEnableSchedule);
+      printf("ENABLESCHEDULE: %x\n", enable_schedule);
+    }
+    if (enable_schedule) {
+      AclEnableSchedule(1);
+    }
+  }
+}
+ACLOperator::~ACLOperator() {}
+
+bool ACLOperator::new_tensor(std::unique_ptr<ACLTensor>& tensor,
+                             arm_compute::TensorShape& shape, void* mem,
+                             bool commit) {
+  auto acl_tensor =
+      new ACLTensor(arm_compute::TensorInfo(shape, arm_compute::Format::F32));
+  acl_tensor->set_target(getTargetHint());
+  acl_tensor->bindmem(mem);
+  if (commit) acl_tensor->commit();
+  tensor = (std::unique_ptr<ACLTensor>)std::move(acl_tensor);
+  return true;
+}
+bool ACLOperator::new_tensor(std::unique_ptr<ACLSubTensor>& tensor,
+                             std::unique_ptr<ACLTensor>& parent,
+                             arm_compute::TensorShape& shape,
+                             arm_compute::Coordinates& coord) {
+  auto acl_tensor = new ACLSubTensor(parent, shape, coord);
+  acl_tensor->set_target(getTargetHint());
+  tensor = (std::unique_ptr<ACLSubTensor>)std::move(acl_tensor);
+  return true;
+}
+
+void ACLTensor::commit(TensorType type) {
+  settensortype(type);
+  if (mem_) {
+    if (!allocate_) {
+#ifdef USE_PROFILING
+      logtime_util log_time(ACL_ALLOCATE_INFO);
+#endif  // USE_PROFILING
+      allocate();
+      allocate_ = true;
+    }
+    if (type_ != tensor_output) {
+      tensor_copy(mem_);
+    }
+    mem_ = nullptr;
+  }
+}
+
+int BaseACLTensor::tensor_copy(arm_compute::ITensor* tensor, void* mem,
+                               bool toTensor) {
+#ifdef USE_PROFILING
+  logtime_util log_time(ACL_COPY_INFO);
+#endif  // USE_PROFILING
+  arm_compute::Window window;
+  // Iterate through the rows (not each element)
+  window.use_tensor_dimensions(tensor->info()->tensor_shape(),
+                               /* first_dimension =*/arm_compute::Window::DimY);
+
+  int width = tensor->info()->tensor_shape()[0];
+  int height = tensor->info()->tensor_shape()[1];
+  int deepth = tensor->info()->tensor_shape()[2];
+  map();
+  // Create an iterator:
+  arm_compute::Iterator it(tensor, window);
+  // Except it works for an arbitrary number of dimensions
+  if (toTensor) {  // mem->tensor
+    arm_compute::execute_window_loop(
+        window,
+        [&](const arm_compute::Coordinates& id) {
+          memcpy(it.ptr(),
+                 ((char*)mem) +
+                     ((id[3] * (width * height * deepth) +
+                       id.z() * (width * height) + id.y() * width + id.x()) *
+                      tensor->info()->element_size()),
+                 width * tensor->info()->element_size());
+        },
+        it);
+  } else {  // tensor-->mem
+    arm_compute::execute_window_loop(
+        window,
+        [&](const arm_compute::Coordinates& id) {
+          memcpy(((char*)mem) + ((id[3] * (width * height * deepth) +
+                                  id.z() * (width * height) + id.y() * width) *
+                                 tensor->info()->element_size()),
+                 it.ptr(), width * tensor->info()->element_size());
+        },
+        it);
+  }
+  unmap();
+
+  return 0;
+}
+
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/kernel/mali/acl_operator.h b/src/operators/kernel/mali/acl_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2e13283b1c679d6dfc8972af5ace5e579d568e6
--- /dev/null
+++ b/src/operators/kernel/mali/acl_operator.h
@@ -0,0 +1,1144 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef ACL_OPERATOR_H_
+#define ACL_OPERATOR_H_
+#include <framework/tensor.h>
+#include <operators/op_param.h>
+
+#if USE_ACL == 1
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+#endif
+
+#ifdef USE_OPENGLES
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h"
+#endif
+
+#include "acl_tensor.h"
+#define FLAGS_ENABLE_ACL_ABSVAL 0x00000001
+#define FLAGS_ENABLE_ACL_BNLL 0x00000002
+#define FLAGS_ENABLE_ACL_CONV 0x00000004
+#define FLAGS_ENABLE_ACL_FC 0x00000008
+#define FLAGS_ENABLE_ACL_LRN 0x00000010
+#define FLAGS_ENABLE_ACL_POOLING 0x00000020
+#define FLAGS_ENABLE_ACL_RELU 0x00000040
+#define FLAGS_ENABLE_ACL_SIGMOID 0x00000080
+#define FLAGS_ENABLE_ACL_SOFTMAX 0x00000100
+#define FLAGS_ENABLE_ACL_TANH 0x00000200
+#define FLAGS_ENABLE_ACL_LC 0x00000400
+#define FLAGS_ENABLE_ACL_BN 0x00000800
+#define FLAGS_ENABLE_ACL_CONCAT 0x00001000
+extern unsigned int bypass_acl_class_layer;
+
+#ifdef USE_PROFILING
+#include <sys/time.h>
+#define NANO_SEC_CONV 1000000
+
+#define MASK_LOG_APP_TIME 0x00000001
+#define MASK_LOG_ALLOCATE 0x00000002
+#define MASK_LOG_RUN 0x00000004
+#define MASK_LOG_CONFIG 0x00000008
+#define MASK_LOG_COPY 0x00000010
+#define MASK_LOG_ABSVAL 0x00000020
+#define MASK_LOG_BNLL 0x00000040
+#define MASK_LOG_CONV 0x00000080
+#define MASK_LOG_FC 0x00000100
+#define MASK_LOG_LRN 0x00000200
+#define MASK_LOG_POOLING 0x00000400
+#define MASK_LOG_RELU 0x00000800
+#define MASK_LOG_SIGMOID 0x00001000
+#define MASK_LOG_SOFTMAX 0x00002000
+#define MASK_LOG_TANH 0x00004000
+#define MASK_LOG_LC 0x00008000
+#define MASK_LOG_BN 0x00010000
+#define MASK_LOG_CONCAT 0x00020000
+#define APP_TIME_INFO MASK_LOG_APP_TIME, "time:       \t"
+#define ACL_ALLOCATE_INFO MASK_LOG_ALLOCATE, "allocate:   \t\t"
+#define ACL_RUN_INFO MASK_LOG_RUN, "run:        \t\t\t"
+#define ACL_CONFIG_INFO MASK_LOG_CONFIG, "configure:  \t\t\t\t"
+#define ACL_COPY_INFO MASK_LOG_COPY, "tensor_copy:\t\t\t\t\t"
+#define ACL_ABSVAL_INFO MASK_LOG_ABSVAL, "ACL_ABSVAL :\t\t\t\t\t\t"
+#define ACL_BNLL_INFO MASK_LOG_BNLL, "ACL_BNLL   :\t\t\t\t\t\t\t"
+#define ACL_CONV_INFO MASK_LOG_CONV, "ACL_CONV   :\t\t\t\t\t\t\t\t"
+#define ACL_FC_INFO MASK_LOG_FC, "ACL_FC     :\t\t\t\t\t\t\t\t\t"
+#define ACL_LRN_INFO MASK_LOG_LRN, "ACL_LRN    :\t\t\t\t\t\t\t\t\t\t"
+#define ACL_POOLING_INFO MASK_LOG_POOLING, "ACL_POOLING:\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_RELU_INFO MASK_LOG_RELU, "ACL_RELU   :\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_SIGMOID_INFO \
+  MASK_LOG_SIGMOID, "ACL_SIGMOID:\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_SOFTMAX_INFO \
+  MASK_LOG_SOFTMAX, "ACL_SOFTMAX:\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_TANH_INFO \
+  MASK_LOG_TANH, "ACL_TANH   :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_LC_INFO MASK_LOG_LC, "ACL_LC     :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_BN_INFO \
+  MASK_LOG_BN, "ACL_BN     :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_CONCAT_INFO \
+  MASK_LOG_CONCAT, "ACL_CONCAT :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+extern unsigned int acl_log_flags;
+
+class logtime_util {
+ public:
+  logtime_util() { mask = 0; }
+  logtime_util(int mask_, const char *information_) {
+    setlogtime_info(mask_, information_);
+  }
+  void setlogtime_info(int mask_, const char *information_) {
+    mask = mask_;
+    if (acl_log_flags & mask) {
+      strncpy(information, information_, 255);
+      gettimeofday(&tv[0], NULL);
+    }
+  }
+  ~logtime_util() {
+    if (acl_log_flags & mask) {
+      int time[2];
+      gettimeofday(&tv[1], NULL);
+      time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
+      time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
+      printf("%s %.6lf\n", information,
+             (((double)time[1] - time[0]) / NANO_SEC_CONV));
+    }
+  }
+  void log_time(bool start) {
+    if (acl_log_flags & mask) {
+      if (start) {
+        gettimeofday(&tv[0], NULL);
+      } else {
+        int time[2];
+        gettimeofday(&tv[1], NULL);
+        time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
+        time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
+        printf("%s %.6lf\n", information,
+               (((double)time[1] - time[0]) / NANO_SEC_CONV));
+      }
+    }
+  }
+
+ private:
+  struct timeval tv[2];
+  int mask;
+  char information[256];
+};
+
+#endif  // USE_PROFILING
+
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+
+class AclParameters {
+ public:
+  AclParameters() {
+    dilated = false;
+    dim = 2;
+    num_group = 1;
+  }
+  int batch;
+  int in_depth;
+  int in_rows;
+  int in_cols;
+
+  int out_depth;
+  int out_rows;
+  int out_cols;
+  int out_num;
+
+  int filter_rows;
+  int filter_cols;
+
+  int stride_rows;
+  int stride_cols;
+
+  int pad_rows;
+  int pad_cols;
+
+  int dilation_rows;
+  int dilation_cols;
+
+  int num_group;
+  bool dilated;
+  int dim;
+  int epsilon;
+
+  int nsize;
+  float alpha;
+  float beta;
+  float knorm;
+
+  void *input_data;
+  void *output_data;
+  void *weight_data;
+  void *biases_data;
+  void *mean_data;
+  void *var_data;
+
+  std::string pool_type;
+  std::string act_type;
+  std::string data_layout;
+
+  bool is_global_pool;
+  bool is_channel_concat;
+
+  std::vector<framework::LoDTensor *> in_tensor;
+};
+
+enum TensorType {
+  tensor_input,
+  tensor_output,
+  tensor_weights,
+  tensor_biases,
+  tensor_mean,
+  tensor_var,
+  tensor_beta,
+  tensor_gamma,
+  tensor_concat,
+  tensor_data,
+};
+enum OperatorState {
+  operator_not_init,
+  operator_init_done,
+  operator_reinit,
+};
+enum OperateType {
+  operate_type_pooling,
+  operate_type_activation,
+  operate_type_lrn,
+  operate_type_conv,
+  operate_type_lc,
+  operate_type_fc,
+  operate_type_bn,
+  operate_type_softmax,
+  operate_type_concat,
+};
+
+class BaseACLTensor {
+ public:
+  BaseACLTensor() : type_(tensor_input), allocate_(false) {}
+  virtual ~BaseACLTensor() {}
+  virtual void bindmem(void *mem) { mem_ = mem; }
+  virtual void settensortype(TensorType type) { type_ = type; }
+  virtual void map(bool blocking = true) {}
+  virtual void unmap() {}
+  virtual void commit(TensorType type = tensor_data) {}
+  int tensor_copy(arm_compute::ITensor *tensor, void *mem,
+                  bool toTensor = true);
+
+ protected:
+  void *mem_;
+  TensorType type_;
+  bool allocate_;
+};
+class ACLTensor : public BaseACLTensor, public Tensor {
+ public:
+  explicit ACLTensor(arm_compute::TensorInfo &&info) : Tensor(info) {}
+  virtual void map(bool blocking = true) {
+    if (!allocate_) {
+      Tensor::allocate();
+      allocate_ = true;
+    }
+    Tensor::map(blocking);
+  }
+  virtual int tensor_copy(void *mem, bool toTensor = true) {
+    auto acl_tensor = this;
+    arm_compute::ITensor *tensor = acl_tensor->tensor();
+    BaseACLTensor::tensor_copy(tensor, mem, toTensor);
+    return 0;
+  }
+  virtual void unmap() { Tensor::unmap(); }
+  virtual void commit(TensorType type = tensor_data);
+};
+class ACLSubTensor : public BaseACLTensor, public SubTensor {
+ public:
+  ACLSubTensor(std::unique_ptr<ACLTensor> &parent,
+               arm_compute::TensorShape &shape, arm_compute::Coordinates &coord)
+      : SubTensor(parent.get(), shape, coord) {}
+  virtual int tensor_copy(void *mem, bool toTensor = true) { return 0; }
+};
+
+template <typename T>
+class TensorPair {
+ public:
+  TensorPair() {}
+  ~TensorPair() {}
+  TensorType type;
+  std::unique_ptr<T> tensor;
+};
+template <typename T>
+std::unique_ptr<T> &tensor_item(
+    std::vector<std::unique_ptr<TensorPair<T>>> &pool, TensorType type,
+    int idx) {
+  int count = 0;
+  for (auto &item : pool) {
+    if (item.get()->type == type) {
+      ++count;
+    }
+    if (item.get()->type == type && idx == count - 1) {
+      return item.get()->tensor;
+    }
+  }
+  pool.push_back((std::unique_ptr<TensorPair<T>>)std::move(new TensorPair<T>));
+  auto item = pool[pool.size() - 1].get();
+  item->type = type;
+  item->tensor = NULL;
+  return item->tensor;
+}
+class ACLOperator {
+ public:
+  virtual void commit() {
+    for (auto &item : tensor_pool_) {
+      if (item.get()->tensor) item.get()->tensor->commit(item.get()->type);
+    }
+  }
+  inline void run() {
+    commit();
+#ifdef USE_PROFILING
+    logtime_util log_time(ACL_RUN_INFO);
+#endif  // USE_PROFILING
+    for (auto &c : funcs_) {
+      c->run();
+    }
+  }
+
+  inline std::vector<std::unique_ptr<arm_compute::IFunction>> &funcs() {
+    return funcs_;
+  }
+  inline std::unique_ptr<ACLSubTensor> &sinput(int idx = 0) {
+    return tensor_item(subtensor_pool_, tensor_input, idx);
+  }
+  inline std::unique_ptr<ACLSubTensor> &soutput(int idx = 0) {
+    return tensor_item(subtensor_pool_, tensor_output, idx);
+  }
+  inline std::unique_ptr<ACLSubTensor> &sweights(int idx = 0) {
+    return tensor_item(subtensor_pool_, tensor_weights, idx);
+  }
+  inline std::unique_ptr<ACLSubTensor> &sbiases(int idx = 0) {
+    return tensor_item(subtensor_pool_, tensor_biases, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &cinput(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_concat, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &input(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_input, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &output(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_output, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &weights(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_weights, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &biases(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_biases, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &mean(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_mean, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &var(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_var, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &beta(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_beta, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &gamma(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_gamma, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &tensor(TensorType type) {
+    switch (type) {
+      case tensor_biases:
+        return biases();
+        break;
+      case tensor_weights:
+        return weights();
+        break;
+      case tensor_output:
+        return output();
+        break;
+      default:
+      case tensor_input:
+        return input();
+        break;
+    }
+    return input();
+  }
+
+  explicit ACLOperator(bool is_gpu = false);
+  virtual ~ACLOperator();
+  inline TargetHint getTargetHint() {
+#ifdef USE_OPENCL
+    if (target_hint_ == TargetHint::DONT_CARE) {
+      if (is_gpu_) {
+        return TargetHint::OPENCL;
+      }
+      return TargetHint::NEON;
+    }
+    return target_hint_;
+#elif defined(USE_OPENGLES)
+    if (target_hint_ == TargetHint::DONT_CARE) {
+      if (is_gpu_) {
+        return TargetHint::OPENGLES;
+      }
+      return TargetHint::NEON;
+    }
+    return target_hint_;
+#else
+    return TargetHint::NEON;
+#endif
+  }
+  inline void setTargetHint(TargetHint hint) { target_hint_ = hint; }
+  inline ConvolutionMethodHint &getConvMethod() {
+    return convolution_method_hint_;
+  }
+  inline void setConvMethod() {
+    convolution_method_hint_ = ConvolutionMethodHint::DIRECT;
+  }
+  inline bool tensor_mem(std::unique_ptr<ACLTensor> &tensor, void *mem) {
+    tensor->bindmem(mem);
+    return true;
+  }
+  inline bool tensor_mem(void *mem, std::unique_ptr<ACLTensor> &tensor) {
+    tensor->tensor_copy(mem, false);
+    return true;
+  }
+  bool new_tensor(std::unique_ptr<ACLTensor> &tensor,
+                  arm_compute::TensorShape &shape, void *mem = nullptr,
+                  bool commit = false);
+  bool new_tensor(std::unique_ptr<ACLSubTensor> &tensor,
+                  std::unique_ptr<ACLTensor> &parent,
+                  arm_compute::TensorShape &shape,
+                  arm_compute::Coordinates &coord);
+  inline int &group() { return _group; }
+  inline void set_operator_property(OperateType type, const char *name) {
+    name_ = name;
+    type_ = type;
+  }
+  inline void acl_run(void *input_data, void *output_data) {
+    if (input_data) tensor_mem(input(), input_data);
+    run();
+    tensor_mem(output_data, output());
+  }
+  inline int &input_idx() { return input_idx_; }
+  inline int &output_idx() { return output_idx_; }
+
+ protected:
+  inline bool isGPUMode() {
+#ifdef USE_OPENCL
+    if (!support_opencl_) return false;
+    return getTargetHint() == TargetHint::OPENCL;
+#elif defined(USE_OPENGLES)
+    if (!support_opengles_) return false;
+    return getTargetHint() == TargetHint::OPENGLES;
+#endif
+    return false;
+  }
+  inline OperatorState &opstate() { return operator_state_; }
+  inline bool is_operator_init_done(arm_compute::TensorShape shape,
+                                    TensorType type = tensor_input) {
+    checkreshape(shape, type);
+    return operator_state_ == operator_init_done;
+  }
+  inline void set_operator_init_done() {
+    opstate() = operator_init_done;
+    set_bypass_state(false);
+  }
+  inline void set_bypass_state(bool state = false) {
+    force_bypass_acl_path_ = state;
+  }
+  inline OperatorState checkreshape(arm_compute::TensorShape shape,
+                                    TensorType type = tensor_input) {
+    opstate() = reshape(shape, type);
+    if (opstate() == operator_reinit) {
+      freeres();
+    }
+    return opstate();
+  }
+  inline OperatorState reshape(arm_compute::TensorShape &shape,
+                               TensorType type) {
+    arm_compute::TensorShape _shape;
+    std::unique_ptr<ACLTensor> &acl_tensor = tensor(type);
+    if (!acl_tensor.get()) return operator_not_init;
+    _shape = acl_tensor->info().tensor_shape();
+    if (_shape.total_size() == shape.total_size() && _shape[0] == shape[0] &&
+        _shape[1] == shape[1]) {
+      return operator_init_done;
+    }
+    return operator_reinit;
+  }
+  inline void freeres() {
+    tensor_pool_.clear();
+    subtensor_pool_.clear();
+    funcs_.clear();
+  }
+  inline const char *&name() { return name_; }
+  inline void set_in_out_index(int indata_idx, int outdata_idx) {
+    input_idx() = indata_idx;
+    output_idx() = outdata_idx;
+  }
+
+ protected:
+  std::vector<std::unique_ptr<TensorPair<ACLTensor>>> tensor_pool_;
+  std::vector<std::unique_ptr<TensorPair<ACLSubTensor>>> subtensor_pool_;
+  std::vector<std::unique_ptr<arm_compute::IFunction>> funcs_;
+  OperatorState operator_state_;
+  bool force_bypass_acl_path_;
+  TargetHint target_hint_;
+  ConvolutionMethodHint convolution_method_hint_;
+  static bool support_opengles_;
+  static bool support_opencl_;
+  static bool init_gpu_env;
+  int _group;
+  const char *name_;
+  OperateType type_;
+  int input_idx_, output_idx_;
+  bool is_gpu_;
+};
+
+int isScheduleEnable();
+
+template <typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    arm_compute::ITensor *input, arm_compute::ITensor *output) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(dynamic_cast<TensorType *>(input),
+                dynamic_cast<TensorType *>(output));
+
+  return std::move(op);
+}
+
+template <typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    arm_compute::ITensor *input, arm_compute::ITensor *output) {
+  return instantiate_function<OperatorType, TensorType>(input, output);
+}
+
+template <typename OpType, typename OpTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint &hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  func = instantiate<OpType, OpTensor>(input->tensor(), output->tensor());
+  return func;
+}
+
+template <typename OperatorType, typename TensorType, typename VectorTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    VectorTensor inputs, arm_compute::ITensor *output) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(inputs, dynamic_cast<TensorType *>(output));
+
+  return std::move(op);
+}
+
+template <typename OperatorType, typename TensorType, typename VectorTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    VectorTensor inputs, arm_compute::ITensor *output) {
+  return instantiate_function<OperatorType, TensorType, VectorTensor>(inputs,
+                                                                      output);
+}
+
+template <typename OpType, typename OpTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func_lists(
+    ACLOperator *&acl_op, std::unique_ptr<ACLTensor> &output, int num,
+    TargetHint &hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  static std::vector<OpTensor *> tensors;
+  tensors.clear();
+  for (int i = 0; i < num; ++i) {
+    tensors.push_back(
+        dynamic_cast<OpTensor *>(acl_op->cinput(i).get()->tensor()));
+  }
+  func = instantiate<OpType, OpTensor, std::vector<OpTensor *>>(
+      tensors, output->tensor());
+  return func;
+}
+
+template <typename OperatorType, typename TensorType, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    arm_compute::ITensor *input, arm_compute::ITensor *output,
+    const OperatorInfo &info) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(dynamic_cast<TensorType *>(input),
+                dynamic_cast<TensorType *>(output), info);
+
+  return std::move(op);
+}
+
+template <typename OperatorType, typename TensorType, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    arm_compute::ITensor *input, arm_compute::ITensor *output,
+    const OperatorInfo &info) {
+  return instantiate_function<OperatorType, TensorType, OperatorInfo>(
+      input, output, info);
+}
+
+template <typename OpType, typename OpTensor, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    const OperatorInfo &info, TargetHint &hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  func = instantiate<OpType, OpTensor, OperatorInfo>(input->tensor(),
+                                                     output->tensor(), info);
+  return func;
+}
+
+template <typename OperatorType, typename TensorType, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    arm_compute::ITensor *input, arm_compute::ITensor *weights,
+    arm_compute::ITensor *biases, arm_compute::ITensor *output,
+    const OperatorInfo &info) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(dynamic_cast<TensorType *>(input),
+                dynamic_cast<TensorType *>(weights),
+                dynamic_cast<TensorType *>(biases),
+                dynamic_cast<TensorType *>(output), info);
+  return std::move(op);
+}
+
+template <typename OperatorType, typename TensorType, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    arm_compute::ITensor *input, arm_compute::ITensor *weights,
+    arm_compute::ITensor *biases, arm_compute::ITensor *output,
+    const OperatorInfo &info) {
+  return instantiate_function<OperatorType, TensorType, OperatorInfo>(
+      input, weights, biases, output, info);
+}
+
+template <typename OpType, typename OpTensor, typename OperatorInfo,
+          typename ACLTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &weights,
+    std::unique_ptr<ACLTensor> &biases, std::unique_ptr<ACLTensor> &output,
+    const OperatorInfo &info, TargetHint &hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  arm_compute::ITensor *biases_tensor = NULL;
+
+  if (biases.get()) {
+    biases_tensor = biases->tensor();
+  }
+  func = instantiate<OpType, OpTensor, OperatorInfo>(
+      input->tensor(), weights->tensor(), biases_tensor, output->tensor(),
+      info);
+  return func;
+}
+
+template <typename Dtype, typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    arm_compute::ITensor *input, arm_compute::ITensor *output,
+    arm_compute::ITensor *mean, arm_compute::ITensor *var,
+    arm_compute::ITensor *beta, arm_compute::ITensor *gamma, Dtype &eps) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(
+      dynamic_cast<TensorType *>(input), dynamic_cast<TensorType *>(output),
+      dynamic_cast<TensorType *>(mean), dynamic_cast<TensorType *>(var),
+      dynamic_cast<TensorType *>(beta), dynamic_cast<TensorType *>(gamma), eps);
+
+  return std::move(op);
+}
+
+template <typename Dtype, typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    arm_compute::ITensor *input, arm_compute::ITensor *output,
+    arm_compute::ITensor *mean, arm_compute::ITensor *var,
+    arm_compute::ITensor *beta, arm_compute::ITensor *gamma, Dtype eps) {
+  return instantiate_function<Dtype, OperatorType, TensorType>(
+      input, output, mean, var, beta, gamma, eps);
+}
+
+template <typename Dtype, typename OpType, typename OpTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    std::unique_ptr<ACLTensor> &mean, std::unique_ptr<ACLTensor> &var,
+    std::unique_ptr<ACLTensor> &beta, std::unique_ptr<ACLTensor> &gamma,
+    Dtype eps, TargetHint hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  func = instantiate<Dtype, OpType, OpTensor>(
+      input->tensor(), output->tensor(), mean->tensor(), var->tensor(),
+      beta->tensor(), gamma->tensor(), eps);
+  return func;
+}
+
+template <typename OperatorInfo>
+bool instantiate_op_pooling(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func<arm_compute::CLPoolingLayer, arm_compute::ICLTensor,
+                            arm_compute::PoolingLayerInfo>(input, output, info,
+                                                           hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(
+        instantiate_op_func<arm_compute::GCPoolingLayer, arm_compute::IGCTensor,
+                            arm_compute::PoolingLayerInfo>(input, output, info,
+                                                           hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func<arm_compute::NEPoolingLayer, arm_compute::ITensor,
+                            arm_compute::PoolingLayerInfo>(input, output, info,
+                                                           hint));
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_activation(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(instantiate_op_func<arm_compute::CLActivationLayer,
+                                       arm_compute::ICLTensor,
+                                       arm_compute::ActivationLayerInfo>(
+        input, output, info, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(instantiate_op_func<arm_compute::GCActivationLayer,
+                                       arm_compute::IGCTensor,
+                                       arm_compute::ActivationLayerInfo>(
+        input, output, info, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(instantiate_op_func<arm_compute::NEActivationLayer,
+                                       arm_compute::ITensor,
+                                       arm_compute::ActivationLayerInfo>(
+        input, output, info, hint));
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_lrn(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(instantiate_op_func<arm_compute::CLNormalizationLayer,
+                                       arm_compute::ICLTensor,
+                                       arm_compute::NormalizationLayerInfo>(
+        input, output, info, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(instantiate_op_func<arm_compute::GCNormalizationLayer,
+                                       arm_compute::IGCTensor,
+                                       arm_compute::NormalizationLayerInfo>(
+        input, output, info, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(instantiate_op_func<arm_compute::NENormalizationLayer,
+                                       arm_compute::ITensor,
+                                       arm_compute::NormalizationLayerInfo>(
+        input, output, info, hint));
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_conv(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+  std::unique_ptr<ACLTensor> &weights = acl_op->weights();
+  std::unique_ptr<ACLTensor> &biases = acl_op->biases();
+  ConvolutionMethodHint &conv_method = acl_op->getConvMethod();
+  bool has_biases = biases.get() ? true : false;
+  int &groups = acl_op->group();
+  arm_compute::TensorShape input_shape = input->info().tensor_shape();
+  arm_compute::TensorShape weights_shape = weights->info().tensor_shape();
+  arm_compute::TensorShape biases_shape;
+  if (has_biases) {
+    biases_shape = biases->info().tensor_shape();
+  }
+  arm_compute::TensorShape output_shape = output->info().tensor_shape();
+
+  if (groups == 1) {
+    if (conv_method == ConvolutionMethodHint::GEMM) {
+#ifdef USE_OPENCL
+      if (hint == TargetHint::OPENCL) {
+        func.push_back(instantiate_op_func<arm_compute::CLConvolutionLayer,
+                                           arm_compute::ICLTensor,
+                                           arm_compute::PadStrideInfo>(
+            acl_op->input(), acl_op->weights(), acl_op->biases(),
+            acl_op->output(), info, hint));
+        return true;
+      }
+#elif defined(USE_OPENGLES)
+      if (hint == TargetHint::OPENGLES) {
+        func.push_back(instantiate_op_func<arm_compute::GCConvolutionLayer,
+                                           arm_compute::IGCTensor,
+                                           arm_compute::PadStrideInfo>(
+            acl_op->input(), acl_op->weights(), acl_op->biases(),
+            acl_op->output(), info, hint));
+        return true;
+      }
+#endif
+      {
+        func.push_back(instantiate_op_func<arm_compute::NEConvolutionLayer,
+                                           arm_compute::ITensor,
+                                           arm_compute::PadStrideInfo>(
+            acl_op->input(), acl_op->weights(), acl_op->biases(),
+            acl_op->output(), info, hint));
+      }
+    } else {
+#ifdef USE_OPENCL
+      if (hint == TargetHint::OPENCL) {
+        func.push_back(
+            instantiate_op_func<arm_compute::CLDirectConvolutionLayer,
+                                arm_compute::ICLTensor,
+                                arm_compute::PadStrideInfo>(
+                acl_op->input(), acl_op->weights(), acl_op->biases(),
+                acl_op->output(), info, hint));
+        return true;
+      }
+#elif defined(USE_OPENGLES)
+      if (hint == TargetHint::OPENGLES) {
+        func.push_back(
+            instantiate_op_func<arm_compute::GCDirectConvolutionLayer,
+                                arm_compute::IGCTensor,
+                                arm_compute::PadStrideInfo>(
+                acl_op->input(), acl_op->weights(), acl_op->biases(),
+                acl_op->output(), info, hint));
+        return true;
+      }
+#endif
+      {
+        func.push_back(
+            instantiate_op_func<arm_compute::NEDirectConvolutionLayer,
+                                arm_compute::ITensor,
+                                arm_compute::PadStrideInfo>(
+                acl_op->input(), acl_op->weights(), acl_op->biases(),
+                acl_op->output(), info, hint));
+      }
+    }
+    return true;
+  }
+
+  // Calculate sub-tensor splits
+  const int input_split = input_shape.z() / groups;
+  const int output_split = output_shape.z() / groups;
+  const int weights_split = weights_shape[3] / groups;
+  const int biases_split = biases_shape.x() / groups;
+
+  // Calculate sub-tensor shapes
+  input_shape.set(2, input_split);
+  output_shape.set(2, output_split);
+  weights_shape.set(3, weights_split);
+  biases_shape.set(0, biases_split);
+
+  for (auto i = 0; i < groups; ++i) {
+    // Calculate sub-tensors starting coordinates
+    arm_compute::Coordinates input_coord(0, 0, input_split * i);
+    arm_compute::Coordinates output_coord(0, 0, output_split * i);
+    arm_compute::Coordinates weights_coord(0, 0, 0, weights_split * i);
+    arm_compute::Coordinates biases_coord(biases_split * i);
+
+    // Create sub-tensors for input, output, weights and bias
+    acl_op->new_tensor(acl_op->sinput(i), acl_op->input(), input_shape,
+                       input_coord);
+    acl_op->new_tensor(acl_op->soutput(i), acl_op->output(), output_shape,
+                       output_coord);
+    acl_op->new_tensor(acl_op->sweights(i), acl_op->weights(), weights_shape,
+                       weights_coord);
+    if (has_biases) {
+      acl_op->new_tensor(acl_op->sbiases(i), acl_op->biases(), biases_shape,
+                         biases_coord);
+    }
+
+    bool use_opencl = false;
+    if (conv_method == ConvolutionMethodHint::GEMM) {
+#ifdef USE_OPENCL
+      if (hint == TargetHint::OPENCL) {
+        use_opencl = true;
+        func.push_back(
+            instantiate_op_func<arm_compute::CLConvolutionLayer,
+                                arm_compute::ICLTensor,
+                                arm_compute::PadStrideInfo, ACLSubTensor>(
+                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
+                acl_op->soutput(i), info, hint));
+      }
+#endif
+      if (!use_opencl) {
+        func.push_back(
+            instantiate_op_func<arm_compute::NEConvolutionLayer,
+                                arm_compute::ITensor,
+                                arm_compute::PadStrideInfo, ACLSubTensor>(
+                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
+                acl_op->soutput(i), info, hint));
+      }
+    } else {
+#ifdef USE_OPENCL
+      if (hint == TargetHint::OPENCL) {
+        use_opencl = true;
+        func.push_back(
+            instantiate_op_func<arm_compute::CLDirectConvolutionLayer,
+                                arm_compute::ICLTensor,
+                                arm_compute::PadStrideInfo, ACLSubTensor>(
+                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
+                acl_op->soutput(i), info, hint));
+      }
+#endif
+      if (!use_opencl) {
+        func.push_back(
+            instantiate_op_func<arm_compute::NEDirectConvolutionLayer,
+                                arm_compute::ITensor,
+                                arm_compute::PadStrideInfo, ACLSubTensor>(
+                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
+                acl_op->soutput(i), info, hint));
+      }
+    }
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_lc(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+  std::unique_ptr<ACLTensor> &weights = acl_op->weights();
+  std::unique_ptr<ACLTensor> &biases = acl_op->biases();
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func<arm_compute::CLLocallyConnectedLayer,
+                            arm_compute::ICLTensor, arm_compute::PadStrideInfo>(
+            input, weights, biases, output, info, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func<arm_compute::NELocallyConnectedLayer,
+                            arm_compute::ITensor, arm_compute::PadStrideInfo>(
+            input, weights, biases, output, info, hint));
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_fc(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+  std::unique_ptr<ACLTensor> &weights = acl_op->weights();
+  std::unique_ptr<ACLTensor> &biases = acl_op->biases();
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(instantiate_op_func<arm_compute::CLFullyConnectedLayer,
+                                       arm_compute::ICLTensor, bool>(
+        input, weights, biases, output, info, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(instantiate_op_func<arm_compute::GCFullyConnectedLayer,
+                                       arm_compute::IGCTensor, bool>(
+        input, weights, biases, output, info, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(instantiate_op_func<arm_compute::NEFullyConnectedLayer,
+                                       arm_compute::ITensor, bool>(
+        input, weights, biases, output, info, hint));
+  }
+  return true;
+}
+template <typename Dtype>
+bool instantiate_op_bn(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, Dtype eps) {
+  std::unique_ptr<ACLTensor> &mean = acl_op->mean();
+  std::unique_ptr<ACLTensor> &var = acl_op->var();
+  std::unique_ptr<ACLTensor> &beta = acl_op->beta();
+  std::unique_ptr<ACLTensor> &gamma = acl_op->gamma();
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func<Dtype, arm_compute::CLBatchNormalizationLayer,
+                            arm_compute::ICLTensor>(input, output, mean, var,
+                                                    beta, gamma, eps, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(
+        instantiate_op_func<Dtype, arm_compute::GCBatchNormalizationLayer,
+                            arm_compute::IGCTensor>(input, output, mean, var,
+                                                    beta, gamma, eps, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func<Dtype, arm_compute::NEBatchNormalizationLayer,
+                            arm_compute::ITensor>(input, output, mean, var,
+                                                  beta, gamma, eps, hint));
+  }
+  return true;
+}
+inline bool instantiate_op_softmax(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, void *data) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func<arm_compute::CLSoftmaxLayer,
+                            arm_compute::ICLTensor>(input, output, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(
+        instantiate_op_func<arm_compute::GCSoftmaxLayer,
+                            arm_compute::IGCTensor>(input, output, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func<arm_compute::NESoftmaxLayer, arm_compute::ITensor>(
+            input, output, hint));
+  }
+  return true;
+}
+inline bool instantiate_op_concat(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, int num) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func_lists<arm_compute::CLDepthConcatenateLayer,
+                                  arm_compute::ICLTensor>(acl_op, output, num,
+                                                          hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(
+        instantiate_op_func_lists<arm_compute::GCDepthConcatenateLayer,
+                                  arm_compute::IGCTensor>(acl_op, output, num,
+                                                          hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func_lists<arm_compute::NEDepthConcatenateLayer,
+                                  arm_compute::ITensor>(acl_op, output, num,
+                                                        hint));
+  }
+  return true;
+}
+template <typename Dtype>
+void *InputdataPtr(ACLOperator *op,
+                   const std::vector<framework::LoDTensor *> &input_data,
+                   Dtype type, int index = -1) {
+  if (index == -1) index = 0;
+  return (void *)(input_data[index]->mutable_data<Dtype>());
+}
+
+template <typename Dtype>
+void acl_run(ACLOperator *op,
+             const std::vector<framework::LoDTensor *> &in_data, void *out_data,
+             Dtype type, bool multi_input_run = true) {
+  for (int i = 0; i < in_data.size(); ++i) {
+    op->tensor_mem(op->cinput(i), InputdataPtr(op, in_data, type, i));
+  }
+  op->acl_run(NULL, out_data);
+}
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef USE_PROFILING
+#define acl_configure(opname, acl_op, args...)                                \
+  {                                                                           \
+    set_operator_property(acl::operate_type_##opname, #opname);               \
+    logtime_util log_time(ACL_CONFIG_INFO);                                   \
+    instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(),         \
+                            acl_op->output(), acl_op->getTargetHint(), args); \
+  }
+#else
+#define acl_configure(opname, acl_op, args...)                                \
+  {                                                                           \
+    set_operator_property(acl::operate_type_##opname, #opname);               \
+    instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(),         \
+                            acl_op->output(), acl_op->getTargetHint(), args); \
+  }
+#endif
+
+#define ACLOp_Ptr(a) dynamic_cast<ACLOperator *>(a)
+
+#endif  // USE_ACL
+
+#endif  // ACL_OPERATOR_H_
diff --git a/src/operators/kernel/mali/acl_tensor.cc b/src/operators/kernel/mali/acl_tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..97a6add20a7ca1b9a6b4f9c9a7e6d1ba1f4e2e0a
--- /dev/null
+++ b/src/operators/kernel/mali/acl_tensor.cc
@@ -0,0 +1,160 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "acl_tensor.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+
+#ifdef USE_ACL
+template <typename TensorType>
+std::unique_ptr<arm_compute::ITensor> initialise_tensor(
+    arm_compute::TensorInfo &info) {
+  auto tensor = cpp14::make_unique<TensorType>();
+  tensor->allocator()->init(info);
+  return std::move(tensor);
+}
+
+template <typename TensorType>
+void tensor_allocate(arm_compute::ITensor &tensor) {
+  auto itensor = dynamic_cast<TensorType *>(&tensor);
+  itensor->allocator()->allocate();
+}
+
+Tensor::Tensor(arm_compute::TensorInfo &info) noexcept
+    : _target(TargetHint::DONT_CARE), _info(info), _tensor(nullptr) {}
+
+Tensor::Tensor(Tensor &&src) noexcept
+    : _target(src._target),
+      _info(std::move(src._info)),
+      _tensor(std::move(src._tensor)) {}
+
+arm_compute::ITensor *Tensor::set_target(TargetHint target) {
+  switch (target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      _tensor = initialise_tensor<arm_compute::CLTensor>(_info);
+      break;
+#elif defined(USE_OPENGLES)
+    case TargetHint::OPENGLES:
+      _tensor = initialise_tensor<arm_compute::GCTensor>(_info);
+      break;
+#endif
+    case TargetHint::NEON:
+      _tensor = initialise_tensor<arm_compute::Tensor>(_info);
+      break;
+    default:
+      break;
+  }
+  _target = target;
+  return _tensor.get();
+}
+
+void Tensor::allocate() {
+  switch (_target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      tensor_allocate<arm_compute::CLTensor>(*_tensor);
+      break;
+#elif defined(USE_OPENGLES)
+    case TargetHint::OPENGLES:
+      tensor_allocate<arm_compute::GCTensor>(*_tensor);
+      break;
+#endif
+    case TargetHint::NEON:
+      tensor_allocate<arm_compute::Tensor>(*_tensor);
+      break;
+    default:
+      break;
+  }
+}
+void Tensor::map(bool blocking) {
+#ifdef USE_OPENCL
+  if (_target == TargetHint::OPENCL)
+    dynamic_cast<arm_compute::CLTensor *>(tensor())->map(blocking);
+#elif defined(USE_OPENGLES)
+  if (_target == TargetHint::OPENGLES)
+    dynamic_cast<arm_compute::GCTensor *>(tensor())->map(blocking);
+#endif
+}
+void Tensor::unmap() {
+#ifdef USE_OPENCL
+  if (_target == TargetHint::OPENCL)
+    dynamic_cast<arm_compute::CLTensor *>(tensor())->unmap();
+#elif defined(USE_OPENGLES)
+  if (_target == TargetHint::OPENGLES)
+    dynamic_cast<arm_compute::GCTensor *>(tensor())->unmap();
+#endif
+}
+
+template <typename SubTensorType, typename ParentTensorType>
+std::unique_ptr<arm_compute::ITensor> initialise_subtensor(
+    arm_compute::ITensor *parent, arm_compute::TensorShape shape,
+    arm_compute::Coordinates coords) {
+  auto ptensor = dynamic_cast<ParentTensorType *>(parent);
+  auto subtensor = cpp14::make_unique<SubTensorType>(ptensor, shape, coords);
+  return std::move(subtensor);
+}
+SubTensor::SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
+                     arm_compute::Coordinates &coords) noexcept
+    : _target(TargetHint::DONT_CARE),
+      _tensor_shape(tensor_shape),
+      _coords(coords),
+      _parent(nullptr),
+      _subtensor(nullptr) {
+  _parent = parent->tensor();
+  _target = parent->target();
+
+  instantiate_subtensor();
+}
+arm_compute::ITensor *SubTensor::set_target(TargetHint target) {
+  return (target == _target) ? _subtensor.get() : nullptr;
+}
+
+arm_compute::ITensor *SubTensor::tensor() { return _subtensor.get(); }
+
+const arm_compute::ITensor *SubTensor::tensor() const {
+  return _subtensor.get();
+}
+
+TargetHint SubTensor::target() const { return _target; }
+
+void SubTensor::allocate() {
+  // NOP for sub-tensors
+}
+
+void SubTensor::instantiate_subtensor() {
+  switch (_target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      _subtensor = initialise_subtensor<arm_compute::CLSubTensor,
+                                        arm_compute::ICLTensor>(
+          _parent, _tensor_shape, _coords);
+      break;
+#endif
+    default:
+    case TargetHint::NEON:
+      _subtensor =
+          initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(
+              _parent, _tensor_shape, _coords);
+      break;
+  }
+}
+
+#endif
+
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/kernel/mali/acl_tensor.h b/src/operators/kernel/mali/acl_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d4f59371e355ddd2e89a709eec0b5451c1c3502
--- /dev/null
+++ b/src/operators/kernel/mali/acl_tensor.h
@@ -0,0 +1,128 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef ACL_TENSOR_H_
+#define ACL_TENSOR_H_
+
+#ifdef USE_ACL
+#ifdef USE_OPENCL
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#elif defined(USE_OPENGLES)
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#endif
+#include "arm_compute/runtime/SubTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+enum class TargetHint {
+  DONT_CARE,
+  OPENCL,
+  OPENGLES,
+  NEON,
+};
+
+enum class ConvolutionMethodHint {
+  GEMM,
+  DIRECT,
+};
+namespace cpp14 {
+template <class T>
+struct _Unique_if {
+  typedef std::unique_ptr<T> _Single_object;
+};
+
+template <class T>
+struct _Unique_if<T[]> {
+  typedef std::unique_ptr<T[]> _Unknown_bound;
+};
+
+template <class T, size_t N>
+struct _Unique_if<T[N]> {
+  typedef void _Known_bound;
+};
+
+template <class T, class... Args>
+typename _Unique_if<T>::_Single_object make_unique(Args &&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+template <class T>
+typename _Unique_if<T>::_Unknown_bound make_unique(size_t n) {
+  typedef typename std::remove_extent<T>::type U;
+  return std::unique_ptr<T>(new U[n]());
+}
+
+template <class T, class... Args>
+typename _Unique_if<T>::_Known_bound make_unique(Args &&...);
+}  // namespace cpp14
+
+class Tensor {
+ public:
+  explicit Tensor(arm_compute::TensorInfo &info) noexcept;
+  virtual ~Tensor() {}
+  Tensor(Tensor &&src) noexcept;
+  void set_info(arm_compute::TensorInfo &&info) { _info = info; }
+  arm_compute::ITensor *set_target(TargetHint target);
+  const arm_compute::TensorInfo &info() const { return _info; }
+  arm_compute::ITensor *tensor() { return _tensor.get(); }
+  void allocate();
+  void init() {}
+  TargetHint target() const { return _target; }
+  virtual void map(bool blocking = true);
+  virtual void unmap();
+
+ private:
+  TargetHint _target;
+  arm_compute::TensorInfo _info;
+  std::unique_ptr<arm_compute::ITensor> _tensor;
+};
+
+class SubTensor {
+ public:
+  SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
+            arm_compute::Coordinates &coords) noexcept;
+  ~SubTensor() {}
+  arm_compute::ITensor *tensor();
+  const arm_compute::ITensor *tensor() const;
+  TargetHint target() const;
+  void allocate();
+  arm_compute::ITensor *set_target(TargetHint target);
+
+ private:
+  /** Instantiates a sub-tensor */
+  void instantiate_subtensor();
+
+ private:
+  /**< Target that this tensor is pinned on */
+  TargetHint _target;
+  /**< SubTensor shape */
+  arm_compute::TensorShape _tensor_shape;
+  /**< SubTensor Coordinates */
+  arm_compute::Coordinates _coords;
+  /**< Parent tensor */
+  arm_compute::ITensor *_parent;
+  /**< SubTensor */
+  std::unique_ptr<arm_compute::ITensor> _subtensor;
+};
+
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif  // ACL_TENSOR_H_
diff --git a/src/operators/kernel/mali/batchnorm_kernel.cpp b/src/operators/kernel/mali/batchnorm_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ff27afc71c42ed1c2b7e67eefbdadd86e92cc0fc
--- /dev/null
+++ b/src/operators/kernel/mali/batchnorm_kernel.cpp
@@ -0,0 +1,166 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef BATCHNORM_OP
+
+#include "operators/kernel/batchnorm_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class AclBatchNormOp : public acl::ACLOperator {
+ public:
+  AclBatchNormOp() {
+    this->force_bypass_acl_path_ = bypass_acl_class_layer & FLAGS_ENABLE_ACL_BN;
+  }
+  ~AclBatchNormOp() = default;
+  AclBatchNormOp(const AclBatchNormOp&) = delete;
+  AclBatchNormOp& operator=(const AclBatchNormOp&) = delete;
+  AclBatchNormOp(AclBatchNormOp&&) = delete;
+  AclBatchNormOp& operator=(AclBatchNormOp&&) = delete;
+
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const BatchNormParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+
+    arm_compute::TensorShape mean_shape(args.in_depth);
+    arm_compute::TensorShape var_shape = mean_shape;
+    arm_compute::TensorShape beta_shape = mean_shape;
+    arm_compute::TensorShape gamma_shape = mean_shape;
+
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+
+    new_tensor(mean(), mean_shape, args.mean_data);
+    new_tensor(var(), var_shape, args.var_data);
+    new_tensor(beta(), beta_shape, args.biases_data);
+    new_tensor(gamma(), gamma_shape, args.weight_data);
+
+    acl_configure(bn, this, args.epsilon);
+  }
+
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const BatchNormParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+
+    return bypass_acl;
+  }
+
+ private:
+  void AclParametersByContext(const BatchNormParam& param) {
+    const Tensor* in_x = param.InputX();
+    Tensor* out = param.OutputY();
+    const Tensor* scale = param.InputScale();
+    const Tensor* bias = param.InputBias();
+    const Tensor* saved_mean = param.InputMean();
+    const Tensor* saved_variance = param.InputVariance();
+
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    const T* weight_data = scale->data<T>();
+    const T* bias_data = bias->data<T>();
+    const T* mean_data = saved_mean->data<T>();
+    const T* var_data = saved_variance->data<T>();
+
+    float epsilon = param.Epsilon();
+
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    // args.weight_data = (void*)weight_data;
+    // args.biases_data = (void*)bias_data;
+    args.mean_data = (void*)mean_data;
+    args.var_data = (void*)var_data;
+    args.epsilon = epsilon;
+
+    args.dim = in_x->dims().size();
+
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+
+    args.out_num = out->dims()[0];
+    args.out_depth = out->dims()[1];
+    args.out_rows = out->dims()[2];
+    args.out_cols = out->dims()[3];
+
+    args.weight_data = (void*)weight_data;
+    args.biases_data = (void*)bias_data;
+
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
+
+template <>
+bool BatchNormKernel<GPU_MALI, float>::Init(const BatchNormParam& param) const {
+  AclBatchNormOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclBatchNormOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+
+template <>
+void BatchNormKernel<GPU_MALI, float>::Compute(
+    const BatchNormParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclBatchNormOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
+}
+
+template class BatchNormKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+#endif
diff --git a/src/operators/kernel/mali/concat_kernel.cpp b/src/operators/kernel/mali/concat_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..08ee58d41577dfb5fd3a99755d66b5677b7b7ed2
--- /dev/null
+++ b/src/operators/kernel/mali/concat_kernel.cpp
@@ -0,0 +1,140 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONCAT_OP
+
+#include "operators/kernel/concat_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class AclConcatOp : public acl::ACLOperator {
+ public:
+  AclConcatOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONCAT;
+  }
+  ~AclConcatOp() = default;
+  AclConcatOp(const AclConcatOp&) = delete;
+  AclConcatOp& operator=(const AclConcatOp&) = delete;
+  AclConcatOp(AclConcatOp&&) = delete;
+  AclConcatOp& operator=(AclConcatOp&&) = delete;
+
+  acl::AclParameters& getargs() { return args; }
+
+  void InitAclLayer(const ConcatParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    const std::vector<framework::LoDTensor*>* input_data = &args.in_tensor;
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.batch);
+
+    if (is_operator_init_done(output_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    T type;
+
+    for (int i = 0; i < input_data->size(); i++) {
+      const T* idata = (*input_data)[i]->data<T>();
+      const T* pdata = (*input_data)[i]->data<T>();
+      int in_batch = (*input_data)[i]->dims()[0];
+      int in_channels = (*input_data)[i]->dims()[1];
+      int in_width = (*input_data)[i]->dims()[2];
+      int in_height = (*input_data)[i]->dims()[3];
+      arm_compute::TensorShape in_shape(in_width, in_height, in_channels);
+
+      new_tensor(cinput(i), in_shape,
+                 acl::InputdataPtr(this, args.in_tensor, type, i));
+    }
+
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+
+    acl_configure(concat, this, input_data->size());
+  }
+
+  void RunAcl(const std::vector<framework::LoDTensor*>& input, void* output) {
+    T type;
+    acl::acl_run(this, input, output, type);
+  }
+  bool Bypass_acl(const ConcatParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || !args.is_channel_concat) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+
+ private:
+  void AclParametersByContext(const ConcatParam& param) {
+    auto inputs = param.Inputs();
+    auto* output = param.Out();
+    int64_t axis = param.Axis();
+
+    T* output_data = output->mutable_data<T>();
+
+    args.is_channel_concat = (axis == 1);
+    args.in_tensor = inputs;
+    args.output_data = (void*)output_data;
+
+    args.batch = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+  }
+  acl::AclParameters args;
+};
+
+template <>
+bool ConcatKernel<GPU_MALI, float>::Init(const ConcatParam& param) const {
+  AclConcatOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConcatOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+
+template <>
+void ConcatKernel<GPU_MALI, float>::Compute(const ConcatParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclConcatOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  std::vector<framework::LoDTensor*> temp_data = args.in_tensor;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl(temp_data, (void*)output_data);
+}
+
+template class ConcatKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+#endif
diff --git a/src/operators/kernel/mali/conv_add_kernel.cpp b/src/operators/kernel/mali/conv_add_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d34910231c086673c58d8dba2c1e44992b5d593
--- /dev/null
+++ b/src/operators/kernel/mali/conv_add_kernel.cpp
@@ -0,0 +1,235 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADD_OP
+
+#include "operators/kernel/conv_add_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class AclConvAddOp : public acl::ACLOperator {
+ public:
+  AclConvAddOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
+  }
+  ~AclConvAddOp() = default;
+  AclConvAddOp(const AclConvAddOp&) = delete;
+  AclConvAddOp& operator=(const AclConvAddOp&) = delete;
+  AclConvAddOp(AclConvAddOp&&) = delete;
+  AclConvAddOp& operator=(AclConvAddOp&&) = delete;
+
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const FusionConvAddParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+    arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
+                                           args.in_depth / args.num_group,
+                                           args.out_depth);
+    arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PadStrideInfo conv_info(
+        args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
+        arm_compute::DimensionRoundingType::FLOOR);
+
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+
+    check_direct_conv();
+    //[kernel_x, kernel_y, IFM, OFM]
+    new_tensor(weights(), weights_shape, args.weight_data);
+    //[OFM]
+    if (args.biases_data) {
+      new_tensor(biases(), biases_shape, args.biases_data);
+    }
+
+    group() = args.num_group;
+
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+
+    acl_configure(conv, this, conv_info);
+  }
+
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const FusionConvAddParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
+      bypass_acl = true;
+    }
+    if (args.dim > 2) {
+      bypass_acl = true;
+    }
+    if (args.dilated) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+
+ private:
+  void check_direct_conv() {
+    bool use_direct_conv = false;
+    const char* pDirectConv;
+    pDirectConv = getenv("DIRECTCONV");
+    if (pDirectConv) {
+      unsigned int bdirectconv;
+      sscanf(pDirectConv, "%i", &bdirectconv);
+      if (bdirectconv != use_direct_conv) {
+        use_direct_conv = bdirectconv;
+        printf("DIRECTCONV<%s>\n", pDirectConv);
+        printf("DIRECTCONV: %x\n", use_direct_conv);
+      }
+    }
+    int pad_data[2], kernel[2];
+    pad_data[1] = args.pad_rows;
+    pad_data[0] = args.pad_cols;
+    kernel[1] = args.filter_rows;
+    kernel[0] = args.filter_cols;
+    if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
+                             pad_data[0] == 0 && pad_data[1] == 0) ||
+                            (kernel[0] == 3 && kernel[1] == 3 &&
+                             pad_data[0] <= 1 && pad_data[1] <= 1))) {
+      setConvMethod();  // NEDirectConvolutionLayer only for 1x1 and 3x3
+    }
+  }
+
+  void AclParametersByContext(const FusionConvAddParam& param) {
+    const Tensor* input = param.Input();
+    Tensor filter = *param.Filter();
+    Tensor* output = param.Output();
+    Tensor* bias;
+
+    int groups = param.Groups();
+    std::vector<int> strides = param.Strides();
+    std::vector<int> paddings = param.Paddings();
+    std::vector<int> dilations = param.Dilations();
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>();
+    const T* weight_data = filter.data<T>();
+
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.weight_data = (void*)weight_data;
+    args.biases_data = nullptr;
+
+    try {
+      bias = param.Bias();
+    } catch (const std::exception& e) {
+    }
+    if (bias) {
+      const T* biases_data = bias->data<T>();
+      args.biases_data = (void*)biases_data;
+    }
+
+    args.num_group = groups;
+
+    args.dilation_rows = dilations[0];
+    args.dilation_cols = dilations[1];
+    if (dilations[0] != 1 || dilations[1] != 1) {
+      args.dilated = true;
+    }
+
+    // NCHW
+    // std::cout << "In dims: " << (input->dims()).size() << std::endl;
+    args.batch = input->dims()[0];
+    args.in_depth = input->dims()[1];
+    args.in_rows = input->dims()[2];
+    args.in_cols = input->dims()[3];
+    // std::cout <<"In N: " << args.batch << " C: " <<  args.in_depth
+    //  << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout << "Out dims: " << (output->dims()).size() << std::endl;
+    args.out_num = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    args.filter_rows = filter.dims()[2];
+    args.filter_cols = filter.dims()[3];
+    // std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
+    //  << " I: " <<  static_cast<int>(filter.dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+  }
+  acl::AclParameters args;
+};
+
+template <>
+bool ConvAddKernel<GPU_MALI, float>::Init(
+    const FusionConvAddParam& param) const {
+  AclConvAddOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConvAddOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+
+template <>
+void ConvAddKernel<GPU_MALI, float>::Compute(
+    const FusionConvAddParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclConvAddOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
+}
+
+template class ConvAddKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+#endif
diff --git a/src/operators/kernel/mali/conv_kernel.cpp b/src/operators/kernel/mali/conv_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f3212cae970b2a554412f59cf48a6e5156463969
--- /dev/null
+++ b/src/operators/kernel/mali/conv_kernel.cpp
@@ -0,0 +1,232 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_OP
+
+#include "operators/kernel/conv_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class AclConvOp : public acl::ACLOperator {
+ public:
+  AclConvOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
+  }
+  ~AclConvOp() = default;
+  AclConvOp(const AclConvOp&) = delete;
+  AclConvOp& operator=(const AclConvOp&) = delete;
+  AclConvOp(AclConvOp&&) = delete;
+  AclConvOp& operator=(AclConvOp&&) = delete;
+
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const ConvParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+    arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
+                                           args.in_depth / args.num_group,
+                                           args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PadStrideInfo conv_info(
+        args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
+        arm_compute::DimensionRoundingType::FLOOR);
+
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+
+    check_direct_conv();
+    //[kernel_x, kernel_y, IFM, OFM]
+    new_tensor(weights(), weights_shape, args.weight_data);
+    //[OFM]
+    // if (args.biases_data) {
+    //    new_tensor(biases(),biases_shape,args.biases_data);
+    //}
+
+    group() = args.num_group;
+
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+
+    acl_configure(conv, this, conv_info);
+  }
+
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const ConvParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
+      bypass_acl = true;
+    }
+    if (args.dim > 2) {
+      bypass_acl = true;
+    }
+    if (args.dilated) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+
+ private:
+  void check_direct_conv() {
+    bool use_direct_conv = false;
+    const char* pDirectConv;
+    pDirectConv = getenv("DIRECTCONV");
+    if (pDirectConv) {
+      unsigned int bdirectconv;
+      sscanf(pDirectConv, "%i", &bdirectconv);
+      if (bdirectconv != use_direct_conv) {
+        use_direct_conv = bdirectconv;
+        printf("DIRECTCONV<%s>\n", pDirectConv);
+        printf("DIRECTCONV: %x\n", use_direct_conv);
+      }
+    }
+    int pad_data[2], kernel[2];
+    pad_data[1] = args.pad_rows;
+    pad_data[0] = args.pad_cols;
+    kernel[1] = args.filter_rows;
+    kernel[0] = args.filter_cols;
+    if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
+                             pad_data[0] == 0 && pad_data[1] == 0) ||
+                            (kernel[0] == 3 && kernel[1] == 3 &&
+                             pad_data[0] <= 1 && pad_data[1] <= 1))) {
+      setConvMethod();  // NEDirectConvolutionLayer only for 1x1 and 3x3
+    }
+  }
+
+  void AclParametersByContext(const ConvParam& param) {
+    const Tensor* input = param.Input();
+    Tensor filter = *param.Filter();
+    Tensor* output = param.Output();
+
+    int groups = param.Groups();
+    std::vector<int> strides = param.Strides();
+    std::vector<int> paddings = param.Paddings();
+    std::vector<int> dilations = param.Dilations();
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>();
+    const T* weight_data = filter.data<T>();
+
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.weight_data = (void*)weight_data;
+    args.biases_data = nullptr;
+
+    // try {
+    //     bias = context.Input<framework::Tensor>("Bias");
+    // } catch (const std::exception& e) {
+    // }
+    // if (bias) {
+    //     const T* biases_data = bias->data<T>();
+    //     args.biases_data = (void*)biases_data;
+    // }
+
+    args.num_group = groups;
+
+    args.dilation_rows = dilations[0];
+    args.dilation_cols = dilations[1];
+    if (dilations[0] != 1 || dilations[1] != 1) {
+      args.dilated = true;
+    }
+
+    // NCHW
+    // std::cout << "In dims: " << (input->dims()).size() << std::endl;
+    args.batch = input->dims()[0];
+    args.in_depth = input->dims()[1];
+    args.in_rows = input->dims()[2];
+    args.in_cols = input->dims()[3];
+    std::cout << "In N: " << args.batch << " C: " << args.in_depth
+              << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout << "Out dims: " << (output->dims()).size() << std::endl;
+    args.out_num = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    args.filter_rows = filter.dims()[2];
+    args.filter_cols = filter.dims()[3];
+    // std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
+    //  << " I: " <<  static_cast<int>(filter.dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+  }
+  acl::AclParameters args;
+};
+
+template <>
+bool ConvKernel<GPU_MALI, float>::Init(const ConvParam& param) const {
+  AclConvOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConvOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+
+template <>
+void ConvKernel<GPU_MALI, float>::Compute(const ConvParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclConvOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
+}
+
+template class ConvKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+#endif
diff --git a/src/operators/kernel/mali/elementwise_add_kernel.cpp b/src/operators/kernel/mali/elementwise_add_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..43d33b3fd2b2cc747ae8c943437e675c84a4cdc6
--- /dev/null
+++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISEADD_OP
+
+#pragma once
+
+#include "operators/kernel/elementwise_add_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename T>
+struct AddFunctor {
+  inline T operator()(T a, T b) const { return a + b; }
+};
+
+template <>
+bool ElementwiseAddKernel<GPU_MALI, float>::Init(
+    const ElementwiseAddParam &para) const {
+  return true;
+}
+
+template <>
+void ElementwiseAddKernel<GPU_MALI, float>::Compute(
+    const ElementwiseAddParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *Out = param.Out();
+  Out->mutable_data<float>();
+  int axis = param.Axis();
+  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
+                                                 AddFunctor<float>(), Out);
+}
+
+template class ElementwiseAddKernel<GPU_MALI, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/mali/fushion_fc_kernel.cpp b/src/operators/kernel/mali/fushion_fc_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..64ab07a9b955893c01e2684cba0a14fa25d032ed
--- /dev/null
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
@@ -0,0 +1,77 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_FC_OP
+
+#pragma once
+
+#include "operators/kernel/fusion_fc_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool FusionFcKernel<GPU_MALI, float>::Init(const FusionFcParam &para) const {
+  return true;
+}
+
+template <>
+void FusionFcKernel<GPU_MALI, float>::Compute(
+    const FusionFcParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  const Tensor *input_z = param.InputZ();
+  auto *input_z_data = input_z->data<float>();
+  int axis = param.Axis();
+  Tensor *out = param.Out();
+  auto *out_data = out->mutable_data<float>();
+  const Tensor x_matrix =
+      input_x->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+          : *input_x;
+  const Tensor y_matrix =
+      input_y->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+          : *input_y;
+  auto out_dim = out->dims();
+  if (out_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
+  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
+                        " out_dim.size must be 2.");
+  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
+  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ")
+
+  int64_t classes = input_z->numel();
+  for (int i = 0; i < out_dim[0]; i++) {
+    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
+  }
+
+  for (int i = 0; i < out->numel(); i++) {
+    DLOG << out_data[i];
+  }
+  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                      out, static_cast<float>(1));
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  //            if (out_dim.size() != 2) {
+  //                out->Resize(out_dim);
+  //            }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/mali/lrn_kernel.cpp b/src/operators/kernel/mali/lrn_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c063ec8783382ccef79086368df8a97320010c23
--- /dev/null
+++ b/src/operators/kernel/mali/lrn_kernel.cpp
@@ -0,0 +1,148 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef LRN_OP
+
+#pragma once
+
+#include "operators/kernel/lrn_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class AclLrnOp : public acl::ACLOperator {
+ public:
+  AclLrnOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_LRN;
+  }
+  ~AclLrnOp() = default;
+  AclLrnOp(const AclLrnOp&) = delete;
+  AclLrnOp& operator=(const AclLrnOp&) = delete;
+  AclLrnOp(AclLrnOp&&) = delete;
+  AclLrnOp& operator=(AclLrnOp&&) = delete;
+
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const LrnParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape shape(args.in_cols, args.in_rows, args.in_depth);
+
+    if (is_operator_init_done(shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+
+    arm_compute::NormalizationLayerInfo norm_info(
+        arm_compute::NormType::CROSS_MAP, args.nsize, args.alpha, args.beta,
+        args.knorm);
+
+    //[width, height, IFM]
+    new_tensor(input(), shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), shape, args.output_data);
+
+    acl_configure(lrn, this, norm_info);
+  }
+
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const LrnParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+
+    return bypass_acl;
+  }
+
+ private:
+  void AclParametersByContext(const LrnParam& param) {
+    const Tensor* in_x = param.InputX();
+    Tensor* out = param.Out();
+
+    int n = param.N();
+    T alpha = param.Alpha();
+    T beta = param.Beta();
+    T k = param.K();
+
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+
+    args.nsize = n;
+    args.alpha = alpha;
+    args.beta = beta;
+    args.knorm = k;
+
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
+
+template <>
+bool LrnKernel<GPU_MALI, float>::Init(const LrnParam& param) const {
+  AclLrnOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclLrnOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+
+template <>
+void LrnKernel<GPU_MALI, float>::Compute(const LrnParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclLrnOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  for (int n = 0; n < args.batch; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth * args.in_cols * args.in_rows;
+    output_data += args.in_depth * args.in_cols * args.in_rows;
+  }
+}
+
+template class LrnKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+#endif
diff --git a/src/operators/kernel/mali/mul_kernel.cpp b/src/operators/kernel/mali/mul_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f2a84deaa1de999e94e335de6d4f40981bded5a8
--- /dev/null
+++ b/src/operators/kernel/mali/mul_kernel.cpp
@@ -0,0 +1,59 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef MUL_OP
+
+#pragma once
+
+#include "operators/kernel/mul_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool MulKernel<GPU_MALI, float>::Init(const MulParam &para) const {
+  return true;
+}
+
+template <>
+void MulKernel<GPU_MALI, float>::Compute(const MulParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *out = param.Out();
+  out->mutable_data<float>();
+  const Tensor x_matrix =
+      input_x->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+          : *input_x;
+  const Tensor y_matrix =
+      input_y->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+          : *input_y;
+  auto out_dim = out->dims();
+  if (out_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                      out, static_cast<float>(0));
+  if (out_dim.size() != 2) {
+    out->Resize(out_dim);
+  }
+}
+
+template class MulKernel<GPU_MALI, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/mali/pool_kernel.cpp b/src/operators/kernel/mali/pool_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9de90deebca05ef50cf94fa958f37bbcf1a08c4b
--- /dev/null
+++ b/src/operators/kernel/mali/pool_kernel.cpp
@@ -0,0 +1,220 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef POOL_OP
+
+#pragma once
+
+#include "operators/kernel/pool_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class AclPoolOp : public acl::ACLOperator {
+ public:
+  AclPoolOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_POOLING;
+  }
+  ~AclPoolOp() = default;
+  AclPoolOp(const AclPoolOp&) = delete;
+  AclPoolOp& operator=(const AclPoolOp&) = delete;
+  AclPoolOp(AclPoolOp&&) = delete;
+  AclPoolOp& operator=(AclPoolOp&&) = delete;
+
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const PoolParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth);
+    // arm_compute::TensorShape weights_shape(
+    // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PoolingLayerInfo pool_info;
+
+    if (args.pool_type == "max") {
+      pool_info = arm_compute::PoolingLayerInfo(
+          arm_compute::PoolingType::MAX, args.filter_rows,
+          arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
+                                     args.pad_cols, args.pad_rows,
+                                     arm_compute::DimensionRoundingType::CEIL));
+    } else {
+      pool_info = arm_compute::PoolingLayerInfo(
+          arm_compute::PoolingType::AVG, args.filter_rows,
+          arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
+                                     args.pad_cols, args.pad_rows,
+                                     arm_compute::DimensionRoundingType::CEIL));
+    }
+
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+
+    acl_configure(pooling, this, pool_info);
+  }
+
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const PoolParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    if (args.pool_type != "max" && args.pool_type != "avg") {
+      bypass_acl = true;
+    }
+    if (args.filter_rows != args.filter_cols) {
+      bypass_acl = true;
+    }
+    // if (args.filter_rows!=2 && args.filter_rows!=3) {
+    //     bypass_acl = true;
+    // }
+    return bypass_acl;
+  }
+
+ private:
+  void AclParametersByContext(const PoolParam& param) {
+    const Tensor* in_x = param.Input();
+    Tensor* out = param.Output();
+    std::string pooling_type = param.PoolingType();
+
+    std::vector<int> ksize = param.Ksize();
+
+    std::vector<int> strides = param.Strides();
+
+    std::vector<int> paddings = param.Paddings();
+
+    bool is_global_pooling = param.isGlobalPooling();
+
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+
+    args.is_global_pool = is_global_pooling;
+    args.pool_type = pooling_type;
+
+    args.filter_rows = ksize[0];
+    args.filter_cols = ksize[1];
+    args.dim = ksize.size();
+
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+    // std::cout <<"In N: " << args.batch << " C: " <<  args.in_depth
+    //  << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    // std::cout <<"Filter O: " << static_cast<int>(filter->dims()[0])
+    //  << " I: " <<  static_cast<int>(filter->dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"PoolingType: " << args.pool_type << "\n";
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+
+    args.out_depth = args.in_depth;
+    // args.out_rows = out->dims()[2];
+    // args.out_cols = out->dims()[3];
+    args.out_rows = static_cast<int>(ceil(static_cast<float>(args.in_rows +
+                                                             2 * args.pad_rows -
+                                                             args.filter_rows) /
+                                          args.stride_rows)) +
+                    1;
+    args.out_cols = static_cast<int>(ceil(static_cast<float>(args.in_cols +
+                                                             2 * args.pad_cols -
+                                                             args.filter_cols) /
+                                          args.stride_cols)) +
+                    1;
+
+    if (is_global_pooling) {
+      args.filter_rows = args.in_rows;
+      args.filter_cols = args.in_cols;
+      args.pad_rows = 0;
+      args.pad_cols = 0;
+    }
+  }
+  acl::AclParameters args;
+};
+
+template <>
+bool PoolKernel<GPU_MALI, float>::Init(const PoolParam& param) const {
+  AclPoolOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclPoolOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+
+template <>
+void PoolKernel<GPU_MALI, float>::Compute(const PoolParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclPoolOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  for (int n = 0; n < args.batch; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth * args.in_cols * args.in_rows;
+    output_data += args.in_depth * args.out_cols * args.out_rows;
+  }
+}
+
+template class PoolKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+#endif
diff --git a/src/operators/kernel/mali/relu_kernel.cpp b/src/operators/kernel/mali/relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3deebc9d2f1a9f652813362f4947f744f0541482
--- /dev/null
+++ b/src/operators/kernel/mali/relu_kernel.cpp
@@ -0,0 +1,136 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RELU_OP
+
+#pragma once
+
+#include "operators/kernel/relu_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class AclReluOp : public acl::ACLOperator {
+ public:
+  AclReluOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_RELU;
+  }
+  ~AclReluOp() = default;
+  AclReluOp(const AclReluOp&) = delete;
+  AclReluOp& operator=(const AclReluOp&) = delete;
+  AclReluOp(AclReluOp&&) = delete;
+  AclReluOp& operator=(AclReluOp&&) = delete;
+
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const ReluParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols * args.in_rows *
+                                         args.in_depth * args.batch);
+    arm_compute::TensorShape output_shape(args.in_cols * args.in_rows *
+                                          args.in_depth * args.out_num);
+    // arm_compute::TensorShape weights_shape(
+    // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::ActivationLayerInfo::ActivationFunction type;
+    type = arm_compute::ActivationLayerInfo::ActivationFunction::RELU;
+
+    arm_compute::ActivationLayerInfo act_info(type);
+
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+
+    acl_configure(activation, this, act_info);
+  }
+
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const ReluParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+
+ private:
+  void AclParametersByContext(const ReluParam& param) {
+    const auto* input_x = param.InputX();
+    auto* out = param.Out();
+
+    const T* input_data = input_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+
+    args.batch = input_x->dims()[0];
+    args.in_depth = input_x->dims()[1];
+    args.in_rows = input_x->dims()[2];
+    args.in_cols = input_x->dims()[3];
+    args.out_num = out->dims()[0];
+  }
+  acl::AclParameters args;
+};
+
+template <>
+bool ReluKernel<GPU_MALI, float>::Init(const ReluParam& param) const {
+  AclReluOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclReluOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+
+template <>
+void ReluKernel<GPU_MALI, float>::Compute(const ReluParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclReluOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
+}
+
+template class ReluKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+#endif
diff --git a/src/operators/kernel/mali/reshape_kernel.cpp b/src/operators/kernel/mali/reshape_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d7521454d46dfc82064930971d2b996b542af54a
--- /dev/null
+++ b/src/operators/kernel/mali/reshape_kernel.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE_OP
+
+#pragma once
+
+#include "operators/kernel/reshape_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ReshapeKernel<GPU_MALI, float>::Init(const ReshapeParam &para) const {
+  return true;
+}
+
+template <>
+void ReshapeKernel<GPU_MALI, float>::Compute(const ReshapeParam &param) const {
+  const auto *input_x = param.InputX();
+  const auto &input_x_dims = input_x->dims();
+  auto *out = param.Out();
+  framework::DDim out_dims = out->dims();
+  const auto *input_shape = param.InputShape();
+
+  if (input_shape) {
+    auto *shape_data = input_shape->data<int>();
+    framework::Tensor cpu_shape_tensor;
+    auto shape =
+        std::vector<int>(shape_data, shape_data + input_shape->numel());
+    out_dims = ValidateShape(shape, input_x->dims());
+  }
+
+  bool inplace = param.Inplace();
+  out->Resize(out_dims);
+  if (!inplace) {
+    out->mutable_data<float>();
+    framework::TensorCopy(*input_x, out);
+    out->Resize(out_dims);
+  } else {
+    out->ShareDataWith(*input_x);
+    out->Resize(out_dims);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/mali/softmax_kernel.cpp b/src/operators/kernel/mali/softmax_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..36edb3724600ada43606c23b1989615183ff21e8
--- /dev/null
+++ b/src/operators/kernel/mali/softmax_kernel.cpp
@@ -0,0 +1,137 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SOFTMAX_OP
+
+#pragma once
+
+#include "operators/kernel/softmax_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class AclSoftmaxOp : public acl::ACLOperator {
+ public:
+  AclSoftmaxOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_SOFTMAX;
+  }
+  ~AclSoftmaxOp() = default;
+  AclSoftmaxOp(const AclSoftmaxOp&) = delete;
+  AclSoftmaxOp& operator=(const AclSoftmaxOp&) = delete;
+  AclSoftmaxOp(AclSoftmaxOp&&) = delete;
+  AclSoftmaxOp& operator=(AclSoftmaxOp&&) = delete;
+
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const SoftmaxParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape shape(args.in_depth, args.batch);
+
+    if (is_operator_init_done(shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+
+    //[width, height, IFM]
+    new_tensor(input(), shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), shape, args.output_data);
+
+    acl_configure(softmax, this, NULL);
+  }
+
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const SoftmaxParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+
+    return bypass_acl;
+  }
+
+ private:
+  void AclParametersByContext(const SoftmaxParam& param) {
+    const framework::Tensor* in_x = param.InputX();
+    framework::Tensor* out = param.Out();
+    auto x_dims = in_x->dims();
+    out->Resize(x_dims);
+
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->data<T>();
+
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+
+    args.out_num = out->dims()[0];
+
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
+
+template <>
+bool SoftmaxKernel<GPU_MALI, float>::Init(const SoftmaxParam& param) const {
+  AclSoftmaxOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclSoftmaxOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+
+template <>
+void SoftmaxKernel<GPU_MALI, float>::Compute(const SoftmaxParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclSoftmaxOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  for (int n = 0; n < args.out_num; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth;
+    output_data += args.in_depth;
+  }
+}
+
+template class SoftmaxKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+#endif
diff --git a/src/operators/kernel/mul_kernel.h b/src/operators/kernel/mul_kernel.h
index 809c9b80b5ba0d610827d8fa5ff00d5ad7183ab9..81db202c2d26fae9abb971a2cafe32f9b20dfe22 100644
--- a/src/operators/kernel/mul_kernel.h
+++ b/src/operators/kernel/mul_kernel.h
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef MUL_OP
+
+#pragma once
+
 #include "framework/operator.h"
 #include "operators/math/math_function.h"
 #include "operators/op_param.h"
-#pragma once;
 
 namespace paddle_mobile {
 namespace operators {
@@ -26,6 +29,9 @@ template <typename DeviceType, typename T>
 class MulKernel : public framework::OpKernelBase<DeviceType, MulParam> {
  public:
   void Compute(const MulParam &param) const;
+  bool Init(const MulParam &para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/multiclass_nms_kernel.h b/src/operators/kernel/multiclass_nms_kernel.h
index 4453197e5c866398bc6f8807ec921ff5638fbb71..ca86604f2c6e550c219e54b6533c1500fb2912c4 100644
--- a/src/operators/kernel/multiclass_nms_kernel.h
+++ b/src/operators/kernel/multiclass_nms_kernel.h
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef MULTICLASSNMS_OP
+
+#pragma once
+
 #include "framework/operator.h"
-#include "operators/op_param.h"
 
-#pragma once;
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -25,6 +28,9 @@ class MultiClassNMSKernel
     : public framework::OpKernelBase<DeviceType, MultiClassNMSParam> {
  public:
   void Compute(const MultiClassNMSParam& param) const;
+  bool Init(const MultiClassNMSParam& para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/pool_kernel.h b/src/operators/kernel/pool_kernel.h
index 5cb185dea6eaed0bbb50c5fd5d3450d4e92f18e7..3285f56cc01fad554bff7e6a4d25769f8ef56d24 100644
--- a/src/operators/kernel/pool_kernel.h
+++ b/src/operators/kernel/pool_kernel.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef POOL_OP
+
 #pragma once
 
 #include "framework/operator.h"
@@ -26,6 +28,9 @@ template <typename DeviceType, typename T>
 class PoolKernel : public OpKernelBase<DeviceType, PoolParam> {
  public:
   void Compute(const PoolParam &param) const override;
+  bool Init(const PoolParam &para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/prior_box_kernel.h b/src/operators/kernel/prior_box_kernel.h
index c3cd399bfe9fad86b45c33d947dbbb3e4f99bade..79fc630b8efb50dec1ff336d2b66d5094eaeb5a5 100644
--- a/src/operators/kernel/prior_box_kernel.h
+++ b/src/operators/kernel/prior_box_kernel.h
@@ -12,14 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <vector>
+#ifdef PRIORBOX_OP
+
+#pragma once
 
+#include <algorithm>
+#include <cmath>
+#include <vector>
 #include "framework/operator.h"
 #include "operators/math/transform.h"
 #include "operators/op_param.h"
 
-#pragma once;
-
 namespace paddle_mobile {
 namespace operators {
 
@@ -52,6 +55,9 @@ class PriorBoxKernel
     : public framework::OpKernelBase<DeviceType, PriorBoxParam> {
  public:
   void Compute(const PriorBoxParam& param) const;
+  bool Init(const PriorBoxParam& para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/relu_kernel.h b/src/operators/kernel/relu_kernel.h
index 83b4548f3e5421657ae6f79bd226e16e1aba7ffb..2155c33811f553435e4a89b5b23533e2bd42db5d 100644
--- a/src/operators/kernel/relu_kernel.h
+++ b/src/operators/kernel/relu_kernel.h
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef RELU_OP
+
+#pragma once
+
 #include "framework/operator.h"
-#include "operators/op_param.h"
 
-#pragma once;
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -24,6 +27,9 @@ template <typename DeviceType, typename T>
 class ReluKernel : public framework::OpKernelBase<DeviceType, ReluParam> {
  public:
   void Compute(const ReluParam& param) const;
+  bool Init(const ReluParam& para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/reshape_kernel.h b/src/operators/kernel/reshape_kernel.h
index 7d5dcdf71de232b1c72180231731fcf76483b9e4..364f5b0902c2661017f2e72520849836f64dd0bb 100644
--- a/src/operators/kernel/reshape_kernel.h
+++ b/src/operators/kernel/reshape_kernel.h
@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <vector>
+#ifdef RESHAPE_OP
+
+#pragma once
 
+#include <vector>
 #include "framework/operator.h"
-#include "operators/op_param.h"
 
-#pragma once;
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -69,6 +71,9 @@ template <typename DeviceType, typename T>
 class ReshapeKernel : public framework::OpKernelBase<DeviceType, ReshapeParam> {
  public:
   void Compute(const ReshapeParam& param) const;
+  bool Init(const ReshapeParam& para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/sigmoid_kernel.h b/src/operators/kernel/sigmoid_kernel.h
index 8f5c787f3ff009ed1e334e61657d00454d6e4c0b..e9eaae5ad867c6880db7346f9632ff37a92aaf66 100644
--- a/src/operators/kernel/sigmoid_kernel.h
+++ b/src/operators/kernel/sigmoid_kernel.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef SIGMOID_OP
+
 #pragma once
 
 #include "framework/operator.h"
@@ -24,6 +26,9 @@ template <typename DeviceType, typename T>
 class SigmoidKernel : public OpKernelBase<DeviceType, SigmoidParam> {
  public:
   void Compute(const SigmoidParam& param) const override;
+  bool Init(const SigmoidParam& para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/softmax_kernel.h b/src/operators/kernel/softmax_kernel.h
index 5bdae46d288adef3c07c6b2735bdfe5e6ec0c1c3..a7a7666e32ef1923a47d71d94c93e813a23028c5 100644
--- a/src/operators/kernel/softmax_kernel.h
+++ b/src/operators/kernel/softmax_kernel.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef SOFTMAX_OP
+
 #pragma once
 
 #include "framework/operator.h"
@@ -27,6 +29,9 @@ template <typename DeviceType, typename T>
 class SoftmaxKernel : public OpKernelBase<DeviceType, SoftmaxParam> {
  public:
   void Compute(const SoftmaxParam &param) const override;
+  bool Init(const SoftmaxParam &para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/transpose_kernel.h b/src/operators/kernel/transpose_kernel.h
index aa7d8902097df441eaa28ea8a74b5e9234f7daea..6526d97df9863392f783841a784cb5df4e45f218 100644
--- a/src/operators/kernel/transpose_kernel.h
+++ b/src/operators/kernel/transpose_kernel.h
@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef TRANSPOSE_OP
+
+#pragma once
+
 #include <vector>
 
 #include "framework/operator.h"
 #include "operators/op_param.h"
 
-#pragma once;
-
 namespace paddle_mobile {
 namespace operators {
 
@@ -27,6 +29,9 @@ class TransposeKernel
     : public framework::OpKernelBase<DeviceType, TransposeParam> {
  public:
   void Compute(const TransposeParam& param) const;
+  bool Init(const TransposeParam& para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/lrn_op.cpp b/src/operators/lrn_op.cpp
index cc89a034b4c43bcee7778cad0c16c614e74bb5fb..dc43cb022ac9d7435654cbc565c81c57ba80b350 100644
--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef LRN_OP
+
 #include "lrn_op.h"
 
 namespace paddle_mobile {
@@ -19,13 +21,23 @@ namespace operators {
 
 template <typename Dtype, typename T>
 void LrnOp<Dtype, T>::InferShape() const {
-  auto x_dims = param_.InputX()->dims();
-  param_.Out()->Resize(x_dims);
+  auto x_dims = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(x_dims);
 }
 template class LrnOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
-USE_OP(lrn);
-REGISTER_OPERATOR(lrn, ops::LrnOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(lrn);
+REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(lrn);
+REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/lrn_op.h b/src/operators/lrn_op.h
index e5d98e1bb103307e1fae9c2460be19fe9d0f01a0..d67b9f6be741581918b09d19a8a8b26c28ceed1c 100644
--- a/src/operators/lrn_op.h
+++ b/src/operators/lrn_op.h
@@ -11,6 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
+#ifdef LRN_OP
+
 #pragma once
 
 #include <string>
@@ -22,26 +25,25 @@ namespace paddle_mobile {
 namespace operators {
 using std::string;
 template <typename DeviceType, typename T>
-class LrnOp : public framework::OperatorWithKernel<DeviceType> {
+class LrnOp : public framework::OperatorWithKernel<
+                  DeviceType, LrnParam, operators::LrnKernel<DeviceType, T>> {
  public:
   LrnOp(const string &type, const VariableNameMap &inputs,
-        const VariableNameMap &outputs, const framework::AttributeMap attrs,
+        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
         std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
-        param_(inputs, outputs, attrs, *scope) {}
+      : framework::OperatorWithKernel<DeviceType, LrnParam,
+                                      operators::LrnKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
 
-  void RunImpl() const {
-    operators::LrnKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
-
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, LrnParam,
+      operators::LrnKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
-  LrnParam param_;
 };
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/math/conv_func.h b/src/operators/math/conv_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d23f6c8a24be7f52e1b322e07addb47ccd8b056
--- /dev/null
+++ b/src/operators/math/conv_func.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+
+#include "framework/ddim.h"
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+using framework::DDim;
+using framework::Tensor;
+
+inline int ConvOutputSize(int input_size, int filter_size, int dilation,
+                          int padding, int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  return output_size;
+}
+
+inline void expand_bias(Tensor &bias, int axis, const DDim &dDim) {
+  auto bias_ptr = bias.data<float>();
+  const DDim bias_ddim = bias.dims();
+  PADDLE_MOBILE_ENFORCE(bias.dims().size() == 1,
+                        "the bias tensor's dims size != 1")
+  DDim outer_ddim = paddle_mobile::framework::slice_ddim(dDim, 0, axis + 1);
+  DDim inner_ddim =
+      paddle_mobile::framework::slice_ddim(dDim, axis + 1, dDim.size());
+  int outer_size = paddle_mobile::framework::product(outer_ddim);
+  int inner_size = paddle_mobile::framework::product(inner_ddim);
+  bias.Resize(dDim);
+  auto new_ptr = bias.mutable_data<float>();
+  int axis_size = dDim[axis];
+
+#if __ARM_NEON
+  for (int i = 0; i < outer_size; ++i) {
+    int inner_num = inner_size >> 4;
+    int remain = inner_size - (inner_num << 4);
+    float v_bias = bias_ptr[i * axis_size / outer_size];
+    for (; inner_num > 0; inner_num--) {
+      float32x4_t v_newptr1 = vdupq_n_f32(v_bias);
+      float32x4_t v_newptr2 = vdupq_n_f32(v_bias);
+      float32x4_t v_newptr3 = vdupq_n_f32(v_bias);
+      float32x4_t v_newptr4 = vdupq_n_f32(v_bias);
+      vst1q_f32(new_ptr, v_newptr1);
+      new_ptr += 4;
+      vst1q_f32(new_ptr, v_newptr2);
+      new_ptr += 4;
+      vst1q_f32(new_ptr, v_newptr3);
+      new_ptr += 4;
+      vst1q_f32(new_ptr, v_newptr4);
+      new_ptr += 4;
+    }
+    for (; remain > 0; remain--) {
+      *new_ptr = v_bias;
+      new_ptr++;
+    }
+  }
+#else
+  for (int i = 0; i < outer_size; ++i) {
+    float v_bias = bias_ptr[i * axis_size / outer_size];
+    for (int j = 0; j < inner_size; ++j) {
+      new_ptr[i * inner_size + j] = v_bias;
+    }
+  }
+#endif
+}
+
+inline bool IsExpand(const std::vector<int64_t> &filter_dim,
+                     const std::vector<int> &strides,
+                     const std::vector<int> &paddings,
+                     const std::vector<int> &dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/math/depthwiseconv3x3s1p1.cpp b/src/operators/math/depthwiseconv3x3s1p1.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..88cac515201c114e83cb9e85b39a51fb3f8e7955
--- /dev/null
+++ b/src/operators/math/depthwiseconv3x3s1p1.cpp
@@ -0,0 +1,288 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/math/depthwiseconv3x3s1p1.h"
+#include <arm_neon.h>
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+using framework::Tensor;
+
+void DepthwiseConv3x3s1p1(const Tensor *input, Tensor filter, Tensor *output,
+                          Tensor bias, bool if_bias) {
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter.data<float>();
+  float *output_data = output->data<float>();
+  const float *bias_data = bias.data<float>();
+
+  const int h = static_cast<int>(input->dims()[2]);
+  const int w = static_cast<int>(input->dims()[3]);
+  const int l = h;
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int c = static_cast<int>(input->dims()[1]);
+  const int hxw = h * w;
+  float32x4_t vbias = vdupq_n_f32(0.0);
+  for (int b = 0; b < batch_size; ++b) {
+    const float *filter_data_tmp = filter_data;
+
+    for (int j = 0; j < c; ++j) {
+      if (if_bias) {
+        vbias = vdupq_n_f32(bias_data[j]);
+      }
+
+      int l_mid = l - 2;  // l=1->l_mid=-1,l=2->l_mid=0
+      float w00 = filter_data_tmp[0];
+      float w01 = filter_data_tmp[1];
+      float w02 = filter_data_tmp[2];
+      float w10 = filter_data_tmp[3];
+      float w11 = filter_data_tmp[4];
+      float w12 = filter_data_tmp[5];
+      float w20 = filter_data_tmp[6];
+      float w21 = filter_data_tmp[7];
+      float w22 = filter_data_tmp[8];
+
+      output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
+                       w21 * input_data[l] + w22 * input_data[l + 1] +
+                       bias_data[j];
+      output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
+                           w20 * input_data[2 * l - 2] +
+                           w21 * input_data[2 * l - 1] + bias_data[j];
+      output_data[(l - 1) * l] =
+          w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
+          w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1] +
+          bias_data[j];
+      output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
+                               w01 * input_data[(l - 2) * (l + 1) + 1] +
+                               w10 * input_data[l * l - 2] +
+                               w11 * input_data[l * l - 1] + bias_data[j];
+
+      for (int i = 1; i < l - 1; ++i) {
+        output_data[i * l] =
+            w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
+            w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
+            w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1] +
+            bias_data[j];
+        output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
+                                     w01 * input_data[i * l + l - 1 - l] +
+                                     w10 * input_data[i * l + l - 1 - 1] +
+                                     w11 * input_data[i * l + l - 1] +
+                                     w20 * input_data[i * l + l - 1 + l - 1] +
+                                     w21 * input_data[i * l + l - 1 + l] +
+                                     bias_data[j];
+      }
+
+      // top 1 row and bottom 1 row
+      const float *input_tmp = input_data;
+
+      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
+          tmp3, tmp4, tmp5, out0;
+      in0 = vld1q_f32(input_tmp);
+      in2 = vld1q_f32(input_tmp + l);
+      const float *input_tmp_end = input_tmp + (l - 2) * l;
+      in4 = vld1q_f32(input_tmp_end);
+      in6 = vld1q_f32(input_tmp_end + l);
+      int c_mid = l_mid;
+      auto output_ptr = output_data + 1;
+      for (; c_mid > 3; c_mid -= 4) {
+        in1 = vld1q_f32(input_tmp + 4);
+        in3 = vld1q_f32(input_tmp + l + 4);
+
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+
+        out0 = vmulq_n_f32(in0, w10);
+        out0 = vmlaq_n_f32(out0, tmp0, w11);
+        out0 = vmlaq_n_f32(out0, tmp1, w12);
+        out0 = vmlaq_n_f32(out0, in2, w20);
+        out0 = vmlaq_n_f32(out0, tmp2, w21);
+        out0 = vmlaq_n_f32(out0, tmp3, w22);
+        out0 = vaddq_f32(out0, vbias);
+
+        vst1q_f32(output_ptr, out0);
+
+        in5 = vld1q_f32(input_tmp_end + 4);
+        in7 = vld1q_f32(input_tmp_end + l + 4);
+
+        tmp0 = vextq_f32(in4, in5, 1);
+        tmp1 = vextq_f32(in4, in5, 2);
+        tmp2 = vextq_f32(in6, in7, 1);
+        tmp3 = vextq_f32(in6, in7, 2);
+
+        out0 = vmulq_n_f32(in4, w00);
+        out0 = vmlaq_n_f32(out0, tmp0, w01);
+        out0 = vmlaq_n_f32(out0, tmp1, w02);
+        out0 = vmlaq_n_f32(out0, in6, w10);
+        out0 = vmlaq_n_f32(out0, tmp2, w11);
+        out0 = vmlaq_n_f32(out0, tmp3, w12);
+        out0 = vaddq_f32(out0, vbias);
+
+        vst1q_f32(output_ptr + (l - 1) * l, out0);
+
+        // can optimize to each 8 stride.
+        input_tmp += 4;
+        input_tmp_end += 4;
+        output_ptr += 4;
+        in0 = in1;
+        in2 = in3;
+        in4 = in5;
+        in6 = in7;
+      }
+
+      // top right pad
+      float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]);
+      float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]);
+
+      tmp0 = vextq_f32(in0, pad0, 1);
+      tmp1 = vextq_f32(in0, pad0, 2);
+      tmp2 = vextq_f32(in2, pad1, 1);
+      tmp3 = vextq_f32(in2, pad1, 2);
+
+      out0 = vmulq_n_f32(in0, w10);
+      out0 = vmlaq_n_f32(out0, tmp0, w11);
+      out0 = vmlaq_n_f32(out0, tmp1, w12);
+      out0 = vmlaq_n_f32(out0, in2, w20);
+      out0 = vmlaq_n_f32(out0, tmp2, w21);
+      out0 = vmlaq_n_f32(out0, tmp3, w22);
+      out0 = vaddq_f32(out0, vbias);
+
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + i, out0, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + i, out0, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + i, out0, 2);
+        }
+      }
+
+      // bottom right pad
+      float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]);
+      float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]);
+
+      tmp0 = vextq_f32(in4, pad2, 1);
+      tmp1 = vextq_f32(in4, pad2, 2);
+      tmp2 = vextq_f32(in6, pad3, 1);
+      tmp3 = vextq_f32(in6, pad3, 2);
+
+      out0 = vmulq_n_f32(in4, w00);
+      out0 = vmlaq_n_f32(out0, tmp0, w01);
+      out0 = vmlaq_n_f32(out0, tmp1, w02);
+      out0 = vmlaq_n_f32(out0, in6, w10);
+      out0 = vmlaq_n_f32(out0, tmp2, w11);
+      out0 = vmlaq_n_f32(out0, tmp3, w12);
+      out0 = vaddq_f32(out0, vbias);
+
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2);
+        }
+      }
+      // mid
+
+      for (int i = 0; i < l - 2; ++i) {
+        auto output_ptr = output_data + (i + 1) * l + 1;
+        input_tmp = input_data + i * l;
+        auto in0_tmp = vld1q_f32(input_tmp);
+        auto in2_tmp = vld1q_f32(input_tmp + l);
+        auto in4_tmp = vld1q_f32(input_tmp + l + l);
+        c_mid = l_mid;
+        for (; c_mid > 3; c_mid -= 4) {
+          auto in1_tmp = vld1q_f32(input_tmp + 4);
+          auto in3_tmp = vld1q_f32(input_tmp + l + 4);
+          auto in5_tmp = vld1q_f32(input_tmp + l + l + 4);
+
+          tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
+          tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
+          tmp2 = vextq_f32(in2_tmp, in3_tmp, 1);
+          tmp3 = vextq_f32(in2_tmp, in3_tmp, 2);
+          tmp4 = vextq_f32(in4_tmp, in5_tmp, 1);
+          tmp5 = vextq_f32(in4_tmp, in5_tmp, 2);
+
+          out0 = vmulq_n_f32(in0_tmp, w00);
+          out0 = vmlaq_n_f32(out0, tmp0, w01);
+          out0 = vmlaq_n_f32(out0, tmp1, w02);
+          out0 = vmlaq_n_f32(out0, in2_tmp, w10);
+          out0 = vmlaq_n_f32(out0, tmp2, w11);
+          out0 = vmlaq_n_f32(out0, tmp3, w12);
+          out0 = vmlaq_n_f32(out0, in4_tmp, w20);
+          out0 = vmlaq_n_f32(out0, tmp4, w21);
+          out0 = vmlaq_n_f32(out0, tmp5, w22);
+          out0 = vaddq_f32(out0, vbias);
+
+          vst1q_f32(output_ptr, out0);
+
+          output_ptr += 4;
+          input_tmp += 4;
+          in0_tmp = in1_tmp;
+          in2_tmp = in3_tmp;
+          in4_tmp = in5_tmp;
+        }
+
+        float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]);
+        float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]);
+        float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]);
+
+        tmp0 = vextq_f32(in0_tmp, pad0, 1);
+        tmp1 = vextq_f32(in0_tmp, pad0, 2);
+        tmp2 = vextq_f32(in2_tmp, pad1, 1);
+        tmp3 = vextq_f32(in2_tmp, pad1, 2);
+        tmp4 = vextq_f32(in4_tmp, pad2, 1);
+        tmp5 = vextq_f32(in4_tmp, pad2, 2);
+
+        out0 = vmulq_n_f32(in0_tmp, w00);
+        out0 = vmlaq_n_f32(out0, tmp0, w01);
+        out0 = vmlaq_n_f32(out0, tmp1, w02);
+        out0 = vmlaq_n_f32(out0, in2_tmp, w10);
+        out0 = vmlaq_n_f32(out0, tmp2, w11);
+        out0 = vmlaq_n_f32(out0, tmp3, w12);
+        out0 = vmlaq_n_f32(out0, in4_tmp, w20);
+        out0 = vmlaq_n_f32(out0, tmp4, w21);
+        out0 = vmlaq_n_f32(out0, tmp5, w22);
+        out0 = vaddq_f32(out0, vbias);
+
+        for (int i = 0; i < c_mid; ++i) {
+          if (i == 0) {
+            vst1q_lane_f32(output_ptr + i, out0, 0);
+          }
+          if (i == 1) {
+            vst1q_lane_f32(output_ptr + i, out0, 1);
+          }
+          if (i == 2) {
+            vst1q_lane_f32(output_ptr + i, out0, 2);
+          }
+        }
+      }
+      output_data += hxw;
+      input_data += hxw;
+      filter_data_tmp += 9;
+    }
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/framework/paddle_mobile_object.h b/src/operators/math/depthwiseconv3x3s1p1.h
similarity index 70%
rename from src/framework/paddle_mobile_object.h
rename to src/operators/math/depthwiseconv3x3s1p1.h
index aff4b6c1f178ba70c756c49721ac9b34de82c71c..019237a43192f30dfb70fe85e6b16a835cba4eba 100644
--- a/src/framework/paddle_mobile_object.h
+++ b/src/operators/math/depthwiseconv3x3s1p1.h
@@ -13,20 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
-#include <string>
-#include "stdio.h"
+#include "framework/tensor.h"
 
 namespace paddle_mobile {
-
-class PaddleMobileObject {
- public:
-  virtual std::string ToString() {
-    char address[128] = {0};
-    sprintf(address, "%p", this);
-    return std::string(address);
-  }
-
- private:
-};
+namespace operators {
+namespace math {
+using framework::Tensor;
+
+void DepthwiseConv3x3s1p1(const Tensor *input, Tensor filter, Tensor *output,
+                          Tensor bias, bool if_bias);
+}  // namespace math
+}  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index 0c0ae8e3dd84f38218d03a761c58a664b927f161..c35a14bf508835b120e1a4108cba0945208867dc 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -13,17 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "operators/math/gemm.h"
+#include "common/log.h"
+#include "memory/t_malloc.h"
+#ifndef X86
+#include <arm_neon.h>
+#endif
 
 namespace paddle_mobile {
 namespace operators {
 namespace math {
+alignas(64) float packedA[MC * KC];
+alignas(64) float packedB[KC * NC];
+alignas(64) float ab[MR * NR];
 // 将A矩阵分块复制到连续内存(ColMajor)
-void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
+void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                  float *buffer) {
   int i, j;
   const float *Aij;
-  for (i = 0; i < m - paddingM; i += MR) {
-    for (int j = 0; j < k; ++j) {
+  for (i = 0; i < m - m_tail; i += MR) {
+    for (j = 0; j < k; ++j) {
       Aij = &A(i, j);
       *buffer++ = *Aij;
       *buffer++ = *(Aij + 1);
@@ -31,13 +39,13 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
       *buffer++ = *(Aij + 3);
     }
   }
-  if (paddingM != 0) {
+  if (m_tail != 0) {
     for (j = 0; j < k; ++j) {
-      Aij = &A(m - paddingM, j);
-      for (i = 0; i < paddingM; ++i) {
+      Aij = &A(m - m_tail, j);
+      for (i = 0; i < m_tail; ++i) {
         *buffer++ = *(Aij + i);
       }
-      for (i = paddingM; i < MR; ++i) {
+      for (i = m_tail; i < MR; ++i) {
         *buffer++ = 0;
       }
     }
@@ -45,11 +53,11 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
 }
 
 // 将A矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
+void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
                   float *buffer) {
   int i, j;
   const float *Ai, *Ai1, *Ai2, *Ai3;
-  for (i = 0; i < m - paddingM; i += MR) {
+  for (i = 0; i < m - m_tail; i += MR) {
     Ai = &A(i, 0);
     Ai1 = &A(i + 1, 0);
     Ai2 = &A(i + 2, 0);
@@ -61,12 +69,12 @@ void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
       *buffer++ = *Ai3++;
     }
   }
-  if (paddingM != 0) {
+  if (m_tail != 0) {
     for (j = 0; j < k; ++j) {
-      for (i = m - paddingM; i < m; ++i) {
+      for (i = m - m_tail; i < m; ++i) {
         *buffer++ = A(i, j);
       }
-      for (i = m; i < m + (MR - paddingM); ++i) {
+      for (i = m; i < m + (MR - m_tail); ++i) {
         *buffer++ = 0;
       }
     }
@@ -74,11 +82,11 @@ void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
 }
 
 // 将B矩阵分块复制到连续内存(ColMajor)
-void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
+void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                  float *buffer) {
   int i, j;
   const float *Bj, *Bj1, *Bj2, *Bj3;
-  for (j = 0; j < n - paddingN; j += NR) {
+  for (j = 0; j < n - n_tail; j += NR) {
     Bj = &B(0, j);
     Bj1 = &B(0, j + 1);
     Bj2 = &B(0, j + 2);
@@ -90,12 +98,12 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
       *buffer++ = *Bj3++;
     }
   }
-  if (paddingN != 0) {
+  if (n_tail != 0) {
     for (i = 0; i < k; ++i) {
-      for (int j = n - paddingN; j < n; ++j) {
+      for (int j = n - n_tail; j < n; ++j) {
         *buffer++ = B(i, j);
       }
-      for (int j = n; j < n + (NR - paddingN); ++j) {
+      for (int j = n; j < n + (NR - n_tail); ++j) {
         *buffer++ = 0;
       }
     }
@@ -103,26 +111,28 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
 }
 
 // 将B矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
+void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
                   float *buffer) {
   int i, j;
   const float *Bij;
-  for (j = 0; j < n - paddingN; j += NR) {
+  for (j = 0; j < n - n_tail; j += NR) {
     for (i = 0; i < k; ++i) {
       Bij = &B(i, j);
-      *buffer++ = *Bij;
-      *buffer++ = *(Bij + 1);
-      *buffer++ = *(Bij + 2);
-      *buffer++ = *(Bij + 3);
+      asm volatile(
+          "vld1.32    {q0}, [%[Bij]]        \n\t"
+          "vst1.32    {q0}, [%[buffer]]!    \n\t"
+          : [buffer] "+r"(buffer)
+          : [Bij] "r"(Bij)
+          : "memory", "q0");
     }
   }
-  if (paddingN != 0) {
+  if (n_tail != 0) {
     for (i = 0; i < k; ++i) {
-      Bij = &B(i, n - paddingN);
-      for (int j = n - paddingN; j < n; ++j) {
+      Bij = &B(i, n - n_tail);
+      for (int j = n - n_tail; j < n; ++j) {
         *buffer++ = *Bij++;
       }
-      for (int j = n; j < n + (NR - paddingN); ++j) {
+      for (int j = n; j < n + (NR - n_tail); ++j) {
         *buffer++ = 0;
       }
     }
@@ -133,53 +143,545 @@ void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
 void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
                  const float *B, int ldb, float beta, float *C, int ldc,
                  int first_time) {
-  int Buff_A_M = m;
-  int Buff_B_N = n;
+  int m_block = (m + MR - 1) / MR * MR;
+  int n_block = (n + NR - 1) / NR * NR;
 
-  int _mc = m % MR;
-  int _nc = n % NR;
+  int m_tail = m % MR;
+  int n_tail = n % NR;
 
-  if (_mc != 0) {
-    Buff_A_M = m + (MR - _mc);
+  if (first_time) {
+    PackMatrixB_(k, n, n_tail, B, ldb, packedB);
   }
+  PackMatrixA_(m, k, m_tail, A, lda, packedA);
+
+  int i, j, mc, nc;
 
-  if (_nc != 0) {
-    Buff_B_N = n + (NR - _nc);
+  // B 取 4 列, 打包预热
+  for (j = 0; j < n_block; j += NR) {
+    nc = (n - j) < NR ? n_tail : NR;
+    // A 取 4 行，打包预热
+    for (i = 0; i < m_block; i += MR) {
+      mc = (m - i) < MR ? m_tail : MR;
+      AddDot4x4(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
+                &C(i, j), ldc, mc, nc);
+    }
   }
+}
 
-  float packedA[MC * KC];
-  static float packedB[KC * NC];
+// 分块矩阵乘法
+void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda,
+                      const float *B, int ldb, float beta, float *C, int ldc,
+                      int first_time, bool relu = false) {
+  int m_block = (m + MR - 1) / MR * MR;
+  int n_block = (n + NR - 1) / NR * NR;
+
+  int m_tail = m % MR;
+  int n_tail = n % NR;
 
   if (first_time) {
-    PackMatrixB_(k, n, _nc, B, ldb, packedB);
+    PackMatrixB_(k, n, n_tail, B, ldb, packedB);
   }
-  PackMatrixA_(m, k, _mc, A, lda, packedA);
+  PackMatrixA_(m, k, m_tail, A, lda, packedA);
 
   int i, j, mc, nc;
 
   // B 取 4 列, 打包预热
-  for (j = 0; j < Buff_B_N; j += NR) {
-    nc = (n - j) < NR ? _nc : NR;
+  for (j = 0; j < n_block; j += NR) {
+    nc = (n - j) < NR ? n_tail : NR;
     // A 取 4 行，打包预热
-    for (i = 0; i < Buff_A_M; i += MR) {
-      mc = (m - i) < MR ? _mc : MR;
-      AddDot4x4(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
-                &C(i, j), ldc, mc, nc);
+    for (i = 0; i < m_block; i += MR) {
+      mc = (m - i) < MR ? m_tail : MR;
+      AddDot4x4_relu(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
+                     &C(i, j), ldc, mc, nc, relu);
     }
   }
 }
 
 // 计算一个更小的 4 * 4 的 C 矩阵分块
+#if defined(IOS)
+void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
+               int ldb, float beta, float *C, int ldc, int mc, int nc) {
+  // init C
+  float32x4_t cv0 = vdupq_n_f32(0.0);
+  float32x4_t cv1 = vdupq_n_f32(0.0);
+  float32x4_t cv2 = vdupq_n_f32(0.0);
+  float32x4_t cv3 = vdupq_n_f32(0.0);
+
+  float32x4_t av;
+  float32x4_t bv;
+
+  float32x2_t av01;
+  float32x2_t av23;
+
+  for (int p = 0; p < k; p += 1) {
+    av = vld1q_f32(a);
+    bv = vld1q_f32(b);
+
+    av01 = vget_low_f32(av);
+    cv0 = vmlaq_lane_f32(cv0, bv, av01, 0);
+    cv1 = vmlaq_lane_f32(cv1, bv, av01, 1);
+    av23 = vget_high_f32(av);
+    cv2 = vmlaq_lane_f32(cv2, bv, av23, 0);
+    cv3 = vmlaq_lane_f32(cv3, bv, av23, 1);
+
+    a += MR;
+    b += NR;
+  }
+  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
+  int i, j;
+  for (i = 0; i < mc; ++i) {
+    for (j = 0; j < nc; ++j) {
+      if (beta == 0.0) {
+        C(i, j) = 0.0;
+      } else if (beta != 1.0) {
+        C(i, j) *= beta;
+      }
+      if (j == 0) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
+      } else if (j == 1) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
+      } else if (j == 2) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
+      } else if (j == 3) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
+      }
+    }
+  }
+}
+
+void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
+                    int ldb, float beta, float *C, int ldc, int mc, int nc,
+                    bool relu = false) {
+  // init C
+  float32x4_t cv0 = vdupq_n_f32(0.0);
+  float32x4_t cv1 = vdupq_n_f32(0.0);
+  float32x4_t cv2 = vdupq_n_f32(0.0);
+  float32x4_t cv3 = vdupq_n_f32(0.0);
+
+  float32x4_t av;
+  float32x4_t bv;
+
+  float32x2_t av01;
+  float32x2_t av23;
+
+  for (int p = 0; p < k; p += 1) {
+    av = vld1q_f32(a);
+    bv = vld1q_f32(b);
+
+    av01 = vget_low_f32(av);
+    cv0 = vmlaq_lane_f32(cv0, bv, av01, 0);
+    cv1 = vmlaq_lane_f32(cv1, bv, av01, 1);
+    av23 = vget_high_f32(av);
+    cv2 = vmlaq_lane_f32(cv2, bv, av23, 0);
+    cv3 = vmlaq_lane_f32(cv3, bv, av23, 1);
+
+    a += MR;
+    b += NR;
+  }
+  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
+  int i, j;
+  for (i = 0; i < mc; ++i) {
+    for (j = 0; j < nc; ++j) {
+      if (beta == 0.0) {
+        C(i, j) = 0.0;
+      } else if (beta != 1.0) {
+        C(i, j) *= beta;
+      }
+      if (j == 0) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
+      } else if (j == 1) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
+      } else if (j == 2) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
+      } else if (j == 3) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
+      }
+      if (C(i, j) < 0) {
+        C(i, j) = 0;
+      }
+    }
+  }
+}
+
+#elif defined(ARMV7)
+void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
+               int ldb, float beta, float *C, int ldc, int mc, int nc) {
+  int kc1 = k / 4, kc2 = k % 4;
+  int bytes_ldc = 4 * ldc;
+  int flag_alpha = (alpha == 1.0) ? 1 : 2;
+  int flag_beta;
+  if (beta == 0.0) {
+    flag_beta = 0;
+  } else if (beta == 1.0) {
+    flag_beta = 1;
+  } else {
+    flag_beta = 2;
+  }
+  asm volatile(
+      "pld        [%[a]]              \n\t"
+      "pld        [%[b]]              \n\t"
+      "vmov.f32   q10,    #0.0        \n\t"
+      "vmov.f32   q11,    #0.0        \n\t"
+      "vmov.f32   q12,    #0.0        \n\t"
+      "vmov.f32   q13,    #0.0        \n\t"
+
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "blt        end_kc1_%=          \n\t"
+      "loop_kc1_%=:                   \n\t"
+      "pld        [%[a], #64]         \n\t"
+      "pld        [%[b], #64]         \n\t"
+      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
+      "vmla.f32   q10, q2, d0[0]      \n\t"
+      "vmla.f32   q11, q2, d0[1]      \n\t"
+      "vmla.f32   q12, q2, d1[0]      \n\t"
+      "vmla.f32   q13, q2, d1[1]      \n\t"
+      "vmla.f32   q10, q3, d2[0]      \n\t"
+      "vmla.f32   q11, q3, d2[1]      \n\t"
+      "vmla.f32   q12, q3, d3[0]      \n\t"
+      "vmla.f32   q13, q3, d3[1]      \n\t"
+      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
+      "vmla.f32   q10, q2, d0[0]      \n\t"
+      "vmla.f32   q11, q2, d0[1]      \n\t"
+      "vmla.f32   q12, q2, d1[0]      \n\t"
+      "vmla.f32   q13, q2, d1[1]      \n\t"
+      "vmla.f32   q10, q3, d2[0]      \n\t"
+      "vmla.f32   q11, q3, d2[1]      \n\t"
+      "vmla.f32   q12, q3, d3[0]      \n\t"
+      "vmla.f32   q13, q3, d3[1]      \n\t"
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "bge        loop_kc1_%=         \n\t"
+      "end_kc1_%=:                    \n\t"
+
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "blt        end_kc2_%=          \n\t"
+      "loop_kc2_%=:                   \n\t"
+      "vld1.32    {q0}, [%[a]]!       \n\t"
+      "vld1.32    {q1}, [%[b]]!       \n\t"
+      "vmla.f32   q10, q1, d0[0]      \n\t"
+      "vmla.f32   q11, q1, d0[1]      \n\t"
+      "vmla.f32   q12, q1, d1[0]      \n\t"
+      "vmla.f32   q13, q1, d1[1]      \n\t"
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "bge        loop_kc2_%=         \n\t"
+      "end_kc2_%=:                    \n\t"
+
+      "cmp        %[mc],      #4      \n\t"
+      "bne        temp_%=             \n\t"
+      "cmp        %[nc],      #4      \n\t"
+      "bne        temp_%=             \n\t"
+
+      "vmov.f32   d8[0],    %[alpha]  \n\t"
+      "vmov.f32   d8[1],    %[beta]   \n\t"
+
+      "cmp        %[flag_alpha],  #1  \n\t"
+      "bne        alpha_%=            \n\t"
+
+      "alpha_%=:                      \n\t"
+      "vmul.f32   q10, q10, d8[0]     \n\t"
+      "vmul.f32   q11, q11, d8[0]     \n\t"
+      "vmul.f32   q12, q12, d8[0]     \n\t"
+      "vmul.f32   q13, q13, d8[0]     \n\t"
+
+      "beta_%=:                       \n\t"
+      "cmp        %[flag_beta],   #0  \n\t"
+      "beq        memory_%=           \n\t"
+
+      "mov        r4,     %[C]        \n\t"
+      "mov        r6,     %[bytes_ldc]\n\t"
+      "vld1.32    {q0}, [r4], r6      \n\t"
+      "vld1.32    {q1}, [r4], r6      \n\t"
+      "vld1.32    {q2}, [r4], r6      \n\t"
+      "vld1.32    {q3}, [r4]          \n\t"
+      "cmp        %[flag_beta],   #1  \n\t"
+      "beq        beta_eq1_%=         \n\t"
+      "bne        beta_ne1_%=         \n\t"
+
+      "beta_eq1_%=:                   \n\t"
+      "vadd.f32   q10, q10, q0        \n\t"
+      "vadd.f32   q11, q11, q1        \n\t"
+      "vadd.f32   q12, q12, q2        \n\t"
+      "vadd.f32   q13, q13, q3        \n\t"
+      "b          memory_%=           \n\t"
+
+      "beta_ne1_%=:                   \n\t"
+      "vmla.f32   q10, q0, d8[1]      \n\t"
+      "vmla.f32   q11, q1, d8[1]      \n\t"
+      "vmla.f32   q12, q2, d8[1]      \n\t"
+      "vmla.f32   q13, q3, d8[1]      \n\t"
+
+      "memory_%=:                     \n\t"
+      "mov        r5,     %[C]        \n\t"
+      "mov        r6,     %[bytes_ldc]\n\t"
+      "vst1.32    {q10}, [r5], r6     \n\t"
+      "vst1.32    {q11}, [r5], r6     \n\t"
+      "vst1.32    {q12}, [r5], r6     \n\t"
+      "vst1.32    {q13}, [r5]         \n\t"
+      "b          end_%=              \n\t"
+
+      "temp_%=:                       \n\t"
+      "vst1.32    {q10, q11}, [%[ab]]!\n\t"
+      "vst1.32    {q12, q13}, [%[ab]] \n\t"
+      "end_%=:                        \n\t"
+      :
+      : [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
+        [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
+        [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13");
+
+  if (mc != MR || nc != NR) {
+    int i, j;
+    for (i = 0; i < mc; ++i) {
+      for (j = 0; j < nc; ++j) {
+        if (beta == 0.0) {
+          if (alpha != 1.0) {
+            C(i, j) = alpha * ab[i * MR + j];
+          } else {
+            C(i, j) = ab[i * MR + j];
+          }
+        } else {
+          if (beta != 1.0) {
+            C(i, j) *= beta;
+          }
+          if (alpha != 1.0) {
+            C(i, j) += alpha * ab[i * MR + j];
+          } else {
+            C(i, j) += ab[i * MR + j];
+          }
+        }
+      }
+    }
+  }
+}
+
+void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
+                    int ldb, float beta, float *C, int ldc, int mc, int nc,
+                    bool relu = false) {
+  int kc1 = k / 4, kc2 = k % 4;
+  int bytes_ldc = 4 * ldc;
+  int flag_alpha = (alpha == 1.0) ? 1 : 2;
+  int flag_beta;
+  if (beta == 0.0) {
+    flag_beta = 0;
+  } else if (beta == 1.0) {
+    flag_beta = 1;
+  } else {
+    flag_beta = 2;
+  }
+  asm volatile(
+      "pld        [%[a]]              \n\t"
+      "pld        [%[b]]              \n\t"
+      "vmov.f32   q10,    #0.0        \n\t"
+      "vmov.f32   q11,    #0.0        \n\t"
+      "vmov.f32   q12,    #0.0        \n\t"
+      "vmov.f32   q13,    #0.0        \n\t"
+
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "blt        end_kc1_%=          \n\t"
+      "loop_kc1_%=:                   \n\t"
+      "pld        [%[a], #64]         \n\t"
+      "pld        [%[b], #64]         \n\t"
+      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
+      "vmla.f32   q10, q2, d0[0]      \n\t"
+      "vmla.f32   q11, q2, d0[1]      \n\t"
+      "vmla.f32   q12, q2, d1[0]      \n\t"
+      "vmla.f32   q13, q2, d1[1]      \n\t"
+      "vmla.f32   q10, q3, d2[0]      \n\t"
+      "vmla.f32   q11, q3, d2[1]      \n\t"
+      "vmla.f32   q12, q3, d3[0]      \n\t"
+      "vmla.f32   q13, q3, d3[1]      \n\t"
+      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
+      "vmla.f32   q10, q2, d0[0]      \n\t"
+      "vmla.f32   q11, q2, d0[1]      \n\t"
+      "vmla.f32   q12, q2, d1[0]      \n\t"
+      "vmla.f32   q13, q2, d1[1]      \n\t"
+      "vmla.f32   q10, q3, d2[0]      \n\t"
+      "vmla.f32   q11, q3, d2[1]      \n\t"
+      "vmla.f32   q12, q3, d3[0]      \n\t"
+      "vmla.f32   q13, q3, d3[1]      \n\t"
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "bge        loop_kc1_%=         \n\t"
+      "end_kc1_%=:                    \n\t"
+
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "blt        end_kc2_%=          \n\t"
+      "loop_kc2_%=:                   \n\t"
+      "vld1.32    {q0}, [%[a]]!       \n\t"
+      "vld1.32    {q1}, [%[b]]!       \n\t"
+      "vmla.f32   q10, q1, d0[0]      \n\t"
+      "vmla.f32   q11, q1, d0[1]      \n\t"
+      "vmla.f32   q12, q1, d1[0]      \n\t"
+      "vmla.f32   q13, q1, d1[1]      \n\t"
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "bge        loop_kc2_%=         \n\t"
+      "end_kc2_%=:                    \n\t"
+
+      "cmp        %[mc],      #4      \n\t"
+      "bne        temp_%=             \n\t"
+      "cmp        %[nc],      #4      \n\t"
+      "bne        temp_%=             \n\t"
+
+      "vmov.f32   d8[0],    %[alpha]  \n\t"
+      "vmov.f32   d8[1],    %[beta]   \n\t"
+
+      "cmp        %[flag_alpha],  #1  \n\t"
+      "bne        alpha_%=            \n\t"
+
+      "alpha_%=:                      \n\t"
+      "vmul.f32   q10, q10, d8[0]     \n\t"
+      "vmul.f32   q11, q11, d8[0]     \n\t"
+      "vmul.f32   q12, q12, d8[0]     \n\t"
+      "vmul.f32   q13, q13, d8[0]     \n\t"
+
+      "beta_%=:                       \n\t"
+      "cmp        %[flag_beta],   #0  \n\t"
+      "beq        memory_%=           \n\t"
+
+      "mov        r4,     %[C]        \n\t"
+      "mov        r6,     %[bytes_ldc]\n\t"
+      "vld1.32    {q0}, [r4], r6      \n\t"
+      "vld1.32    {q1}, [r4], r6      \n\t"
+      "vld1.32    {q2}, [r4], r6      \n\t"
+      "vld1.32    {q3}, [r4]          \n\t"
+      "cmp        %[flag_beta],   #1  \n\t"
+      "beq        beta_eq1_%=         \n\t"
+      "bne        beta_ne1_%=         \n\t"
+
+      "beta_eq1_%=:                   \n\t"
+      "vadd.f32   q10, q10, q0        \n\t"
+      "vadd.f32   q11, q11, q1        \n\t"
+      "vadd.f32   q12, q12, q2        \n\t"
+      "vadd.f32   q13, q13, q3        \n\t"
+      "b          memory_%=           \n\t"
+
+      "beta_ne1_%=:                   \n\t"
+      "vmla.f32   q10, q0, d8[1]      \n\t"
+      "vmla.f32   q11, q1, d8[1]      \n\t"
+      "vmla.f32   q12, q2, d8[1]      \n\t"
+      "vmla.f32   q13, q3, d8[1]      \n\t"
+
+      "memory_%=:                     \n\t"
+      "vmax.f32 q10, q10, q14         \n\t"
+      "vmax.f32 q11, q11, q14         \n\t"
+      "vmax.f32 q12, q12, q14         \n\t"
+      "vmax.f32 q13, q13, q14         \n\t"
+      "mov        r5,     %[C]        \n\t"
+      "mov        r6,     %[bytes_ldc]\n\t"
+      "vst1.32    {q10}, [r5], r6     \n\t"
+      "vst1.32    {q11}, [r5], r6     \n\t"
+      "vst1.32    {q12}, [r5], r6     \n\t"
+      "vst1.32    {q13}, [r5]         \n\t"
+      "b          end_%=              \n\t"
+
+      "temp_%=:                       \n\t"
+      "vst1.32    {q10, q11}, [%[ab]]!\n\t"
+      "vst1.32    {q12, q13}, [%[ab]] \n\t"
+      "end_%=:                        \n\t"
+      :
+      : [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
+        [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
+        [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13",
+        "q14");
+
+  if (mc != MR || nc != NR) {
+    int i, j;
+    for (i = 0; i < mc; ++i) {
+      for (j = 0; j < nc; ++j) {
+        if (beta == 0.0) {
+          if (alpha != 1.0) {
+            C(i, j) = alpha * ab[i * MR + j];
+          } else {
+            C(i, j) = ab[i * MR + j];
+          }
+        } else {
+          if (beta != 1.0) {
+            C(i, j) *= beta;
+          }
+          if (alpha != 1.0) {
+            C(i, j) += alpha * ab[i * MR + j];
+          } else {
+            C(i, j) += ab[i * MR + j];
+          }
+        }
+        if (relu) {
+          if (C(i, j) < 0) {
+            C(i, j) = 0;
+          }
+        }
+      }
+    }
+  }
+}
+
+#else
 void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
                int ldb, float beta, float *C, int ldc, int mc, int nc) {
   float c[16] = {0};
   float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
 
-  // // init C
-  // float32x4_t cv0 = vdup_n_f32(0.0);
-  // float32x4_t cv1 = vdup_n_f32(0.0);
-  // float32x4_t cv2 = vdup_n_f32(0.0);
-  // float32x4_t cv3 = vdup_n_f32(0.0);
+  for (int p = 0; p < k; p += 1) {
+    reg_b0 = *b++;
+    reg_b1 = *b++;
+    reg_b2 = *b++;
+    reg_b3 = *b++;
+
+    reg_a0 = *a++;
+    reg_a1 = *a++;
+    reg_a2 = *a++;
+    reg_a3 = *a++;
+
+    // first row
+    c[0] += reg_a0 * reg_b0;
+    c[1] += reg_a0 * reg_b1;
+    c[2] += reg_a0 * reg_b2;
+    c[3] += reg_a0 * reg_b3;
+
+    // second row
+    c[4] += reg_a1 * reg_b0;
+    c[5] += reg_a1 * reg_b1;
+    c[6] += reg_a1 * reg_b2;
+    c[7] += reg_a1 * reg_b3;
+
+    // third row
+    c[8] += reg_a2 * reg_b0;
+    c[9] += reg_a2 * reg_b1;
+    c[10] += reg_a2 * reg_b2;
+    c[11] += reg_a2 * reg_b3;
+
+    // fourth row
+    c[12] += reg_a3 * reg_b0;
+    c[13] += reg_a3 * reg_b1;
+    c[14] += reg_a3 * reg_b2;
+    c[15] += reg_a3 * reg_b3;
+  }
+  int i, j;
+  for (i = 0; i < mc; ++i) {
+    for (j = 0; j < nc; ++j) {
+      if (beta == 0.0) {
+        C(i, j) = 0.0;
+      } else if (beta != 1.0) {
+        C(i, j) *= beta;
+      }
+      if (alpha != 1.0) {
+        C(i, j) += alpha * c[i * MR + j];
+      } else {
+        C(i, j) += c[i * MR + j];
+      }
+    }
+  }
+}
+
+void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
+                    int ldb, float beta, float *C, int ldc, int mc, int nc,
+                    bool relu) {
+  float c[16] = {0};
+  float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
 
   for (int p = 0; p < k; p += 1) {
     reg_b0 = *b++;
@@ -229,15 +731,26 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
       } else {
         C(i, j) += c[i * MR + j];
       }
+      if (relu) {
+        if (C(i, j) < 0) {
+          C(i, j) = 0;
+        }
+      }
     }
   }
 }
 
+#endif
+
 // 32位 float 矩阵乘法
 void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
            const float *B, int ldb, float beta, float *C, int ldc) {
   int i, j, p, mc, nc, kc;
   float beta_;
+  if (m == 1) {
+    VectorKernel(1, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+    return;
+  }
   for (j = 0; j < n; j += NC) {
     nc = s_min(n - j, NC);
     for (p = 0; p < k; p += KC) {
@@ -256,6 +769,248 @@ void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
   }
 }
 
+void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
+                const float *B, int ldb, float beta, float *C, int ldc) {
+  int i, j, p, mc, nc, kc;
+  float beta_;
+  for (j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+    for (p = 0; p < k; p += KC) {
+      kc = s_min(k - p, KC);
+      for (i = 0; i < m; i += MC) {
+        mc = s_min(m - i, MC);
+        if (p != 0) {
+          beta_ = 1.0;
+        } else {
+          beta_ = beta;
+        }
+
+        if (p + KC >= k) {
+          InnerKernel_relu(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb,
+                           beta_, &C(i, j), ldc, i == 0, true);
+        } else {
+          InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
+                      &C(i, j), ldc, i == 0);
+        }
+      }
+    }
+  }
+}
+
+void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
+                  const float *B, int ldb, float beta, float *C, int ldc) {
+  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
+
+  const float *a0, *b0, *b1, *b2, *b3;
+  float *c0, *C0;
+
+  int volatile kc1 = k / 4;
+  int volatile kc2 = k % 4;
+  int volatile nc1 = n / 16;
+  int _nc1 = n % 16;
+  int volatile nc2 = _nc1 / 4;
+  int volatile nc3 = _nc1 % 4;
+  for (int i = 0; i < kc1; i++) {
+    a0 = A + i * 4;
+    b0 = B + i * 4 * ldb;
+    b1 = b0 + ldb;
+    b2 = b1 + ldb;
+    b3 = b2 + ldb;
+    c0 = bufferC;
+    asm volatile(
+        "pld        [%[a0], #16]          \n\t"
+        "vld1.32    {q0}, [%[a0]]         \n\t"
+
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "blt        end_nc1_%=            \n\t"
+        "loop_nc1_%=:                     \n\t"
+
+        "cmp        %[i],       #0        \n\t"
+        "beq        i_eq0_%=              \n\t"
+        "bne        i_ne0_%=              \n\t"
+
+        "i_eq0_%=:                        \n\t"
+        "vmov.f32   q10,    #0.0          \n\t"
+        "vmov.f32   q11,    #0.0          \n\t"
+        "vmov.f32   q12,    #0.0          \n\t"
+        "vmov.f32   q13,    #0.0          \n\t"
+        "b          gemm_nc1_%=           \n\t"
+
+        "i_ne0_%=:                        \n\t"
+        "pld        [%[c0], #64]          \n\t"
+        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
+        "sub        %[c0], %[c0], #32     \n\t"
+
+        "gemm_nc1_%=:                     \n\t"
+        "pld        [%[b0], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "vmla.f32   q11, q3, d0[0]        \n\t"
+        "vmla.f32   q12, q4, d0[0]        \n\t"
+        "vmla.f32   q13, q5, d0[0]        \n\t"
+
+        "pld        [%[b1], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b1]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b1]]!    \n\t"
+        "vmla.f32   q10, q2, d0[1]        \n\t"
+        "vmla.f32   q11, q3, d0[1]        \n\t"
+        "vmla.f32   q12, q4, d0[1]        \n\t"
+        "vmla.f32   q13, q5, d0[1]        \n\t"
+
+        "pld        [%[b2], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b2]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b2]]!    \n\t"
+        "vmla.f32   q10, q2, d1[0]        \n\t"
+        "vmla.f32   q11, q3, d1[0]        \n\t"
+        "vmla.f32   q12, q4, d1[0]        \n\t"
+        "vmla.f32   q13, q5, d1[0]        \n\t"
+
+        "pld        [%[b3], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b3]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b3]]!    \n\t"
+        "vmla.f32   q10, q2, d1[1]        \n\t"
+        "vmla.f32   q11, q3, d1[1]        \n\t"
+        "vmla.f32   q12, q4, d1[1]        \n\t"
+        "vmla.f32   q13, q5, d1[1]        \n\t"
+
+        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
+
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "bge        loop_nc1_%=           \n\t"
+        "end_nc1_%=:                      \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "blt        end_nc2_%=            \n\t"
+        "loop_nc2_%=:                     \n\t"
+
+        "cmp        %[i],       #0        \n\t"
+        "beq        ii_eq0_%=             \n\t"
+        "bne        ii_ne0_%=             \n\t"
+
+        "ii_eq0_%=:                       \n\t"
+        "vmov.f32   q10,    #0.0          \n\t"
+        "b          gemm_nc2_%=           \n\t"
+
+        "ii_ne0_%=:                       \n\t"
+        "pld        [%[c0], #16]          \n\t"
+        "vld1.32    {q10}, [%[c0]]        \n\t"
+
+        "gemm_nc2_%=:                     \n\t"
+        "pld        [%[b0], #16]          \n\t"
+        "vld1.32    {q2}, [%[b0]]!        \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+
+        "pld        [%[b1], #16]          \n\t"
+        "vld1.32    {q3}, [%[b1]]!        \n\t"
+        "vmla.f32   q10, q3, d0[1]        \n\t"
+
+        "pld        [%[b2], #16]          \n\t"
+        "vld1.32    {q4}, [%[b2]]!        \n\t"
+        "vmla.f32   q10, q4, d1[0]        \n\t"
+
+        "pld        [%[b3], #16]          \n\t"
+        "vld1.32    {q5}, [%[b3]]!        \n\t"
+        "vmla.f32   q10, q5, d1[1]        \n\t"
+
+        "vst1.32    {q10}, [%[c0]]!       \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "bge        loop_nc2_%=           \n\t"
+        "end_nc2_%=:                      \n\t"
+
+        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
+          [c0] "+r"(c0)
+        : [a0] "r"(a0), [i] "r"(i), [nc1] "r"(nc1), [nc2] "r"(nc2)
+        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
+
+    for (int j = 0; j < nc3; j++) {
+      if (i == 0) {
+        *c0 = (*a0) * (*b0++);
+      } else {
+        *c0 += (*a0) * (*b0++);
+      }
+      *c0 += (*(a0 + 1)) * (*b1++);
+      *c0 += (*(a0 + 2)) * (*b2++);
+      *c0 += (*(a0 + 3)) * (*b3++);
+      c0++;
+    }
+  }
+
+  for (int i = 0; i < kc2; ++i) {
+    a0 = A + 4 * kc1 + i;
+    b0 = B + (4 * kc1 + i) * ldb;
+    c0 = bufferC;
+    asm volatile(
+        "pld        [%[a0], #16]          \n\t"
+        "vld1.32    {d0}, [%[a0]]         \n\t"
+
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "blt        end_nc1_%=            \n\t"
+        "loop_nc1_%=:                     \n\t"
+
+        "pld        [%[c0], #64]          \n\t"
+        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
+        "sub        %[c0], %[c0], #32     \n\t"
+
+        "gemm_nc1_%=:                     \n\t"
+        "pld        [%[b0], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "vmla.f32   q11, q3, d0[0]        \n\t"
+        "vmla.f32   q12, q4, d0[0]        \n\t"
+        "vmla.f32   q13, q5, d0[0]        \n\t"
+
+        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
+
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "bge        loop_nc1_%=           \n\t"
+        "end_nc1_%=:                      \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "blt        end_nc2_%=            \n\t"
+        "loop_nc2_%=:                     \n\t"
+
+        "pld        [%[c0], #16]          \n\t"
+        "vld1.32    {q10}, [%[c0]]        \n\t"
+
+        "gemm_nc2_%=:                     \n\t"
+        "vld1.32    {q2}, [%[b0]]!        \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+
+        "vst1.32    {q10}, [%[c0]]!       \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "bge        loop_nc2_%=           \n\t"
+        "end_nc2_%=:                      \n\t"
+
+        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
+          [c0] "+r"(c0)
+        : [a0] "r"(a0), [nc1] "r"(nc1), [nc2] "r"(nc2)
+        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
+
+    for (int j = 0; j < nc3; j++) {
+      *c0 += (*a0) * (*b0++);
+      c0++;
+    }
+  }
+
+  c0 = bufferC;
+  C0 = C;
+  for (int i = 0; i < n; i++) {
+    if (beta == 1.0) {
+      *C0++ += *c0++;
+    } else {
+      *C0++ = *c0++;
+    }
+  }
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h
index 87d65bdd28a42c4510668345ad7ce7058eb2cdf8..6d7ae6d2bcdbd7e24cb3c2389dd3cdf09a807892 100644
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -20,9 +20,9 @@ limitations under the License. */
 #define C(i, j) C[(i)*ldc + (j)]
 
 // 分块计算的块大小，mc 与 kc 分别对应分块计算时的 m 与 k
-#define MC 384
-#define KC 384
-#define NC 4096
+#define MC 128
+#define KC 128
+#define NC 1024
 #define MR 4
 #define NR 4
 
@@ -33,19 +33,19 @@ namespace operators {
 namespace math {
 
 // 将 A 矩阵分块复制到连续内存(ColMajor)
-void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
+void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                  float *buffer);
 
 // 将 B 矩阵分块复制到连续内存(ColMajor)
-void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
+void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                  float *buffer);
 
 // 将 A 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
+void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
                   float *buffer);
 
 // 将 B 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
+void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
                   float *buffer);
 
 // 分块矩阵乘法
@@ -53,14 +53,25 @@ void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
                  const float *B, int ldb, float beta, float *C, int ldc,
                  int first_time);
 
+// 向量矩阵乘法 (M = 1)
+void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
+                  const float *B, int ldb, float beta, float *C, int ldc);
+
 // 计算一个更小的 4 * 4 的 C 矩阵分块
 void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B,
                int ldb, float beta, float *C, int ldc, int mc, int nc);
 
+void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
+                    int ldb, float beta, float *C, int ldc, int mc, int nc,
+                    bool relu);
+
 // 32位 float 矩阵乘法
 void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
            const float *B, int ldb, float beta, float *C, int ldc);
 
+void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
+                const float *B, int ldb, float beta, float *C, int ldc);
+
 // 64位 double 矩阵乘法
 void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
            const double *B, int ldb, float beta, double *C, int ldc);
diff --git a/src/operators/math/im2col.cpp b/src/operators/math/im2col.cpp
index 46c5b7a90fdad30301c0b9b21e37a3078df4a821..625d120705aab8fcc3ea8d232b4077e213941ec4 100644
--- a/src/operators/math/im2col.cpp
+++ b/src/operators/math/im2col.cpp
@@ -14,8 +14,10 @@ limitations under the License. */
 
 #include "operators/math/im2col.h"
 #include <vector>
+#ifdef __ARM_NEON
+#include "arm_neon.h"
+#endif
 #include "common/types.h"
-
 namespace paddle_mobile {
 namespace operators {
 namespace math {
@@ -65,9 +67,350 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
     //                      are " "inconsistent.");
 
     int channels_col = im_channels * filter_height * filter_width;
-
     const T *im_data = im.data<T>();
     T *col_data = col->data<T>();
+#ifdef __ARM_NEON
+    const int osize = col_height;
+    const int isize = im_height;
+    bool pad1 = padding[0] > 0;
+    bool pad2 =
+        (pad1 &&
+         (((isize - 2 * padding[0] + filter_height) % stride[0] == 0) ? 1 : 0));
+    int fill = isize % 2;
+    if (stride[0] == 1 && filter_height == 3 && pad1 && pad2 &&
+        dilation[0] == 1) {
+      for (int c = 0; c < im_channels; ++c) {
+        int oosize = osize * osize;
+        int nk4 = osize / 4;
+        int mk4 = osize % 4;
+
+        float *col0 = col_data + 0 * oosize + 2 * osize + 2;
+        float *col1 = col_data + 1 * oosize + 2 * osize + 1;
+        float *col2 = col_data + 2 * oosize + 2 * osize;
+
+        float *col3 = col_data + 3 * oosize + osize + 2;
+        float *col4 = col_data + 4 * oosize + osize + 1;
+        float *col5 = col_data + 5 * oosize + osize;
+
+        float *col6 = col_data + 6 * oosize + 2;
+        float *col7 = col_data + 7 * oosize + 1;
+        float *col8 = col_data + 8 * oosize;
+
+        float32x4_t im1;
+        const float *im_tmp_data = im_data + osize + 1;
+
+        int rrsize = oosize - osize - 1;
+        int nr4 = rrsize / 4;
+        int mr4 = rrsize % 4;
+        for (int i = 0; i < nr4; ++i) {
+          im1 = vld1q_f32(im_tmp_data);
+          vst1q_f32(col0, im1);
+          vst1q_f32(col1, im1);
+          vst1q_f32(col2, im1);
+          vst1q_f32(col3, im1);
+          vst1q_f32(col4, im1);
+          vst1q_f32(col5, im1);
+          vst1q_f32(col6, im1);
+          vst1q_f32(col7, im1);
+          vst1q_f32(col8, im1);
+
+          col0 += 4;
+          col1 += 4;
+          col2 += 4;
+          col3 += 4;
+          col4 += 4;
+          col5 += 4;
+          col6 += 4;
+          col7 += 4;
+          col8 += 4;
+
+          im_tmp_data += 4;
+        }
+        for (int i = 0; i < mr4; ++i) {
+          *col0 = *im_tmp_data;
+          *col1 = *im_tmp_data;
+          *col2 = *im_tmp_data;
+          *col3 = *im_tmp_data;
+          *col4 = *im_tmp_data;
+          *col5 = *im_tmp_data;
+          *col6 = *im_tmp_data;
+          *col7 = *im_tmp_data;
+          *col8 = *im_tmp_data;
+
+          col0++;
+          col1++;
+          col2++;
+          col3++;
+          col4++;
+          col5++;
+          col6++;
+          col7++;
+          col8++;
+
+          im_tmp_data++;
+        }
+
+        im_tmp_data = im_data + 1;
+        col0 = col_data + 0 * oosize + osize + 2;
+        col1 = col_data + 1 * oosize + osize + 1;
+        col2 = col_data + 2 * oosize + osize;
+
+        col3 = col_data + 3 * oosize + 2;
+        col4 = col_data + 4 * oosize + 1;
+        col5 = col_data + 5 * oosize;
+
+        for (int i = 0; i < nk4; i++) {
+          im1 = vld1q_f32(im_tmp_data);
+          vst1q_f32(col0, im1);
+          vst1q_f32(col1, im1);
+          vst1q_f32(col2, im1);
+          vst1q_f32(col3, im1);
+          vst1q_f32(col4, im1);
+          vst1q_f32(col5, im1);
+
+          col0 += 4;
+          col1 += 4;
+          col2 += 4;
+          col3 += 4;
+          col4 += 4;
+          col5 += 4;
+          im_tmp_data += 4;
+        }
+
+        for (int i = 0; i < mk4; i++) {
+          *col0 = *im_tmp_data;
+          *col1 = *im_tmp_data;
+          *col2 = *im_tmp_data;
+          *col3 = *im_tmp_data;
+          *col4 = *im_tmp_data;
+          *col5 = *im_tmp_data;
+          col0++;
+          col1++;
+          col2++;
+          col3++;
+          col4++;
+          col5++;
+
+          im_tmp_data++;
+        }
+
+        // fill 0 1 11;
+        for (int i = 0; i < osize; ++i) {
+          col_data[0 * oosize + i * osize] = 0.0;
+          col_data[3 * oosize + i * osize] = 0.0;
+          col_data[6 * oosize + i * osize] = 0.0;
+
+          col_data[2 * oosize + osize - 1 + i * osize] = 0.0;
+          col_data[5 * oosize + osize - 1 + i * osize] = 0.0;
+          col_data[8 * oosize + osize - 1 + i * osize] = 0.0;
+        }
+
+        col_data[0 * oosize + osize + 1] = im_data[0];
+        col_data[3 * oosize + 1] = im_data[0];
+        col_data[6 * oosize + 1] = im_data[osize];
+
+        col_data[1 * oosize + osize] = im_data[0];
+        col_data[4 * oosize] = im_data[0];
+        col_data[7 * oosize] = im_data[osize];
+
+        float32x4_t zero4;
+        zero4 = vdupq_n_f32(0.0);
+        auto col_z0 = col_data;
+        auto col_z1 = col_data + oosize;
+        auto col_z2 = col_data + 2 * oosize;
+        auto col_z6 = col_data + 6 * oosize + osize * (osize - 1);
+        auto col_z7 = col_data + 7 * oosize + osize * (osize - 1);
+        auto col_z8 = col_data + 8 * oosize + osize * (osize - 1);
+
+        for (int i = 0; i < nk4; ++i) {
+          vst1q_f32(col_z0, zero4);
+          vst1q_f32(col_z1, zero4);
+          vst1q_f32(col_z2, zero4);
+          vst1q_f32(col_z6, zero4);
+          vst1q_f32(col_z7, zero4);
+          vst1q_f32(col_z8, zero4);
+
+          col_z0 += 4;
+          col_z1 += 4;
+          col_z2 += 4;
+          col_z6 += 4;
+          col_z7 += 4;
+          col_z8 += 4;
+        }
+
+        for (int i = 0; i < mk4; ++i) {
+          col_z0[i] = 0.0;
+          col_z1[i] = 0.0;
+          col_z2[i] = 0.0;
+          col_z6[i] = 0.0;
+          col_z7[i] = 0.0;
+          col_z8[i] = 0.0;
+        }
+        col_data += 9 * oosize;
+        im_data += isize * isize;
+      }
+    } else if (stride[0] == 2 && filter_height == 3 && pad1 &&
+               dilation[0] == 1) {
+      for (int c = 0; c < im_channels; ++c) {
+        int oosize = osize * osize;
+        int nk4 = osize / 4;
+        int mk4 = osize % 4;
+
+        // 3 2 3 1 0 1 3 2 3
+        float *col0 = col_data + 0 * oosize + osize + 1;
+        float *col1 = col_data + 1 * oosize + osize;
+        float *col2 = col_data + 2 * oosize + osize;
+
+        float *col3 = col_data + 3 * oosize + 1;
+        float *col4 = col_data + 4 * oosize;
+        float *col5 = col_data + 5 * oosize;
+
+        float *col6 = col_data + 6 * oosize + 1;
+        float *col7 = col_data + 7 * oosize;
+        float *col8 = col_data + 8 * oosize;
+
+        float32x4x2_t im01;
+        float32x4x2_t im23;
+        const float *im_tmp_data0 = im_data;
+        const float *im_tmp_data2 = im_data + isize;
+
+        for (int j = 0; j < osize; ++j) {
+          for (int i = 0; i < nk4; ++i) {
+            im01 = vld2q_f32(im_tmp_data0);
+            im23 = vld2q_f32(im_tmp_data2);
+            vst1q_f32(col0, im23.val[1]);
+            vst1q_f32(col1, im23.val[0]);
+            vst1q_f32(col2, im23.val[1]);
+            vst1q_f32(col3, im01.val[1]);
+            vst1q_f32(col4, im01.val[0]);
+            vst1q_f32(col5, im01.val[1]);
+            vst1q_f32(col6, im23.val[1]);
+            vst1q_f32(col7, im23.val[0]);
+            vst1q_f32(col8, im23.val[1]);
+
+            col0 += 4;
+            col1 += 4;
+            col2 += 4;
+            col3 += 4;
+            col4 += 4;
+            col5 += 4;
+            col6 += 4;
+            col7 += 4;
+            col8 += 4;
+
+            im_tmp_data0 += 8;
+            im_tmp_data2 += 8;
+          }
+          const float *im_tmp_data1 = im_tmp_data0 + 1;
+          const float *im_tmp_data3 = im_tmp_data2 + 1;
+          for (int i = 0; i < mk4; ++i) {
+            *col0 = *im_tmp_data3;
+            *col1 = *im_tmp_data2;
+            *col2 = *im_tmp_data3;
+            *col3 = *im_tmp_data1;
+            *col4 = *im_tmp_data0;
+            *col5 = *im_tmp_data1;
+            *col6 = *im_tmp_data3;
+            *col7 = *im_tmp_data2;
+            *col8 = *im_tmp_data3;
+
+            col0++;
+            col1++;
+            col2++;
+            col3++;
+            col4++;
+            col5++;
+            col6++;
+            col7++;
+            col8++;
+            im_tmp_data0 += 2;
+            im_tmp_data1 += 2;
+            im_tmp_data2 += 2;
+            im_tmp_data3 += 2;
+          }
+          im_tmp_data0 += (isize - fill);
+          im_tmp_data2 += (isize - fill);
+        }
+        for (int i = 0; i < osize; ++i) {
+          col_data[0 * oosize + i * osize] = 0.0;
+          col_data[3 * oosize + i * osize] = 0.0;
+          col_data[6 * oosize + i * osize] = 0.0;
+          if (pad2) {
+            col_data[2 * oosize + osize - 1 + i * osize] = 0.0;
+            col_data[5 * oosize + osize - 1 + i * osize] = 0.0;
+            col_data[8 * oosize + osize - 1 + i * osize] = 0.0;
+          }
+        }
+        float32x4_t zero4;
+        zero4 = vdupq_n_f32(0.0);
+        auto col_z0 = col_data;
+        auto col_z1 = col_data + oosize;
+        auto col_z2 = col_data + 2 * oosize;
+        auto col_z6 = col_data + 6 * oosize + osize * (osize - 1);
+        auto col_z7 = col_data + 7 * oosize + osize * (osize - 1);
+        auto col_z8 = col_data + 8 * oosize + osize * (osize - 1);
+
+        for (int i = 0; i < nk4; ++i) {
+          vst1q_f32(col_z0, zero4);
+          vst1q_f32(col_z1, zero4);
+          vst1q_f32(col_z2, zero4);
+          if (pad2) {
+            vst1q_f32(col_z6, zero4);
+            vst1q_f32(col_z7, zero4);
+            vst1q_f32(col_z8, zero4);
+          }
+          col_z0 += 4;
+          col_z1 += 4;
+          col_z2 += 4;
+          col_z6 += 4;
+          col_z7 += 4;
+          col_z8 += 4;
+        }
+
+        for (int i = 0; i < mk4; ++i) {
+          col_z0[i] = 0.0;
+          col_z1[i] = 0.0;
+          col_z2[i] = 0.0;
+          if (pad2) {
+            col_z6[i] = 0.0;
+            col_z7[i] = 0.0;
+            col_z8[i] = 0.0;
+          }
+        }
+
+        col_data[1 * oosize + osize] = im_data[isize];
+        for (int i = 1; i < osize; ++i) {
+          col_data[3 * oosize + i] = im_data[(i - 1) * stride[0] + 1];
+        }
+        col_data[4 * oosize] = im_data[0];
+        col_data[7 * oosize] = im_data[isize];
+
+        col_data += 9 * oosize;
+        im_data += isize * isize;
+      }
+    } else {
+      for (int c = 0; c < channels_col; ++c) {
+        int w_offset = c % filter_width;
+        int h_offset = (c / filter_width) % filter_height;
+        int c_im = c / (filter_width * filter_height);
+        for (int h = 0; h < col_height; ++h) {
+          int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+          for (int w = 0; w < col_width; ++w) {
+            int im_col_idx =
+                w * stride[1] - padding[1] + w_offset * dilation[1];
+            int col_idx = (c * col_height + h) * col_width + w;
+            int im_idx =
+                (im_row_idx + c_im * im_height) * im_width + im_col_idx;
+
+            col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
+                                 im_col_idx < 0 || im_col_idx >= im_width)
+                                    ? static_cast<T>(0)
+                                    : im_data[im_idx];
+          }
+        }
+      }
+    }
+#else
     for (int c = 0; c < channels_col; ++c) {
       int w_offset = c % filter_width;
       int h_offset = (c / filter_width) % filter_height;
@@ -86,6 +429,7 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
         }
       }
     }
+#endif
   }
 };
 
@@ -158,7 +502,7 @@ class Col2ImFunctor<ColFormat::kCFO, CPU, T> {
 };
 
 template class Im2ColFunctor<ColFormat::kCFO, CPU, float>;
-template class Im2ColFunctor<ColFormat::kCFO, CPU, double>;
+// template class Im2ColFunctor<ColFormat::kCFO, CPU, double>;
 template class Col2ImFunctor<ColFormat::kCFO, CPU, float>;
 template class Col2ImFunctor<ColFormat::kCFO, CPU, double>;
 
diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp
index 59dd3e82d98334fd8aa86caa8f552936a6983900..fd4106038c7446e659736c6b3c61b5aa05127e72 100644
--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -22,7 +22,7 @@ namespace math {
 template <>
 void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
                    const framework::Tensor &matrix_b, bool trans_b, float alpha,
-                   framework::Tensor *matrix_out, float beta) {
+                   framework::Tensor *matrix_out, float beta, bool relu) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
   auto dim_out = matrix_out->dims();
@@ -41,14 +41,20 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
   int N = dim_out[1];
   int K = (trans_a == false) ? dim_a[1] : dim_a[0];
 
-  sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-        beta, matrix_out->data<float>(), N);
+  if (relu) {
+    sgemm_relu(M, N, K, alpha, matrix_a.data<float>(), K,
+               matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N);
+  } else {
+    sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+          beta, matrix_out->data<float>(), N);
+  }
 }
 
 template <>
 void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
                     const framework::Tensor &matrix_b, bool trans_b,
-                    double alpha, framework::Tensor *matrix_out, double beta) {
+                    double alpha, framework::Tensor *matrix_out, double beta,
+                    bool relu) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
   auto dim_out = matrix_out->dims();
diff --git a/src/operators/math/math_function.h b/src/operators/math/math_function.h
index bf81fc88a03b5ea91647c3086bc2b870fc9bc321..0b953ec6a3b2a03a94a91884b9daf3ed88523a22 100644
--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -25,7 +25,7 @@ namespace math {
 template <typename T>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
             const framework::Tensor &matrix_b, bool trans_b, T alpha,
-            framework::Tensor *matrix_out, T beta);
+            framework::Tensor *matrix_out, T beta, bool relu = false);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/math/pool_2x2.cpp b/src/operators/math/pool_2x2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..96d277c136b4656dbb1fd682489bd7dee5c3af0e
--- /dev/null
+++ b/src/operators/math/pool_2x2.cpp
@@ -0,0 +1,176 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef POOL_OP
+#include "pool_2x2.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
+                Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+
+  const int input_height = input->dims()[2];
+
+  const int input_width = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int ksize_height = 2;
+  const int ksize_width = 2;
+  const int stride_height = strides[0];
+  const int stride_width = strides[1];
+  const int padding_height = paddings[0];
+  const int padding_width = paddings[1];
+
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+
+  const float *input_data = input->data<float>();
+  float *output_data = output->mutable_data<float>();
+
+  int out_w_num = output_width >> 2;
+  const int in_h_num = output_height >> 1;
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  int remain = output_width - out_w_num << 2;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      const float *input_data_chanel_row_next = input_data + input_width;
+      for (; output_height > 0; output_height--) {
+        if (out_w_num > 0) {
+          asm volatile(
+              "max_loop:                            \n\t"
+              "vld1.f32  {q0,q1},  [%[in_ptr1]]!         \n\t"
+              "vld1.f32  {q2,q3},  [%[in_ptr2]]!         \n\t"
+              "vmax.f32  q0,  q0,  q2                 \n\t"
+              "vmax.f32  q1,  q1,  q3                 \n\t"
+              "vpmax.f32  d4,  d0, d1                  \n\t"
+              "vpmax.f32  d5,  d2, d3                  \n\t"
+              "subs %[out_w_num],  #1                  \n\t"
+              "vst1.32  {q2},  [%[out_ptr]]!                 \n\t"
+              "bne  max_loop                            \n\t"
+              : [in_ptr1] "+r"(input_data),
+                [in_ptr2] "+r"(input_data_chanel_row_next),
+                [out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num)
+              :
+              : "memory", "q0", "q1", "q2", "q3");
+        }
+
+        for (; remain > 0; remain--) {
+          float max_row1 = std::max(input_data[0], input_data[1]);
+          float max_row2 = std::max(input_data_chanel_row_next[0],
+                                    input_data_chanel_row_next[1]);
+          *output_data = std::max(max_row1, max_row2);
+          input_data += 2;
+          input_data_chanel_row_next += 2;
+          output_data++;
+        }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+#endif
+}
+
+void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
+                Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+
+  const int input_height = input->dims()[2];
+
+  const int input_width = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int ksize_height = 2;
+  const int ksize_width = 2;
+  const int stride_height = strides[0];
+  const int stride_width = strides[1];
+  const int padding_height = paddings[0];
+  const int padding_width = paddings[1];
+
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+
+  const float *input_data = input->data<float>();
+  float *output_data = output->mutable_data<float>();
+
+  int out_w_num = output_width >> 2;
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  float vqua[] = {0.25f, 0.25f, 0.25f, 0.25f};
+  int remain = output_width - out_w_num << 2;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      const float *input_data_chanel_row_next = input_data + input_width;
+      for (; output_height > 0; output_height--) {
+        if (out_w_num > 0) {
+          asm volatile(
+              "avg_loop:                            \n\t"
+              "vld1.32  {q0,q1},  [%[in_ptr1]]!         \n\t"
+              "vld1.32  {q2,q3},  [%[in_ptr2]]!         \n\t"
+              "vadd.f32  q0,  q0,  q2                 \n\t"
+              "vadd.f32  q1,  q1,  q3                 \n\t"
+              "vpadd.f32  d4,  d0, d1                  \n\t"
+              "vpadd.f32  d5,  d2, d3                  \n\t"
+              "vld1.32  {q4}, [%[vqua]]!                  \n\t"
+              "vmul.f32  q2,  q2,  q4                          \n\t"
+              "subs %[out_w_num],  #1                  \n\t"
+              "vst1.32  {q2},  [%[out_ptr]]!                 \n\t"
+              "bne  avg_loop                            \n\t"
+              : [in_ptr1] "+r"(input_data),
+                [in_ptr2] "+r"(input_data_chanel_row_next),
+                [out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num)
+              : [vqua] "r"(vqua)
+              : "memory", "q0", "q1", "q2", "q3", "q4");
+        }
+
+        for (; remain > 0; remain--) {
+          float max_row1 = std::max(input_data[0], input_data[1]);
+          float max_row2 = std::max(input_data_chanel_row_next[0],
+                                    input_data_chanel_row_next[1]);
+          *output_data = std::max(max_row1, max_row2);
+          input_data += 2;
+          input_data_chanel_row_next += 2;
+          output_data++;
+        }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+#endif
+}
+
+//}
+}  // namespace math
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/math/pool_2x2.h b/src/operators/math/pool_2x2.h
index 0ed7f4e6abd4f7c78a9f14652fcf662a99d1e549..3fb0d24ba2ce854e8e63c066222e355e2c84dabb 100644
--- a/src/operators/math/pool_2x2.h
+++ b/src/operators/math/pool_2x2.h
@@ -12,16 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef POOL_OP
+
 #pragma once
 
+#include "framework/tensor.h"
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+using framework::Tensor;
+using std::vector;
 
-static void Pool2x2Max() {
-  // todo impl with neon
-}
+void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
+                Tensor *output);
 
-static void Pool2x2Avg() {
-  // todo impl with neon
-}
+void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *in_x,
+                Tensor *out);
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/src/operators/math/pool_3x3.cpp b/src/operators/math/pool_3x3.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0259565377386a1415d27b0794580a6a223a88d4
--- /dev/null
+++ b/src/operators/math/pool_3x3.cpp
@@ -0,0 +1,232 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef POOL_OP
+#define __ARM_NEON true
+#include "pool_3x3.h"
+#include "framework/tensor.h"
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif  // __ARM_NEON
+#include <climits>
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+using framework::Tensor;
+using std::max;
+using std::min;
+using std::vector;
+
+void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
+                Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+
+  const int input_height = input->dims()[2];
+
+  const int input_width = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int _kernel_size = 3;
+  const int stride_height = strides[0];
+  const int stride_width = strides[1];
+  const int padding_height = paddings[0];
+  const int padding_width = paddings[1];
+  const float negative_max = -INT_MAX;
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+
+  const float *input_data = input->data<float>();
+  float *output_data = output->mutable_data<float>();
+
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  const float *pos1, *pos2, *pos3, *output_ptr;
+  int hstart, wstart, hend, wend;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      for (int ph = 0; ph < output_height; ph++) {
+        for (int pw = 0; pw < output_width; pw++) {
+          hstart = ph * stride_height - padding_height;
+          wstart = pw * stride_width - padding_width;
+          hend = min(hstart + _kernel_size, input_height + padding_height);
+          wend = min(wstart + _kernel_size, input_width + padding_width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          hend = min(hend, input_height);
+          wend = min(wend, input_width);
+          pos1 = input_data + hstart * input_width + wstart;
+          pos2 = input_data + (hstart + 1) * input_width + wstart;
+          pos3 = input_data + (hstart + 2) * input_width + wstart;
+          output_ptr = output_data + ph * output_width + pw;
+
+          if (hend - hstart != 3 || wend - wstart != 3) {
+            float max_value = -INT_MAX;
+            for (int h = hstart; h < hend; h++) {
+              for (int w = wstart; w < wend; w++) {
+                float value = input_data[h * input_width + w];
+                if (value > max_value) {
+                  max_value = value;
+                }
+              }
+            }
+            output_data[ph * output_width + pw] = max_value;
+          } else {
+#if defined(ARMV7)
+            asm volatile(
+                "vld1.32  {q1}, [%[pos1]]        \n\t"
+                "vld1.32  {q2}, [%[pos2]]        \n\t"
+                "vld1.32  {q3}, [%[pos3]]        \n\t"
+                "vmax.f32 q1, q1, q2            \n\t"
+                "vmax.f32 q2, q1, q3            \n\t"
+                "vmov.f32 d5[1],  %[negative_max]         \n\t"
+                "vpmax.f32  d6, d4, d5            \n\t"
+                "vpmax.f32  d7, d6, d6             \n\t"
+                "vst1.32 {d7[0]},[%[output_ptr]]    \n\t"
+                :
+                : [input_data] "r"(input_data), [pos1] "r"(pos1),
+                  [pos2] "r"(pos2), [pos3] "r"(pos3),
+                  [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
+                : "memory", "q1", "q2", "q3", "q4");
+#else
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos2);
+            const float32x4_t data3 = vld1q_f32(pos3);
+            const float32x4_t max_data =
+                vmaxq_f32(vmaxq_f32(data1, data3), data2);
+            float32x2_t res =
+                vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
+                          vget_low_f32(max_data));
+            res = vpmax_f32(res, res);
+            output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
+#endif
+          }
+        }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+#endif
+}
+
+void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
+                Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+
+  const int input_height = input->dims()[2];
+
+  const int input_width = input->dims()[3];
+
+  const int output_channels = output->dims()[1];
+
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int _kernel_size = 3;
+  const int stride_height = strides[0];
+  const int stride_width = strides[1];
+  const int padding_height = paddings[0];
+  const int padding_width = paddings[1];
+
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+
+  const float *input_data = input->data<float>();
+  float *output_data = output->mutable_data<float>();
+  const float zero = 0;
+  const float nine = 1.0 / 9.0;
+  const float nine_ptr[] = {nine, nine};
+
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      for (int ph = 0; ph < output_height; ph++) {
+        for (int pw = 0; pw < output_width; pw++) {
+          int hstart = ph * stride_height - padding_height;
+          int wstart = pw * stride_width - padding_width;
+          int hend = min(hstart + _kernel_size, input_height + padding_height);
+          int wend = min(wstart + _kernel_size, input_width + padding_width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          hend = min(hend, input_height);
+          wend = min(wend, input_width);
+          const float *pos1 = input_data + hstart * input_width + wstart;
+          const float *pos2 = input_data + (hstart + 1) * input_width + wstart;
+          const float *pos3 = input_data + (hstart + 2) * input_width + wstart;
+          const float *output_ptr = output_data + ph * output_width + pw;
+
+          if (hend - hstart != 3 || wend - wstart != 3) {
+            float sum = 0;
+            for (int h = hstart; h < hend; h++) {
+              for (int w = wstart; w < wend; w++) {
+                sum += input_data[h * input_width + w];
+              }
+            }
+            output_data[ph * output_width + pw] = sum / 9.0;
+          } else {
+#if defined(ARMV7)
+
+            asm volatile(
+                "vld1.32  {q1}, [%[pos1]]        \n\t"
+                "vld1.32  {q2}, [%[pos2]]        \n\t"
+                "vld1.32  {q3}, [%[pos3]]        \n\t"
+                "vadd.f32 q1, q1, q2            \n\t"
+                "vadd.f32 q2, q1, q3            \n\t"
+                "vmov.f32 d5[1],  %[zero]         \n\t"
+                "vpadd.f32  d6, d4, d5            \n\t"
+                "vpadd.f32  d6, d6, d6             \n\t"
+                "vld1.f32 d7, [%[nine_ptr]]!        \n\t"
+                "vmul.f32 d6,d7                     \n\t"
+                "vst1.32 {d6[0]},[%[output_ptr]]    \n\t"
+                :
+                : [input_data] "r"(input_data), [pos1] "r"(pos1),
+                  [pos2] "r"(pos2), [pos3] "r"(pos3),
+                  [output_ptr] "r"(output_ptr), [zero] "r"(zero),
+                  [nine_ptr] "r"(nine_ptr)
+                : "memory", "r6", "q1", "q2", "q3", "q4");
+#else
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos2);
+            const float32x4_t data3 = vld1q_f32(pos3);
+            const float32x4_t sum_data =
+                vaddq_f32(vaddq_f32(data1, data3), data2);
+            float32x2_t res =
+                vpadd_f32(vget_high_f32(vsetq_lane_f32(0, sum_data, 3)),
+                          vget_low_f32(sum_data));
+            res = vpadd_f32(res, res);
+            output_data[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0;
+#endif
+          }
+        }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+#endif
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/math/pool3x3.h b/src/operators/math/pool_3x3.h
similarity index 59%
rename from src/operators/math/pool3x3.h
rename to src/operators/math/pool_3x3.h
index 3852b901871eb4cdcff0497a1ad2854abf93b7b6..22a398084390701aefc8815c9aa93b82b4c4ec7b 100644
--- a/src/operators/math/pool3x3.h
+++ b/src/operators/math/pool_3x3.h
@@ -12,16 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef POOL_OP
+
 #pragma once
 
+#include "framework/tensor.h"
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON
 
-static void Pool3x3Max() {
-  // todo impl with neon
-}
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+using framework::Tensor;
+using std::vector;
+
+void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
+                Tensor *output);
+
+void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *in_x,
+                Tensor *out);
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
 
-static void Pool3x3Avg() {
-  // todo impl with neon
-}
+#endif
diff --git a/src/operators/math/pooling.cpp b/src/operators/math/pooling.cpp
index 07afdb7d14a7260e547e072cc67bd1613e812944..4287408394f1a7f407154938f3e83e9fac3543a2 100644
--- a/src/operators/math/pooling.cpp
+++ b/src/operators/math/pooling.cpp
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef POOL_OP
+
 #include "pooling.h"
-#include <common/types.h>
+#include "common/types.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +38,7 @@ class PoolFunctor<CPU, PoolProcess, T> {
     const int input_height = input.dims()[2];
 
     const int input_width = input.dims()[3];
-    if (output == nullptr) {
-      DLOG << "output tensor is null";
-    }
+
     const int output_channels = output->dims()[1];
 
     const int output_height = output->dims()[2];
@@ -57,7 +57,7 @@ class PoolFunctor<CPU, PoolProcess, T> {
     T *output_data = output->mutable_data<T>();
 
     for (int i = 0; i < batch_size; i++) {
-#pragma omp parallel for
+      #pragma omp parallel for
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
           int hstart = ph * stride_height - padding_height;
@@ -91,3 +91,5 @@ template class PoolFunctor<CPU, math::MaxPool<float>, float>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/math/pooling.h b/src/operators/math/pooling.h
index e511fc0518cb755d481b347df449d0e242a58e14..bc2ecf41d224c2b0fd518d44fecc3f688d98ee19 100644
--- a/src/operators/math/pooling.h
+++ b/src/operators/math/pooling.h
@@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef POOL_OP
+
 #pragma once
 
 #include "common/log.h"
 #include "framework/tensor.h"
+#include "pool_2x2.h"
+#include "pool_3x3.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -64,3 +68,5 @@ class PoolFunctor {
 }
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/math/softmax.cpp b/src/operators/math/softmax.cpp
index 224382eb2b78b1653da0cbbd9327cabb4fd9b3d1..a1eb4f13d82376d86da258101b15e6ae5e8bdc97 100644
--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
@@ -11,6 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
+#ifdef SOFTMAX_OP
+
 #include "operators/math/softmax.h"
 #include "common/types.h"
 #if __ARM_NEON
@@ -153,3 +156,4 @@ template class SoftmaxFuntor<CPU, float>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
diff --git a/src/operators/math/softmax.h b/src/operators/math/softmax.h
index 232497da531a44c14772916fa26328c4b3a1f130..e2ca8f30b067e9262a0e87f4ba5807df07949e73 100644
--- a/src/operators/math/softmax.h
+++ b/src/operators/math/softmax.h
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef SOFTMAX_OP
 #pragma once
 #include "framework/tensor.h"
 namespace paddle_mobile {
@@ -26,3 +27,4 @@ class SoftmaxFuntor {
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
diff --git a/src/operators/mul_op.cpp b/src/operators/mul_op.cpp
index 80c20122f4b04a3de13a95bc8ed26d48f7464f44..49ae3a5e8484cb2f6628eb53cabd9321ae5705b8 100644
--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef MUL_OP
+
 #include "mul_op.h"
 
 namespace paddle_mobile {
@@ -19,10 +21,10 @@ namespace operators {
 
 template <typename Dtype, typename T>
 void MulOp<Dtype, T>::InferShape() const {
-  auto x_dims = param_.InputX()->dims();
-  auto y_dims = param_.InputY()->dims();
-  int x_num_col_dims = param_.XNumColDims();
-  int y_num_col_dims = param_.YNumColDims();
+  auto x_dims = this->param_.InputX()->dims();
+  auto y_dims = this->param_.InputY()->dims();
+  int x_num_col_dims = this->param_.XNumColDims();
+  int y_num_col_dims = this->param_.YNumColDims();
 
   assert(x_dims.size() > x_num_col_dims);
   assert(y_dims.size() > y_num_col_dims);
@@ -46,12 +48,22 @@ void MulOp<Dtype, T>::InferShape() const {
   }
 
   framework::DDim ddim = framework::make_ddim(output_dims);
-  param_.Out()->Resize(ddim);
+  this->param_.Out()->Resize(ddim);
 }
 template class MulOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
-USE_OP(mul);
-REGISTER_OPERATOR(mul, ops::MulOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(mul);
+REGISTER_OPERATOR_CPU(mul, ops::MulOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(mul);
+REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/mul_op.h b/src/operators/mul_op.h
index ded618551fca682daea0bacc3635776eeb81301c..ad5c9a3702348455cb559c28453df82d81e1c4c8 100644
--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
@@ -11,6 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
+#ifdef MUL_OP
+
 #pragma once
 
 #include <string>
@@ -22,26 +25,25 @@ namespace paddle_mobile {
 namespace operators {
 
 template <typename DeviceType, typename T>
-class MulOp : public framework::OperatorWithKernel<DeviceType> {
+class MulOp : public framework::OperatorWithKernel<
+                  DeviceType, MulParam, operators::MulKernel<DeviceType, T>> {
  public:
   MulOp(const std::string &type, const VariableNameMap &inputs,
-        const VariableNameMap &outputs, const framework::AttributeMap attrs,
+        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
         std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
-        param_(inputs, outputs, attrs, *scope) {}
+      : framework::OperatorWithKernel<DeviceType, MulParam,
+                                      operators::MulKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
 
-  void RunImpl() const {
-    operators::MulKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
-
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, MulParam,
+      operators::MulKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
-  MulParam param_;
 };
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/multiclass_nms_op.cpp b/src/operators/multiclass_nms_op.cpp
index bc796010b231929b3f0c017b68f33b861a84262d..52adf6cc627d76b18b3b48928c344545327ca99e 100644
--- a/src/operators/multiclass_nms_op.cpp
+++ b/src/operators/multiclass_nms_op.cpp
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef MULTICLASSNMS_OP
+
 #include "operators/multiclass_nms_op.h"
 namespace paddle_mobile {
 namespace operators {
 
 template <typename Dtype, typename T>
 void MultiClassNMSOp<Dtype, T>::InferShape() const {
-  auto input_bboxes_dims = param_.InputBBoxes()->dims();
-  auto input_scores_dims = param_.InputScores()->dims();
+  auto input_bboxes_dims = this->param_.InputBBoxes()->dims();
+  auto input_scores_dims = this->param_.InputScores()->dims();
   if (input_scores_dims.size() != 3) {
     LOG(kLOG_ERROR) << "Input Scores size must be 3";
   }
@@ -30,12 +32,20 @@ void MultiClassNMSOp<Dtype, T>::InferShape() const {
     LOG(kLOG_ERROR) << "Predict bboxes must be equal";
   }
   // pre size, will change in Compute.
-  param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6}));
+  this->param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6}));
 }
 template class MultiClassNMSOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
-USE_OP(multiclass_nms);
-REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(multiclass_nms);
+REGISTER_OPERATOR_CPU(multiclass_nms, ops::MultiClassNMSOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/multiclass_nms_op.h b/src/operators/multiclass_nms_op.h
index c424856b8cdc09b365a7ece28df39a911b6d3af8..30cf8f67942f7888599e8f0057baff1ddd5d6cea 100644
--- a/src/operators/multiclass_nms_op.h
+++ b/src/operators/multiclass_nms_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef MULTICLASSNMS_OP
+
 #pragma once
 
 #include <string>
@@ -26,27 +28,28 @@ namespace operators {
 using paddle_mobile::framework::Tensor;
 
 template <typename DeviceType, typename T>
-class MultiClassNMSOp : public framework::OperatorWithKernel<DeviceType> {
+class MultiClassNMSOp : public framework::OperatorWithKernel<
+                            DeviceType, MultiClassNMSParam,
+                            operators::MultiClassNMSKernel<DeviceType, T>> {
  public:
   MultiClassNMSOp(const std::string &type, const VariableNameMap &inputs,
                   const VariableNameMap &outputs,
-                  const framework::AttributeMap attrs,
+                  const framework::AttributeMap &attrs,
                   std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
-        param_(inputs, outputs, attrs, *scope) {}
-
-  void RunImpl() const {
-    operators::MultiClassNMSKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
-
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+      : framework::OperatorWithKernel<
+            DeviceType, MultiClassNMSParam,
+            operators::MultiClassNMSKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, MultiClassNMSParam,
+      operators::MultiClassNMSKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
-  MultiClassNMSParam param_;
 };
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/op_param.cpp b/src/operators/op_param.cpp
index ac6ae4cdef77af623097bf6a6d1e73f55339a71a..4ad7685731bdb44794c235c639be7eed4a0c812b 100644
--- a/src/operators/op_param.cpp
+++ b/src/operators/op_param.cpp
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "op_param.h"
-
 namespace paddle_mobile {
 namespace operators {
+
+#ifdef CONV_OP
 Print &operator<<(Print &printer, const ConvParam &conv_param) {
   printer << "parameter of conv: "
           << "\n";
@@ -36,5 +37,33 @@ Print &operator<<(Print &printer, const ConvParam &conv_param) {
   printer << "  output dims: " << conv_param.Output()->dims();
   return printer;
 }
+#endif
+
+#ifdef FUSION_CONVADD_OP
+
+Print &operator<<(Print &printer, const FusionConvAddParam &conv_param) {
+  printer << "parameter of conv_add: "
+          << "\n";
+  printer << "  stride: "
+          << " (" << conv_param.Strides()[0] << conv_param.Strides()[1] << ") "
+          << "\n";
+  printer << "  paddings: "
+          << " (" << conv_param.Paddings()[0] << conv_param.Paddings()[1]
+          << ") "
+          << "\n";
+  printer << "  dilations: "
+          << " (" << conv_param.Dilations()[0] << conv_param.Dilations()[1]
+          << ") "
+          << "\n";
+  printer << "  groups: " << conv_param.Groups() << "\n";
+  printer << "  input  dims: " << conv_param.Input()->dims() << "\n";
+  printer << "  filter dims: " << conv_param.Filter()->dims() << "\n";
+  printer << "  bias dims: " << conv_param.Bias()->dims() << "\n";
+  printer << "  output dims: " << conv_param.Output()->dims();
+  return printer;
+}
+
+#endif
+
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index e0efa046ca82091d4d6d4eaa1dd7024e2ba1629b..4e51e412f78049b6f5a83aae5ffc600af611de1a 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -34,7 +34,7 @@ using framework::Tensor;
 using std::string;
 using std::vector;
 
-class OpParam : PaddleMobileObject {
+class OpParam {
  protected:
   template <typename T>
   static T *InputFrom(const VariableNameMap &inputs, const Scope &scope) {
@@ -165,10 +165,10 @@ class OpParam : PaddleMobileObject {
   template <typename T>
   static T *GetVarValue(const string &key, const VariableNameMap &var_map,
                         const Scope &scope) {
+    PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0,
+                          "%s is not contained in var_map", key.c_str())
     auto var_vec = var_map.at(key);
     if (!var_vec.empty()) {
-      //      std::cout << " get var value -- " << var_vec[0] <<
-      //      std::endl;
       auto var = scope.FindVar(var_vec[0]);
       return var->GetMutable<T>();
     } else {
@@ -191,6 +191,7 @@ class OpParam : PaddleMobileObject {
   }
 };
 
+#ifdef CONV_OP
 class ConvParam : OpParam {
  public:
   ConvParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -230,7 +231,9 @@ class ConvParam : OpParam {
 };
 
 Print &operator<<(Print &printer, const ConvParam &conv_param);
+#endif
 
+#ifdef ELEMENTWISEADD_OP
 class ElementwiseAddParam : OpParam {
  public:
   ElementwiseAddParam(const VariableNameMap &inputs,
@@ -258,6 +261,9 @@ class ElementwiseAddParam : OpParam {
   int axis_;
 };
 
+#endif
+
+#ifdef MUL_OP
 class MulParam : OpParam {
  public:
   MulParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -287,7 +293,9 @@ class MulParam : OpParam {
   int x_num_col_dims_;
   int y_num_col_dims_;
 };
+#endif
 
+#ifdef CONCAT_OP
 class ConcatParam : public OpParam {
  public:
   ConcatParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -309,7 +317,9 @@ class ConcatParam : public OpParam {
   Tensor *out_;
   int axis_;
 };
+#endif
 
+#ifdef LRN_OP
 class LrnParam : public OpParam {
  public:
   LrnParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -351,6 +361,9 @@ class LrnParam : public OpParam {
   float k_;
   string data_format_;
 };
+#endif
+
+#ifdef BATCHNORM_OP
 class BatchNormParam : OpParam {
  public:
   BatchNormParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -399,6 +412,9 @@ class BatchNormParam : OpParam {
   bool is_test_;
   string data_format_;
 };
+#endif
+
+#ifdef POOL_OP
 class PoolParam : public OpParam {
  public:
   PoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -442,6 +458,9 @@ class PoolParam : public OpParam {
   bool gloabal_pooling_ = false;
 };
 
+#endif
+
+#ifdef PRIORBOX_OP
 class PriorBoxParam : public OpParam {
  public:
   PriorBoxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -503,7 +522,9 @@ class PriorBoxParam : public OpParam {
   float step_h_;
   float offset_;
 };
+#endif
 
+#ifdef BOXCODER_OP
 class BoxCoderParam : public OpParam {
  public:
   BoxCoderParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -533,7 +554,9 @@ class BoxCoderParam : public OpParam {
   Tensor *output_box_;
   std::string code_type_;
 };
+#endif
 
+#ifdef SOFTMAX_OP
 class SoftmaxParam : public OpParam {
  public:
   SoftmaxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -549,7 +572,9 @@ class SoftmaxParam : public OpParam {
   Tensor *input_x_;
   Tensor *out_;
 };
+#endif
 
+#ifdef SIGMOID_OP
 class SigmoidParam : public OpParam {
  public:
   SigmoidParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -565,6 +590,9 @@ class SigmoidParam : public OpParam {
   Tensor *input_x_;
   Tensor *out_;
 };
+#endif
+
+#ifdef MULTICLASSNMS_OP
 class MultiClassNMSParam : public OpParam {
  public:
   MultiClassNMSParam(const VariableNameMap &inputs,
@@ -610,6 +638,7 @@ class MultiClassNMSParam : public OpParam {
   float nms_eta_;
   float score_threshold_;
 };
+#endif
 
 class FeedParam : public OpParam {
  public:
@@ -646,6 +675,7 @@ class FetchParam : public OpParam {
   Tensor *out_;
 };
 
+#ifdef TRANSPOSE_OP
 class TransposeParam : public OpParam {
  public:
   TransposeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -666,7 +696,9 @@ class TransposeParam : public OpParam {
   Tensor *out_;
   vector<int> axis_;
 };
+#endif
 
+#ifdef RESHAPE_OP
 class ReshapeParam : public OpParam {
  public:
   ReshapeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -695,7 +727,9 @@ class ReshapeParam : public OpParam {
   vector<int> shape_;
   bool inplace_;
 };
+#endif
 
+#ifdef RELU_OP
 /*
  * @b op 层实例化好这个 param 传递给 kernel 层使用
  * */
@@ -715,11 +749,13 @@ class ReluParam : public OpParam {
   Tensor *input_x_;
   Tensor *out_;
 };
+#endif
 
-class FushionFcParam : public OpParam {
+#ifdef FUSION_FC_OP
+class FusionFcParam : public OpParam {
  public:
-  FushionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                 const AttributeMap &attrs, const Scope &scope) {
+  FusionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+                const AttributeMap &attrs, const Scope &scope) {
     input_x_ = InputXFrom<LoDTensor>(inputs, scope);
     input_y_ = InputYFrom<LoDTensor>(inputs, scope);
     input_z_ = InputZFrom<LoDTensor>(inputs, scope);
@@ -751,6 +787,66 @@ class FushionFcParam : public OpParam {
   int y_num_col_dims_;
   int axis_;
 };
+#endif
+
+#ifdef FUSION_CONVADD_OP
+class FusionConvAddParam : public OpParam {
+ public:
+  FusionConvAddParam(const VariableNameMap &inputs,
+                     const VariableNameMap &outputs, const AttributeMap &attrs,
+                     const Scope &scope) {
+    bias_ = InputYFrom<LoDTensor>(inputs, scope);
+    axis_ = GetAttr<int>("axis", attrs);
+    filter_ = FilterFrom<LoDTensor>(inputs, scope);
+    input_ = InputFrom<LoDTensor>(inputs, scope);
+    output_ = OutFrom<LoDTensor>(outputs, scope);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+  }
+  Tensor *Bias() const { return bias_; }
+
+  const int &Axis() const { return axis_; }
+
+  const Tensor *Input() const { return input_; }
+
+  const Tensor *Filter() const { return filter_; }
+
+  Tensor *Output() const { return output_; }
+
+  const vector<int> &Strides() const { return strides_; }
+
+  const vector<int> &Paddings() const { return paddings_; }
+
+  const vector<int> &Dilations() const { return dilations_; }
+
+  const int &Groups() const { return groups; }
+
+ protected:
+  Tensor *bias_;
+  int axis_;
+  Tensor *input_;
+  Tensor *output_;
+  Tensor *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+};
+
+Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
+#endif
+
+#ifdef FUSION_CONVADD_RELU_OP
+class FusionConvAddReluParam : public FusionConvAddParam {
+ public:
+  FusionConvAddReluParam(const VariableNameMap &inputs,
+                         const VariableNameMap &outputs,
+                         const AttributeMap &attrs, const Scope &scope)
+      : FusionConvAddParam(inputs, outputs, attrs, scope) {}
+};
+#endif
 
 class Im2SequenceParam : public OpParam {
  public:
diff --git a/src/operators/pool_op.cpp b/src/operators/pool_op.cpp
index 3096199dc3e3157f9fa0048ad35f796e24113f28..62eaf6b5f8105c4d2ab63f2f883445705b815860 100644
--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
@@ -12,7 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef POOL_OP
+
 #include "pool_op.h"
+#include "framework/op_proto_maker.h"
+#include "framework/op_registry.h"
 
 namespace paddle_mobile {
 namespace operators {
@@ -30,13 +34,13 @@ int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
 }
 template <typename DeviceType, typename T>
 void PoolOp<DeviceType, T>::InferShape() const {
-  auto in_x_dims = param_.Input()->dims();
-  std::vector<int> ksize = param_.Ksize();
-  std::vector<int> paddings = param_.Paddings();
-  std::vector<int> strides = param_.Strides();
-  bool ceil_mode = param_.isCeilMode();
+  auto in_x_dims = this->param_.Input()->dims();
+  std::vector<int> ksize = this->param_.Ksize();
+  std::vector<int> paddings = this->param_.Paddings();
+  std::vector<int> strides = this->param_.Strides();
+  bool ceil_mode = this->param_.isCeilMode();
 
-  if (param_.isGlobalPooling()) {
+  if (this->param_.isGlobalPooling()) {
     ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
     for (size_t i = 0; i < ksize.size(); ++i) {
       paddings[i] = 0;
@@ -48,12 +52,22 @@ void PoolOp<DeviceType, T>::InferShape() const {
     output_shape.push_back(PoolOutputSize(in_x_dims[i + 2], ksize[i],
                                           paddings[i], strides[i], ceil_mode));
   }
-  param_.Output()->Resize(framework::make_ddim(output_shape));
+  this->param_.Output()->Resize(framework::make_ddim(output_shape));
 }
 template class PoolOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
-USE_OP(pool2d);
-REGISTER_OPERATOR(pool2d, ops::PoolOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(pool2d);
+REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(pool2d);
+REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/pool_op.h b/src/operators/pool_op.h
index ff44771c56151acf699b017ddf834a2d32e07761..5b436fb18bdc055add21acd37e5a1a9c7b6e5b02 100644
--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef POOL_OP
+
 #pragma once
 
-#include <framework/operator.h>
-#include <operators/kernel/pool_kernel.h>
-#include <operators/op_param.h>
 #include <string>
 
+#include "framework/operator.h"
+#include "operators/kernel/pool_kernel.h"
+#include "operators/op_param.h"
+
 namespace paddle_mobile {
 namespace operators {
 using framework::AttributeMap;
@@ -26,24 +29,23 @@ using framework::OperatorWithKernel;
 using framework::Scope;
 using std::string;
 template <typename DeviceType, typename T>
-class PoolOp : public OperatorWithKernel<DeviceType> {
+class PoolOp : public OperatorWithKernel<DeviceType, PoolParam,
+                                         operators::PoolKernel<DeviceType, T>> {
  public:
   PoolOp(const string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const AttributeMap &attrs,
          std::shared_ptr<Scope> scope)
-      : OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, scope),
-        param_(inputs, outputs, attrs, *scope) {}
-  using OperatorWithKernel<DeviceType>::OperatorWithKernel;
+      : OperatorWithKernel<DeviceType, PoolParam,
+                           operators::PoolKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  using OperatorWithKernel<
+      DeviceType, PoolParam,
+      operators::PoolKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
-  void RunImpl() const {
-    operators::PoolKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-    this->ClearVariables({"X"});
-  }
-
  private:
-  PoolParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/prior_box_op.cpp b/src/operators/prior_box_op.cpp
index 3928c3db53414dbb3ef9a6ae4ebe5527dc5eeeca..44e1741b66f301aee55f1f4d33b9bb1173e6004d 100644
--- a/src/operators/prior_box_op.cpp
+++ b/src/operators/prior_box_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef PRIORBOX_OP
+
 #include "operators/prior_box_op.h"
 #include <vector>
 namespace paddle_mobile {
@@ -19,13 +21,13 @@ namespace operators {
 
 template <typename Dtype, typename T>
 void PriorBoxOp<Dtype, T>::InferShape() const {
-  auto input_dims = param_.Input()->dims();
-  auto input_image_dims = param_.InputImage()->dims();
-  auto min_sizes = param_.MinSizes();
-  auto max_sizes = param_.MaxSizes();
-  auto variances = param_.Variances();
-  auto aspect_ratios = param_.AspectRatios();
-  bool flip = param_.Flip();
+  auto input_dims = this->param_.Input()->dims();
+  auto input_image_dims = this->param_.InputImage()->dims();
+  auto min_sizes = this->param_.MinSizes();
+  auto max_sizes = this->param_.MaxSizes();
+  auto variances = this->param_.Variances();
+  auto aspect_ratios = this->param_.AspectRatios();
+  bool flip = this->param_.Flip();
   std::vector<float> aspect_ratios_vec;
   ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
 
@@ -39,13 +41,21 @@ void PriorBoxOp<Dtype, T>::InferShape() const {
   dim_vec[1] = input_dims[3];
   dim_vec[2] = num_priors;
   dim_vec[3] = 4;
-  param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec));
-  param_.OutputVariances()->Resize(framework::make_ddim(dim_vec));
+  this->param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec));
+  this->param_.OutputVariances()->Resize(framework::make_ddim(dim_vec));
 }
 template class PriorBoxOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
-USE_OP(prior_box);
-REGISTER_OPERATOR(prior_box, ops::PriorBoxOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(prior_box);
+REGISTER_OPERATOR_CPU(prior_box, ops::PriorBoxOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/prior_box_op.h b/src/operators/prior_box_op.h
index 84481e602a6cb4143a50760e66b0d430b8a1c719..5b3e3fffd6787360b69ff3af2d19bc8e05549c04 100644
--- a/src/operators/prior_box_op.h
+++ b/src/operators/prior_box_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef PRIORBOX_OP
+
 #pragma once
 
 #include <string>
@@ -26,27 +28,27 @@ namespace operators {
 using paddle_mobile::framework::Tensor;
 
 template <typename DeviceType, typename T>
-class PriorBoxOp : public framework::OperatorWithKernel<DeviceType> {
+class PriorBoxOp
+    : public framework::OperatorWithKernel<
+          DeviceType, PriorBoxParam, operators::PriorBoxKernel<DeviceType, T>> {
  public:
   PriorBoxOp(const std::string &type, const VariableNameMap &inputs,
              const VariableNameMap &outputs,
-             const framework::AttributeMap attrs,
+             const framework::AttributeMap &attrs,
              std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
-        param_(inputs, outputs, attrs, *scope) {}
-
-  void RunImpl() const {
-    operators::PriorBoxKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
+      : framework::OperatorWithKernel<DeviceType, PriorBoxParam,
+                                      operators::PriorBoxKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
 
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, PriorBoxParam,
+      operators::PriorBoxKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
-  PriorBoxParam param_;
 };
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/relu_op.cpp b/src/operators/relu_op.cpp
index 21bcc605282ffc590025e87b609cccc855a631d1..877dcee1a7f4a5a75d013031235d3a216c35f854 100644
--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef RELU_OP
+
 #include "operators/relu_op.h"
 namespace paddle_mobile {
 namespace operators {
 
 template <typename Dtype, typename T>
 void ReluOp<Dtype, T>::InferShape() const {
-  auto input_dims = param_.InputX()->dims();
-  param_.Out()->Resize(input_dims);
+  auto input_dims = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(input_dims);
 }
 template class ReluOp<CPU, float>;
 }  // namespace operators
@@ -31,5 +33,15 @@ template class ReluOp<CPU, float>;
  * 都是需要和model中类型对应起来的
  * */
 namespace ops = paddle_mobile::operators;
-USE_OP(relu);
-REGISTER_OPERATOR(relu, ops::ReluOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(relu);
+REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(relu);
+REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/relu_op.h b/src/operators/relu_op.h
index 7be8cd249cb22255dff237da6c8653e6237bbc3f..8f9e55cf8a2d5bb58e85c21cd2cee3647b00fa22 100644
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef RELU_OP
+
 #pragma once
 
 #include <string>
@@ -26,36 +28,29 @@ namespace operators {
 using paddle_mobile::framework::Tensor;
 
 template <typename DeviceType, typename T>
-class ReluOp : public framework::OperatorWithKernel<DeviceType> {
+class ReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, ReluParam, operators::ReluKernel<DeviceType, T>> {
  public:
   /*
    * @b op 的实例化方法, 需要调用父类的实例化方法, 以及实例化自己的参数结构体
    * */
   ReluOp(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const framework::AttributeMap attrs,
+         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
          std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
-        param_(inputs, outputs, attrs, *scope) {}
-
-  /*
-   * @b op 进行运算, 调用相应的 kernel 进行运算
-   * */
-  void RunImpl() const {
-    operators::ReluKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
+      : framework::OperatorWithKernel<DeviceType, ReluParam,
+                                      operators::ReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
 
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, ReluParam,
+      operators::ReluKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
-  /*
-   * @b Relu kernel 进行运算时所需要用到参数的结构体,
-   *    结构体定义在: paddle-mobile/src/operators/op_param.h
-   * */
-  ReluParam param_;
 };
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/reshape_op.cpp b/src/operators/reshape_op.cpp
index 6562b7a5eb491a7e69e9bd9481251b8aaf9f3f4b..c7294079b26250770006aeb1b79c15469489b988 100644
--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef RESHAPE_OP
+
 #include "operators/reshape_op.h"
 #include <vector>
 namespace paddle_mobile {
@@ -20,15 +22,25 @@ namespace operators {
 template <typename Dtype, typename T>
 void ReshapeOp<Dtype, T>::InferShape() const {
   /// todo: add InputShape() detection.
-  auto &shape = param_.Shape();
-  auto input_x_dims = param_.InputX()->dims();
+  auto &shape = this->param_.Shape();
+  auto input_x_dims = this->param_.InputX()->dims();
   auto out_dims = ValidateShape(shape, input_x_dims);
-  param_.Out()->Resize(out_dims);
+  this->param_.Out()->Resize(out_dims);
 }
 template class ReshapeOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
-USE_OP(reshape);
-REGISTER_OPERATOR(reshape, ops::ReshapeOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(reshape);
+REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(reshape);
+REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/reshape_op.h b/src/operators/reshape_op.h
index b244e62a930a0e6a98d56fe06a4e4a7e37f7d5e1..90d31153135f629585d56eb89ae12830215900d8 100644
--- a/src/operators/reshape_op.h
+++ b/src/operators/reshape_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef RESHAPE_OP
+
 #pragma once
 
 #include <string>
@@ -26,26 +28,27 @@ namespace operators {
 using paddle_mobile::framework::Tensor;
 
 template <typename DeviceType, typename T>
-class ReshapeOp : public framework::OperatorWithKernel<DeviceType> {
+class ReshapeOp
+    : public framework::OperatorWithKernel<
+          DeviceType, ReshapeParam, operators::ReshapeKernel<DeviceType, T>> {
  public:
   ReshapeOp(const std::string &type, const VariableNameMap &inputs,
-            const VariableNameMap &outputs, const framework::AttributeMap attrs,
+            const VariableNameMap &outputs,
+            const framework::AttributeMap &attrs,
             std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
-        param_(inputs, outputs, attrs, *scope) {}
-
-  void RunImpl() const {
-    operators::ReshapeKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
+      : framework::OperatorWithKernel<DeviceType, ReshapeParam,
+                                      operators::ReshapeKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
 
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, ReshapeParam,
+      operators::ReshapeKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
 
  protected:
-  ReshapeParam param_;
 };
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/sigmoid_op.cpp b/src/operators/sigmoid_op.cpp
index 6bff80a35aa019a7b05f6e9b58c49e13fb8f1bc8..79190e6c3368b9d375770062d948580779393f04 100644
--- a/src/operators/sigmoid_op.cpp
+++ b/src/operators/sigmoid_op.cpp
@@ -12,18 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef SIGMOID_OP
+
 #include "operators/sigmoid_op.h"
 
 namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType, typename T>
 void SigmoidOp<DeviceType, T>::InferShape() const {
-  param_.Out()->Resize(param_.InputX()->dims());
+  this->param_.Out()->Resize(this->param_.InputX()->dims());
 }
 template class SigmoidOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
-USE_OP(sigmoid);
-REGISTER_OPERATOR(sigmoid, ops::SigmoidOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(sigmoid);
+REGISTER_OPERATOR_CPU(sigmoid, ops::SigmoidOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/sigmoid_op.h b/src/operators/sigmoid_op.h
index f631ba51759ea31f91ddcdf7c90a0dc874e86b20..bd914a63783f65c7b55d783f2bbcdf19c303c00f 100644
--- a/src/operators/sigmoid_op.h
+++ b/src/operators/sigmoid_op.h
@@ -12,38 +12,38 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef SIGMOID_OP
+
 #pragma once
 
-#include <framework/operator.h>
-#include <operators/op_param.h>
 #include <string>
+
+#include "framework/operator.h"
 #include "operators/kernel/sigmoid_kernel.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType, typename T>
-class SigmoidOp : public framework::OperatorWithKernel<DeviceType> {
+class SigmoidOp
+    : public framework::OperatorWithKernel<
+          DeviceType, SigmoidParam, operators::SigmoidKernel<DeviceType, T>> {
  public:
   SigmoidOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
             const framework::AttributeMap &attrs,
             std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
-        param_(inputs, outputs, attrs, *scope) {}
+      : framework::OperatorWithKernel<DeviceType, SigmoidParam,
+                                      operators::SigmoidKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
 
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, SigmoidParam,
+      operators::SigmoidKernel<DeviceType, T>>::OperatorWithKernel;
 
   void InferShape() const override;
-
-  void RunImpl() const {
-    operators::SigmoidKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-    this->ClearVariables({"X"});
-  }
-
- private:
-  SigmoidParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/softmax_op.cpp b/src/operators/softmax_op.cpp
index c353d0b882cb8f0682f9e4710ff05c32ca68e685..296e3ef30f7c0260cca169bcfe2f6b445493792a 100644
--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -12,18 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef SOFTMAX_OP
+
 #include "operators/softmax_op.h"
 
 namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType, typename T>
 void SoftmaxOp<DeviceType, T>::InferShape() const {
-  param_.Out()->Resize(param_.InputX()->dims());
+  this->param_.Out()->Resize(this->param_.InputX()->dims());
 }
 template class SoftmaxOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
-USE_OP(softmax);
-REGISTER_OPERATOR(softmax, ops::SoftmaxOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(softmax);
+REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(softmax);
+REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/softmax_op.h b/src/operators/softmax_op.h
index 07fd9b945cb29cecd6f4d629b6be58035f971ce4..1445ca055ea0472cdaa02d7496ff895feb9174bc 100644
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
@@ -12,38 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef SOFTMAX_OP
+
 #pragma once
 
-#include <framework/operator.h>
-#include <operators/op_param.h>
 #include <string>
+
+#include "framework/operator.h"
 #include "operators/kernel/softmax_kernel.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType, typename T>
-class SoftmaxOp : public framework::OperatorWithKernel<DeviceType> {
+class SoftmaxOp
+    : public framework::OperatorWithKernel<
+          DeviceType, SoftmaxParam, operators::SoftmaxKernel<DeviceType, T>> {
  public:
   SoftmaxOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
             const framework::AttributeMap &attrs,
             std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
-        param_(inputs, outputs, attrs, *scope) {}
+      : framework::OperatorWithKernel<DeviceType, SoftmaxParam,
+                                      operators::SoftmaxKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
 
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, SoftmaxParam,
+      operators::SoftmaxKernel<DeviceType, T>>::OperatorWithKernel;
 
   void InferShape() const override;
 
-  void RunImpl() const {
-    operators::SoftmaxKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-    this->ClearVariables({"X"});
-  }
-
  private:
-  SoftmaxParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/transpose_op.cpp b/src/operators/transpose_op.cpp
index e21338bf1b59981e914ca4a8e1781e02254bc00c..989b277b9d58a8c029e041a89a1982f8994bae44 100644
--- a/src/operators/transpose_op.cpp
+++ b/src/operators/transpose_op.cpp
@@ -12,16 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "operators/transpose_op.h"
-#include <common/enforce.h>
+#ifdef TRANSPOSE_OP
+
 #include <vector>
+
+#include "common/enforce.h"
+#include "operators/transpose_op.h"
 namespace paddle_mobile {
 namespace operators {
 
 template <typename Dtype, typename T>
 void TransposeOp<Dtype, T>::InferShape() const {
-  auto input_x_dims = param_.InputX()->dims();
-  auto axis = param_.Axis();
+  auto input_x_dims = this->param_.InputX()->dims();
+  auto axis = this->param_.Axis();
 
   size_t x_dims_size = input_x_dims.size();
   size_t axis_size = axis.size();
@@ -42,12 +45,20 @@ void TransposeOp<Dtype, T>::InferShape() const {
   for (size_t i = 0; i < axis_size; i++) {
     out_dims[i] = input_x_dims[axis[i]];
   }
-  param_.Out()->Resize(out_dims);
+  this->param_.Out()->Resize(out_dims);
 }
 template class TransposeOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
-USE_OP(transpose);
-REGISTER_OPERATOR(transpose, ops::TransposeOp);
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(transpose);
+REGISTER_OPERATOR_CPU(transpose, ops::TransposeOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/transpose_op.h b/src/operators/transpose_op.h
index 0f67339533261f98374c6257494278306f3a7208..349220b58ff3e0daec8c7dc2e2dec969ced8b289 100644
--- a/src/operators/transpose_op.h
+++ b/src/operators/transpose_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef TRANSPOSE_OP
+
 #pragma once
 
 #include <string>
@@ -26,27 +28,26 @@ namespace operators {
 using paddle_mobile::framework::Tensor;
 
 template <typename DeviceType, typename T>
-class TransposeOp : public framework::OperatorWithKernel<DeviceType> {
+class TransposeOp : public framework::OperatorWithKernel<
+                        DeviceType, TransposeParam,
+                        operators::TransposeKernel<DeviceType, T>> {
  public:
   TransposeOp(const std::string &type, const VariableNameMap &inputs,
               const VariableNameMap &outputs,
-              const framework::AttributeMap attrs,
+              const framework::AttributeMap &attrs,
               std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope),
-        param_(inputs, outputs, attrs, *scope) {}
-
-  void RunImpl() const {
-    operators::TransposeKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
-
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+      : framework::OperatorWithKernel<
+            DeviceType, TransposeParam,
+            operators::TransposeKernel<DeviceType, T>>(type, inputs, outputs,
+                                                       attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, TransposeParam,
+      operators::TransposeKernel<DeviceType, T>>::OperatorWithKernel;
   void InferShape() const override;
-
- protected:
-  TransposeParam param_;
 };
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/platform/data_type.h b/src/platform/data_type.h
deleted file mode 100644
index 44e0158a7cd7f912689f8514c9c8cfddae5654a1..0000000000000000000000000000000000000000
--- a/src/platform/data_type.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <typeindex>
-
-#include "framework/program/tensor_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-inline VarType_Type ToDataType(std::type_index type) {
-  /*if (typeid(platform::float16).hash_code() == type.hash_code()) {
-    return proto::VarType::FP16;
-  } else */
-  if (typeid(const float).hash_code() == type.hash_code()) {
-    // CPPLint complains Using C-style cast.  Use
-    // static_cast<float>() instead
-    // One fix to this is to replace float with const float because
-    // typeid(T) == typeid(const T)
-    // http://en.cppreference.com/w/cpp/language/typeid
-    return VARTYPE_TYPE_FP32;
-  } else if (typeid(const double).hash_code() == type.hash_code()) {
-    return VARTYPE_TYPE_FP64;
-  } else if (typeid(const int).hash_code() == type.hash_code()) {
-    return VARTYPE_TYPE_INT32;
-  } else if (typeid(const int64_t).hash_code() == type.hash_code()) {
-    return VARTYPE_TYPE_INT64;
-  } else if (typeid(const bool).hash_code() == type.hash_code()) {
-    return VARTYPE_TYPE_BOOL;
-  } else {
-    //    PADDLE_THROW("Not supported");
-    //    std::cout << "Not supported";
-  }
-}
-
-inline std::type_index ToTypeIndex(VarType_Type type) {
-  switch (type) {
-    //    case proto::VarType::FP16:
-    //      return typeid(platform::float16);
-    case VARTYPE_TYPE_FP32:
-      return typeid(float);
-    case VARTYPE_TYPE_FP64:
-      return typeid(double);
-    case VARTYPE_TYPE_INT32:
-      return typeid(int);
-    case VARTYPE_TYPE_INT64:
-      return typeid(int64_t);
-    case VARTYPE_TYPE_BOOL:
-      return typeid(bool);
-    default:
-      //      PADDLE_THROW("Not support type %d", type);
-      printf("Not support type %d", type);
-  }
-}
-
-template <typename Visitor>
-inline void VisitDataType(VarType_Type type, Visitor visitor) {
-  switch (type) {
-    //    case proto::VarType::FP16:
-    //      visitor.template operator()<platform::float16>();
-    //      break;
-    case VARTYPE_TYPE_FP32:
-      visitor.template operator()<float>();
-      break;
-    case VARTYPE_TYPE_FP64:
-      visitor.template operator()<double>();
-      break;
-    case VARTYPE_TYPE_INT32:
-      visitor.template operator()<int>();
-      break;
-    case VARTYPE_TYPE_INT64:
-      visitor.template operator()<int64_t>();
-      break;
-    case VARTYPE_TYPE_BOOL:
-      visitor.template operator()<bool>();
-      break;
-    default:
-      //      PADDLE_THROW("Not supported");
-      printf("Not supported");
-  }
-}
-
-inline std::string DataTypeToString(const VarType_Type type) {
-  switch (type) {
-    case VARTYPE_TYPE_FP16:
-      return "float16";
-    case VARTYPE_TYPE_FP32:
-      return "float32";
-    case VARTYPE_TYPE_FP64:
-      return "float64";
-    case VARTYPE_TYPE_INT16:
-      return "int16";
-    case VARTYPE_TYPE_INT32:
-      return "int32";
-    case VARTYPE_TYPE_INT64:
-      return "int64";
-    case VARTYPE_TYPE_BOOL:
-      return "bool";
-    default:
-      //      PADDLE_THROW("Not support type %d", type);
-      printf("Not support type %d", type);
-  }
-}
-
-inline std::ostream &operator<<(std::ostream &out, const VarType_Type &type) {
-  out << DataTypeToString(type);
-  return out;
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c71306281e3354cd1856ecaa7278266b031b665c..9bfc55c93daa2f69200941bfb49a8a6312fa9eb1 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,116 +1,150 @@
-
 set(dir ${CMAKE_CURRENT_SOURCE_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build")
 
-# gen test
-ADD_EXECUTABLE(test-conv-op  operators/test_cov_op.cpp test_helper.h  test_include.h executor_for_test.h)
-target_link_libraries(test-conv-op paddle-mobile)
-
-# gen test
-ADD_EXECUTABLE(test-mul-op  operators/test_mul_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-mul-op paddle-mobile)
-
-# gen test
-ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-elementwiseadd-op paddle-mobile)
-
-# gen test
-ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-concat-op paddle-mobile)
-
-# gen test
-ADD_EXECUTABLE(test-lrn-op  operators/test_lrn_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-lrn-op paddle-mobile)
-
-# gen test
-ADD_EXECUTABLE(test-batchnorm-op  operators/test_batchnorm_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-batchnorm-op paddle-mobile)
-
-# gen test
-ADD_EXECUTABLE(test-priorbox-op  operators/test_prior_box_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-priorbox-op paddle-mobile)
-
-# gen test
-ADD_EXECUTABLE(test-boxcoder-op  operators/test_box_coder_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-boxcoder-op paddle-mobile)
-
-# gen test
-ADD_EXECUTABLE(test-transpose-op  operators/test_transpose_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-transpose-op paddle-mobile)
-
-# gen test
-ADD_EXECUTABLE(test-multiclassnms-op  operators/test_multiclass_nms_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-multiclassnms-op paddle-mobile)
-
-# gen test
-ADD_EXECUTABLE(test-reshape-op  operators/test_reshape_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-reshape-op paddle-mobile)
-
-# gen test
-ADD_EXECUTABLE(test-relu-op  operators/test_relu_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-relu-op paddle-mobile)
-
-# gen test
-ADD_EXECUTABLE(test-fc-op  operators/test_fushion_fc_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-fc-op paddle-mobile)
-
-# gen test log
-ADD_EXECUTABLE(test-log common/test_log.cpp)
-target_link_libraries(test-log paddle-mobile)
-
-# gen test log
-ADD_EXECUTABLE(test-load framework/test_load.cpp)
-target_link_libraries(test-load paddle-mobile)
-
-# gen test log
-# gen test
-ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp)
-target_link_libraries(test-optimize paddle-mobile)
+if (googlenet)
+    # gen test
+    ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-googlenet paddle-mobile)
+elseif (mobilenet)
+    # gen test
+    ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-mobilenet paddle-mobile)
+elseif (yolo)
+    # gen test
+    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-yolo paddle-mobile)
+elseif (squeezenet)
+    # gen test
+    ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-squeezenet paddle-mobile)
+elseif(resnet)
+    # gen test
+    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet paddle-mobile)
+else ()
+
+    # gen test
+    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-squeezenet paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-yolo paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-googlenet paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-conv-op  operators/test_cov_op.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-conv-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-mul-op  operators/test_mul_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-mul-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-elementwiseadd-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-concat-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-lrn-op  operators/test_lrn_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-lrn-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-batchnorm-op  operators/test_batchnorm_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-batchnorm-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-priorbox-op  operators/test_prior_box_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-priorbox-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-boxcoder-op  operators/test_box_coder_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-boxcoder-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-transpose-op  operators/test_transpose_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-transpose-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-multiclassnms-op  operators/test_multiclass_nms_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-multiclassnms-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-reshape-op  operators/test_reshape_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-reshape-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-relu-op  operators/test_relu_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-relu-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-fc-op  operators/test_fusion_fc_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-fc-op paddle-mobile)
+
+    # gen test log
+    ADD_EXECUTABLE(test-log common/test_log.cpp)
+    target_link_libraries(test-log paddle-mobile)
+
+    # gen test log
+    ADD_EXECUTABLE(test-load framework/test_load.cpp)
+    target_link_libraries(test-load paddle-mobile)
+
+    # gen test log
+    # gen test
+    ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp)
+    target_link_libraries(test-optimize paddle-mobile)
+
+
+    #gen test
+    ADD_EXECUTABLE(test-pool operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-pool paddle-mobile)
+
+    #gen test
+    ADD_EXECUTABLE(test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-softmax paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-gemm common/test_gemm.cpp)
+    target_link_libraries(test-gemm paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
+    target_link_libraries(test-enforce paddle-mobile)
 
-#gen test
-ADD_EXECUTABLE(test-pool operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
-target_link_libraries(test-pool paddle-mobile)
+    # gen test - test if openmp works
+    ADD_EXECUTABLE(test-openmp common/test_openmp.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-openmp paddle-mobile)
 
-#gen test
-ADD_EXECUTABLE(test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
-target_link_libraries(test-softmax paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-mobilenetssd paddle-mobile)
 
-# gen test
-ADD_EXECUTABLE(test-gemm common/test_gemm.cpp)
-target_link_libraries(test-gemm paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp  test_include.h)
+    target_link_libraries(test-sigmoid paddle-mobile)
 
-# gen test
-ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
-target_link_libraries(test-enforce paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-depthwise-conv-op paddle-mobile)
 
-# gen test
-ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
-target_link_libraries(test-yolo paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-mobilenet paddle-mobile)
 
-# gen test
-ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
-target_link_libraries(test-googlenet paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-conv-add-relu-op paddle-mobile)
 
-# gen test
-ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
-target_link_libraries(test-mobilenet paddle-mobile)
+    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 
-# gen test
-ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
-target_link_libraries(test-resnet paddle-mobile)
-# gen test
-ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h  test_include.h executor_for_test.h)
-target_link_libraries(test-mobilenetssd paddle-mobile)
-
-# gen test
-ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
-target_link_libraries(test-squeezenet paddle-mobile)
-
-# gen test
-ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp  test_include.h)
-target_link_libraries(test-sigmoid paddle-mobile)
-
-# gen test
-ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
-target_link_libraries(test-depthwise-conv-op paddle-mobile)
+endif()
diff --git a/test/common/test_gemm.cpp.cpp b/test/common/test_gemm.cpp
similarity index 75%
rename from test/common/test_gemm.cpp.cpp
rename to test/common/test_gemm.cpp
index f385bf960e266df1ddfd317c3281904fea1a21ee..aaf3c183f3e125f09695fad8a41cfb5360e9da13 100644
--- a/test/common/test_gemm.cpp.cpp
+++ b/test/common/test_gemm.cpp
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <iostream>
+#include "../test_helper.h"
 #include "common/log.h"
+#include "memory/t_malloc.h"
 #include "operators/math/gemm.h"
 
 #define a(i, j) a[(i)*lda + (j)]
@@ -29,10 +31,15 @@ int main() {
   int ldb = n;
   int ldc = n;
 
-  float a[62 * 74];
-  float b[74 * 63];
-  float c[62 * 63] = {0};
-  float c1[62 * 63] = {0};
+  float *a =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * k));
+  float *b =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * k * n));
+  float *c =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
+  float *c1 =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
+
   for (int i = 0; i < m * k; ++i) {
     a[i] = 2;
   }
@@ -44,8 +51,11 @@ int main() {
     c1[i] = 2;
   }
 
+  auto time1 = time();
   paddle_mobile::operators::math::sgemm(m, n, k, 0.9, a, lda, b, ldb, 0.3, c,
                                         ldc);
+  auto time2 = time();
+  DLOG << "gemm cost :" << time_diff(time1, time2) << "ms\n";
   for (int i = 0; i < m * n; ++i) {
     std::cout << c[i] << " | ";
     if (i % n == (n - 1)) {
diff --git a/src/framework/program/tensor_desc.cpp b/test/common/test_lib_size.cpp
similarity index 86%
rename from src/framework/program/tensor_desc.cpp
rename to test/common/test_lib_size.cpp
index 1b4bd93f6f19426407868052e5366ebeeaedda69..805668f359f0e0959ea7122f25cdaa0ad2d7ec77 100644
--- a/src/framework/program/tensor_desc.cpp
+++ b/test/common/test_lib_size.cpp
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 //
-// Created by liuRuiLong on 2018/5/26.
+// Created by liuRuiLong on 2018/6/6.
 //
 
-#include "tensor_desc.h"
+#include "test_lib_size.h"
+
+static test_lib_size t;
diff --git a/test/common/test_lib_size.h b/test/common/test_lib_size.h
new file mode 100644
index 0000000000000000000000000000000000000000..a00a5afe12f952a7bc47ab62ba1d07a7879cebec
--- /dev/null
+++ b/test/common/test_lib_size.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+//
+// Created by liuRuiLong on 2018/6/6.
+//
+
+#ifndef PADDLE_MOBILE_TEST_LIB_SIZE_H
+#define PADDLE_MOBILE_TEST_LIB_SIZE_H
+
+#include <pthread.h>
+#include <thread>
+#include <vector>
+//#include <list>
+//#include <tuple>
+//#include <typeinfo>
+//#include <mutex>
+//#include <initializer_list>
+//#include <map>
+//#include <string>
+//#include <unordered_map>
+//#include <unordered_set>
+//#include <algorithm>
+
+//#include <iostream>
+//#include <sstream>
+//#include <memory>
+//#include <stdio.h>
+//#include <cstring>
+
+void foo() {
+  //  char *str = "1234";
+  //  char dst[10];
+  //  strcpy(dst, str);
+
+  //  std::cout << "12345" << std::endl;
+  std::vector<int> vec = {1, 2, 3, 4, 5};
+  vec.push_back(2);
+
+  pthread_mutex_init(NULL, NULL);
+  pthread_attr_destroy(NULL);
+  //  std::find(vec.begin(), vec.end(), 1);
+
+  //  std::list<int> l;
+  //  std::mutex mutex_;
+
+  //  std::map<int, float> m;
+  //  std::unordered_map<int, float> u_m;
+  //  std::unordered_set<int> u_s;
+  //  std::string ss = "12345";
+  //  printf("%f", ss.c_str());
+
+  //  std::initializer_list<int> init_list = {1, 2};
+  //  std::tuple<int, int> t = {1, 2};
+
+  //  std::tuple_element<I, std::tuple<ARGS...>>::type
+
+  //  std::tuple<>
+
+  //  int i;
+  //  int j;
+  //  if (typeid(i) == typeid(j)){
+  //    int z = 10;
+  //  }
+
+  //  std::shared_ptr<int> s1 = std::make_shared<int>();
+
+  //  std::stringstream ss;
+  //  ss << "12345";
+}
+
+class test_lib_size {
+ public:
+  test_lib_size() {}
+  //  std::shared_ptr<int> Test(){
+  //    std::vector<int> vec = {1, 2, 3};
+  //    std::shared_ptr<int> si = std::make_shared<int>();
+  //    return si;
+  //  }
+
+  //  void test(){
+  //    int i = 9;
+  //  }
+};
+
+#endif  // PADDLE_MOBILE_TEST_LIB_SIZE_H
diff --git a/src/platform/hostdevice.h b/test/common/test_openmp.cpp
similarity index 63%
rename from src/platform/hostdevice.h
rename to test/common/test_openmp.cpp
index 6139fb94b998b2a9b261064d1b0428e0c65cf69e..790c434101e20478853b7079533403d65dc829ba 100644
--- a/src/platform/hostdevice.h
+++ b/test/common/test_openmp.cpp
@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
+//#include <omp.h>
+#include <iostream>
 
-#ifdef __CUDACC__
-#define HOSTDEVICE __host__ __device__
-#define DEVICE __device__
-#define HOST __host__
-#else
-#define HOSTDEVICE
-#define DEVICE
-#define HOST
+int main(void) {
+#ifdef PADDLE_MOBILE_USE_OPENMP
+  #pragma omp parallel num_threads(2)
+  {
+    //        int thread_id = omp_get_thread_num();
+    //        int nthreads = omp_get_num_threads();
+    //        std::cout << "Hello, OMP " << thread_id << "/" << nthreads <<
+    //        "\n";
+  }
 #endif
+  return 0;
+}
diff --git a/test/executor_for_test.h b/test/executor_for_test.h
index ce3c84e986eb7ef5e9602209cedb3dbabbf06e85..0d3051327a57202e2b8d1dcbdda571fd244de108 100644
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #include "common/log.h"
 #include "framework/op_registry.h"
-#include "io.h"
+#include "io/io.h"
 #include "operators/conv_op.h"
 #include "operators/elementwise_add_op.h"
 #include "operators/pool_op.h"
@@ -42,8 +42,10 @@ using std::vector;
 template <typename DeviceType, typename OpType>
 class Executor4Test : public Executor<DeviceType> {
  public:
-  Executor4Test(Program<DeviceType> p, string op_type)
+  Executor4Test(Program<DeviceType> p, string op_type,
+                bool use_optimize = false)
       : Executor<DeviceType>() {
+    this->use_optimize_ = use_optimize;
     this->program_ = p;
     if (this->use_optimize_) {
       this->to_predict_program_ = this->program_.optimizeProgram;
@@ -61,13 +63,14 @@ class Executor4Test : public Executor<DeviceType> {
       std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
       for (std::shared_ptr<OpDesc> op : ops) {
         if (op->Type() == op_type) {
+          DLOG << "匹配到: " << op->Type();
+
           /// test first meeting op in program
           std::shared_ptr<paddle_mobile::framework::OperatorBase<DeviceType>>
-              op_ptr = paddle_mobile::framework::OpRegistry<
-                  paddle_mobile::CPU>::CreateOp(op->Type(), op->GetInputs(),
-                                                op->GetOutputs(),
-                                                op->GetAttrMap(),
-                                                this->program_.scope);
+              op_ptr =
+                  paddle_mobile::framework::OpRegistry<DeviceType>::CreateOp(
+                      op->Type(), op->GetInputs(), op->GetOutputs(),
+                      op->GetAttrMap(), this->program_.scope);
           this->ops_of_block_[*block_desc.get()].push_back(op_ptr);
           break;
         }
diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp
index 95357547e1b93d3060481b55eaf46c919496785d..8c76eb1dde3ef39a342d19e7f3d4e26fc1be2b2f 100644
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -13,13 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "../test_helper.h"
-#include "io.h"
+#include "io/io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
-  auto program = loader.Load(g_googlenet);
-  program.optimizeProgram->Description("program desc: ");
+  auto program = loader.Load(g_mobilenet_ssd, false, false);
+  //  auto program = loader.Load(g_googlenet_combine + "/model",
+  //  g_googlenet_combine +
+  //    "/params", true);
+
+  //  program.originProgram->Description("program desc: ");
   return 0;
 }
diff --git a/test/framework/test_optimize.cpp b/test/framework/test_optimize.cpp
index f0392cfec02c8ea764cd3d6dc9f50b2415c39e2c..32574764e1ba538ab0bea31d1e238096e7098dfc 100644
--- a/test/framework/test_optimize.cpp
+++ b/test/framework/test_optimize.cpp
@@ -15,17 +15,17 @@ limitations under the License. */
 #include "../test_helper.h"
 #include "framework/program/program-optimize/node.h"
 #include "framework/program/program-optimize/program_optimize.h"
-#include "io.h"
+#include "io/io.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
   //    "../../../test/models/googlenet"
-  auto program = loader.Load(g_googlenet);
+  auto program = loader.Load(g_mobilenet_ssd, true);
   paddle_mobile::framework::ProgramOptimize optimize;
   //  program.originProgram->Description("origin");
-  auto optimize_program = optimize.FushionOptimize(program.originProgram);
+  auto optimize_program = optimize.FusionOptimize(program.originProgram);
   if (optimize_program != nullptr) {
-    optimize_program->Description("optimize");
+    //    optimize_program->Description("optimize");
   } else {
     LOG(paddle_mobile::kLOG_ERROR) << "optimize_program is null";
   }
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index 302cd3e726eeb99c50c3adf7e3b9117a05cf0560..1695995a8d60d20e0d6c5f8911c39a948426a82a 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -20,7 +20,9 @@ int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
   bool optimize = true;
   auto time1 = time();
-  auto program = loader.Load(g_googlenet, optimize);
+  //  auto program = loader.Load(g_googlenet, optimize);
+  auto program = loader.Load(g_googlenet_combine + "/model",
+                             g_googlenet_combine + "/params", optimize);
   auto time2 = time();
   DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
   paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
@@ -28,7 +30,11 @@ int main() {
   std::vector<int64_t> dims{1, 3, 224, 224};
   GetInput<float>(g_test_image_1x3x224x224, &input, dims);
   auto time3 = time();
-  executor.Predict(input, dims);
+
+  for (int i = 0; i < 10; ++i) {
+    executor.Predict(input, dims);
+  }
+
   auto time4 = time();
   DLOG << "predict cost :" << time_diff(time3, time4) << "ms\n";
   return 0;
diff --git a/test/net/test_mobilenet+ssd.cpp b/test/net/test_mobilenet+ssd.cpp
index e9d92e7a51b9f7abe2c451df4073428bd2bd6d5f..097d03ad710468a881050ff729e8352f029d664f 100644
--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
@@ -19,10 +19,10 @@ limitations under the License. */
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
   auto time1 = time();
-  auto program = loader.Load(g_mobilenet_ssd, false);
+  auto program = loader.Load(g_mobilenet_ssd, true);
   auto time2 = time();
   DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, true);
 
   std::vector<int64_t> dims{1, 3, 300, 300};
   Tensor input_tensor;
diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp
index 7ed9a3566e3be8d5baa7e47611fc713772e94327..8400b08f2292bb5655e2d85298acce603e1ce603 100644
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -19,14 +19,14 @@ limitations under the License. */
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
   auto time1 = time();
-  auto program = loader.Load(g_mobilenet, false);
+  auto program = loader.Load(g_mobilenet, true);
   auto time2 = time();
   DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 2, false);
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, true);
 
-  std::vector<int64_t> dims{2, 3, 224, 224};
+  std::vector<int64_t> dims{1, 3, 224, 224};
   Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {2, 3, 224, 224}, static_cast<float>(0),
+  SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
                      static_cast<float>(1));
 
   std::vector<float> input(input_tensor.data<float>(),
diff --git a/test/operators/test_batchnorm_op.cpp b/test/operators/test_batchnorm_op.cpp
index 38d9f624909fd645c78ae56a5d9efff9fa961795..4ccad8c1512036c2400a09575b3775e75b26acce 100644
--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
@@ -41,7 +41,7 @@ class TestBatchNormOp {
       for (int j = 0; j < ops.size(); ++j) {
         std::shared_ptr<OpDesc> op = ops[j];
         if (op->Type() == "batch_norm" &&
-            op->Input("X")[0] == "conv2d_0.tmp_0") {
+            op->Input("X")[0] == "conv2d_5.tmp_0") {
           DLOG << " mul attr size: " << op->GetAttrMap().size();
           DLOG << " inputs size: " << op->GetInputs().size();
           DLOG << " outputs size: " << op->GetOutputs().size();
@@ -67,29 +67,29 @@ class TestBatchNormOp {
                                      const Tensor &t5) {
     // feed
     auto scope = program_.scope;
-    Variable *x1_feed_value = scope->Var("conv2d_0.tmp_0");
+    Variable *x1_feed_value = scope->Var("conv2d_5.tmp_0");
     auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
     tensor_x1->ShareDataWith(t1);
 
-    Variable *mean_feed_value = scope->Var("batch_norm_0.w_1");
+    Variable *mean_feed_value = scope->Var("batch_norm_10.w_1");
     auto tensor_mean = mean_feed_value->GetMutable<LoDTensor>();
     tensor_mean->ShareDataWith(t2);
 
-    Variable *scale_feed_value = scope->Var("batch_norm_0.w_0");
+    Variable *scale_feed_value = scope->Var("batch_norm_10.w_0");
     auto tensor_scale = scale_feed_value->GetMutable<LoDTensor>();
     tensor_scale->ShareDataWith(t3);
 
-    Variable *variance_feed_value = scope->Var("batch_norm_0.w_2");
+    Variable *variance_feed_value = scope->Var("batch_norm_10.w_2");
     auto tensor_variance = variance_feed_value->GetMutable<LoDTensor>();
     tensor_variance->ShareDataWith(t4);
 
-    Variable *bias_feed_value = scope->Var("batch_norm_0.b_0");
+    Variable *bias_feed_value = scope->Var("batch_norm_10.b_0");
     auto tensor_bias = bias_feed_value->GetMutable<LoDTensor>();
     tensor_bias->ShareDataWith(t5);
 
-    Variable *output = scope->Var("batch_norm_0.tmp_2");
+    Variable *output = scope->Var("batch_norm_10.tmp_2");
     auto *output_tensor = output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>({4, 10, 2, 2});
+    output_tensor->mutable_data<float>({1, 256, 38, 38});
     //  DLOG << typeid(output_tensor).name();
     //  DLOG << "output_tensor dims: " << output_tensor->dims();
 
@@ -128,30 +128,32 @@ int main() {
   DLOG << "----------**********----------";
   DLOG << "begin to run BatchNormOp Test";
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_resnet));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
 
   /// input x (4,10,2,2)
   paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {4, 10, 2, 2}, static_cast<float>(0),
+  SetupTensor<float>(&inputx1, {1, 256, 38, 38}, static_cast<float>(0),
                      static_cast<float>(1));
   auto *inputx1_ptr = inputx1.data<float>();
 
   paddle_mobile::framework::Tensor mean;
-  SetupTensor<float>(&mean, {10}, static_cast<float>(0), static_cast<float>(1));
+  SetupTensor<float>(&mean, {256}, static_cast<float>(0),
+                     static_cast<float>(1));
   auto *mean_ptr = mean.data<float>();
 
   paddle_mobile::framework::Tensor scale;
-  SetupTensor<float>(&scale, {10}, static_cast<float>(0),
+  SetupTensor<float>(&scale, {256}, static_cast<float>(0),
                      static_cast<float>(1));
   auto *scale_ptr = scale.data<float>();
 
   paddle_mobile::framework::Tensor variance;
-  SetupTensor<float>(&variance, {10}, static_cast<float>(0),
+  SetupTensor<float>(&variance, {256}, static_cast<float>(0),
                      static_cast<float>(1));
   auto *variance_ptr = variance.data<float>();
 
   paddle_mobile::framework::Tensor bias;
-  SetupTensor<float>(&bias, {10}, static_cast<float>(0), static_cast<float>(1));
+  SetupTensor<float>(&bias, {256}, static_cast<float>(0),
+                     static_cast<float>(1));
   auto *bias_ptr = bias.data<float>();
 
   paddle_mobile::framework::TestBatchNormOp<paddle_mobile::CPU> testBatchNormOp(
@@ -161,11 +163,13 @@ int main() {
       testBatchNormOp.predict_bn(inputx1, mean, scale, variance, bias);
   auto *output_bn_ptr = output_bn->data<float>();
 
-  /// [2, 5, 1, 0]
-  DLOG << " (" << inputx1_ptr[102] << " - " << mean_ptr[5] << ")/(("
-       << variance_ptr[5] << " + 0.00001"
-       << ")^0.5)* " << scale_ptr[5] << " + " << bias_ptr[5] << " = ";
-  DLOG << output_bn_ptr[102];
+  DLOG << " (" << inputx1_ptr[0] << " - " << mean_ptr[0] << ")/(("
+       << variance_ptr[0] << " + 0.00001"
+       << ")^0.5)* " << scale_ptr[0] << " + " << bias_ptr[0] << " = ";
+  DLOG << output_bn_ptr[0];
+
+  DLOG << "input_ptr 0 : " << inputx1_ptr[0];
+  DLOG << "output_ptr 0 : " << output_bn_ptr[0];
 
   return 0;
 }
diff --git a/test/operators/test_concat_op.cpp b/test/operators/test_concat_op.cpp
index 7a106b03c44c57fa7ef0f9282434717efd602b5c..edaa4ce1ddba251886c90262895333b0a56c3a07 100644
--- a/test/operators/test_concat_op.cpp
+++ b/test/operators/test_concat_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/concat_op.h"
 
diff --git a/test/operators/test_conv_add_relu_op.cpp b/test/operators/test_conv_add_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..987f52cd62f91b3bc00cc1ef49bd21913e288d75
--- /dev/null
+++ b/test/operators/test_conv_add_relu_op.cpp
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/fusion_conv_add_relu_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../models/image_classification_resnet.inference.model
+  auto program = loader.Load(g_googlenet, true);
+
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<
+      paddle_mobile::CPU,
+      paddle_mobile::operators::FusionConvAddReluOp<paddle_mobile::CPU, float>>
+      executor(program, "fusion_conv_add_relu", true);
+
+  paddle_mobile::framework::Tensor input;
+  GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
+  //  // use SetupTensor if not has local input image .
+  //  SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
+  //                     static_cast<float>(1));
+
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 112, 112});
+  auto output = executor.Predict(input, "data", "conv2d_0.tmp_2", out_ddim);
+
+  auto output_ptr = output->data<float>();
+  for (int j = 0; j < 25; ++j) {
+    DLOG << " value of output: " << output_ptr[j];
+  }
+  return 0;
+}
diff --git a/test/operators/test_cov_op.cpp b/test/operators/test_cov_op.cpp
index ba6a9b4800f8b2acb3a5c3b0992128bd4ea0e619..a85ad9edba5d3e2256b8d7ee7d7d3c5b7200888d 100644
--- a/test/operators/test_cov_op.cpp
+++ b/test/operators/test_cov_op.cpp
@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "../executor_for_test.h"
 #include "../test_include.h"
+#include "operators/conv_op.h"
 
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::Loader<paddle_mobile::GPU_MALI> loader;
   //  ../models/image_classification_resnet.inference.model
   auto program = loader.Load(g_googlenet);
 
   PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                         "program file read fail");
 
-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::ConvOp<paddle_mobile::CPU, float>>
+  Executor4Test<paddle_mobile::GPU_MALI, paddle_mobile::operators::ConvOp<
+                                             paddle_mobile::GPU_MALI, float>>
       executor(program, "conv2d");
 
   paddle_mobile::framework::Tensor input;
@@ -37,7 +37,7 @@ int main() {
   auto output = executor.Predict(input, "data", "conv2d_0.tmp_0", out_ddim);
 
   auto output_ptr = output->data<float>();
-  for (int j = 0; j < output->numel(); ++j) {
+  for (int j = 0; j < 20; ++j) {
     DLOG << " value of output: " << output_ptr[j];
   }
   return 0;
diff --git a/test/operators/test_depthwise_conv_op.cpp b/test/operators/test_depthwise_conv_op.cpp
index 648b4c5db9970804a2ca140eef13e2560e36f935..bd2aad19eda896bad3da8a47f5b70b1a923dc1a7 100644
--- a/test/operators/test_depthwise_conv_op.cpp
+++ b/test/operators/test_depthwise_conv_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/depthwise_conv_op.h"
 
diff --git a/test/operators/test_elementwise_add_op.cpp b/test/operators/test_elementwise_add_op.cpp
index c4997f2eb37730e1af38fbe8aac927e7ee2b6ee0..0a5e9f7e92701e748df51078b21eb46eec90599d 100644
--- a/test/operators/test_elementwise_add_op.cpp
+++ b/test/operators/test_elementwise_add_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "../executor_for_test.h"
 #include "../test_include.h"
 
 int main() {
diff --git a/test/operators/test_fushion_fc_op.cpp b/test/operators/test_fusion_fc_op.cpp
similarity index 96%
rename from test/operators/test_fushion_fc_op.cpp
rename to test/operators/test_fusion_fc_op.cpp
index 8dc1b02bec403d13b0b18f3fad58d7686ce403d0..a23bde45cb74f0f75e655821b15e66b1cef4c081 100644
--- a/test/operators/test_fushion_fc_op.cpp
+++ b/test/operators/test_fusion_fc_op.cpp
@@ -49,8 +49,8 @@ class TestFcOp {
           DLOG << " Input Y is : " << op->Input("Y")[0];
           DLOG << " Input Y is : " << op->Input("Z")[0];
           DLOG << " Output Out is : " << op->Output("Out")[0];
-          std::shared_ptr<operators::FushionFcOp<Dtype, float>> testOp =
-              std::make_shared<operators::FushionFcOp<Dtype, float>>(
+          std::shared_ptr<operators::FusionFcOp<Dtype, float>> testOp =
+              std::make_shared<operators::FusionFcOp<Dtype, float>>(
                   op->Type(), op->GetInputs(), op->GetOutputs(),
                   op->GetAttrMap(), program_.scope);
           ops_of_block_[*block_desc.get()].push_back(testOp);
@@ -119,7 +119,7 @@ int main() {
   auto program = loader.Load(g_googlenet);
   paddle_mobile::framework::ProgramOptimize optimize;
   //  program.originProgram->Description("origin");
-  auto optimize_program = optimize.FushionOptimize(program.originProgram);
+  auto optimize_program = optimize.FusionOptimize(program.originProgram);
 
   program.optimizeProgram = optimize_program;
 
diff --git a/test/operators/test_lrn_op.cpp b/test/operators/test_lrn_op.cpp
index cf5fd4bdf2d45abcf63eb865f1cf333eeb14eafc..d4d9f8da802fc0f5f885a3b2e81cba695776c29e 100644
--- a/test/operators/test_lrn_op.cpp
+++ b/test/operators/test_lrn_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/lrn_op.h"
 
diff --git a/test/operators/test_mul_op.cpp b/test/operators/test_mul_op.cpp
index 5412e6905b7c12782555c7271c5da17713561469..8ebf0926890497c0ed622b69f163a9f6f5c8612b 100644
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/mul_op.h"
 
diff --git a/test/operators/test_pool_op.cpp b/test/operators/test_pool_op.cpp
index 62dfc20dc12006f86b16997cb6de96123e10ee9c..2daecd7b4c1a50c612bc784c801208d2e6f31482 100644
--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "../executor_for_test.h"
-#include "../test_helper.h"
-#include "io.h"
+#include "../test_include.h"
+#include "operators/pool_op.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/operators/test_relu_op.cpp b/test/operators/test_relu_op.cpp
index 50f3b6a20b6244fcb39975c80cc6a6e14dc88d1c..fad0d0c30a126cc2730e4aa8b87364eee9fc8209 100644
--- a/test/operators/test_relu_op.cpp
+++ b/test/operators/test_relu_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/relu_op.h"
 
diff --git a/test/operators/test_reshape_op.cpp b/test/operators/test_reshape_op.cpp
index 5448aac87c23ea90f5b8beec24aee9cc6f437330..3541151d8a1a286527e715f402df381d2efc094c 100644
--- a/test/operators/test_reshape_op.cpp
+++ b/test/operators/test_reshape_op.cpp
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "../executor_for_test.h"
-#include "../test_helper.h"
-#include "io.h"
+#include "../test_include.h"
+#include "operators/reshape_op.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/operators/test_sigmoid_op.cpp b/test/operators/test_sigmoid_op.cpp
index 289eac149fa2d3e05f65624f8a9e5f93e85c6fff..4ed3efaf28aa986f0b679729c46cb386150583e3 100644
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "../../src/operators/kernel/sigmoid_kernel.h"
 #include "../test_helper.h"
-#include "io.h"
+#include "io/io.h"
 
 int main() {
   paddle_mobile::framework::Tensor input;
diff --git a/test/operators/test_softmax_op.cpp b/test/operators/test_softmax_op.cpp
index 58de5300cca0bf367652066851bc4e7e9f75389c..a0184729a8bc5e6b0ba952923eecd5242cfe36d4 100644
--- a/test/operators/test_softmax_op.cpp
+++ b/test/operators/test_softmax_op.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "../executor_for_test.h"
-#include "../test_helper.h"
-#include "io.h"
+#include "../test_include.h"
+
+#include "operators/softmax_op.h"
 
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
diff --git a/test/operators/test_transpose_op.cpp b/test/operators/test_transpose_op.cpp
index 4c88df2d83dcfbc44915ced815b50f90ddb33b38..f83ee23c25d8f2588e0fe40d5fabc6114129b995 100644
--- a/test/operators/test_transpose_op.cpp
+++ b/test/operators/test_transpose_op.cpp
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "io.h"
-
+#include "../test_include.h"
+#include "operators/transpose_op.h"
 int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
   auto program = loader.Load(std::string(g_mobilenet_ssd));
diff --git a/test/test_helper.h b/test/test_helper.h
index fc4ed6c91dc9b03c1f4dadfd8a4bc94efe3a724e..fe720ded8270f2bc02a4f1e72625954962184069 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -22,12 +22,13 @@ limitations under the License. */
 #include "framework/ddim.h"
 #include "framework/tensor.h"
 
-static const std::string g_googlenet = "../models/googlenet";
-static const std::string g_mobilenet = "../models/mobilenet";
 static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd";
 static const std::string g_squeezenet = "../models/squeezenet";
-static const std::string g_resnet =
-    "../models/image_classification_resnet.inference.model";
+static const std::string g_googlenet = "../models/googlenet";
+static const std::string g_mobilenet = "../models/mobilenet";
+static const std::string g_resnet_50 = "../models/resnet_50";
+static const std::string g_resnet = "../models/resnet";
+static const std::string g_googlenet_combine = "../models/googlenet_combine";
 static const std::string g_yolo = "../models/yolo";
 static const std::string g_test_image_1x3x224x224 =
     "../images/test_image_1x3x224x224_float";
diff --git a/test/test_include.h b/test/test_include.h
index 25efbb9f4c00921495a5ab054acdde329c4ef58a..2d89dc8c9ed1de1ad49ebca07724b6649e2a12a7 100644
--- a/test/test_include.h
+++ b/test/test_include.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "./test_helper.h"
 #include "common/enforce.h"
 #include "common/log.h"
+#include "executor_for_test.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"
 #include "framework/program/block_desc.h"
@@ -29,4 +30,4 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
-#include "io.h"
+#include "io/io.h"
diff --git a/tools/android-debug-script/push2android.sh b/tools/android-debug-script/push2android.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fae1a856123bd16cf3f7a115f61b3e4473ff58a3
--- /dev/null
+++ b/tools/android-debug-script/push2android.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env sh
+
+push_fn () {
+MODELS_PATH="../../test/models/*"
+MODELS_SRC="../../test/models"
+IMAGE_PATH="../../test/images/*"
+EXE_FILE="../../test/build/*"
+EXE_DIR="data/local/tmp/bin"
+adb shell mkdir ${EXE_DIR}
+MODELS_DIR="data/local/tmp/models"
+adb shell mkdir ${MODELS_DIR}
+for file in `ls ${MODELS_SRC}`
+do 
+    adb shell mkdir ${MODELS_DIR}"/"${file}
+done
+
+if [[ -d "../../src/operators/kernel/mali/ACL_Android/build" ]]; then
+ACL_BUILD_PATH="../../src/operators/kernel/mali/ACL_Android/build/*"
+adb push ${ACL_BUILD_PATH} ${EXE_DIR}
+fi
+
+IMAGES_DIR="data/local/tmp/images"
+adb shell mkdir ${IMAGES_DIR}
+LIB_PATH="../../build/release/arm-v7a/build/*"
+adb push ${EXE_FILE} ${EXE_DIR}
+adb push ${LIB_PATH} ${EXE_DIR}
+if [[ $1 != "npm" ]]; then
+adb push ${IMAGE_PATH} ${IMAGES_DIR}
+adb push ${MODELS_PATH} ${MODELS_DIR}
+fi
+}
+
+if [[ $1 == "npm" ]]; then
+push_fn $1
+else
+push_fn
+fi
diff --git a/tools/android-debug-script/run_on_android.sh b/tools/android-debug-script/run_on_android.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e9f6388cd6e9a9b3aeaf72691a8724a898aa4e44
--- /dev/null
+++ b/tools/android-debug-script/run_on_android.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env sh
+
+push_fn () {
+MODELS_PATH="../../test/models/*"
+MODELS_SRC="../../test/models"
+IMAGE_PATH="../../test/images/*"
+EXE_FILE="../../test/build/*"
+EXE_DIR="data/local/tmp/bin"
+adb shell mkdir ${EXE_DIR}
+MODELS_DIR="data/local/tmp/models"
+adb shell mkdir ${MODELS_DIR}
+for file in `ls ${MODELS_SRC}`
+do 
+    adb shell mkdir ${MODELS_DIR}"/"${file}
+done
+
+IMAGES_DIR="data/local/tmp/images"
+adb shell mkdir ${IMAGES_DIR}
+LIB_PATH="../../build/release/arm-v7a/build/*"
+adb push ${EXE_FILE} ${EXE_DIR}
+adb push ${LIB_PATH} ${EXE_DIR}
+if [[ $1 != "npm" ]]; then
+adb push ${IMAGE_PATH} ${IMAGES_DIR}
+adb push ${MODELS_PATH} ${MODELS_DIR}
+fi
+echo "test-op or test-net below : "
+adb shell ls /data/local/tmp/bin
+echo "**** choose OP or NET to test ****"
+read -p "which to test : " test_name
+adb shell "cd /data/local/tmp/bin; LD_LIBRARY_PATH=. ./${test_name}"
+}
+
+if [[ $1 == "npm" ]]; then
+push_fn $1
+else
+push_fn
+fi
\ No newline at end of file
diff --git a/tools/arm-platform.cmake b/tools/arm-platform.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..9f2b6d5e89d92255848af54321ea09ebdb058691
--- /dev/null
+++ b/tools/arm-platform.cmake
@@ -0,0 +1,9 @@
+
+set(ARCH "armv7-a")
+
+set(FLOAT_ABI "softfp" CACHE STRING "-mfloat-api chosen")
+set_property(CACHE FLOAT_ABI PROPERTY STRINGS "softfp" "soft" "hard")
+
+set(FPU "neon")
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${ARCH} -mfloat-abi=${FLOAT_ABI} -mfpu=${FPU}")
diff --git a/tools/build.sh b/tools/build.sh
new file mode 100755
index 0000000000000000000000000000000000000000..42e872c580cffef3bd904dc9cc575e9961ef4257
--- /dev/null
+++ b/tools/build.sh
@@ -0,0 +1,174 @@
+#!/usr/bin/env bash
+
+build_for_mac() {
+    if [ ! `which brew` ]; then
+        echo "building failed! homebrew not found, please install homebrew."
+        return
+    fi
+    if [ ! `which cmake` ]; then
+        echo "installing cmake."
+        brew install cmake
+        if [ ! $? ]; then
+            echo "cmake install failed."
+            return
+        fi
+    fi
+    PLATFORM="x86"
+    MODE="Release"
+    BUILD_DIR=../build/release/"${PLATFORM}"
+    mkdir -p ${BUILD_DIR}/build
+
+    mkdir -p ${BUILD_DIR}/test
+    cp -r ../test/models ${BUILD_DIR}/test/models
+
+    cmake .. \
+        -B"${BUILD_DIR}" \
+    	-DCMAKE_BUILD_TYPE="${MODE}" \
+    	-DIS_MAC=true
+
+    cd ${BUILD_DIR}
+    make -j 8
+}
+
+build_for_android() {
+    rm -rf "../build"
+    if [ -z "${ANDROID_NDK}" ]; then
+        echo "ANDROID_NDK not found!"
+        exit -1
+    fi
+
+    if [ -z "$PLATFORM" ]; then
+        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" or other platforms from the command line.
+    fi
+
+    if [ "${PLATFORM}" = "arm-v7a" ]; then
+        ABI="armeabi-v7a with NEON"
+        ARM_PLATFORM="V7"
+        CXX_FLAGS="-march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
+    elif [ "${PLATFORM}" = "arm-v8a" ]; then
+        ABI="arm64-v8a"
+        ARM_PLATFORM="V8"
+        CXX_FLAGS="-march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog"
+    else
+        echo "unknown platform!"
+        exit -1
+    fi
+
+
+    MODE="Release"
+    ANDROID_PLATFORM_VERSION="android-22"
+    TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
+    ANDROID_ARM_MODE="arm"
+    if [ $# -eq 1 ]; then
+    NET=$1
+    cmake .. \
+        -B"../build/release/${PLATFORM}" \
+        -DANDROID_ABI="${ABI}" \
+        -DCMAKE_BUILD_TYPE="${MODE}" \
+        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
+        -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \
+        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
+        -DANDROID_STL=c++_static \
+        -DANDROID=true \
+        -D"${NET}=true" \
+        -D"${ARM_PLATFORM}"=true
+    else
+
+    cmake .. \
+        -B"../build/release/${PLATFORM}" \
+        -DANDROID_ABI="${ABI}" \
+        -DCMAKE_BUILD_TYPE="${MODE}" \
+        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
+        -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \
+        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
+        -DANDROID_STL=c++_static \
+        -DANDROID=true \
+        -D"${ARM_PLATFORM}"=true
+    fi
+    cd "../build/release/${PLATFORM}"
+    make -j 8
+}
+
+build_for_ios() {
+    rm -rf "../build"
+    PLATFORM="ios"
+    MODE="Release"
+    BUILD_DIR=../build/release/"${PLATFORM}"
+    TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
+    C_FLAGS="-fobjc-abi-version=2 -fobjc-arc -isysroot ${CMAKE_OSX_SYSROOT}"
+    CXX_FLAGS="-fobjc-abi-version=2 -fobjc-arc -std=gnu++14 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT}"
+    mkdir -p "${BUILD_DIR}"
+    if [ $# -eq 1 ]; then
+        NET=$1
+        cmake .. \
+            -B"${BUILD_DIR}" \
+            -DCMAKE_BUILD_TYPE="${MODE}" \
+            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
+            -DIOS_PLATFORM=OS \
+            -DCMAKE_C_FLAGS="${C_FLAGS}" \
+            -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
+            -D"${NET}"=true \
+            -DIS_IOS="true"
+    else
+        cmake .. \
+            -B"${BUILD_DIR}" \
+            -DCMAKE_BUILD_TYPE="${MODE}" \
+            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
+            -DIOS_PLATFORM=OS \
+            -DCMAKE_C_FLAGS="${C_FLAGS}" \
+            -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
+            -DIS_IOS="true"
+    fi
+    cd "${BUILD_DIR}"
+    make -j 8
+}
+
+build_error() {
+    echo "unknown argument"
+}
+
+if [ $# -lt 1 ]; then
+	echo "error: target missing!"
+    echo "available targets: mac|linux|ios|android"
+    echo "sample usage: ./build.sh mac"
+else
+    if [ $# -eq 2 ]; then
+        if [ $2 != "googlenet" -a $2 != "mobilenet" -a $2 != "yolo" -a $2 != "squeezenet" -a $2 != "resnet" ]; then
+            if [ $1 = "mac" ]; then
+		        build_for_mac
+	        elif [ $1 = "linux" ]; then
+		        build_for_linux
+	        elif [ $1 = "android" ]; then
+		        build_for_android
+	        elif [ $1 = "ios" ]; then
+		        build_for_ios
+	        else
+		        build_error
+	        fi
+        else
+            if [ $1 = "mac" ]; then
+		        build_for_mac $2
+	        elif [ $1 = "linux" ]; then
+		        build_for_linux $2
+	        elif [ $1 = "android" ]; then
+		        build_for_android $2
+	        elif [ $1 = "ios" ]; then
+		        build_for_ios $2
+	        else
+		        build_error
+	        fi
+        fi
+    else
+        if [ $1 = "mac" ]; then
+		    build_for_mac
+	    elif [ $1 = "linux" ]; then
+		    build_for_linux
+	    elif [ $1 = "android" ]; then
+		    build_for_android
+	    elif [ $1 = "ios" ]; then
+		    build_for_ios
+	    else
+		    build_error
+	    fi
+	fi
+fi
diff --git a/tools/ios-cmake/ios.toolchain.cmake b/tools/ios-cmake/ios.toolchain.cmake
index 5d34c892e146da89a286188f8493d16530844505..a8735adc8d853a5825a23f1ddf129d0a95199275 100644
--- a/tools/ios-cmake/ios.toolchain.cmake
+++ b/tools/ios-cmake/ios.toolchain.cmake
@@ -1,392 +1,210 @@
-# This file is part of the ios-cmake project. It was retrieved from
-# https://github.com/cristeab/ios-cmake.git, which is a fork of
-# https://code.google.com/p/ios-cmake/. Which in turn is based off of
-# the Platform/Darwin.cmake and Platform/UnixPaths.cmake files which
-# are included with CMake 2.8.4
-#
-# The ios-cmake project is licensed under the new BSD license.
-#
-# Copyright (c) 2014, Bogdan Cristea and LTE Engineering Software,
-# Kitware, Inc., Insight Software Consortium.  All rights reserved.
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 1. Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-# This file is based off of the Platform/Darwin.cmake and
-# Platform/UnixPaths.cmake files which are included with CMake 2.8.4
-# It has been altered for iOS development.
-#
-# Updated by Alex Stewart (alexs.mac@gmail.com)
-#
-# *****************************************************************************
-#      Now maintained by Alexander Widerberg (widerbergaren [at] gmail.com)
-#                      under the BSD-Clause-3 licence
-# *****************************************************************************
-#
-#                           INFORMATION / HELP
-#
-# The following variables control the behaviour of this toolchain:
-#
-# IOS_PLATFORM: OS (default) or SIMULATOR or SIMULATOR64 or TVOS or SIMULATOR_TVOS
-#    OS = Build for iPhoneOS.
-#    SIMULATOR = Build for x86 i386 iPhone Simulator.
-#    SIMULATOR64 = Build for x86_64 iPhone Simulator.
-#    TVOS = Build for AppleTVOS.
-#    SIMULATOR_TVOS = Build for x86_64 AppleTV Simulator.
-# CMAKE_OSX_SYSROOT: Path to the iOS SDK to use.  By default this is
-#    automatically determined from IOS_PLATFORM and xcodebuild, but
-#    can also be manually specified (although this should not be required).
-# CMAKE_IOS_DEVELOPER_ROOT: Path to the Developer directory for the iOS platform
-#    being compiled for.  By default this is automatically determined from
-#    CMAKE_OSX_SYSROOT, but can also be manually specified (although this should
-#    not be required).
-# ENABLE_BITCODE: (1|0) Enables or disables bitcode support. Default 1 (true)
-# ENABLE_ARC: (1|0) Enables or disables ARC support. Default 1 (true, ARC enabled by default)
-# IOS_ARCH: (armv7 armv7s arm64 i386 x86_64) If specified, will override the default architectures for the given IOS_PLATFORM
-#    OS = armv7 armv7s arm64
-#    SIMULATOR = i386
-#    SIMULATOR64 = x86_64
-#    TVOS = arm64
-#    SIMULATOR_TVOS = x86_64
+# This file is based off of the Platform/Darwin.cmake and Platform/UnixPaths.cmake
+# files which are included with CMake 2.8.4
+# It has been altered for iOS development
+
+# Options:
 #
-# This toolchain defines the following variables for use externally:
+# IOS_PLATFORM = OS (default) or SIMULATOR or SIMULATOR64
+#   This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
+#   OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
+#   SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
 #
-# XCODE_VERSION: Version number (not including Build version) of Xcode detected.
-# IOS_SDK_VERSION: Version of iOS SDK being used.
-# CMAKE_OSX_ARCHITECTURES: Architectures being compiled for (generated from
-#    IOS_PLATFORM).
+# CMAKE_IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
+#   By default this location is automatcially chosen based on the IOS_PLATFORM value above.
+#   If set manually, it will override the default location and force the user of a particular Developer Platform
 #
-# This toolchain defines the following macros for use externally:
+# CMAKE_IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
+#   By default this location is automatcially chosen based on the CMAKE_IOS_DEVELOPER_ROOT value.
+#   In this case it will always be the most up-to-date SDK found in the CMAKE_IOS_DEVELOPER_ROOT path.
+#   If set manually, this will force the use of a specific SDK version
+
+# Macros:
 #
-# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE XCODE_VARIANT)
-#   A convenience macro for setting xcode specific properties on targets.
-#   Available variants are: All, Release, RelWithDebInfo, Debug, MinSizeRel
-#   example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1" "all").
+# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
+#  A convenience macro for setting xcode specific properties on targets
+#  example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
 #
 # find_host_package (PROGRAM ARGS)
-#   A macro used to find executable programs on the host system, not within the
-#   iOS environment.  Thanks to the android-cmake project for providing the
-#   command.
-
-# Fix for PThread library not in path
-set(CMAKE_THREAD_LIBS_INIT "-lpthread")
-set(CMAKE_HAVE_THREADS_LIBRARY 1)
-set(CMAKE_USE_WIN32_THREADS_INIT 0)
-set(CMAKE_USE_PTHREADS_INIT 1)
-
-# Get the Xcode version being used.
-execute_process(COMMAND xcodebuild -version
-  OUTPUT_VARIABLE XCODE_VERSION
-  ERROR_QUIET
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
-string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}")
-string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}")
-message(STATUS "Building with Xcode version: ${XCODE_VERSION}")
-# Default to building for iPhoneOS if not specified otherwise, and we cannot
-# determine the platform from the CMAKE_OSX_ARCHITECTURES variable.  The use
-# of CMAKE_OSX_ARCHITECTURES is such that try_compile() projects can correctly
-# determine the value of IOS_PLATFORM from the root project, as
-# CMAKE_OSX_ARCHITECTURES is propagated to them by CMake.
-if (NOT DEFINED IOS_PLATFORM)
-  if (CMAKE_OSX_ARCHITECTURES)
-    if (CMAKE_OSX_ARCHITECTURES MATCHES ".*arm.*")
-      set(IOS_PLATFORM "OS")
-    elseif (CMAKE_OSX_ARCHITECTURES MATCHES "i386")
-      set(IOS_PLATFORM "SIMULATOR")
-    elseif (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
-      set(IOS_PLATFORM "SIMULATOR64")
-    endif()
-  endif()
-  if (NOT IOS_PLATFORM)
-    set(IOS_PLATFORM "OS")
-  endif()
-endif()
-set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING
-  "Type of iOS platform for which to build.")
-# Determine the platform name and architectures for use in xcodebuild commands
-# from the specified IOS_PLATFORM name.
-if (IOS_PLATFORM STREQUAL "OS")
-  set(XCODE_IOS_PLATFORM iphoneos)
-  if(NOT IOS_ARCH)
-    set(IOS_ARCH armv7 armv7s arm64)
-  endif()
-elseif (IOS_PLATFORM STREQUAL "SIMULATOR")
-  set(XCODE_IOS_PLATFORM iphonesimulator)
-  if(NOT IOS_ARCH)
-    set(IOS_ARCH i386)
-  endif()
-elseif(IOS_PLATFORM STREQUAL "SIMULATOR64")
-  set(XCODE_IOS_PLATFORM iphonesimulator)
-  if(NOT IOS_ARCH)
-    set(IOS_ARCH x86_64)
-  endif()
-elseif (IOS_PLATFORM STREQUAL "TVOS")
-  set(XCODE_IOS_PLATFORM appletvos)
-  if(NOT IOS_ARCH)
-    set(IOS_ARCH arm64)
-  endif()
-elseif (IOS_PLATFORM STREQUAL "SIMULATOR_TVOS")
-  set(XCODE_IOS_PLATFORM appletvsimulator)
-  if(NOT IOS_ARCH)
-    set(IOS_ARCH x86_64)
-  endif()
-else()
-  message(FATAL_ERROR "Invalid IOS_PLATFORM: ${IOS_PLATFORM}")
-endif()
-message(STATUS "Configuring iOS build for platform: ${IOS_PLATFORM}, "
-  "architecture(s): ${IOS_ARCH}")
-# If user did not specify the SDK root to use, then query xcodebuild for it.
-if (NOT CMAKE_OSX_SYSROOT)
-  execute_process(COMMAND xcodebuild -version -sdk ${XCODE_IOS_PLATFORM} Path
-    OUTPUT_VARIABLE CMAKE_OSX_SYSROOT
-    ERROR_QUIET
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-  message(STATUS "Using SDK: ${CMAKE_OSX_SYSROOT} for platform: ${IOS_PLATFORM}")
-endif()
-if (NOT EXISTS ${CMAKE_OSX_SYSROOT})
-  message(FATAL_ERROR "Invalid CMAKE_OSX_SYSROOT: ${CMAKE_OSX_SYSROOT} "
-    "does not exist.")
-endif()
-# Specify minimum version of deployment target.
-if (NOT DEFINED IOS_DEPLOYMENT_TARGET)
-  # Unless specified, SDK version 8.0 is used by default as minimum target version.
-  set(IOS_DEPLOYMENT_TARGET "8.0"
-      CACHE STRING "Minimum iOS version to build for." )
-  message(STATUS "Using the default min-version since IOS_DEPLOYMENT_TARGET not provided!")
-endif()
-# Use bitcode or not
-if (NOT DEFINED ENABLE_BITCODE)
-  # Unless specified, enable bitcode support by default
-  set(ENABLE_BITCODE TRUE CACHE BOOL "Wheter or not to enable bitcode")
-  message(STATUS "Enabling bitcode support by default. ENABLE_BITCODE not provided!")
-endif()
-# Use ARC or not
-if (NOT DEFINED ENABLE_ARC)
-  # Unless specified, enable ARC support by default
-  set(ENABLE_ARC TRUE CACHE BOOL "Wheter or not to enable ARC")
-  message(STATUS "Enabling ARC support by default. ENABLE_ARC not provided!")
-endif()
-# Get the SDK version information.
-execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion
-  OUTPUT_VARIABLE IOS_SDK_VERSION
-  ERROR_QUIET
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
-# Find the Developer root for the specific iOS platform being compiled for
-# from CMAKE_OSX_SYSROOT.  Should be ../../ from SDK specified in
-# CMAKE_OSX_SYSROOT.  There does not appear to be a direct way to obtain
-# this information from xcrun or xcodebuild.
-if (NOT CMAKE_IOS_DEVELOPER_ROOT)
-  get_filename_component(IOS_PLATFORM_SDK_DIR ${CMAKE_OSX_SYSROOT} PATH)
-  get_filename_component(CMAKE_IOS_DEVELOPER_ROOT ${IOS_PLATFORM_SDK_DIR} PATH)
-endif()
-if (NOT EXISTS ${CMAKE_IOS_DEVELOPER_ROOT})
-  message(FATAL_ERROR "Invalid CMAKE_IOS_DEVELOPER_ROOT: "
-    "${CMAKE_IOS_DEVELOPER_ROOT} does not exist.")
-endif()
-# Find the C & C++ compilers for the specified SDK.
-if (NOT CMAKE_C_COMPILER)
-  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang
-    OUTPUT_VARIABLE CMAKE_C_COMPILER
-    ERROR_QUIET
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-  message(STATUS "Using C compiler: ${CMAKE_C_COMPILER}")
-endif()
-if (NOT CMAKE_CXX_COMPILER)
-  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++
-    OUTPUT_VARIABLE CMAKE_CXX_COMPILER
-    ERROR_QUIET
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-  message(STATUS "Using CXX compiler: ${CMAKE_CXX_COMPILER}")
-endif()
-# Find (Apple's) libtool.
-execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find libtool
-  OUTPUT_VARIABLE IOS_LIBTOOL
-  ERROR_QUIET
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
-message(STATUS "Using libtool: ${IOS_LIBTOOL}")
-# Configure libtool to be used instead of ar + ranlib to build static libraries.
-# This is required on Xcode 7+, but should also work on previous versions of
-# Xcode.
-set(CMAKE_C_CREATE_STATIC_LIBRARY
-  "${IOS_LIBTOOL} -static -o <TARGET> <LINK_FLAGS> <OBJECTS> ")
-set(CMAKE_CXX_CREATE_STATIC_LIBRARY
-  "${IOS_LIBTOOL} -static -o <TARGET> <LINK_FLAGS> <OBJECTS> ")
-# Get the version of Darwin (OS X) of the host.
-execute_process(COMMAND uname -r
-  OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION
-  ERROR_QUIET
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
-# Standard settings.
-set(CMAKE_SYSTEM_NAME Darwin CACHE INTERNAL "")
-set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION} CACHE INTERNAL "")
-set(UNIX TRUE CACHE BOOL "")
-set(APPLE TRUE CACHE BOOL "")
-set(IOS TRUE CACHE BOOL "")
+#  A macro used to find executable programs on the host system, not within the iOS environment.
+#  Thanks to the android-cmake project for providing the command
+
+# Standard settings
+set (CMAKE_SYSTEM_NAME Darwin)
+set (CMAKE_SYSTEM_VERSION 1)
+set (UNIX True)
+set (APPLE True)
+set (IOS True)
+
+# Required as of cmake 2.8.10
+set (CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
+
+# Determine the cmake host system version so we know where to find the iOS SDKs
+find_program (CMAKE_UNAME uname /bin /usr/bin /usr/local/bin)
+if (CMAKE_UNAME)
+  exec_program(uname ARGS -r OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION)
+  string (REGEX REPLACE "^([0-9]+)\\.([0-9]+).*$" "\\1" DARWIN_MAJOR_VERSION "${CMAKE_HOST_SYSTEM_VERSION}")
+endif (CMAKE_UNAME)
+
+# Force the compilers to gcc for iOS
+#include (CMakeForceCompiler)
+#CMAKE_C_COMPILER (/usr/bin/gcc)
+#CMAKE_CXX_COMPILER (/usr/bin/g++)
+set(CMAKE_C_COMPILER /usr/bin/gcc)
+set(CMAKE_CXX_COMPILER /usr/bin/g++)
 set(CMAKE_AR ar CACHE FILEPATH "" FORCE)
-set(CMAKE_RANLIB ranlib CACHE FILEPATH "" FORCE)
-# Force unset of OS X-specific deployment target (otherwise autopopulated),
-# required as of cmake 2.8.10.
-set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING
-  "Must be empty for iOS builds." FORCE)
-# Set the architectures for which to build.
-set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE STRING "Build architecture for iOS")
-# Skip the platform compiler checks for cross compiling.
-set(CMAKE_CXX_COMPILER_FORCED TRUE)
-set(CMAKE_CXX_COMPILER_WORKS TRUE)
-set(CMAKE_C_COMPILER_FORCED TRUE)
-set(CMAKE_C_COMPILER_WORKS TRUE)
-# All iOS/Darwin specific settings - some may be redundant.
-set(CMAKE_SHARED_LIBRARY_PREFIX "lib")
-set(CMAKE_SHARED_LIBRARY_SUFFIX ".dylib")
-set(CMAKE_SHARED_MODULE_PREFIX "lib")
-set(CMAKE_SHARED_MODULE_SUFFIX ".so")
-set(CMAKE_MODULE_EXISTS 1)
-set(CMAKE_DL_LIBS "")
-set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
-set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
-set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
-set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
-message(STATUS "Building for minimum iOS version: ${IOS_DEPLOYMENT_TARGET}"
-               " (SDK version: ${IOS_SDK_VERSION})")
-# Note that only Xcode 7+ supports the newer more specific:
-# -m${XCODE_IOS_PLATFORM}-version-min flags, older versions of Xcode use:
-# -m(ios/ios-simulator)-version-min instead.
-if (IOS_PLATFORM STREQUAL "OS")
-  if (XCODE_VERSION VERSION_LESS 7.0)
-    set(XCODE_IOS_PLATFORM_VERSION_FLAGS
-      "-mios-version-min=${IOS_DEPLOYMENT_TARGET}")
-  else()
-    # Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM.
-    set(XCODE_IOS_PLATFORM_VERSION_FLAGS
-      "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
-  endif()
-elseif (IOS_PLATFORM STREQUAL "TVOS")
-  set(XCODE_IOS_PLATFORM_VERSION_FLAGS
-    "-mtvos-version-min=${IOS_DEPLOYMENT_TARGET}")
-elseif (IOS_PLATFORM STREQUAL "SIMULATOR_TVOS")
-  set(XCODE_IOS_PLATFORM_VERSION_FLAGS
-    "-mtvos-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
-else()
-  # SIMULATOR or SIMULATOR64 both use -mios-simulator-version-min.
-  set(XCODE_IOS_PLATFORM_VERSION_FLAGS
-    "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
-endif()
-message(STATUS "Version flags set to: ${XCODE_IOS_PLATFORM_VERSION_FLAGS}")
-
-if (ENABLE_BITCODE)
-  set(BITCODE "-fembed-bitcode")
-  message(STATUS "Enabling bitcode support.")
-else()
-  set(BITCODE "")
-  message(STATUS "Disabling bitcode support.")
-endif()
-
-if (ENABLE_ARC)
-  set(FOBJC_ARC "-fobjc-arc")
-  message(STATUS "Enabling ARC support.")
-else()
-  set(FOBJC_ARC "-fno-objc-arc")
-  message(STATUS "Disabling ARC support.")
-endif()
-
-set(CMAKE_C_FLAGS
-"${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${BITCODE} -fobjc-abi-version=2 ${FOBJC_ARC} ${C_FLAGS}")
-# Hidden visibilty is required for C++ on iOS.
-set(CMAKE_CXX_FLAGS
-"${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${BITCODE} -fvisibility=hidden -fvisibility-inlines-hidden -fobjc-abi-version=2 ${FOBJC_ARC} ${CXX_FLAGS}")
-set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS} -DNDEBUG -Os -fomit-frame-pointer -ffast-math ${BITCODE} ${CXX_FLAGS_MINSIZEREL}")
-set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -DNDEBUG -O2 -g -fomit-frame-pointer -ffast-math ${BITCODE} ${CXX_FLAGS_RELWITHDEBINFO}")
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -DNDEBUG -O3 -fomit-frame-pointer -ffast-math ${BITCODE} ${CXX_FLAGS_RELEASE}")
-set(CMAKE_C_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first ${C_LINK_FLAGS}")
-set(CMAKE_CXX_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS}  -Wl,-search_paths_first ${CXX_LINK_FLAGS}")
-
-# In order to ensure that the updated compiler flags are used in try_compile()
-# tests, we have to forcibly set them in the CMake cache, not merely set them
-# in the local scope.
-list(APPEND VARS_TO_FORCE_IN_CACHE
-  CMAKE_C_FLAGS
-  CMAKE_CXX_FLAGS
-  CMAKE_CXX_FLAGS_RELWITHDEBINFO
-  CMAKE_CXX_FLAGS_MINSIZEREL
-  CMAKE_CXX_FLAGS_RELEASE
-  CMAKE_C_LINK_FLAGS
-  CMAKE_CXX_LINK_FLAGS)
-foreach(VAR_TO_FORCE ${VARS_TO_FORCE_IN_CACHE})
-  set(${VAR_TO_FORCE} "${${VAR_TO_FORCE}}" CACHE STRING "" FORCE)
-endforeach()
-
-set(CMAKE_PLATFORM_HAS_INSTALLNAME 1)
-set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
-set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
-set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
-set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
-set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
-# Hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old
-# build tree (where install_name_tool was hardcoded) and where
-# CMAKE_INSTALL_NAME_TOOL isn't in the cache and still cmake didn't fail in
-# CMakeFindBinUtils.cmake (because it isn't rerun) hardcode
-# CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did
-# before, Alex.
+
+# Skip the platform compiler checks for cross compiling
+set (CMAKE_CXX_COMPILER_WORKS TRUE)
+set (CMAKE_C_COMPILER_WORKS TRUE)
+
+# All iOS/Darwin specific settings - some may be redundant
+set (CMAKE_SHARED_LIBRARY_PREFIX "lib")
+set (CMAKE_SHARED_LIBRARY_SUFFIX ".dylib")
+set (CMAKE_SHARED_MODULE_PREFIX "lib")
+set (CMAKE_SHARED_MODULE_SUFFIX ".so")
+set (CMAKE_MODULE_EXISTS 1)
+set (CMAKE_DL_LIBS "")
+
+set (CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
+set (CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
+set (CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
+set (CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
+
+# Hidden visibilty is required for cxx on iOS
+set (CMAKE_C_FLAGS_INIT "")
+set (CMAKE_CXX_FLAGS_INIT "-fvisibility=hidden -fvisibility-inlines-hidden")
+
+set (CMAKE_C_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}")
+set (CMAKE_CXX_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}")
+
+set (CMAKE_PLATFORM_HAS_INSTALLNAME 1)
+set (CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
+set (CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
+set (CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
+set (CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
+set (CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
+
+# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
+# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
+# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
+# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
 if (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
   find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
 endif (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
-# Set the find root to the iOS developer roots and to user defined paths.
-set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_OSX_SYSROOT}
-  ${CMAKE_PREFIX_PATH} CACHE string  "iOS find search path root" FORCE)
-# Default to searching for frameworks first.
-set(CMAKE_FIND_FRAMEWORK FIRST)
-# Set up the default search directories for frameworks.
-set(CMAKE_SYSTEM_FRAMEWORK_PATH
-  ${CMAKE_OSX_SYSROOT}/System/Library/Frameworks
-  ${CMAKE_OSX_SYSROOT}/System/Library/PrivateFrameworks
-  ${CMAKE_OSX_SYSROOT}/Developer/Library/Frameworks)
-# Only search the specified iOS SDK, not the remainder of the host filesystem.
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-# This little macro lets you set any XCode specific property.
-macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE XCODE_RELVERSION)
-  set(XCODE_RELVERSION_I "${XCODE_RELVERSION}")
-  if (XCODE_RELVERSION_I STREQUAL "All")
-    set_property(TARGET ${TARGET} PROPERTY
-    XCODE_ATTRIBUTE_${XCODE_PROPERTY} "${XCODE_VALUE}")
-  else()
-    set_property(TARGET ${TARGET} PROPERTY
-    XCODE_ATTRIBUTE_${XCODE_PROPERTY}[variant=${XCODE_RELVERSION_I}] "${XCODE_VALUE}")
-  endif()
-endmacro(set_xcode_property)
-# This macro lets you find executable programs on the host system.
-macro(find_host_package)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
-  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
-  set(IOS FALSE)
+
+# Setup iOS platform unless specified manually with IOS_PLATFORM
+if (NOT DEFINED IOS_PLATFORM)
+  set (IOS_PLATFORM "OS")
+endif (NOT DEFINED IOS_PLATFORM)
+set (IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
+
+# Setup building for arm64 or not
+if (NOT DEFINED BUILD_ARM64)
+  set (BUILD_ARM64 true)
+endif (NOT DEFINED BUILD_ARM64)
+set (BUILD_ARM64 ${BUILD_ARM64} CACHE STRING "Build arm64 arch or not")
+
+# Check the platform selection and setup for developer root
+if (${IOS_PLATFORM} STREQUAL "OS")
+  set (IOS_PLATFORM_LOCATION "iPhoneOS.platform")
+
+  # This causes the installers to properly locate the output libraries
+  set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
+elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
+  set (SIMULATOR true)
+  set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
+
+  # This causes the installers to properly locate the output libraries
+  set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
+elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")
+  set (SIMULATOR true)
+  set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
+
+  # This causes the installers to properly locate the output libraries
+  set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
+else (${IOS_PLATFORM} STREQUAL "OS")
+  message (FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please choose OS or SIMULATOR")
+endif (${IOS_PLATFORM} STREQUAL "OS")
+
+# Setup iOS developer location unless specified manually with CMAKE_IOS_DEVELOPER_ROOT
+# Note Xcode 4.3 changed the installation location, choose the most recent one available
+exec_program(/usr/bin/xcode-select ARGS -print-path OUTPUT_VARIABLE CMAKE_XCODE_DEVELOPER_DIR)
+set (XCODE_POST_43_ROOT "${CMAKE_XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
+set (XCODE_PRE_43_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
+if (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
+  if (EXISTS ${XCODE_POST_43_ROOT})
+    set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_POST_43_ROOT})
+  elseif(EXISTS ${XCODE_PRE_43_ROOT})
+    set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_PRE_43_ROOT})
+  endif (EXISTS ${XCODE_POST_43_ROOT})
+endif (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
+set (CMAKE_IOS_DEVELOPER_ROOT ${CMAKE_IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
+
+# Find and use the most recent iOS sdk unless specified manually with CMAKE_IOS_SDK_ROOT
+if (NOT DEFINED CMAKE_IOS_SDK_ROOT)
+  file (GLOB _CMAKE_IOS_SDKS "${CMAKE_IOS_DEVELOPER_ROOT}/SDKs/*")
+  if (_CMAKE_IOS_SDKS)
+    list (SORT _CMAKE_IOS_SDKS)
+    list (REVERSE _CMAKE_IOS_SDKS)
+    list (GET _CMAKE_IOS_SDKS 0 CMAKE_IOS_SDK_ROOT)
+  else (_CMAKE_IOS_SDKS)
+    message (FATAL_ERROR "No iOS SDK's found in default search path ${CMAKE_IOS_DEVELOPER_ROOT}. Manually set CMAKE_IOS_SDK_ROOT or install the iOS SDK.")
+  endif (_CMAKE_IOS_SDKS)
+  message (STATUS "Toolchain using default iOS SDK: ${CMAKE_IOS_SDK_ROOT}")
+endif (NOT DEFINED CMAKE_IOS_SDK_ROOT)
+set (CMAKE_IOS_SDK_ROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
+
+# Set the sysroot default to the most recent SDK
+set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
+
+# set the architecture for iOS
+if (${IOS_PLATFORM} STREQUAL "OS")
+  set (IOS_ARCH armv7 armv7s arm64)
+elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
+  set (IOS_ARCH i386)
+elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")
+  set (IOS_ARCH x86_64)
+endif (${IOS_PLATFORM} STREQUAL "OS")
+
+set (CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
+
+# Set the find root to the iOS developer roots and to user defined paths
+set (CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string  "iOS find search path root")
+
+# default to searching for frameworks first
+set (CMAKE_FIND_FRAMEWORK FIRST)
+
+# set up the default search directories for frameworks
+set (CMAKE_SYSTEM_FRAMEWORK_PATH
+        ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
+        ${CMAKE_IOS_SDK_ROOT}/System/Library/PrivateFrameworks
+        ${CMAKE_IOS_SDK_ROOT}/Developer/Library/Frameworks
+        )
+
+# only search the iOS sdks, not the remainder of the host filesystem
+set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+
+# This little macro lets you set any XCode specific property
+macro (set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
+  set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
+endmacro (set_xcode_property)
+
+
+# This macro lets you find executable programs on the host system
+macro (find_host_package)
+  set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+  set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
+  set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
+  set (IOS FALSE)
+
   find_package(${ARGN})
-  set(IOS TRUE)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-endmacro(find_host_package)
+
+  set (IOS TRUE)
+  set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+  set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+  set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+endmacro (find_host_package)
+
diff --git a/tools/op.cmake b/tools/op.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..2eabac925f6021448243b3668c22cbcaebe2f1d9
--- /dev/null
+++ b/tools/op.cmake
@@ -0,0 +1,148 @@
+set(NET "googlenet" CACHE STRING "select net type")
+set_property(CACHE NET PROPERTY STRINGS "defult" "googlenet" "mobilenet" "yolo" "squeezenet")
+
+if (NET EQUAL "googlenet")
+  set(CONCAT_OP ON)
+  set(CONV_OP ON)
+  set(LRN_OP ON)
+  set(MUL_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(FUSION_FC_OP ON)
+  set(POOL_OP ON)
+  set(RELU_OP ON)
+  set(FUSION_CONVADD_OP ON)
+  set(FUSION_CONVADD_RELU_OP ON)
+elseif (NET EQUAL "mobilenet")
+  set(CONV_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(RELU_OP ON)
+  set(SOFTMAX_OP ON)
+  set(SOFTMAX_OP ON)
+  set(DEPTHWISECONV_OP ON)
+  set(BATCHNORM_OP ON)
+  set(POOL_OP ON)
+  set(RESHAPE_OP ON)
+elseif (NET EQUAL "yolo")
+  set(BATCHNORM_OP ON)
+  set(CONV_OP ON)
+  set(RELU_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+elseif (NET EQUAL "squeezenet")
+  set(CONCAT_OP ON)
+  set(CONV_OP ON)
+  set(RELU_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(POOL_OP ON)
+  set(RESHAPE_OP ON)
+  set(SOFTMAX_OP ON)
+elseif (NET EQUAL "resnet")
+  set(CONV_OP ON)
+  set(BATCHNORM_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(SOFTMAX_OP ON)
+  set(MUL_OP ON)
+  set(POOL_OP ON)
+  set(RELU_OP ON)
+else ()
+  set(BATCHNORM_OP ON)
+  set(BOXCODER_OP ON)
+  set(CONCAT_OP ON)
+  set(CONV_OP ON)
+  set(DEPTHWISECONV_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(FUSION_CONVADD_OP ON)
+  set(CONVADDRELU_OP ON)
+  set(FUSION_FC_OP ON)
+  set(LRN_OP ON)
+  set(MUL_OP ON)
+  set(MULTICLASSNMS_OP ON)
+  set(POOL_OP ON)
+  set(PRIORBOX_OP ON)
+  set(RELU_OP ON)
+  set(RESHAPE_OP ON)
+  set(SIGMOID_OP ON)
+  set(SOFTMAX_OP ON)
+  set(TRANSPOSE_OP ON)
+  set(FUSION_CONVADD_RELU_OP ON)
+  # option(BATCHNORM_OP "" ON)
+  # option(BOXCODER_OP "" ON)
+  # option(CONCAT_OP "" ON)
+  # option(CONV_OP "" ON)
+  # option(DEPTHWISECONV_OP "" ON)
+  # option(ELEMENTWISEADD_OP "" ON)
+  # option(FUSION_CONVADD_OP "" ON)
+  # option(CONVADDRELU_OP "" ON)
+  # option(FUSION_FC_OP "" ON)
+  # option(LRN_OP "" ON)
+  # option(MUL_OP "" ON)
+  # option(MULTICLASSNMS_OP "" ON)
+  # option(POOL_OP "" ON)
+  # option(PRIORBOX_OP "" ON)
+  # option(RELU_OP "" ON)
+  # option(RESHAPE_OP "" ON)
+  # option(SIGMOID_OP "" ON)
+  # option(SOFTMAX_OP "" ON)
+  # option(TRANSPOSE_OP "" ON)
+  # option(FUSION_CONVADD_RELU_OP "" ON)
+endif ()
+
+if (BATCHNORM_OP)
+  add_definitions(-DBATCHNORM_OP)
+endif()
+if (BOXCODER_OP)
+  add_definitions(-DBOXCODER_OP)
+endif()
+if (CONCAT_OP)
+  add_definitions(-DCONCAT_OP)
+endif()
+if (CONV_OP)
+  add_definitions(-DCONV_OP)
+endif()
+if (DEPTHWISECONV_OP)
+  add_definitions(-DDEPTHWISECONV_OP)
+endif()
+if (ELEMENTWISEADD_OP)
+  add_definitions(-DELEMENTWISEADD_OP)
+endif()
+if (FUSION_CONVADD_OP)
+  add_definitions(-DFUSION_CONVADD_OP)
+endif()
+if (CONVADDRELU_OP)
+  add_definitions(-DCONVADDRELU_OP)
+endif()
+if (FUSION_FC_OP)
+  add_definitions(-DFUSION_FC_OP)
+endif()
+if (LRN_OP)
+  add_definitions(-DLRN_OP)
+endif()
+if (MUL_OP)
+  add_definitions(-DMUL_OP)
+endif()
+if (MULTICLASSNMS_OP)
+  add_definitions(-DMULTICLASSNMS_OP)
+endif()
+if (POOL_OP)
+  add_definitions(-DPOOL_OP)
+endif()
+if (PRIORBOX_OP)
+  add_definitions(-DPRIORBOX_OP)
+endif()
+if (RELU_OP)
+  add_definitions(-DRELU_OP)
+endif()
+if (RESHAPE_OP)
+  add_definitions(-DRESHAPE_OP)
+endif()
+if (SIGMOID_OP)
+  add_definitions(-DSIGMOID_OP)
+endif()
+if (SOFTMAX_OP)
+  add_definitions(-DSOFTMAX_OP)
+endif()
+if (TRANSPOSE_OP)
+  add_definitions(-DTRANSPOSE_OP)
+endif()
+if (FUSION_CONVADD_RELU_OP)
+  add_definitions(-DFUSION_CONVADD_RELU_OP)
+endif()
\ No newline at end of file
diff --git a/tools/pre-commit.hooks/clang-format.hook b/tools/pre-commit.hooks/clang-format.hook
index 406850d1a2450b49463563c0034c6c969895bfe4..4fa4253bad78fe287fb92863a684a5d7def71061 100644
--- a/tools/pre-commit.hooks/clang-format.hook
+++ b/tools/pre-commit.hooks/clang-format.hook
@@ -12,4 +12,8 @@ if ! [[ $version == *"$VERSION"* ]]; then
     exit -1
 fi
 
-clang-format $@
+# https://medicineyeh.wordpress.com/2017/07/13/clang-format-with-pragma/
+shift
+perl -i -pe 's|#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@"
+clang-format -i $@
+perl -i -pe 's|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> ||' "$@"
diff --git a/tools/profile_show.sh b/tools/profile_show.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9dae20faf343de68a7815988c4d1046d0438094b
--- /dev/null
+++ b/tools/profile_show.sh
@@ -0,0 +1,139 @@
+#!/usr/bin/env sh
+cat <<EOF
+<html>
+<head>
+<style>
+html, body {
+position: absolute;
+width: 100%;
+height: 100%;
+margin: 0;
+}
+div.timeview {
+width: 100%;
+position: relative;
+overflow: scroll;
+}
+ul {
+position: absolute;
+margin: 0;
+list-style:none;
+padding: 0;
+margin: 0;
+}
+li {
+height: 15px;
+position: absolute;
+background: blue;
+}
+li:nth-child(odd) {
+background: blue;
+}
+li:nth-child(even) {
+background: rebeccapurple;
+}
+ul.timeline {
+z-index: -1;
+}
+ul.timeline li {
+position: relative;
+height: 15px;
+width: 100%;
+}
+ul.timeline li:nth-child(odd) {
+background: beige;
+}
+ul.timeline li:nth-child(even) {
+background: antiquewhite;
+}
+</style>
+</head>
+<body>
+<div class="timeview">
+<ul>
+EOF
+
+min=$(awk 'NR==1{min=$4} NR>1{if($4 < min) min=$4} END{print min}' $1)
+max=$(awk 'NR==1{max=$5} NR>1{if($5 > max) max=$5} END{print max}' $1)
+sort $1 -k1,1n | awk -v max="$max" -v min="$min" '
+BEGIN {
+  total = max - min
+}
+{
+    opid = $1
+    optype = $2
+    tid = $3
+    cb = $4
+    ce = $5
+    cl = $6
+    sum += $4 - $3
+    print "<li class=\"timeline\"" \
+          " data-opid=\"" opid "\"" \
+          " data-optype=\"" optype "\"" \
+          " data-tid=\"" tid "\"" \
+          " data-begin=\"" cb "\"" \
+          " data-end=\"" ce "\"" \
+          "></li>"
+}
+'
+
+cat <<EOF
+</ul>
+</div>
+<pre>
+EOF
+
+echo "==================[ profile ]==================="
+cat $1 | awk '
+NR>1{
+    optype = $2
+    sum += $5 - $4
+    count[$2] += $6
+}
+END {
+for (t in count) {
+    msg = sprintf("%-16s\t%-10d\t%-.4f", t, count[t], count[t]*100 / sum);
+    print msg
+}
+}' | sort -k2,2nr
+cat $1 | awk '
+NR>1{
+    sum += $5 - $4
+}
+END {
+msg = sprintf("%-16s\t%-10d\t%-.4f", "total", sum, 100);
+print msg
+}'
+
+cat <<EOF
+</pre>
+<script>
+const min= $min;
+const max= $max;
+const px_per_nanosecond = 1/1000000;
+const scale = px_per_nanosecond;
+const li = document.querySelectorAll('li');
+const thread = new Set();
+for (let i = 0; i < li.length; i++) {
+    const prof = li[i].dataset;
+    li[i].style.width = (prof.end - prof.begin)*scale + 'px';
+    li[i].style.left = (prof.begin - min)*scale + 'px';
+    li[i].style.top = prof.tid * 15 + 'px';
+    thread.add(prof.tid);
+}
+const ul = document.createElement('ul');
+ul.classList.add('timeline');
+ul.style.width = (max - min)*scale + 'px';
+thread.forEach(i => {
+    const l = document.createElement('li');
+    ul.appendChild(l);
+});
+const timeview = document.querySelector('.timeview');
+timeview.appendChild(ul);
+timeview.style.height = thread.size * 15 + 'px';
+
+</script>
+</body>
+</html>
+EOF
+
diff --git a/tools/toolchains/arm-android-neon.cmake b/tools/toolchains/arm-android-neon.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..f2fa600b90fb54886838e953e61c1e940569dee6
--- /dev/null
+++ b/tools/toolchains/arm-android-neon.cmake
@@ -0,0 +1,2 @@
+set(ANDROID_ARM_NEON ON)
+include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")
\ No newline at end of file
diff --git a/tools/toolchains/arm-linux-gnueabi.cmake b/tools/toolchains/arm-linux-gnueabi.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..ee3cb50796d184f9f4577e8aabb4cf0ca98c955f
--- /dev/null
+++ b/tools/toolchains/arm-linux-gnueabi.cmake
@@ -0,0 +1,16 @@
+# CMake toolchain file for building ARM software on Linux environment
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_VERSION 1)
+
+set(CMAKE_C_COMPILER   /usr/bin/arm-linux-gnueabi-gcc)
+set(CMAKE_CXX_COMPILER /usr/bin/arm-linux-gnueabi-g++)
+set(CMAKE_STRIP /usr/bin/arm-linux-gnueabi-strip)
+
+set(CMAKE_FIND_ROOT_PATH  /usr/arm-linux-gnueabi)
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+include("${CMAKE_CURRENT_LIST_DIR}/../arm-platform.cmake")