Merge remote-tracking branch 'upstream/develop' into develop

# Conflicts: # src/common/types.h

Merge remote-tracking branch 'upstream/develop' into develop
# Conflicts: # src/common/types.h
abb4bb07 · Yao,kun · 783a1a6a · e4d182c5 · abb4bb07 · abb4bb07
213 changed file
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "src/operators/kernel/mali/ACL_Android"]
+	path = src/operators/kernel/mali/ACL_Android
+	url = https://github.com/halsay/ACL_Android.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.0)
 project(paddle-mobile)
-add_definitions(-DPADDLE_MOBILE_DEBUG)
-add_definitions(-DENABLE_EXCEPTION)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+option(DEBUGING "enable debug mode" ON)
-set(CMAKE_BUILD_TYPE RelWithDebInfo)
+option(USE_OPENMP "openmp support" OFF)
-set(CMAKE_VERBOSE_MAKEFILE ON)
+option(USE_EXCEPTION "use std exception" ON)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+option(LOG_PROFILE "log profile" ON)
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
+# select the platform to build
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
+option(CPU "armv7 with neon" ON)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
+option(MALI_GPU "mali gpu" ON)
+option(FPGA "fpga" OFF)
+set(DEBUGING ON)
+if (CPU)
+  add_definitions(-DPADDLE_MOBILE_CPU)
+endif()
+if (MALI_GPU)
+    add_definitions(-DPADDLE_MOBILE_MALI_GPU)
+    add_definitions(-DUSE_ACL=1)
+    add_definitions(-DUSE_OPENCL)
+    set(ACL_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/ACL_Android)
+    include_directories(${ACL_ROOT} ${ACL_ROOT}/include)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_core")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_graph")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build/opencl-1.2-stubs")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ACL=1")
+endif()
+if(FPGA)
+  add_definitions(-DPADDLE_MOBILE_FPGA)
+endif()
+set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
+if (DEBUGING)
+  message(STATUS "debug")
+  set(CMAKE_BUILD_TYPE Debug)
+  set(CMAKE_CXX_FLAGS_DEBUG "-g -DNDEBUG")
+  add_definitions(-DPADDLE_MOBILE_DEBUG)
+  if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
+    add_definitions(-DARMV7)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
+  endif ()
+else ()
+  set(CMAKE_BUILD_TYPE Release)
+  set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
+  add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
+endif ()
+if (USE_EXCEPTION)
+    message(STATUS "use exception")
+    add_definitions(-DENABLE_EXCEPTION)
+    add_definitions(-fexceptions)
+else()
+    add_definitions(-fno-exceptions)
+endif ()
+if (LOG_PROFILE)
+    add_definitions(-DPADDLE_MOBILE_PROFILE)
+endif()
+if(USE_OPENMP)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
+endif()
 file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c)
 file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
-# include headers
+if (NOT ANDROID_NDK_TOOLCHAIN_INCLUDED)
+  list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.cpp)
+  list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.h)
+  list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
+endif ()
 include_directories(src/)
-#include(ExternalProject)
+set(CMAKE_VERBOSE_MAKEFILE ON)
-#ExternalProject_Add(openblas_proj
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-#        GIT_REPOSITORY "https://github.com/xianyi/OpenBLAS.git"
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
-#        GIT_TAG "v0.2.20"
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
-#        SOURCE_DIR "openblas/"
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
-#        BUILD_IN_SOURCE 1
-#        CONFIGURE_COMMAND ""
+include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
-#        BUILD_COMMAND "make" "ONLY_CBLAS=1"
-#        INSTALL_COMMAND "make" "PREFIX=${CMAKE_BINARY_DIR}/" "install"
-#        )
-#set_target_properties(openblas_proj PROPERTIES EXCLUDE_FROM_ALL 1)
-#add_dependencies(paddle-mobile openblas_proj)
+# if (IS_IOS)
+#     add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
+    list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS)
+    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+else ()
+    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+endif ()
-# gen static
+if(DEBUGING)
-ADD_LIBRARY(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+    add_subdirectory(test)
+endif()
-#add_dependencies(paddle-mobile openblas_proj)
-add_subdirectory(test)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
+# 贡献代码
+欢迎您对Paddle-Mobile项目的贡献。
+我们诚挚的感谢你的贡献，这个文档描述了我们的工作方式和工作流程。Paddle-Mobile在PaddlePaddle org下，和服务器版本的Paddle工程的代码规范基本相同，开发者也可以同时参考Paddle的相关文档。
+## Workflow
+Paddle-Mobile 开发中使用到的几种模型在这个链接下载 [点我](https://mms-mis.cdn.bcebos.com/paddle-mobile/models.zip).  
+之后是贡献代码的主要流程。
+### Fork
+* Paddle-Mobile采用Pull Request的方式提交代码，禁止直接push，所有的代码都需要人工review。首先要fork一份Paddle-Moble的代码 ["Fork" button](https://help.github.com/articles/fork-a-repo/).
+* 跳转到[Paddle-Mobile](https://github.com/PaddlePaddle/paddle-mobile) GitHub首页，然后单击 `Fork` 按钮，生成自己目录下的仓库，比如 <https://github.com/你的用户名/paddle-mobile>。
+### Clone(克隆)
+将远程仓库 clone 到本地：
+```bash
+➜  git clone https://github.com/你的用户名/paddle-mobile
+➜  cd Paddle
+```
+### 创建本地分支
+Paddle-Mobile 和Paddle一样，目前使用[Git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护，具体请参考 [Paddle 分支规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/releasing_process.md#paddle-分支规范)。
+所有的 feature 和 bug fix 的开发工作都应该在一个新的分支上完成，一般从 `develop` 分支上创建新分支。
+使用 `git checkout -b` 创建并切换到新分支。
+```bash
+➜  git checkout -b my-cool-stuff
+```
+值得注意的是，在 checkout 之前，需要保持当前分支目录 clean，否则会把 untracked 的文件也带到新分支上，这可以通过 `git status` 查看。
+### 使用 `pre-commit` 钩子
+Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码（C++，Python），在提交（commit）前自动检查一些基本事宜（如每个文件只有一个 EOL，Git 中不要添加大文件等）。
+`pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子的 PR 不能被提交到 Paddle，首先安装并在当前目录运行它：
+```bash
+pip install pre-commit
+pre-commit -v -a
+```
+Paddle-Mobile 使用 `clang-format` 来调整 C/C++ 源代码格式，在格式化代码时不同的`clang-format`版本会有不同的表现形态，和Paddle不同的是，Paddle-Mobile开发人员使用的是更的5.0版本的llvm工具集。所以为了防止无法CI，请确保 `clang-format` 版本是 5.0 版本。
+> 另外：通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的，Paddle 开发人员使用的是`pip install pre-commit`。
+## 开始开发
+在本例中，我删除了 README.md 中的一行，并创建了一个新文件。
+通过 `git status` 查看当前状态，这会提示当前目录的一些变化，同时也可以通过 `git diff` 查看文件具体被修改的内容。
+```bash
+➜  git status
+On branch test
+Changes not staged for commit:
+  (use "git add <file>..." to update what will be committed)
+  (use "git checkout -- <file>..." to discard changes in working directory)
+	modified:   README.md
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+	test
+no changes added to commit (use "git add" and/or "git commit -a")
+```
+## 构建
+paddle-mobile是为了移动端版本开发的，而移动端大多以arm平台为主。所以我们要交叉编译到arm平台。以cpu为例：
+1. 安装NDK最新版
+2. 配置ANDROID_NDK和NDK_ROOT环境变量
+3. 开发，并写单元测试
+4. sh build.sh
+## 提交（commit）
+接下来我们取消对 README.md 文件的改变，然后提交新添加的 test 文件。
+```bash
+➜  git checkout -- README.md
+➜  git status
+On branch test
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+	test
+nothing added to commit but untracked files present (use "git add" to track)
+➜  git add test
+```
+Git 每次提交代码，都需要写提交说明，这可以让其他人知道这次提交做了哪些改变，这可以通过`git commit` 完成。
+```bash
+▶ pre-commit run -a -v
+[remove-crlf] CRLF end-lines remover........................................Passed
+[remove-tabs] Tabs remover..................................................Passed
+[check-added-large-files] Check for added large files.......................Passed
+[check-merge-conflict] Check for merge conflicts............................Passed
+[check-symlinks] Check for broken symlinks..................................Passed
+[detect-private-key] Detect Private Key.....................................Passed
+[end-of-file-fixer] Fix End of Files........................................Passed
+[trailing-whitespace] Trim Trailing Whitespace..............................Passed
+[copyright] copyright.......................................................Passed
+[clang-format] clang-format.................................................Passed
+[cpplint] cpplint...........................................................Passed
+hookid: cpplint
+Ignoring build_bak.sh; not a valid file name (c, cc, h, hpp, c++, h++, cu, cpp, hxx, cxx, cuh)
+Done processing build_bak.sh
+Ignoring build_bak.sh; not a valid file name (c, cc, h, hpp, c++, h++, cu, cpp, hxx, cxx, cuh)
+Done processing build_bak.sh
+```
+## 保持本地仓库最新
+在准备发起 Pull Request 之前，需要同步原仓库（<https://github.com/PaddlePaddle/paddle-mobile>）最新的代码。
+首先通过 `git remote` 查看当前远程仓库的名字。
+```bash
+➜  git remote
+origin
+➜  git remote -v
+origin	https://github.com/USERNAME/paddle-mobile (fetch)
+origin	https://github.com/USERNAME/paddle-mobile (push)
+```
+这里 origin 是我们 clone 的远程仓库的名字，也就是自己用户名下的 paddle-mobile，接下来我们创建一个原始 paddle-mobile 仓库的远程主机，命名为 upstream。
+```bash
+➜  git remote add upstream https://github.com/PaddlePaddle/paddle-mobile
+➜  git remote
+origin
+upstream
+```
+获取 upstream 的最新代码并更新当前分支。
+```bash
+➜  git fetch upstream
+➜  git pull upstream develop
+```
+## Push 到远程仓库
+将本地的修改推送到 GitHub 上，也就是 https://github.com/USERNAME/paddle-mobile。
+```bash
+# 推送到远程仓库 origin 的 my-cool-stuff 分支上
+➜  git push origin my-cool-stuff
+```
+## 建立 Issue 并完成 Pull Request
+建立一个 Issue 描述问题，并记录它的编号。
+切换到所建分支，然后点击 `New pull request`。
+在 PR 的描述说明中，填写 `resolve #Issue编号` 可以在这个 PR 被 merge 后，自动关闭对应的 Issue
+> 具体请见 <https://help.github.com/articles/closing-issues-via-commit-messages/>
+## review
+在接到PR后，可以看到该pr页面内正在运行CI。如果运行出现问题，可以点Details进入Travis平台上看详细内容。
+![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294833030073.jpg)
+可以在travis上看到更加详细的信息。
+![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294833651326.jpg)
+接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。
+## 删除远程分支
+在 PR 被 merge 进主仓库后，我们可以在 PR 的页面删除远程仓库的分支。
+<img width="775" alt="screen shot 2017-04-26 at 9 18 24 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436457/e4cdd472-2ac5-11e7-9272-badc76c4a23e.png">
+也可以使用 `git push origin :分支名` 删除远程分支，如：
+```bash
+➜  git push origin :my-cool-stuff
+```
+## 删除本地分支
+最后，删除本地分支。
+```bash
+# 切换到 develop 分支
+➜  git checkout develop 
+# 删除 my-cool-stuff 分支
+➜  git branch -D my-cool-stuff
+```
+至此，我们就完成了一次代码贡献的过程。
+## 提交代码的一些约定
+为了使评审人在评审代码时更好地专注于代码本身，请您每次提交代码时，遵守以下约定：
+1. 请保证Travis-CI 中单元测试能顺利通过。如果没过，说明提交的代码存在问题，评审人一般不做评审。
+2. 提交Pull Request前：
+   - 请注意commit的数量：
+     - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
+     - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
+   - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
+3. 如果解决了某个Issue的问题，请在该Pull Request的**第一个**评论框中加上：`fix #issue_number`，这样当该PUll Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
+此外，在回复评审人意见时，请您遵守以下约定：
+1. 评审人的每个意见都必须回复（这是开源社区的基本礼貌，别人帮了忙，应该说谢谢）：
+   - 对评审意见同意且按其修改完的，给个简单的`Done`即可；
+   - 对评审意见不同意的，请给出您自己的反驳理由。
+2. 如果评审意见比较多：
+   - 请给出总体的修改情况。
+   - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复，而非直接回复的方式。原因是每个回复都会发送一封邮件，会造成邮件灾难。
--- a/Dockerfile
+++ b/Dockerfile
+FROM ubuntu:16.04
+RUN echo '\
+deb <mirror> <version> main restricted universe multiverse\n\
+deb <mirror> <version>-updates main restricted universe multiverse\n\
+deb <mirror> <version>-backports main restricted universe multiverse\n\
+deb <mirror> <version>-security main restricted universe multiverse\n'\
+> /etc/apt/sources.list
+RUN sed -ie 's|<mirror>|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|' /etc/apt/sources.list
+RUN sed -ie 's|<version>|xenial|' /etc/apt/sources.list
+RUN apt-get update && apt-get upgrade -y
+RUN apt-get install -y --no-install-recommends \
+        curl \
+        unzip \
+        git \
+        make \
+        cmake \
+        cmake-curses-gui \
+        python \
+        python-pip \
+        python-setuptools \
+        clang-format-5.0 \
+        graphviz \
+        g++-arm-linux-gnueabi \
+        gcc-arm-linux-gnueabi
+RUN apt-get autoremove -y && apt-get clean
+RUN pip install --upgrade pip
+RUN pip install wheel && pip install pre-commit
+RUN ln -s clang-format-5.0 /usr/bin/clang-format
+# RUN cd /tmp && curl -O http://mirrors.neusoft.edu.cn/android/repository/android-ndk-r17b-linux-x86_64.zip
+# RUN cd /opt && unzip /tmp/android-ndk-r17b-linux-x86_64.zip
+# ENV NDK_ROOT /opt/android-ndk-r17b
--- a/doc/build.md
+++ b/doc/build.md
+# 环境搭建
+## 使用 docker
+### 1. 安装 docker
+安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
+### 2. 使用 docker 搭建构建环境
+首先进入 paddle-mobile 的目录下，执行 `docker build`
+以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
+```
+$ docker build -t paddle-mobile:dev - < Dockerfile
+```
+使用 `docker images` 可以看到我们新建的 image
+```
+$ docker images
+REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
+paddle-mobile   dev     33b146787711   45 hours ago    372MB
+```
+### 3. 使用 docker 构建
+进入 paddle-mobile 目录，执行 docker run
+```
+$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
+root@5affd29d4fc5:/ # cd /paddle-mobile
+# 生成构建 android 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
+# 生成构建 linux 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
+```
+### 4. 设置编译选项
+可以通过 ccmake 设置编译选项
+```
+root@5affd29d4fc5:/ # ccmake .
+                                                     Page 1 of 1
+ CMAKE_ASM_FLAGS
+ CMAKE_ASM_FLAGS_DEBUG
+ CMAKE_ASM_FLAGS_RELEASE
+ CMAKE_BUILD_TYPE
+ CMAKE_INSTALL_PREFIX             /usr/local
+ CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
+ CPU                              ON
+ DEBUGING                         ON
+ FPGA                             OFF
+ LOG_PROFILE                      ON
+ MALI_GPU                         OFF
+ NET                              googlenet
+ USE_EXCEPTION                    ON
+ USE_OPENMP                       OFF
+```
+修改选项后，按 `c`, `g` 更新 Makefile
+### 5. 构建
+使用 make 命令进行构建
+```
+root@5affd29d4fc5:/ # make
+```
+### 6. 查看构建产出
+构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
+## 不使用 docker
+不使用 docker 的方法，可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具，可能需要设置 CC，CXX 环境变量，或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake，或者增加自己需要的 toolchain file。
\ No newline at end of file
--- a/scripts/push2android.sh
+++ b/scripts/push2android.sh
-#!/usr/bin/env sh
-push_fn () {
-MODELS_PATH="../test/models/*"
-EXE_FILE="../test/build/*"
-EXE_DIR="data/local/tmp/bin"
-MODELS_DIR="data/local/tmp/models"
-LIB_PATH="../build/release/arm-v7a/build/*"
-adb push ${EXE_FILE} ${EXE_DIR}
-adb push ${LIB_PATH} ${EXE_DIR}
-adb push ${MODELS_PATH} ${MODELS_DIR}
-echo "test files sync completed"
-}
-push_fn
--- a/src/common/dep_core.h
+++ b/src/common/dep_core.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "framework/operator.h"
+namespace paddle_mobile {
+class depCore {
+ public:
+  template <typename Dtype>
+  void analysisDep(
+      const std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>& ops) {
+    std::unordered_map<std::string, int> vars;
+    size_t nop = ops.size();
+    deps.resize(nop);
+    next.resize(nop);
+    for (size_t i = 0; i < nop; i++) {
+      const auto& op = ops[i];
+      for (const auto& kv : op->Inputs()) {
+        for (const auto& v : kv.second) {
+          if (vars.find(v) == vars.end()) {
+            continue;
+          }
+          int di = vars[v];
+          if (di == i) {
+            continue;
+          }
+          if (std::find(deps[i].begin(), deps[i].end(), di) != deps[i].end()) {
+            continue;
+          }
+          deps[i].push_back(di);
+          next[di].push_back(i);
+        }
+      }
+      for (const auto& kv : op->Outputs()) {
+        for (const auto& v : kv.second) {
+          vars[v] = i;
+        }
+      }
+    }
+  }
+  const std::vector<int>& getNext(int i) { return next[i]; }
+  const std::vector<int>& getDeps(int i) { return deps[i]; }
+  std::vector<std::vector<int>> deps;
+  std::vector<std::vector<int>> next;
+};
+}  // namespace paddle_mobile
+#endif
--- a/src/common/enforce.h
+++ b/src/common/enforce.h
@@ -17,8 +17,6 @@ limitations under the License. */
 #ifdef ENABLE_EXCEPTION
 #include <stdio.h>
 #include <exception>
-#include <sstream>
-#include <stdexcept>
 #include <string>
 #endif
@@ -32,12 +30,11 @@ struct PaddleMobileException : public std::exception {
  PaddleMobileException(const char *header, const char *detail,
                        const char *file, const int line) {
-    std::stringstream ss;
+    char buffer[1500];
-    ss << exception_prefix << "| " << header << "\n";
+    snprintf(buffer, sizeof(buffer),
-    ss << "| [in file] : " << file << " \n";
+             "%s| %s \n| [in file] : %s\n| [on line] : %d\n| [detail]  : %s\n",
-    ss << "| [on line] : " << line << " \n";
+             exception_prefix.c_str(), header, file, line, detail);
-    ss << "| [detail]  : " << detail;
+    message = std::string(buffer);
-    message = ss.str();
  }
  const char *what() const noexcept { return message.c_str(); }
 };

--- a/src/common/log.h
+++ b/src/common/log.h
@@ -16,15 +16,43 @@ limitations under the License. */
 #include <vector>
 #ifdef PADDLE_MOBILE_DEBUG
+#include <cstring>
 #include <iostream>
 #include <sstream>
 #include <string>
 #endif
+#ifdef ANDROID
+#include <android/log.h>
+#endif
 namespace paddle_mobile {
 #ifdef PADDLE_MOBILE_DEBUG
+#ifdef ANDROID
+extern const char *ANDROID_LOG_TAG;
+#define ANDROIDLOGI(...)                                               \
+  __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, __VA_ARGS__); \
+  printf(__VA_ARGS__)
+#define ANDROIDLOGW(...)                                                  \
+  __android_log_print(ANDROID_LOG_WARNING, ANDROID_LOG_TAG, __VA_ARGS__); \
+  printf(__VA_ARGS__)
+#define ANDROIDLOGD(...)                                                \
+  __android_log_print(ANDROID_LOG_DEBUG, ANDROID_LOG_TAG, __VA_ARGS__); \
+  printf(__VA_ARGS__)
+#define ANDROIDLOGE(...)                                                \
+  __android_log_print(ANDROID_LOG_ERROR, ANDROID_LOG_TAG, __VA_ARGS__); \
+  printf(__VA_ARGS__)
+#else
+#define ANDROIDLOGI(...)
+#define ANDROIDLOGW(...)
+#define ANDROIDLOGD(...)
+#define ANDROIDLOGE(...)
+#endif
 enum LogLevel {
  kNO_LOG,
  kLOG_ERROR,
@@ -88,26 +116,29 @@ struct ToLog {
  Print printer_;
 };
-#define LOG(level)                                                             \
+#define LOG(level)                                                           \
-  if (level > paddle_mobile::log_level) {                                      \
+  if (level > paddle_mobile::log_level) {                                    \
-  } else                                                                       \
+  } else                                                                     \
-    paddle_mobile::ToLog(                                                      \
+    paddle_mobile::ToLog(                                                    \
-        level,                                                                 \
+        level, static_cast<std::stringstream &>(                             \
-        (std::stringstream()                                                   \
+                   std::stringstream()                                       \
-         << "[file: "                                                          \
+                   << "[file: "                                              \
-         << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \
+                   << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
-         << "] [line: " << __LINE__ << "] ")                                   \
+                                              : __FILE__)                    \
-            .str())
+                   << "] [line: " << __LINE__ << "] ")                       \
+                   .str())
-#define DLOG                                                                   \
-  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {                  \
+#define DLOG                                                          \
-  } else                                                                       \
+  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {         \
-    paddle_mobile::ToLog(                                                      \
+  } else                                                              \
-        paddle_mobile::kLOG_DEBUG,                                             \
+    paddle_mobile::ToLog(                                             \
-        (std::stringstream()                                                   \
+        paddle_mobile::kLOG_DEBUG,                                    \
-         << "[file: "                                                          \
+        static_cast<std::stringstream &>(                             \
-         << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \
+            std::stringstream()                                       \
-         << "] [line: " << __LINE__ << "] ")                                   \
+            << "[file: "                                              \
+            << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
+                                       : __FILE__)                    \
+            << "] [line: " << __LINE__ << "] ")                       \
            .str())
 #define LOGF(level, format, ...)          \
@@ -122,6 +153,11 @@ struct ToLog {
 #else
+#define ANDROIDLOGI(...)
+#define ANDROIDLOGW(...)
+#define ANDROIDLOGD(...)
+#define ANDROIDLOGE(...)
 enum LogLevel {
  kNO_LOG,
  kLOG_ERROR,

--- a/src/common/log.cpp
+++ b/src/common/log.cpp
@@ -12,6 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "log.h"
+#pragma once
-namespace paddle_mobile {}
+#define EXPORT __attribute__((visibility("default")))
--- a/src/platform/macros.h
+++ b/src/platform/macros.h
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
+#ifdef PADDLE_MOBILE_USE_OPENMP
+/**
-// Disable the copy and assignment operator for a class.
+ * android-ndk-r17 has a problem when linking with openmp.
-#ifndef DISABLE_COPY_AND_ASSIGN
+ * if paddle-mobile enables -fopenmp, but didn't use those omp_* functions,
-#define DISABLE_COPY_AND_ASSIGN(classname)          \
+ * after linking another binary with libpaddle-mobile.so, the omp_get_thread_num
- private:                                           \
+ * will not work. see test/common/test_openmp.cc the detailed reason is still
-  classname(const classname &) = delete;            \
+ * unclear, but this trick will work. a better solution is hacking the linker,
-  classname(classname &&) = delete;                 \
+ * try some flags to make it link omp_* functions, but I didn't find out how to
-  classname &operator=(const classname &) = delete; \
+ * make it work.
-  classname &operator=(classname &&) = delete
+ */
+#include <omp.h>
+static int _ = omp_get_num_procs();
 #endif
--- a/src/common/protobuf-c.c
+++ b/src/common/protobuf-c.c
@@ -711,47 +711,6 @@ static inline size_t uint32_pack(uint32_t value, uint8_t *out) {
  return rv;
 }
-/**
- * Pack a signed 32-bit integer and return the number of bytes written.
- * Negative numbers are encoded as two's complement 64-bit integers.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t int32_pack(int32_t value, uint8_t *out) {
-  if (value < 0) {
-    out[0] = value | 0x80;
-    out[1] = (value >> 7) | 0x80;
-    out[2] = (value >> 14) | 0x80;
-    out[3] = (value >> 21) | 0x80;
-    out[4] = (value >> 28) | 0x80;
-    out[5] = out[6] = out[7] = out[8] = 0xff;
-    out[9] = 0x01;
-    return 10;
-  } else {
-    return uint32_pack(value, out);
-  }
-}
-/**
- * Pack a signed 32-bit integer using ZigZag encoding and return the number of
- * bytes written.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t sint32_pack(int32_t value, uint8_t *out) {
-  return uint32_pack(zigzag32(value), out);
-}
 /**
 * Pack a 64-bit unsigned integer using base-128 varint encoding and return the
 * number of bytes written.
@@ -789,116 +748,6 @@ static size_t uint64_pack(uint64_t value, uint8_t *out) {
  return rv;
 }
-/**
- * Pack a 64-bit signed integer in ZigZag encoding and return the number of
- * bytes written.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t sint64_pack(int64_t value, uint8_t *out) {
-  return uint64_pack(zigzag64(value), out);
-}
-/**
- * Pack a 32-bit quantity in little-endian byte order. Used for protobuf wire
- * types fixed32, sfixed32, float. Similar to "htole32".
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t fixed32_pack(uint32_t value, void *out) {
-#if !defined(WORDS_BIGENDIAN)
-  memcpy(out, &value, 4);
-#else
-  uint8_t *buf = out;
-  buf[0] = value;
-  buf[1] = value >> 8;
-  buf[2] = value >> 16;
-  buf[3] = value >> 24;
-#endif
-  return 4;
-}
-/**
- * Pack a 64-bit quantity in little-endian byte order. Used for protobuf wire
- * types fixed64, sfixed64, double. Similar to "htole64".
- *
- * \todo The big-endian impl is really only good for 32-bit machines, a 64-bit
- * version would be appreciated, plus a way to decide to use 64-bit math where
- * convenient.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t fixed64_pack(uint64_t value, void *out) {
-#if !defined(WORDS_BIGENDIAN)
-  memcpy(out, &value, 8);
-#else
-  fixed32_pack(value, out);
-  fixed32_pack(value >> 32, ((char *)out) + 4);
-#endif
-  return 8;
-}
-/**
- * Pack a boolean value as an integer and return the number of bytes written.
- *
- * \todo Perhaps on some platforms *out = !!value would be a better impl, b/c
- * that is idiomatic C++ in some STL implementations.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t boolean_pack(protobuf_c_boolean value, uint8_t *out) {
-  *out = value ? TRUE : FALSE;
-  return 1;
-}
-/**
- * Pack a NUL-terminated C string and return the number of bytes written. The
- * output includes a length delimiter.
- *
- * The NULL pointer is treated as an empty string. This isn't really necessary,
- * but it allows people to leave required strings blank. (See Issue #13 in the
- * bug tracker for a little more explanation).
- *
- * \param str
- *      String to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t string_pack(const char *str, uint8_t *out) {
-  if (str == NULL) {
-    out[0] = 0;
-    return 1;
-  } else {
-    size_t len = strlen(str);
-    size_t rv = uint32_pack(len, out);
-    memcpy(out + rv, str, len);
-    return rv + len;
-  }
-}
 /**
 * Pack a ProtobufCBinaryData and return the number of bytes written. The output
 * includes a length delimiter.
@@ -918,30 +767,6 @@ static inline size_t binary_data_pack(const ProtobufCBinaryData *bd,
  return rv + len;
 }
-/**
- * Pack a ProtobufCMessage and return the number of bytes written. The output
- * includes a length delimiter.
- *
- * \param message
- *      ProtobufCMessage object to pack.
- * \param[out] out
- *      Packed message.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t prefixed_message_pack(const ProtobufCMessage *message,
-                                           uint8_t *out) {
-  if (message == NULL) {
-    out[0] = 0;
-    return 1;
-  } else {
-    size_t rv = protobuf_c_message_pack(message, out + 1);
-    uint32_t rv_packed_size = uint32_size(rv);
-    if (rv_packed_size != 1) memmove(out + rv_packed_size, out + 1, rv);
-    return uint32_pack(rv, out) + rv;
-  }
-}
 /**
 * Pack a field tag.
 *
@@ -963,143 +788,6 @@ static size_t tag_pack(uint32_t id, uint8_t *out) {
    return uint64_pack(((uint64_t)id) << 3, out);
 }
-/**
- * Pack a required field and return the number of bytes written.
- *
- * \param field
- *      Field descriptor.
- * \param member
- *      The field member.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t required_field_pack(const ProtobufCFieldDescriptor *field,
-                                  const void *member, uint8_t *out) {
-  size_t rv = tag_pack(field->id, out);
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SINT32:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
-      return rv + sint32_pack(*(const int32_t *)member, out + rv);
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
-      return rv + int32_pack(*(const int32_t *)member, out + rv);
-    case PROTOBUF_C_TYPE_UINT32:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
-      return rv + uint32_pack(*(const uint32_t *)member, out + rv);
-    case PROTOBUF_C_TYPE_SINT64:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
-      return rv + sint64_pack(*(const int64_t *)member, out + rv);
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
-      return rv + uint64_pack(*(const uint64_t *)member, out + rv);
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_32BIT;
-      return rv + fixed32_pack(*(const uint32_t *)member, out + rv);
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_64BIT;
-      return rv + fixed64_pack(*(const uint64_t *)member, out + rv);
-    case PROTOBUF_C_TYPE_BOOL:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_VARINT;
-      return rv + boolean_pack(*(const protobuf_c_boolean *)member, out + rv);
-    case PROTOBUF_C_TYPE_STRING:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED;
-      return rv + string_pack(*(char *const *)member, out + rv);
-    case PROTOBUF_C_TYPE_BYTES:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED;
-      return rv +
-             binary_data_pack((const ProtobufCBinaryData *)member, out + rv);
-    case PROTOBUF_C_TYPE_MESSAGE:
-      out[0] |= PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED;
-      return rv + prefixed_message_pack(*(ProtobufCMessage *const *)member,
-                                        out + rv);
-  }
-  PROTOBUF_C__ASSERT_NOT_REACHED();
-  return 0;
-}
-/**
- * Pack a oneof field and return the number of bytes written. Only packs the
- * field that is selected by the case enum.
- *
- * \param field
- *      Field descriptor.
- * \param oneof_case
- *      Enum value that selects the field in the oneof.
- * \param member
- *      The field member.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t oneof_field_pack(const ProtobufCFieldDescriptor *field,
-                               uint32_t oneof_case, const void *member,
-                               uint8_t *out) {
-  if (oneof_case != field->id) {
-    return 0;
-  }
-  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
-      field->type == PROTOBUF_C_TYPE_STRING) {
-    const void *ptr = *(const void *const *)member;
-    if (ptr == NULL || ptr == field->default_value) return 0;
-  }
-  return required_field_pack(field, member, out);
-}
-/**
- * Pack an optional field and return the number of bytes written.
- *
- * \param field
- *      Field descriptor.
- * \param has
- *      Whether the field is set.
- * \param member
- *      The field member.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t optional_field_pack(const ProtobufCFieldDescriptor *field,
-                                  const protobuf_c_boolean has,
-                                  const void *member, uint8_t *out) {
-  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
-      field->type == PROTOBUF_C_TYPE_STRING) {
-    const void *ptr = *(const void *const *)member;
-    if (ptr == NULL || ptr == field->default_value) return 0;
-  } else {
-    if (!has) return 0;
-  }
-  return required_field_pack(field, member, out);
-}
-/**
- * Pack an unlabeled field and return the number of bytes written.
- *
- * \param field
- *      Field descriptor.
- * \param member
- *      The field member.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t unlabeled_field_pack(const ProtobufCFieldDescriptor *field,
-                                   const void *member, uint8_t *out) {
-  if (field_is_zeroish(field, member)) return 0;
-  return required_field_pack(field, member, out);
-}
 /**
 * Given a field type, return the in-memory size.
 *
@@ -1139,236 +827,6 @@ static inline size_t sizeof_elt_in_repeated_array(ProtobufCType type) {
  return 0;
 }
-/**
- * Pack an array of 32-bit quantities.
- *
- * \param[out] out
- *      Destination.
- * \param[in] in
- *      Source.
- * \param[in] n
- *      Number of elements in the source array.
- */
-static void copy_to_little_endian_32(void *out, const void *in,
-                                     const unsigned n) {
-#if !defined(WORDS_BIGENDIAN)
-  memcpy(out, in, n * 4);
-#else
-  unsigned i;
-  const uint32_t *ini = in;
-  for (i = 0; i < n; i++) fixed32_pack(ini[i], (uint32_t *)out + i);
-#endif
-}
-/**
- * Pack an array of 64-bit quantities.
- *
- * \param[out] out
- *      Destination.
- * \param[in] in
- *      Source.
- * \param[in] n
- *      Number of elements in the source array.
- */
-static void copy_to_little_endian_64(void *out, const void *in,
-                                     const unsigned n) {
-#if !defined(WORDS_BIGENDIAN)
-  memcpy(out, in, n * 8);
-#else
-  unsigned i;
-  const uint64_t *ini = in;
-  for (i = 0; i < n; i++) fixed64_pack(ini[i], (uint64_t *)out + i);
-#endif
-}
-/**
- * Get the minimum number of bytes required to pack a field value of a
- * particular type.
- *
- * \param type
- *      Field type.
- * \return
- *      Number of bytes.
- */
-static unsigned get_type_min_size(ProtobufCType type) {
-  if (type == PROTOBUF_C_TYPE_SFIXED32 || type == PROTOBUF_C_TYPE_FIXED32 ||
-      type == PROTOBUF_C_TYPE_FLOAT) {
-    return 4;
-  }
-  if (type == PROTOBUF_C_TYPE_SFIXED64 || type == PROTOBUF_C_TYPE_FIXED64 ||
-      type == PROTOBUF_C_TYPE_DOUBLE) {
-    return 8;
-  }
-  return 1;
-}
-/**
- * Get the packed size of an array of same field type.
- *
- * \param field
- *      Field descriptor.
- * \param count
- *      Number of elements of this type.
- * \param array
- *      The elements to get the size of.
- * \return
- *      Number of bytes required.
- */
-static size_t get_packed_payload_length(const ProtobufCFieldDescriptor *field,
-                                        unsigned count, const void *array) {
-  unsigned rv = 0;
-  unsigned i;
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      return count * 4;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      return count * 8;
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32: {
-      const int32_t *arr = (const int32_t *)array;
-      for (i = 0; i < count; i++) rv += int32_size(arr[i]);
-      break;
-    }
-    case PROTOBUF_C_TYPE_SINT32: {
-      const int32_t *arr = (const int32_t *)array;
-      for (i = 0; i < count; i++) rv += sint32_size(arr[i]);
-      break;
-    }
-    case PROTOBUF_C_TYPE_UINT32: {
-      const uint32_t *arr = (const uint32_t *)array;
-      for (i = 0; i < count; i++) rv += uint32_size(arr[i]);
-      break;
-    }
-    case PROTOBUF_C_TYPE_SINT64: {
-      const int64_t *arr = (const int64_t *)array;
-      for (i = 0; i < count; i++) rv += sint64_size(arr[i]);
-      break;
-    }
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64: {
-      const uint64_t *arr = (const uint64_t *)array;
-      for (i = 0; i < count; i++) rv += uint64_size(arr[i]);
-      break;
-    }
-    case PROTOBUF_C_TYPE_BOOL:
-      return count;
-    default:
-      PROTOBUF_C__ASSERT_NOT_REACHED();
-  }
-  return rv;
-}
-/**
- * Pack an array of same field type to a virtual buffer.
- *
- * \param field
- *      Field descriptor.
- * \param count
- *      Number of elements of this type.
- * \param array
- *      The elements to get the size of.
- * \param[out] buffer
- *      Virtual buffer to append data to.
- * \return
- *      Number of bytes packed.
- */
-static size_t pack_buffer_packed_payload(const ProtobufCFieldDescriptor *field,
-                                         unsigned count, const void *array,
-                                         ProtobufCBuffer *buffer) {
-  uint8_t scratch[16];
-  size_t rv = 0;
-  unsigned i;
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-#if !defined(WORDS_BIGENDIAN)
-      rv = count * 4;
-      goto no_packing_needed;
-#else
-      for (i = 0; i < count; i++) {
-        unsigned len = fixed32_pack(((uint32_t *)array)[i], scratch);
-        buffer->append(buffer, len, scratch);
-        rv += len;
-      }
-      break;
-#endif
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-#if !defined(WORDS_BIGENDIAN)
-      rv = count * 8;
-      goto no_packing_needed;
-#else
-      for (i = 0; i < count; i++) {
-        unsigned len = fixed64_pack(((uint64_t *)array)[i], scratch);
-        buffer->append(buffer, len, scratch);
-        rv += len;
-      }
-      break;
-#endif
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      for (i = 0; i < count; i++) {
-        unsigned len = int32_pack(((int32_t *)array)[i], scratch);
-        buffer->append(buffer, len, scratch);
-        rv += len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_SINT32:
-      for (i = 0; i < count; i++) {
-        unsigned len = sint32_pack(((int32_t *)array)[i], scratch);
-        buffer->append(buffer, len, scratch);
-        rv += len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_UINT32:
-      for (i = 0; i < count; i++) {
-        unsigned len = uint32_pack(((uint32_t *)array)[i], scratch);
-        buffer->append(buffer, len, scratch);
-        rv += len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_SINT64:
-      for (i = 0; i < count; i++) {
-        unsigned len = sint64_pack(((int64_t *)array)[i], scratch);
-        buffer->append(buffer, len, scratch);
-        rv += len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      for (i = 0; i < count; i++) {
-        unsigned len = uint64_pack(((uint64_t *)array)[i], scratch);
-        buffer->append(buffer, len, scratch);
-        rv += len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_BOOL:
-      for (i = 0; i < count; i++) {
-        unsigned len = boolean_pack(((protobuf_c_boolean *)array)[i], scratch);
-        buffer->append(buffer, len, scratch);
-        rv += len;
-      }
-      return count;
-    default:
-      PROTOBUF_C__ASSERT_NOT_REACHED();
-  }
-  return rv;
-#if !defined(WORDS_BIGENDIAN)
-no_packing_needed:
-  buffer->append(buffer, rv, array);
-  return rv;
-#endif
-}
 static inline int int_range_lookup(unsigned n_ranges,
                                   const ProtobufCIntRange *ranges, int value) {
  unsigned n;
@@ -2638,147 +2096,3 @@ protobuf_c_boolean protobuf_c_message_check(const ProtobufCMessage *message) {
 typedef void (*GenericHandler)(void *service, const ProtobufCMessage *input,
                               ProtobufCClosure closure, void *closure_data);
-void protobuf_c_service_invoke_internal(ProtobufCService *service,
-                                        unsigned method_index,
-                                        const ProtobufCMessage *input,
-                                        ProtobufCClosure closure,
-                                        void *closure_data) {
-  GenericHandler *handlers;
-  GenericHandler handler;
-  /*
-   * Verify that method_index is within range. If this fails, you are
-   * likely invoking a newly added method on an old service. (Although
-   * other memory corruption bugs can cause this assertion too.)
-   */
-  assert(method_index < service->descriptor->n_methods);
-  /*
-   * Get the array of virtual methods (which are enumerated by the
-   * generated code).
-   */
-  handlers = (GenericHandler *)(service + 1);
-  /*
-   * Get our method and invoke it.
-   * \todo Seems like handler == NULL is a situation that needs handling.
-   */
-  handler = handlers[method_index];
-  (*handler)(service, input, closure, closure_data);
-}
-void protobuf_c_service_generated_init(
-    ProtobufCService *service, const ProtobufCServiceDescriptor *descriptor,
-    ProtobufCServiceDestroy destroy) {
-  ASSERT_IS_SERVICE_DESCRIPTOR(descriptor);
-  service->descriptor = descriptor;
-  service->destroy = destroy;
-  service->invoke = protobuf_c_service_invoke_internal;
-  memset(service + 1, 0, descriptor->n_methods * sizeof(GenericHandler));
-}
-void protobuf_c_service_destroy(ProtobufCService *service) {
-  service->destroy(service);
-}
-/* --- querying the descriptors --- */
-const ProtobufCEnumValue *protobuf_c_enum_descriptor_get_value_by_name(
-    const ProtobufCEnumDescriptor *desc, const char *name) {
-  unsigned start = 0;
-  unsigned count;
-  if (desc == NULL || desc->values_by_name == NULL) return NULL;
-  count = desc->n_value_names;
-  while (count > 1) {
-    unsigned mid = start + count / 2;
-    int rv = strcmp(desc->values_by_name[mid].name, name);
-    if (rv == 0)
-      return desc->values + desc->values_by_name[mid].index;
-    else if (rv < 0) {
-      count = start + count - (mid + 1);
-      start = mid + 1;
-    } else
-      count = mid - start;
-  }
-  if (count == 0) return NULL;
-  if (strcmp(desc->values_by_name[start].name, name) == 0)
-    return desc->values + desc->values_by_name[start].index;
-  return NULL;
-}
-const ProtobufCEnumValue *protobuf_c_enum_descriptor_get_value(
-    const ProtobufCEnumDescriptor *desc, int value) {
-  int rv = int_range_lookup(desc->n_value_ranges, desc->value_ranges, value);
-  if (rv < 0) return NULL;
-  return desc->values + rv;
-}
-const ProtobufCFieldDescriptor *protobuf_c_message_descriptor_get_field_by_name(
-    const ProtobufCMessageDescriptor *desc, const char *name) {
-  unsigned start = 0;
-  unsigned count;
-  const ProtobufCFieldDescriptor *field;
-  if (desc == NULL || desc->fields_sorted_by_name == NULL) return NULL;
-  count = desc->n_fields;
-  while (count > 1) {
-    unsigned mid = start + count / 2;
-    int rv;
-    field = desc->fields + desc->fields_sorted_by_name[mid];
-    rv = strcmp(field->name, name);
-    if (rv == 0)
-      return field;
-    else if (rv < 0) {
-      count = start + count - (mid + 1);
-      start = mid + 1;
-    } else
-      count = mid - start;
-  }
-  if (count == 0) return NULL;
-  field = desc->fields + desc->fields_sorted_by_name[start];
-  if (strcmp(field->name, name) == 0) return field;
-  return NULL;
-}
-const ProtobufCFieldDescriptor *protobuf_c_message_descriptor_get_field(
-    const ProtobufCMessageDescriptor *desc, unsigned value) {
-  int rv = int_range_lookup(desc->n_field_ranges, desc->field_ranges, value);
-  if (rv < 0) return NULL;
-  return desc->fields + rv;
-}
-const ProtobufCMethodDescriptor *
-protobuf_c_service_descriptor_get_method_by_name(
-    const ProtobufCServiceDescriptor *desc, const char *name) {
-  unsigned start = 0;
-  unsigned count;
-  if (desc == NULL || desc->method_indices_by_name == NULL) return NULL;
-  count = desc->n_methods;
-  while (count > 1) {
-    unsigned mid = start + count / 2;
-    unsigned mid_index = desc->method_indices_by_name[mid];
-    const char *mid_name = desc->methods[mid_index].name;
-    int rv = strcmp(mid_name, name);
-    if (rv == 0) return desc->methods + desc->method_indices_by_name[mid];
-    if (rv < 0) {
-      count = start + count - (mid + 1);
-      start = mid + 1;
-    } else {
-      count = mid - start;
-    }
-  }
-  if (count == 0) return NULL;
-  if (strcmp(desc->methods[desc->method_indices_by_name[start]].name, name) ==
-      0)
-    return desc->methods + desc->method_indices_by_name[start];
-  return NULL;
-}
--- a/src/common/protobuf-c.h
+++ b/src/common/protobuf-c.h
@@ -798,76 +798,6 @@ uint32_t protobuf_c_version_number(void);
 */
 #define PROTOBUF_C_MIN_COMPILER_VERSION 1000000
-/**
- * Look up a `ProtobufCEnumValue` from a `ProtobufCEnumDescriptor` by name.
- *
- * \param desc
- *      The `ProtobufCEnumDescriptor` object.
- * \param name
- *      The `name` field from the corresponding `ProtobufCEnumValue` object to
- *      match.
- * \return
- *      A `ProtobufCEnumValue` object.
- * \retval NULL
- *      If not found or if the optimize_for = CODE_SIZE option was set.
- */
-PROTOBUF_C__API
-const ProtobufCEnumValue *protobuf_c_enum_descriptor_get_value_by_name(
-    const ProtobufCEnumDescriptor *desc, const char *name);
-/**
- * Look up a `ProtobufCEnumValue` from a `ProtobufCEnumDescriptor` by numeric
- * value.
- *
- * \param desc
- *      The `ProtobufCEnumDescriptor` object.
- * \param value
- *      The `value` field from the corresponding `ProtobufCEnumValue` object to
- *      match.
- *
- * \return
- *      A `ProtobufCEnumValue` object.
- * \retval NULL
- *      If not found.
- */
-PROTOBUF_C__API
-const ProtobufCEnumValue *protobuf_c_enum_descriptor_get_value(
-    const ProtobufCEnumDescriptor *desc, int value);
-/**
- * Look up a `ProtobufCFieldDescriptor` from a `ProtobufCMessageDescriptor` by
- * the name of the field.
- *
- * \param desc
- *      The `ProtobufCMessageDescriptor` object.
- * \param name
- *      The name of the field.
- * \return
- *      A `ProtobufCFieldDescriptor` object.
- * \retval NULL
- *      If not found or if the optimize_for = CODE_SIZE option was set.
- */
-PROTOBUF_C__API
-const ProtobufCFieldDescriptor *protobuf_c_message_descriptor_get_field_by_name(
-    const ProtobufCMessageDescriptor *desc, const char *name);
-/**
- * Look up a `ProtobufCFieldDescriptor` from a `ProtobufCMessageDescriptor` by
- * the tag value of the field.
- *
- * \param desc
- *      The `ProtobufCMessageDescriptor` object.
- * \param value
- *      The tag value of the field.
- * \return
- *      A `ProtobufCFieldDescriptor` object.
- * \retval NULL
- *      If not found.
- */
-PROTOBUF_C__API
-const ProtobufCFieldDescriptor *protobuf_c_message_descriptor_get_field(
-    const ProtobufCMessageDescriptor *desc, unsigned value);
 /**
 * Determine the number of bytes required to store the serialised message.
 *
@@ -947,33 +877,6 @@ PROTOBUF_C__API
 void protobuf_c_message_init(const ProtobufCMessageDescriptor *descriptor,
                             void *message);
-/**
- * Free a service.
- *
- * \param service
- *      The service object to free.
- */
-PROTOBUF_C__API
-void protobuf_c_service_destroy(ProtobufCService *service);
-/**
- * Look up a `ProtobufCMethodDescriptor` by name.
- *
- * \param desc
- *      Service descriptor.
- * \param name
- *      Name of the method.
- *
- * \return
- *      A `ProtobufCMethodDescriptor` object.
- * \retval NULL
- *      If not found or if the optimize_for = CODE_SIZE option was set.
- */
-PROTOBUF_C__API
-const ProtobufCMethodDescriptor *
-protobuf_c_service_descriptor_get_method_by_name(
-    const ProtobufCServiceDescriptor *desc, const char *name);
 /**
 * Initialise a `ProtobufCBufferSimple` object.
 */
@@ -1011,18 +914,6 @@ PROTOBUF_C__API
 void protobuf_c_buffer_simple_append(ProtobufCBuffer *buffer, size_t len,
                                     const unsigned char *data);
-PROTOBUF_C__API
-void protobuf_c_service_generated_init(
-    ProtobufCService *service, const ProtobufCServiceDescriptor *descriptor,
-    ProtobufCServiceDestroy destroy);
-PROTOBUF_C__API
-void protobuf_c_service_invoke_internal(ProtobufCService *service,
-                                        unsigned method_index,
-                                        const ProtobufCMessage *input,
-                                        ProtobufCClosure closure,
-                                        void *closure_data);
 /**@}*/
 PROTOBUF_C__END_DECLS

--- a/src/common/threadpool.h
+++ b/src/common/threadpool.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <stdexcept>
+#include <thread>
+#include <vector>
+namespace paddle_mobile {
+class ThreadPool {
+ public:
+  static ThreadPool& getThreadPool();
+  static int getThreadPoolThreadId();
+  explicit ThreadPool(size_t);
+  template <class F, class... Args>
+  auto enqueue(F&& f, Args&&... args)
+      -> std::future<typename std::result_of<F(Args...)>::type>;
+  ~ThreadPool();
+  int getTid(const std::thread::id& id) {
+    for (int i = 0; i < workers.size(); i++) {
+      if (workers[i].get_id() == id) {
+        return i;
+      }
+    }
+    return -1;
+  }
+ private:
+  // need to keep track of threads so we can join them
+  std::vector<std::thread> workers;
+  // the task queue
+  std::queue<std::function<void()>> tasks;
+  // synchronization
+  std::mutex queue_mutex;
+  std::condition_variable condition;
+  bool stop;
+};
+// the constructor just launches some amount of workers
+inline ThreadPool::ThreadPool(size_t threads) : stop(false) {
+  for (size_t i = 0; i < threads; ++i)
+    workers.emplace_back([this] {
+      for (;;) {
+        std::function<void()> task;
+        {
+          std::unique_lock<std::mutex> lock(this->queue_mutex);
+          this->condition.wait(
+              lock, [this] { return this->stop || !this->tasks.empty(); });
+          // for (;;) {
+          //     if (this->stop || !this->tasks.empty()) {
+          //         break;
+          //     }
+          //     lock.unlock();
+          //     lock.lock();
+          // }
+          if (this->stop && this->tasks.empty()) return;
+          task = std::move(this->tasks.front());
+          this->tasks.pop();
+        }
+        task();
+      }
+    });
+}
+// add new work item to the pool
+template <class F, class... Args>
+auto ThreadPool::enqueue(F&& f, Args&&... args)
+    -> std::future<typename std::result_of<F(Args...)>::type> {
+  using return_type = typename std::result_of<F(Args...)>::type;
+  auto task = std::make_shared<std::packaged_task<return_type()>>(
+      std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+  std::future<return_type> res = task->get_future();
+  {
+    std::unique_lock<std::mutex> lock(queue_mutex);
+    // don't allow enqueueing after stopping the pool
+    // if(stop)
+    //     throw std::runtime_error("enqueue on stopped ThreadPool");
+    tasks.emplace([task]() { (*task)(); });
+  }
+  condition.notify_one();
+  return res;
+}
+// the destructor joins all threads
+inline ThreadPool::~ThreadPool() {
+  {
+    std::unique_lock<std::mutex> lock(queue_mutex);
+    stop = true;
+  }
+  condition.notify_all();
+  for (std::thread& worker : workers) worker.join();
+}
+ThreadPool& ThreadPool::getThreadPool() {
+  static ThreadPool threadPool(3);
+  return threadPool;
+}
+int ThreadPool::getThreadPoolThreadId() {
+  return getThreadPool().getTid(std::this_thread::get_id());
+}
+}  // namespace paddle_mobile
--- a/src/common/type_define.h
+++ b/src/common/type_define.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once;
+#pragma once
+#include <functional>
 #include <map>
 #include <string>
-#include <unordered_set>
 #include <vector>
 #include "framework/attribute.h"
 #include "framework/scope.h"
@@ -40,13 +40,6 @@ using OpCreator = std::function<framework::OperatorBase<Dtype> *(
    const framework::AttributeMap & /*attrs*/,
    std::shared_ptr<framework::Scope> /*scope*/)>;
-using GradOpMakerFN =
-    std::function<std::vector<std::unique_ptr<framework::OpDesc>>(
-        const framework::OpDesc &,
-        const std::unordered_set<std::string> & /*no_grad_set*/,
-        std::unordered_map<std::string, std::string> * /*grad_to_var*/,
-        const std::vector<framework::BlockDesc *> &grad_block)>;
 using InferVarTypeFN = std::function<void(const framework::OpDesc & /*op_desc*/,
                                          framework::BlockDesc * /*block*/)>;

--- a/src/common/types.cpp
+++ b/src/common/types.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "common/types.h"
+#include <vector>
+namespace paddle_mobile {
+const std::string G_OP_TYPE_CONV = "conv2d";
+const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
+const std::string G_OP_TYPE_BOX_CODER = "box_coder";
+const std::string G_OP_TYPE_CONCAT = "concat";
+const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
+const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
+const std::string G_OP_TYPE_FC = "fc";
+const std::string G_OP_TYPE_CONV_ADD = "conv_add";
+const std::string G_OP_TYPE_LRN = "lrn";
+const std::string G_OP_TYPE_MUL = "mul";
+const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
+const std::string G_OP_TYPE_POOL2D = "pool2d";
+const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
+const std::string G_OP_TYPE_RELU = "relu";
+const std::string G_OP_TYPE_RESHAPE = "reshape";
+const std::string G_OP_TYPE_SIGMOID = "sigmoid";
+const std::string G_OP_TYPE_SOFTMAX = "softmax";
+const std::string G_OP_TYPE_TRANSPOSE = "transpose";
+const std::string G_OP_TYPE_SPLIT = "split";
+const std::string G_OP_TYPE_FEED = "feed";
+const std::string G_OP_TYPE_FETCH = "fetch";
+const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
+const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence";
+std::unordered_map<
+    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
+    op_input_output_key = {
+        {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_CONV_ADD, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
+        {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_BOX_CODER,
+         {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
+        {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
+        {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
+        {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
+        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}}};
+}  // namespace paddle_mobile
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once;
+#pragma once
 #include <string>
 #include <unordered_map>
-#include <utility>
+#include <vector>
 namespace paddle_mobile {
 enum class Precision : int { FP32 = 0 };
@@ -72,50 +72,32 @@ enum PMStatus {
  PMWrongDevice = 0x08     /*!< un-correct device. */
 };
-static const std::string G_OP_TYPE_CONV = "conv2d";
+extern const std::string G_OP_TYPE_CONV;
-static const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
+extern const std::string G_OP_TYPE_BATCHNORM;
-static const std::string G_OP_TYPE_BOX_CODER = "box_coder";
+extern const std::string G_OP_TYPE_BOX_CODER;
-static const std::string G_OP_TYPE_CONCAT = "concat";
+extern const std::string G_OP_TYPE_CONCAT;
-static const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
+extern const std::string G_OP_TYPE_ELEMENTWISE_ADD;
-static const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU =
+extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU;
-    "fusion_conv_add_relu";
+extern const std::string G_OP_TYPE_FC;
-static const std::string G_OP_TYPE_FC = "fc";
+extern const std::string G_OP_TYPE_CONV_ADD;
-static const std::string G_OP_TYPE_LRN = "lrn";
+extern const std::string G_OP_TYPE_LRN;
-static const std::string G_OP_TYPE_MUL = "mul";
+extern const std::string G_OP_TYPE_MUL;
-static const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
+extern const std::string G_OP_TYPE_MULTICLASS_NMS;
-static const std::string G_OP_TYPE_POOL2D = "pool2d";
+extern const std::string G_OP_TYPE_POOL2D;
-static const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
+extern const std::string G_OP_TYPE_PRIOR_BOX;
-static const std::string G_OP_TYPE_RELU = "relu";
+extern const std::string G_OP_TYPE_RELU;
-static const std::string G_OP_TYPE_RESHAPE = "reshape";
+extern const std::string G_OP_TYPE_RESHAPE;
-static const std::string G_OP_TYPE_SIGMOID = "sigmoid";
+extern const std::string G_OP_TYPE_SIGMOID;
-static const std::string G_OP_TYPE_SOFTMAX = "softmax";
+extern const std::string G_OP_TYPE_SOFTMAX;
-static const std::string G_OP_TYPE_TRANSPOSE = "transpose";
+extern const std::string G_OP_TYPE_TRANSPOSE;
-static const std::string G_OP_TYPE_SPLIT = "split";
+extern const std::string G_OP_TYPE_SPLIT;
-static const std::string G_OP_TYPE_FEED = "feed";
+extern const std::string G_OP_TYPE_FEED;
-static const std::string G_OP_TYPE_FETCH = "fetch";
+extern const std::string G_OP_TYPE_FETCH;
-static const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
+extern const std::string G_OP_TYPE_DEPTHWISE_CONV;
-static const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence";
+extern const std::string G_OP_TYPE_IM2SEQUENCE;
-static std::unordered_map<
+extern std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
-    op_input_output_key = {
+    op_input_output_key;
-        {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
-        {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
-        {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_BOX_CODER,
-         {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
-        {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
-        {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
-        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}}};
 }  // namespace paddle_mobile
--- a/src/common/variant.cpp
+++ b/src/common/variant.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <iostream>
+#include "common/enforce.h"
 #include "common/log.h"
 #pragma once
@@ -57,15 +56,11 @@ class RawData {
  char data[size];
  RawData() {}
  RawData(const RawData &raw_data) { strcpy(data, raw_data.data); }
-  //      void operator=(const RawData &raw_data){
-  //        strcpy(data, raw_data.data);
-  //      }
 };
 template <typename... Ts>
 struct Variant {
  Variant(const Variant &variant) {
-    //        std::cout << " 赋值构造函数 " << std::endl;
    type_id = variant.type_id;
    data = variant.data;
  }
@@ -87,8 +82,7 @@ struct Variant {
    if (type_id == typeid(T).hash_code()) {
      return *const_cast<T *>(reinterpret_cast<const T *>(&data));
    } else {
-      //      std::cout << " bad cast in variant " << std::endl;
+      PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant ");
-      throw std::bad_cast();
    }
  }

--- a/src/framework/attribute.cpp
+++ b/src/framework/attribute.cpp
@@ -17,14 +17,8 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {
-/*
- * Variant<int, float, std::string, std::vector<int>, std::vector<float>,
-          std::vector<std::string>, bool, std::vector<bool>, BlockDesc *,
-          int64_t>
- * */
 struct PrintVistor : Vistor<Print &> {
-  PrintVistor(Print &printer) : printer_(printer) {}
+  explicit PrintVistor(Print &printer) : printer_(printer) {}
  template <typename T>
  Print &operator()(const T &value) {
    printer_ << value;

--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -14,7 +14,11 @@ limitations under the License. */
 #pragma once
+#include <string>
+#include <typeinfo>
 #include <unordered_map>
+#include <vector>
 #include "common/enforce.h"
 #include "common/log.h"
 #include "common/variant.h"
@@ -22,28 +26,15 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {
+using std::string;
+using std::vector;
 class BlockDesc;
 class Attribute {
 public:
-  /*
-   *  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT = 0,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT = 1,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING = 2,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS = 3,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS = 4,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS = 5,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN = 6,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG = 9
-    PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE)
-   *
-   * */
  static Attribute GetAttrValue(
      PaddleMobile__Framework__Proto__OpDesc__Attr *attr_desc) {
-    //    std::cout << "begin get attr value" << std::endl;
    Attribute attr;
    switch (attr_desc->type) {
      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN: {
@@ -63,35 +54,35 @@ class Attribute {
        break;
      }
      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS: {
-        std::vector<bool> val(attr_desc->n_bools);
+        vector<bool> val(attr_desc->n_bools);
        for (int i = 0; i < attr_desc->n_bools; ++i) {
          val[i] = attr_desc->bools[i];
        }
-        attr.Set<std::vector<bool>>(val);
+        attr.Set<vector<bool>>(val);
        break;
      }
      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS: {
-        std::vector<int> val(attr_desc->n_ints);
+        vector<int> val(attr_desc->n_ints);
        for (int i = 0; i < attr_desc->n_ints; ++i) {
          val[i] = attr_desc->ints[i];
        }
-        attr.Set<std::vector<int>>(val);
+        attr.Set<vector<int>>(val);
        break;
      }
      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS: {
-        std::vector<float> val(attr_desc->n_floats);
+        vector<float> val(attr_desc->n_floats);
        for (int i = 0; i < attr_desc->n_floats; ++i) {
          val[i] = attr_desc->floats[i];
        }
-        attr.Set<std::vector<float>>(val);
+        attr.Set<vector<float>>(val);
        break;
      }
      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS: {
-        std::vector<std::string> val(attr_desc->n_strings);
+        vector<string> val(attr_desc->n_strings);
        for (int i = 0; i < attr_desc->n_strings; ++i) {
          val[i] = attr_desc->strings[i];
        }
-        attr.Set<std::vector<std::string>>(val);
+        attr.Set<vector<string>>(val);
        break;
      }
      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG: {
@@ -122,47 +113,41 @@ class Attribute {
      return vistor(attr.variant_.Get<int>());
    } else if (attr.variant_.TypeId() == typeid(float).hash_code()) {
      return vistor(attr.variant_.Get<float>());
-    } else if (attr.variant_.TypeId() == typeid(std::string).hash_code()) {
+    } else if (attr.variant_.TypeId() == typeid(string).hash_code()) {
-      return vistor(attr.variant_.Get<std::string>());
+      return vistor(attr.variant_.Get<string>());
-    } else if (attr.variant_.TypeId() == typeid(std::vector<int>).hash_code()) {
+    } else if (attr.variant_.TypeId() == typeid(vector<int>).hash_code()) {
-      return vistor(attr.variant_.Get<std::vector<int>>());
+      return vistor(attr.variant_.Get<vector<int>>());
-    } else if (attr.variant_.TypeId() ==
+    } else if (attr.variant_.TypeId() == typeid(vector<float>).hash_code()) {
-               typeid(std::vector<float>).hash_code()) {
+      return vistor(attr.variant_.Get<vector<float>>());
-      return vistor(attr.variant_.Get<std::vector<float>>());
+    } else if (attr.variant_.TypeId() == typeid(vector<string>).hash_code()) {
-    } else if (attr.variant_.TypeId() ==
+      return vistor(attr.variant_.Get<vector<string>>());
-               typeid(std::vector<std::string>).hash_code()) {
-      return vistor(attr.variant_.Get<std::vector<std::string>>());
    } else if (attr.variant_.TypeId() == typeid(bool).hash_code()) {
      return vistor(attr.variant_.Get<bool>());
-    } else if (attr.variant_.TypeId() ==
+    } else if (attr.variant_.TypeId() == typeid(vector<bool>).hash_code()) {
-               typeid(std::vector<bool>).hash_code()) {
+      return vistor(attr.variant_.Get<vector<bool>>());
-      return vistor(attr.variant_.Get<std::vector<bool>>());
    } else if (attr.variant_.TypeId() == typeid(int64_t).hash_code()) {
      return vistor(attr.variant_.Get<int64_t>());
    } else {
-      throw std::bad_exception();
+      PADDLE_MOBILE_THROW_EXCEPTION("type not support");
    }
  }
 private:
-  Variant<int, float, std::string, std::vector<int>, std::vector<float>,
+  Variant<int, float, string, vector<int>, vector<float>, vector<string>, bool,
-          std::vector<std::string>, bool, std::vector<bool>, BlockDesc *,
+          vector<bool>, BlockDesc *, int64_t>
-          int64_t>
      variant_;
 };
-using AttributeMap = std::unordered_map<std::string, Attribute>;
+using AttributeMap = std::unordered_map<string, Attribute>;
 class AttrReader {
 public:
  explicit AttrReader(const AttributeMap &attrs) : attrs_(attrs) {}
  template <typename T>
-  inline T Get(const std::string &name) const {
+  inline T Get(const string &name) const {
-    //          PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should
+    PADDLE_MOBILE_ENFORCE(attrs_.count(name) != 0,
-    //          be in
+                          "%s should  be in AttributeMap", name);
-    //          AttributeMap",
-    //                         name);
    return ((Attribute)attrs_.at(name)).Get<T>();
  }

--- a/src/framework/data_layout.h
+++ b/src/framework/data_layout.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 #include <cctype>
-#include <iostream>
 #include <string>
 namespace paddle_mobile {
@@ -40,7 +39,7 @@ inline DataLayout StringToDataLayout(const std::string &str) {
  } else if (s == "ANYLAYOUT") {
    return DataLayout::kAnyLayout;
  } else {
-    //    std::cout << "Unknown storage order string: %s", s;
+    PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
  }
 }
@@ -54,14 +53,8 @@ inline std::string DataLayoutToString(const DataLayout &data_layout) {
      return "ANY_LAYOUT";
    default:
      break;
-      //      std::cout << "unknown DataLayou %d", data_layout;
  }
 }
-inline std::ostream &operator<<(std::ostream &out, const DataLayout &l) {
-  out << DataLayoutToString(l);
-  return out;
-}
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/data_transform.cpp
+++ b/src/framework/data_transform.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include "framework/data_transform.h"
-namespace paddle_mobile {
-namespace framework {
-static void PassTensorData(Tensor *from, Tensor *to) {
-  to->ShareDataWith(*from);
-  *from = Tensor();
-}
-void DataTransform(const OpKernelType &expected_kernel_type,
-                   const OpKernelType &kernel_type_for_var,
-                   const Tensor &input_tensor, Tensor *output_tensor) {
-  bool transformed = false;
-  Tensor in;
-  in.ShareDataWith(input_tensor);
-  Tensor out;
-  //  // do layout transform
-  //  if (NeedTransformLayout(expected_kernel_type.data_layout_,
-  //                          kernel_type_for_var.data_layout_)) {
-  //    TransDataLayout(kernel_type_for_var, expected_kernel_type, in,
-  //    &out);
-  //    transformed = true;
-  //    PassTensorData(&out, &in);
-  //  }
-  //
-  //  // do data type transform
-  //  if (expected_kernel_type.data_type_ !=
-  //  kernel_type_for_var.data_type_) {
-  //    TransDataType(kernel_type_for_var, expected_kernel_type, in,
-  //    &out);
-  //    transformed = true;
-  //    PassTensorData(&out, &in);
-  //  }
-  //
-  //  // do device transform
-  //  if (!platform::is_same_place(kernel_type_for_var.place_,
-  //                               expected_kernel_type.place_)) {
-  //    TransDataDevice(in, expected_kernel_type.place_, &out);
-  //    transformed = true;
-  //    PassTensorData(&out, &in);
-  //  }
-  //
-  //  PADDLE_ENFORCE(transformed, "No transform is applied, please
-  //  check!");
-  // get output data
-  output_tensor->ShareDataWith(in);
-}
-void CopyVariableWithTensor(const Variable &in_var, const Tensor &tensor,
-                            Variable *out_var) {
-  //  if (in_var.IsType<LoDTensor>()) {
-  //    auto& in_lod_tensor = in_var.Get<LoDTensor>();
-  //    auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
-  //    tran_lod_tensor->set_lod(in_lod_tensor.lod());
-  //    tran_lod_tensor->set_layout(in_lod_tensor.layout());
-  //    tran_lod_tensor->ShareDataWith(tensor);
-  //  } else if (in_var.IsType<SelectedRows>()) {
-  //    auto& in_selected_rows = in_var.Get<SelectedRows>();
-  //    auto* trans_selected_rows =
-  //    out_var.GetMutable<SelectedRows>();
-  //    trans_selected_rows->set_height(in_selected_rows.height());
-  //    trans_selected_rows->set_rows(in_selected_rows.rows());
-  //    trans_selected_rows->mutable_value()->ShareDataWith(tensor);
-  //  } else {
-  //    PADDLE_THROW("unknown var type");
-  //  }
-}
-}  // namespace framework
-}  // namespace paddle_mobile
--- a/src/framework/ddim.cpp
+++ b/src/framework/ddim.cpp
@@ -63,9 +63,6 @@ void make_ddim(DDim &ddim, const int64_t *dims, int n) {
      ddim = make_dim<9>(dims);
      break;
    default:
-      //      std::cout << "Dynamic dimensions must have between [1,
-      //      9]
-      //      dimensions.";
      break;
  }
 }
@@ -133,9 +130,6 @@ int64_t DDim::operator[](int idx) const {
 int DDim::size() const { return arity(*this); }
 bool DDim::operator==(DDim d) const {
-  //  if (var.which() != d.getVar().which()) {
-  //    return false;
-  //  } else {
  std::vector<int64_t> v1 = vectorize(*this);
  std::vector<int64_t> v2 = vectorize(d);
@@ -157,7 +151,7 @@ DDim DDim::operator+(DDim d) const {
  std::vector<int64_t> v3;
-  assert(v1.size() == v2.size());
+  PADDLE_MOBILE_ENFORCE(v1.size() == v2.size(), "v1.size() != v2.size()");
  for (unsigned int i = 0; i < v1.size(); i++) {
    v3.push_back(v1[i] + v2[i]);
@@ -172,7 +166,7 @@ DDim DDim::operator*(DDim d) const {
  std::vector<int64_t> v3;
-  assert(v1.size() == v2.size());
+  PADDLE_MOBILE_ENFORCE(v1.size() == v2.size(), "v1.size() == v2.size()");
  for (unsigned int i = 0; i < v1.size(); i++) {
    v3.push_back(v1[i] * v2[i]);
@@ -183,7 +177,7 @@ DDim DDim::operator*(DDim d) const {
 int64_t get(const DDim &ddim, int idx) { return ddim[idx]; }
-void set(DDim &ddim, int idx, int value) { ddim[idx] = value; }
+void set(DDim *ddim, int idx, int value) { (*ddim)[idx] = value; }
 /// @cond HIDDEN
 struct VectorizeVisitor : Vistor<void> {
@@ -235,13 +229,10 @@ struct SliceVectorizeVisitor : Vistor<void> {
  SliceVectorizeVisitor(std::vector<int64_t> &v, int b, int e)
      : vector(v), begin(b), end(e) {
-    //    PADDLE_ENFORCE(begin < end,
+    PADDLE_MOBILE_ENFORCE(
-    //                   "Begin index must be less than end index in
+        begin < end, "Begin index must be less than end index in ddim slice.");
-    //                   ddim
+    PADDLE_MOBILE_ENFORCE(begin >= 0,
-    //                   slice.");
+                          "Begin index can't be less than zero in ddim slice.");
-    //    PADDLE_ENFORCE(begin >= 0,
-    //                   "Begin index can't be less than zero in
-    //                   ddim slice.");
  }
  template <int S>
@@ -267,9 +258,7 @@ DDim slice_ddim(const DDim &ddim, int begin, int end) {
  std::vector<int64_t> vec;
  vec.reserve(end - begin);
  SliceVectorizeVisitor visitor(vec, begin, end);
-  //  boost::apply_visitor(visitor, dim);
  DDim::ApplyVistor(visitor, ddim);
-  //  visitor(ddim.var.Get<Dim<4>>());
  return make_ddim(vec);
 }
@@ -287,31 +276,19 @@ struct ArityVisitor : Vistor<int> {
 int arity(const DDim &d) {
  ArityVisitor arityVisitor = ArityVisitor();
  return DDim::ApplyVistor(arityVisitor, d);
-  //  return arityVisitor(d.var.Get<Dim<4>>());
-  //  return boost::apply_visitor(ArityVisitor(), d); }
 }
-/// \cond HIDDEN
-/// \endcond
-struct OSVistor : Vistor<std::ostream &> {
+#ifdef PADDLE_MOBILE_DEBUG
-  OSVistor(std::ostream &os) : os_(os) {}
+Print &operator<<(Print &printer, const DDim &ddim) {
+  for (int j = 0; j < ddim.size(); ++j) {
-  template <int D>
+    printer << ddim[j] << " ";
-  std::ostream &operator()(Dim<D> dim) const {
-    return os_ << dim;
  }
- private:
+  return printer;
-  std::ostream &os_;
-};
-std::ostream &operator<<(std::ostream &os, const DDim &ddim) {
-  auto vistor = OSVistor(os);
-  DDim::ApplyVistor(vistor, ddim);
-  return os;
 }
+#endif
 DDim::DDim(std::initializer_list<int64_t> init_list) {
  *this = make_ddim(init_list);
 }

--- a/src/framework/ddim.h
+++ b/src/framework/ddim.h
@@ -14,10 +14,10 @@ limitations under the License. */
 #pragma once
-#include <assert.h>
 #include <initializer_list>
-#include <stdexcept>
+#include <typeinfo>
 #include <vector>
+#include "common/enforce.h"
 #include "common/variant.h"
 #include "dim.h"
@@ -58,9 +58,7 @@ struct DDim {
    } else if (d.var.TypeId() == typeid(Dim<9>).hash_code()) {
      return vistor(d.var.Get<Dim<9>>());
    } else {
-      printf(" dim not support  \n");
+      DLOG << " dim not support";
-      throw std::bad_exception();
-      //        return typename Vistor::type_t();
    }
  }
@@ -83,17 +81,6 @@ struct DDim {
  int64_t operator[](int idx) const;
-  //  template <typename Visitor>
-  //  typename Visitor::result_type apply_visitor(Visitor& visitor) {
-  //    return var.apply_visitor(visitor);
-  //  }
-  //
-  //  template <typename Visitor>
-  //  typename Visitor::result_type apply_visitor(Visitor& visitor)
-  //  const {
-  //    return var.apply_visitor(visitor);
-  //  }
  DDimVar getVar() { return var; }
  bool operator==(DDim d) const;
@@ -126,7 +113,7 @@ DDim make_ddim(std::initializer_list<int64_t> dims);
 int64_t get(const DDim &dim, int idx);
-void set(DDim &dim, int idx, int val);
+void set(DDim *dim, int idx, int val);
 std::vector<int64_t> vectorize(const DDim &ddim);
@@ -151,8 +138,6 @@ DDim slice_ddim(const DDim &dim, int begin, int end);
 int arity(const DDim &ddim);
-std::ostream &operator<<(std::ostream &, const DDim &);
 // Reshape a tensor to a matrix. The matrix's first dimension(column
 // length)
 // will be the product of tensor's first `num_col_dims` dimensions.
@@ -163,5 +148,9 @@ DDim flatten_to_1d(const DDim &src);
 DDim stride(const DDim &ddim);
 DDim stride_numel(const DDim &ddim);
+#ifdef PADDLE_MOBILE_DEBUG
+Print &operator<<(Print &printer, const DDim &ddim);
+#endif
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/dim.h
+++ b/src/framework/dim.h
@@ -14,13 +14,7 @@ limitations under the License. */
 #pragma once
-#include <iostream>
+#include "common/enforce.h"
-#include <sstream>
-#include <stdexcept>
-#include <type_traits>
-#include "platform/hostdevice.h"
 namespace paddle_mobile {
 namespace framework {
@@ -30,42 +24,35 @@ struct Dim {
  static constexpr int dimensions = i;
  template <typename... Args>
-  HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
+  Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
    static_assert(sizeof...(_tail) == i - 1,
                  "Dim initialized with the wrong number of parameters");
  }
-  HOSTDEVICE
  Dim(int64_t _head, const Dim<i - 1> &_tail) : head(_head), tail(_tail) {}
-  HOSTDEVICE
  Dim() : head(0), tail() {}
  /** Construct a Dim from a linear index and size.  Uses Fortran
   * order
   * indexing. */
-  HOSTDEVICE
  Dim(int64_t idx, const Dim<i> &size)
      : head(idx % size.head), tail(idx / size.head, size.tail) {}
  /** Construct a Dim with each dimension set to the given index */
-  HOSTDEVICE
  Dim(int64_t idx) : head(idx), tail(idx) {}
-  HOSTDEVICE
  bool operator==(const Dim<i> &o) const {
    return (head == o.head) && (tail == o.tail);
  }
-  HOSTDEVICE
  bool operator!=(const Dim<i> &o) const { return !(*this == o); }
-  HOSTDEVICE
  int64_t &operator[](int idx);
-  HOSTDEVICE
  int64_t operator[](int idx) const;
-  HOST std::string to_string() const;
+  std::string to_string() const;
  int64_t head;
  Dim<i - 1> tail;
@@ -76,32 +63,22 @@ template <>
 struct Dim<0> {
  static constexpr int dimensions = 0;
-  HOSTDEVICE
  Dim(int64_t _head) {}
-  HOSTDEVICE
  Dim() {}
-  HOSTDEVICE
  Dim(int idx, const Dim<0> &size) {
-#ifndef __CUDA_ARCH__
    if (idx > 0) {
-      throw std::invalid_argument("Index out of range.");
+      PADDLE_MOBILE_THROW_EXCEPTION("Index out of range.")
    }
-#else
-    PADDLE_ASSERT(idx == 0);
-#endif
  }
-  HOSTDEVICE
  bool operator==(const Dim<0> &o) const { return true; }
-  HOSTDEVICE
  bool operator!=(const Dim<0> &o) const { return false; }
-  HOSTDEVICE
  int64_t &operator[](int idx);
-  HOSTDEVICE
  int64_t operator[](int idx) const;
 };
@@ -112,12 +89,12 @@ template <int i>
 struct DimGetter {
  // Return a copy if Dim is const
  template <typename D>
-  HOSTDEVICE static int64_t impl(const D &d) {
+  static int64_t impl(const D &d) {
    return DimGetter<i - 1>::impl(d.tail);
  }
  // Return a reference if Dim is mutable
  template <typename D>
-  HOSTDEVICE static int64_t &impl(D &d) {
+  static int64_t &impl(D &d) {
    return DimGetter<i - 1>::impl(d.tail);
  }
 };
@@ -127,25 +104,22 @@ template <>
 struct DimGetter<0> {
  // Return a copy if Dim is const
  template <typename D>
-  HOSTDEVICE static int64_t impl(const D &d) {
+  static int64_t impl(const D &d) {
    return d.head;
  }
  // Return a reference if Dim is mutable
  template <typename D>
-  HOSTDEVICE static int64_t &impl(D &d) {
+  static int64_t &impl(D &d) {
    return d.head;
  }
 };
 template <int D>
-HOSTDEVICE int64_t &indexer(Dim<D> &dim, int idx) {
+int64_t &indexer(Dim<D> &dim, int idx) {
-#ifndef __CUDA_ARCH__
  if (idx < 0) {
-    throw std::invalid_argument("Tried to access a negative dimension");
+    PADDLE_MOBILE_THROW_EXCEPTION("Tried to access a negative dimension")
  }
-#else
-  PADDLE_ASSERT(idx >= 0);
-#endif
  if (idx == 0) {
    return dim.head;
  }
@@ -153,31 +127,15 @@ HOSTDEVICE int64_t &indexer(Dim<D> &dim, int idx) {
 }
 template <>
-HOSTDEVICE int64_t &indexer<0>(Dim<0> &dim, int idx) {
+int64_t &indexer<0>(Dim<0> &dim, int idx) {
-#ifndef __CUDA_ARCH__
+  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
-  throw std::invalid_argument("Invalid index");
-#else
-  PADDLE_ASSERT(false);
-#if CUDA_VERSION < 8000
-  // On CUDA versions previous to 8.0, only __shared__ variables
-  // could be declared as static in the device code.
-  int64_t head = 0;
-#else
-  static int64_t head = 0;
-#endif
-  return head;
-#endif
 }
 template <int D>
-HOSTDEVICE int64_t indexer(const Dim<D> &dim, int idx) {
+int64_t indexer(const Dim<D> &dim, int idx) {
-#ifndef __CUDA_ARCH__
  if (idx < 0) {
-    throw std::invalid_argument("Tried to access a negative dimension");
+    PADDLE_MOBILE_THROW_EXCEPTION("Tried to access a negative dimension")
  }
-#else
-  PADDLE_ASSERT(idx >= 0);
-#endif
  if (idx == 0) {
    return dim.head;
  }
@@ -185,102 +143,84 @@ HOSTDEVICE int64_t indexer(const Dim<D> &dim, int idx) {
 }
 template <>
-HOSTDEVICE int64_t indexer<0>(const Dim<0> &dim, int idx) {
+int64_t indexer<0>(const Dim<0> &dim, int idx) {
-#ifndef __CUDA_ARCH__
+  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
-  throw std::invalid_argument("Invalid index");
-#else
-  PADDLE_ASSERT(false);
-#if CUDA_VERSION < 8000
-  // On CUDA versions previous to 8.0, only __shared__ variables
-  // could be declared as static in the device code.
-  int64_t head = 0;
-#else
-  static int64_t head = 0;
-#endif
-  return head;
-#endif
 }
 }  // namespace
 // Static access to constant Dim
 template <int i, int l>
-HOSTDEVICE int64_t get(const Dim<l> &d) {
+int64_t get(const Dim<l> &d) {
  return DimGetter<i>::impl(d);
 }
 // Static access to mutable Dim
 template <int i, int l>
-HOSTDEVICE int64_t &get(Dim<l> &d) {
+int64_t &get(Dim<l> &d) {
  return DimGetter<i>::impl(d);
 }
 // Dynamic access to constant Dim
 template <int l>
-HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
+int64_t Dim<l>::operator[](int i) const {
  //  std::cout << "l: " << l << std::endl;
  return indexer(*this, i);
 }
 // Dynamic access to mutable Dim
 template <int l>
-HOSTDEVICE int64_t &Dim<l>::operator[](int i) {
+int64_t &Dim<l>::operator[](int i) {
  return indexer(*this, i);
 }
 // Dynamic access to constant Dim
-inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const {
+inline int64_t Dim<0>::operator[](int i) const { return indexer(*this, i); }
-  return indexer(*this, i);
-}
 // Dynamic access to mutable Dim
-inline HOSTDEVICE int64_t &Dim<0>::operator[](int i) {
+inline int64_t &Dim<0>::operator[](int i) { return indexer(*this, i); }
-  return indexer(*this, i);
-}
 // Dynamic access to constant Dim
 // without std::enable_if will try to instantiate this on get<0>(d)
 template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l> &d,
+typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l> &d, int i) {
-                                                               int i) {
  return d[i];
 }
 // Dynamic access to mutable Dim
 template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int64_t &>::type get(Dim<l> &d,
+typename std::enable_if<(l > 0), int64_t &>::type get(Dim<l> &d, int i) {
-                                                                 int i) {
  return d[i];
 }
 // Dot product of two dims
 template <int i>
-HOSTDEVICE int64_t linearize(const Dim<i> &a, const Dim<i> &b) {
+int64_t linearize(const Dim<i> &a, const Dim<i> &b) {
  return a.head * b.head + linearize(a.tail, b.tail);
 }
 // Base case dot product of two Dims
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int64_t linearize(const Dim<0> &a, const Dim<0> &b) {
+inline int64_t linearize(const Dim<0> &a, const Dim<0> &b) {
  return 0;
 }
 // Product of a Dim
 template <int i>
-HOSTDEVICE int64_t product(const Dim<i> &a, int prod = 1) {
+int64_t product(const Dim<i> &a, int prod = 1) {
  return prod * a.head * product(a.tail);
 }
 // Base case product of a Dim
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int64_t product(const Dim<0> &a, int prod) {
+inline int64_t product(const Dim<0> &a, int prod) {
  return prod;
 }
 // Is 0 <= idx_i < size_i for all i?
 template <int i>
-HOSTDEVICE bool contained(const Dim<i> &idx, const Dim<i> &size) {
+bool contained(const Dim<i> &idx, const Dim<i> &size) {
  return ((0 <= idx.head) && (idx.head < size.head) &&
          contained(idx.tail, size.tail));
 }
@@ -288,7 +228,7 @@ HOSTDEVICE bool contained(const Dim<i> &idx, const Dim<i> &size) {
 // Base case of is 0 <= idx_i < size_i ?
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline bool contained(const Dim<0> &idx, const Dim<0> &size) {
+inline bool contained(const Dim<0> &idx, const Dim<0> &size) {
  return true;
 }
@@ -296,7 +236,7 @@ HOSTDEVICE inline bool contained(const Dim<0> &idx, const Dim<0> &size) {
 * \brief Compute exclusive prefix-multiply of a Dim.
 */
 template <int i>
-HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i> &src, int mul = 1) {
+Dim<i> ex_prefix_mul(const Dim<i> &src, int mul = 1) {
  return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head));
 }
@@ -304,7 +244,7 @@ HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i> &src, int mul = 1) {
 // Base case of ex_prefix_mul
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0> &src, int mul) {
+inline Dim<0> ex_prefix_mul(const Dim<0> &src, int mul) {
  return Dim<0>();
 }
 ///\endcond
@@ -313,18 +253,18 @@ HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0> &src, int mul) {
 * Add two dimensions together
 */
 template <int i>
-HOSTDEVICE Dim<i> dim_plus(const Dim<i> &a, const Dim<i> &b) {
+Dim<i> dim_plus(const Dim<i> &a, const Dim<i> &b) {
  return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail));
 }
 // Base case
 template <>
-HOSTDEVICE inline Dim<0> dim_plus(const Dim<0> &a, const Dim<0> &b) {
+inline Dim<0> dim_plus(const Dim<0> &a, const Dim<0> &b) {
  return Dim<0>();
 }
 template <int i>
-HOSTDEVICE Dim<i> operator+(const Dim<i> &lhs, const Dim<i> &rhs) {
+Dim<i> operator+(const Dim<i> &lhs, const Dim<i> &rhs) {
  return dim_plus(lhs, rhs);
 }
@@ -332,18 +272,18 @@ HOSTDEVICE Dim<i> operator+(const Dim<i> &lhs, const Dim<i> &rhs) {
 * Multiply two dimensions together
 */
 template <int i>
-HOSTDEVICE Dim<i> dim_mult(const Dim<i> &a, const Dim<i> &b) {
+Dim<i> dim_mult(const Dim<i> &a, const Dim<i> &b) {
  return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail));
 }
 // Base case
 template <>
-HOSTDEVICE inline Dim<0> dim_mult(const Dim<0> &a, const Dim<0> &b) {
+inline Dim<0> dim_mult(const Dim<0> &a, const Dim<0> &b) {
  return Dim<0>();
 }
 template <int i>
-HOSTDEVICE Dim<i> operator*(const Dim<i> &lhs, const Dim<i> &rhs) {
+Dim<i> operator*(const Dim<i> &lhs, const Dim<i> &rhs) {
  return dim_mult(lhs, rhs);
 }
@@ -358,7 +298,7 @@ HOSTDEVICE Dim<i> operator*(const Dim<i> &lhs, const Dim<i> &rhs) {
 */
 template <int i>
-HOSTDEVICE Dim<i> normalize_strides(const Dim<i> &size, const Dim<i> &stride) {
+Dim<i> normalize_strides(const Dim<i> &size, const Dim<i> &stride) {
  int norm_stride = size.head == 1 ? 0 : stride.head;
  return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail));
 }
@@ -366,8 +306,7 @@ HOSTDEVICE Dim<i> normalize_strides(const Dim<i> &size, const Dim<i> &stride) {
 ///\cond HIDDEN
 template <>
-HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0> &size,
+inline Dim<0> normalize_strides(const Dim<0> &size, const Dim<0> &stride) {
-                                           const Dim<0> &stride) {
  return Dim<0>();
 }
@@ -382,54 +321,9 @@ HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0> &size,
 */
 template <typename... Args>
-HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
+Dim<sizeof...(Args)> make_dim(Args... idxes) {
  return Dim<sizeof...(Args)>(idxes...);
 }
-// Allows us to output a Dim
-// XXX For some reason, overloading fails to resolve this correctly
-template <int i>
-typename std::enable_if<(i > 1), std::ostream &>::type operator<<(
-    std::ostream &os, const Dim<i> &d) {
-  os << d.head << ", " << d.tail;
-  return os;
-}
-// Base case that allows us to output a Dim
-// XXX I wish this could be an overload instead of a template
-template <int i>
-typename std::enable_if<(i == 1), std::ostream &>::type operator<<(
-    std::ostream &os, const Dim<i> &d) {
-  os << d.head;
-  return os;
-}
-inline std::ostream &operator<<(std::ostream &os, const Dim<0> &d) {
-  return os;
-}
-template <int i>
-HOST std::string Dim<i>::to_string() const {
-  std::stringstream stream;
-  stream << *this;
-  return stream.str();
-}
-template <int D>
-HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
-  Dim<D> result;
-  for (int i = 0; i < D - 1; ++i) {
-    result[i] = linear_index % extents[i];
-    linear_index /= extents[i];
-  }
-  result[D - 1] = linear_index;
-  return result;
-}
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/lod_tensor.cpp
+++ b/src/framework/lod_tensor.cpp
@@ -13,72 +13,56 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "lod_tensor.h"
-#include <stdint.h>
-#include <string.h>
 #include <algorithm>
-#include <iterator>
 namespace paddle_mobile {
 namespace framework {
-std::ostream &operator<<(std::ostream &os, const LoD &lod) {
+// std::ostream &operator<<(std::ostream &os, const LoD &lod) {
-  os << "{";
+//  os << "{";
-  for (auto &v : lod) {
+//  for (auto &v : lod) {
-    os << "{";
+//    os << "{";
-    bool is_first = true;
+//    bool is_first = true;
-    for (auto &i : v) {
+//    for (auto &i : v) {
-      if (is_first) {
+//      if (is_first) {
-        os << i;
+//        os << i;
-        is_first = false;
+//        is_first = false;
-      } else {
+//      } else {
-        os << ", " << i;
+//        os << ", " << i;
-      }
+//      }
-    }
+//    }
-    os << "}";
+//    os << "}";
-  }
+//  }
-  os << "}";
+//  os << "}";
+//
-  return os;
+//  return os;
-}
+//}
+//
-std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
+// std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
-  //  PADDLE_ENFORCE(t.type().hash_code() ==
+//  PADDLE_MOBILE_ENFORCE(t.type().hash_code() == typeid(float).hash_code(),
-  //  typeid(float).hash_code());
+//                        "t.type() is not float");
+//  os << "dim: " << t.dims() << "\n";
-  //  if (!platform::is_cpu_place(t.place())) {
+//  os << "lod: " << t.lod() << "\n";
-  //    LoDTensor tt;
+//  // only print first ten elements
-  //    framework::TensorCopy(t, platform::CPUPlace(), &tt);
+//  int64_t size = t.numel() < 10 ? t.numel() : 10;
-  //    platform::DeviceContextPool &pool =
+//  for (int64_t i = 0; i < size; ++i) {
-  //    platform::DeviceContextPool::Instance(); auto &dev_ctx =
+//    os << t.data<float>()[i] << " ";
-  //    *pool.Get(t.place()); dev_ctx.Wait();
+//  }
-  //
+//
-  //    os << tt;
+//  return os;
-  //    return os;
+//}
-  //  }
+// std::string LoDToString(const LoD &lod) {
-  os << "dim: " << t.dims() << "\n";
+//  std::ostringstream stream;
-  os << "lod: " << t.lod() << "\n";
+//  stream << lod;
+//  return stream.str();
-  // only print first ten elements
+//}
-  int64_t size = t.numel() < 10 ? t.numel() : 10;
-  for (int64_t i = 0; i < size; ++i) {
-    os << t.data<float>()[i] << " ";
-  }
-  return os;
-}
-std::string LoDToString(const LoD &lod) {
-  std::ostringstream stream;
-  stream << lod;
-  return stream.str();
-}
 LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
                 size_t elem_end) {
-  //  PADDLE_ENFORCE_LT(level, in.size());
+  PADDLE_MOBILE_ENFORCE(level < in.size(), "level should >= in.size()");
-  //  PADDLE_ENFORCE_LT(elem_end, in[level].size());
+  PADDLE_MOBILE_ENFORCE(elem_end < in[level].size(),
+                        "elem_end >= in[level].size()");
  LoD res;
  res.resize(in.size() - level);
  // copy the first level
@@ -152,7 +136,7 @@ bool CheckLoD(const LoD &in, int tensor_height) {
          if (a < b) return true;
          return false;
        })) {
-      std::cout << "ascending error";
+      PADDLE_MOBILE_THROW_EXCEPTION("ascending error")
      return false;
    }
  }
@@ -211,8 +195,9 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
  LoD sub_lod;
  for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
-    //    PADDLE_ENFORCE_LE(start_idx, end_idx);
+    PADDLE_MOBILE_ENFORCE(start_idx <= end_idx, "start_idx > end_idx");
-    //    PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
+    PADDLE_MOBILE_ENFORCE(end_idx < lod[level_idx].size(),
+                          "end_idx >= lod[level_idx].size()");
    std::vector<size_t> level_lens;
    for (size_t i = start_idx; i < end_idx; ++i) {
      level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
@@ -226,10 +211,9 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
 }
 void AppendLoD(LoD *lod, const LoD &lod_length) {
-  //  PADDLE_ENFORCE(
+  PADDLE_MOBILE_ENFORCE(
-  //      lod->empty() || lod->size() == lod_length.size(),
+      lod->empty() || lod->size() == lod_length.size(),
-  //      "The lod_length should has the same size with the appended
+      "The lod_length should has the same size with the appended lod.");
-  //      lod.");
  if (lod->empty()) {
    for (size_t i = 0; i < lod_length.size(); ++i) {
      lod->emplace_back(1, 0);  // size = 1, value = 0;

--- a/src/framework/lod_tensor.h
+++ b/src/framework/lod_tensor.h
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <memory>
 #include <string>
-#include <utility>
 #include <vector>
 #include "tensor.h"
 #include "tensor_util.h"

--- a/src/framework/op_info.h
+++ b/src/framework/op_info.h
@@ -25,9 +25,8 @@ template <typename Dtype>
 struct OpInfo {
  OpCreator<Dtype> creator_;
  const OpCreator<Dtype> &Creator() const {
-    //    PADDLE_ENFORCE_NOT_NULL(creator_,
+    PADDLE_MOBILE_ENFORCE(creator_ != nullptr,
-    //                            "Operator Creator has not been
+                          "Operator Creator has not been registered");
-    //                            registered");
    return creator_;
  }
 };
@@ -48,17 +47,15 @@ class OpInfoMap {
  }
  void Insert(const std::string &type, const OpInfo<Dtype> &info) {
-    //    PADDLE_ENFORCE(!Has(type), "Operator %s has been
+    PADDLE_MOBILE_ENFORCE(!Has(type), "Operator %s has been registered",
-    //    registered", type);
+                          type.c_str());
    map_.insert({type, info});
  }
  const OpInfo<Dtype> &Get(const std::string &type) const {
    auto op_info_ptr = GetNullable(type);
-    //    PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not
+    PADDLE_MOBILE_ENFORCE(op_info_ptr != nullptr,
-    //    been
+                          "Operator %s has not been registered", type.c_str());
-    //    registered",
-    //                            type);
    return *op_info_ptr;
  }

--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -96,24 +96,39 @@ class OpRegistry {
  }
 };
-#define REGISTER_OPERATOR(op_type, op_class)                                \
+#define REGISTER_OPERATOR(op_type, op_class, device_name, device_type)     \
-  template <typename Dtype, typename T>                                     \
+  template <typename Dtype, typename T>                                    \
-  class _OpClass_##op_type##_ : public op_class<Dtype, T> {                 \
+  class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> {   \
-   public:                                                                  \
+   public:                                                                 \
-    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class);                 \
+    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_##device_name, op_class);   \
-  };                                                                        \
+  };                                                                       \
-  static paddle_mobile::framework::OperatorRegistrar<                       \
+  static paddle_mobile::framework::OperatorRegistrar<                      \
-      paddle_mobile::CPU, _OpClass_##op_type##_<paddle_mobile::CPU, float>> \
+      device_type, _OpClass_##op_type##_##device_name<device_type, float>> \
-      __op_registrar_##op_type##__(#op_type);                               \
+      __op_registrar_##op_type##_##device_name(#op_type);                  \
-  int TouchOpRegistrar_##op_type() {                                        \
+  int TouchOpRegistrar_##op_type##_##device_name() {                       \
-    __op_registrar_##op_type##__.Touch();                                   \
+    __op_registrar_##op_type##_##device_name.Touch();                      \
-    return 0;                                                               \
+    return 0;                                                              \
  }
-#define USE_OP(op_type)                                           \
+#define REGISTER_OPERATOR_CPU(op_type, op_class) \
-  extern int TouchOpRegistrar_##op_type();                        \
+  REGISTER_OPERATOR(op_type, op_class, cpu, paddle_mobile::CPU);
-  static int use_op_itself_##op_type##_ __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type()
+#define REGISTER_OPERATOR_MALI_GPU(op_type, op_class) \
+  REGISTER_OPERATOR(op_type, op_class, mali_gpu, paddle_mobile::GPU_MALI);
+#define REGISTER_OPERATOR_FPGA(op_type, op_class) \
+  REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);
+#define USE_OP(op_type, device_name)                                           \
+  extern int TouchOpRegistrar_##op_type##_##device_name();                     \
+  static int use_op_itself_##op_type##_##device_name __attribute__((unused)) = \
+      TouchOpRegistrar_##op_type##_##device_name()
+#define USE_OP_CPU(op_type) USE_OP(op_type, cpu);
+#define USE_OP_MALI_GPU(op_type) USE_OP(op_type, mali_gpu);
+#define USE_OP_FPGA(op_type) USE_OP(op_type, fpga);
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -58,7 +58,8 @@ void OperatorBase<Dtype>::Run() const {
 }
 template class OperatorBase<CPU>;
-template class OperatorWithKernel<CPU>;
+template class OperatorBase<FPGA>;
+template class OperatorBase<GPU_MALI>;
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <map>
 #include <string>
-#include <utility>
 #include <vector>
 #include "common/enforce.h"
@@ -27,7 +26,6 @@ limitations under the License. */
 #include "framework/op_info.h"
 #include "framework/op_kernel_type.h"
 #include "framework/op_registry.h"
-#include "framework/paddle_mobile_object.h"
 #include "framework/program/block_desc.h"
 #include "framework/program/program-optimize/node.h"
 #include "framework/scope.h"
@@ -52,7 +50,7 @@ static T *GetVarValue(const string &key, const VariableNameMap &var_map,
 }
 template <typename Dtype>
-class OperatorBase : PaddleMobileObject {
+class OperatorBase {
 public:
  /*
   *  @b op 基类的实例化方法, op 获取到了 输入、参数以及提前分配好的输出 tensor
@@ -65,6 +63,7 @@ class OperatorBase : PaddleMobileObject {
  std::vector<string> GetOutKeys() const;
  virtual void RunImpl() const = 0;
+  virtual void Init() const = 0;
  /*
   * @b op 运算所需的输入, 如上一层的输出结果、卷积核
   * */
@@ -105,31 +104,55 @@ class OperatorBase : PaddleMobileObject {
 /*
 * @b 这个类为所有带有运算的 op 的父类, 这个 op 继承与 OperatorBase
 * */
-template <typename Dtype>
+template <typename Dtype, typename ParamType, typename KernelType>
 class OperatorWithKernel : public OperatorBase<Dtype> {
 public:
  OperatorWithKernel(const std::string &type, const VariableNameMap &inputs,
                     const VariableNameMap &outputs, const AttributeMap &attrs,
                     std::shared_ptr<Scope> scope)
-      : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope) {}
+      : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope),
+        param_(inputs, outputs, attrs, *scope) {}
+  virtual void RunImpl() const { this->kernel_.Compute(this->param_); }
-  virtual void RunImpl() const = 0;
  virtual void InferShape() const = 0;
+  void Init() const {
+    PADDLE_MOBILE_ENFORCE(kernel_.Init(param_), "  %s kernel init failed",
+                          this->type_.c_str());
+  }
+ protected:
+  KernelType kernel_;
+  ParamType param_;
 };
 /*
 * @b 所有kernel的父类
 * */
 template <typename Dtype, typename P>
-class OpKernelBase : PaddleMobileObject {
+class OpKernelBase {
 public:
  /*
   * @b 所有kernel 需实现 Compute 方法
   * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
   *    所有结构体存在与: paddle-mobile/src/operators/op_param.h
   * */
+#ifdef PADDLE_MOBILE_MALI_GPU
+  OpKernelBase() { acl_op_ = nullptr; }
+  void *GetAclOp() const { return acl_op_; }
+  void SetAclOp(void *op, void *ob) const {
+    reinterpret_cast<OpKernelBase<Dtype, P> *>(ob)->acl_op_ = op;
+  }
+#endif
  virtual void Compute(const P &para) const = 0;
+  virtual bool Init(const P &para) const { return true; };
  virtual ~OpKernelBase() = default;
+ private:
+#ifdef PADDLE_MOBILE_MALI_GPU
+  void *acl_op_;
+#endif
 };
 #define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)                                 \
@@ -139,20 +162,23 @@ class OpKernelBase : PaddleMobileObject {
      std::shared_ptr<::paddle_mobile::framework::Scope> scope)                \
      : parent_cls<Dtype, T>(type, inputs, outputs, attrs, scope) {}
-class FusionOpMatcher : PaddleMobileObject {
+class FusionOpMatcher {
 public:
  FusionOpMatcher() {}
  virtual std::string Type() = 0;
-  virtual void FolderNodes(Node *node) {
+  virtual void FolderNodes(
-    node->Folder(node_.Depth(), Type(), {});
+      Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
  }
  virtual Node &BeginNode() { return node_; }
  std::string BeginType() { return node_.Type(); }
+  //  virtual  bool Fusion();
 protected:
  Node node_;
  std::string type_;

--- a/src/framework/paddle_mobile_object.cpp
+++ b/src/framework/paddle_mobile_object.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle_mobile_object.h"
--- a/src/framework/program/block_desc.cpp
+++ b/src/framework/program/block_desc.cpp
@@ -13,17 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "block_desc.h"
+#include <algorithm>
 namespace paddle_mobile {
 namespace framework {
-std::vector<std::shared_ptr<VarDesc>> BlockDesc::Vars() const {
+std::vector<std::shared_ptr<VarDesc>> BlockDesc::Vars() const { return vars_; }
-  std::vector<std::shared_ptr<VarDesc>> res;
-  for (const auto &p : vars_) {
-    res.push_back(p.second);
-  }
-  return res;
-}
 std::vector<std::shared_ptr<OpDesc>> BlockDesc::Ops() const { return ops_; }
@@ -31,10 +26,14 @@ BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc)
    : index_(desc->idx), parent_index_(desc->idx) {
  for (int i = 0; i < desc->n_vars; ++i) {
    PaddleMobile__Framework__Proto__VarDesc *var_desc = desc->vars[i];
-    vars_[std::string(var_desc->name)] =
+    vars_.emplace_back(std::shared_ptr<VarDesc>(new VarDesc(var_desc)));
-        std::shared_ptr<VarDesc>(new VarDesc(var_desc));
  }
+  std::sort(vars_.begin(), vars_.end(),
+            [](std::shared_ptr<VarDesc> left, std::shared_ptr<VarDesc> right) {
+              return left->Name() < right->Name();
+            });
  for (int j = 0; j < desc->n_ops; ++j) {
    PaddleMobile__Framework__Proto__OpDesc *op_desc = desc->ops[j];
    ops_.emplace_back(new framework::OpDesc(op_desc));

--- a/src/framework/program/block_desc.h
+++ b/src/framework/program/block_desc.h
@@ -15,14 +15,13 @@ limitations under the License. */
 #pragma once
 #include "framework/framework.pb-c.h"
-#include "framework/paddle_mobile_object.h"
 #include "framework/program/op_desc.h"
 #include "framework/program/var_desc.h"
 namespace paddle_mobile {
 namespace framework {
-class BlockDesc : PaddleMobileObject {
+class BlockDesc {
 public:
  friend class Node;
  friend class ProgramOptimize;
@@ -35,10 +34,9 @@ class BlockDesc : PaddleMobileObject {
      ops_.push_back(copy_op_desc);
    }
-    for (auto &var_desc : block_desc.vars_) {
+    for (int i = 0; i < block_desc.vars_.size(); ++i) {
-      std::shared_ptr<VarDesc> copy_var_desc =
+      auto &var_desc = block_desc.vars_[i];
-          std::make_shared<VarDesc>(*var_desc.second);
+      vars_.emplace_back(std::make_shared<VarDesc>(*var_desc));
-      vars_[var_desc.first] = copy_var_desc;
    }
  }
@@ -64,7 +62,7 @@ class BlockDesc : PaddleMobileObject {
  bool multi_thread_;
  int parent_index_;
  std::vector<std::shared_ptr<OpDesc>> ops_;
-  std::unordered_map<std::string, std::shared_ptr<VarDesc>> vars_;
+  std::vector<std::shared_ptr<VarDesc>> vars_;
 };
 }  // namespace framework

--- a/src/framework/program/op_desc.h
+++ b/src/framework/program/op_desc.h
@@ -20,12 +20,11 @@ limitations under the License. */
 #include "common/log.h"
 #include "common/type_define.h"
 #include "framework/framework.pb-c.h"
-#include "framework/paddle_mobile_object.h"
 namespace paddle_mobile {
 namespace framework {
-class OpDesc : PaddleMobileObject {
+class OpDesc {
 public:
  friend class ProgramOptimize;
  friend class FusionOpMatcher;

--- a/src/framework/program/program-optimize/fusion_op_register.cpp
+++ b/src/framework/program/program-optimize/fusion_op_register.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "fusion_op_register.h"
--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <sstream>
-#include "framework/operator.h"
 #include "framework/program/program-optimize/node.h"
+#include <algorithm>
+#include "framework/operator.h"
 namespace paddle_mobile {
@@ -45,54 +44,13 @@ bool Node::operator==(const Node &in) {
  return true;
 }
-bool Node::CanSplit(std::unordered_set<std::string> complex_compute_set) {
+std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(int size) {
-  bool split = false;
-  CanSplit(&split, false, 0, &complex_compute_set, this);
-  return split;
-}
-void Node::CanSplit(bool *split, bool spliting, int complex_count,
-                    std::unordered_set<std::string> *complex_compute_set,
-                    Node *pre_node) {
-  if (spliting) {
-    if (complex_compute_set->find(this->type_) != complex_compute_set->end()) {
-      complex_count++;
-    }
-  }
-  if (inputs_.size() > 1 && pre_node != inputs_.back()) {
-    return;
-  }
-  if (inputs_.size() > 1 && pre_node == inputs_.back()) {
-    if (complex_count > 1) {
-      *split = true;
-      return;
-    }
-  }
-  // multi output, to check
-  if (outputs_.size() > 1) {
-    spliting = true;
-    complex_compute_set = 0;
-  } else {
-    if (spliting == true && inputs_.size() > 0) {
-      spliting = false;
-    } else {
-    }
-  }
-  for (auto &output : outputs_) {
-    output->CanSplit(split, spliting, complex_count, complex_compute_set, this);
-  }
-}
-std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(uint size) {
  std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
  OpDescs(size - 1, &op_descs);
  return op_descs;
 }
-void Node::OpDescs(uint index,
+void Node::OpDescs(int index,
                   std::vector<std::shared_ptr<framework::OpDesc>> *op_desc) {
  if (index == 0) {
    return;
@@ -103,107 +61,6 @@ void Node::OpDescs(uint index,
  }
 }
-void Node::OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-                   Node *node, bool adding_thread, int thread_num) {
-  if (outputs_.size() > 1) {
-    adding_thread = false;
-  }
-  bool can_add_split = false;
-  // 如果当前节点有多个输出 并且 只有当前节点对应的 op_desc_ 输出数为 1 时支持
-  if (outputs_.size() > 1 &&
-      op_input_output_key[op_desc_->type_].second.size() == 1) {
-    can_add_split = true;
-    // 遍历当前节点的 output 节点
-    for (const auto &output : outputs_) {
-      // 不支持 output 有多个 output 的情况
-      if (output->outputs_.size() > 0) {
-        can_add_split = false;
-        break;
-      }
-      //与节点关联的 OpDesc
-      std::shared_ptr<framework::OpDesc> &op_desc = output->op_desc_;
-      //获取这个 op 的 inputs key 和 outputs key
-      auto inputs_and_outputs = op_input_output_key[op_desc->type_];
-      //判断现在 是否存在这个 op
-      //判断这个 output 和 input key 的 size 等于 1
-      if (op_input_output_key.find(op_desc->type_) !=
-              op_input_output_key.end() &&
-          inputs_and_outputs.first.size() == 1 &&
-          inputs_and_outputs.second.size() == 1) {
-        auto inputs_of_output = op_desc->Input(inputs_and_outputs.first[0]);
-        auto outputs_of_output = op_desc->Output(inputs_and_outputs.second[0]);
-        // 判断一下, 如果输入和输出没有同名, 是支持的
-        for (int i = 0; i < inputs_of_output.size(); ++i) {
-          std::string input_of_output = inputs_of_output[i];
-          for (int j = 0; j < outputs_of_output.size(); ++j) {
-            std::string output_of_output = outputs_of_output[j];
-            if (input_of_output == output_of_output) {
-              DLOG << "output的 output 包含 input" << input_of_output;
-              can_add_split = false;
-              break;
-            }
-          }
-        }
-      } else {  // 如果模型中包含没有的 op, 则不支持添加 split
-        DLOG << "找不到 这个 op 类型: " << output->op_desc_->type_;
-        can_add_split = false;
-      }
-    }
-  }
-  if (inputs_.size() > 1 && node != inputs_.back()) {
-    return;
-  } else if (inputs_.size() > 1 && node == inputs_.back()) {
-    adding_thread = false;
-    op_desc->push_back(this->op_desc_);
-  } else {
-    op_desc->push_back(this->op_desc_);
-  }
-  if (adding_thread) {
-    Attribute attr;
-    attr.Set<int>(thread_num);
-    this->op_desc_->attrs_["thread"] = attr;
-  }
-  if (can_add_split) {
-    adding_thread = true;
-    std::shared_ptr<OpDesc> split_op_desc = std::make_shared<OpDesc>();
-    split_op_desc->type_ = G_OP_TYPE_SPLIT;
-    auto outputs = this->op_desc_->Output(
-        op_input_output_key[this->op_desc_->Type()].second[0]);
-    split_op_desc->inputs_ = {
-        {op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
-    auto &split_outputs =
-        split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]];
-    for (const auto &output : outputs_) {
-      split_outputs.push_back(outputs[0]);
-    }
-    DLOG << "add split";
-    op_desc->push_back(split_op_desc);
-  }
-  for (int i = 0; i < outputs_.size(); ++i) {
-    auto &output = outputs_[i];
-    if (can_add_split) {
-      output->OpDescs(op_desc, this, adding_thread, i);
-    } else {
-      output->OpDescs(op_desc, this, adding_thread, thread_num);
-    }
-  }
-}
-std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs() {
-  std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
-  OpDescs(&op_descs, this, false, 0);
-  return op_descs;
-}
 std::shared_ptr<Node> Node::To(int size) {
  std::shared_ptr<Node> node = std::make_shared<Node>();
  this->To(size - 1, node);
@@ -224,24 +81,25 @@ void Node::To(int index, std::shared_ptr<Node> node) {
  }
 }
-uint Node::Depth(uint begin) {
+int Node::Depth(int begin) {
-  uint depth = 0;
+  int depth = 0;
  begin++;
  for (int i = 0; i < outputs_.size(); ++i) {
-    uint output_depth = outputs_[i]->Depth(begin);
+    int output_depth = outputs_[i]->Depth(begin);
    depth = output_depth > depth ? output_depth : depth;
  }
  return begin > depth ? begin : depth;
 }
 Node &Node::Folder(
-    uint size, std::string type,
+    int size, std::string type,
-    std::map<std::string, std::pair<std::string, std::string>> change) {
+    std::map<std::string, std::pair<std::string, std::string>> change,
+    std::vector<std::shared_ptr<Node>> *removed_nodes) {
  std::shared_ptr<framework::OpDesc> op_desc =
      std::make_shared<framework::OpDesc>();
  op_desc->inputs_ = this->op_desc_->inputs_;
  std::vector<std::shared_ptr<Node>> outputs;
-  this->Folder(op_desc, &outputs, size - 1, &change, this);
+  this->Folder(op_desc, &outputs, size - 1, &change, this, removed_nodes);
  this->outputs_ = outputs;
  this->type_ = type;
  this->op_desc_ = op_desc;
@@ -251,9 +109,9 @@ Node &Node::Folder(
 void Node::Folder(
    std::shared_ptr<framework::OpDesc> op_desc,
-    std::vector<std::shared_ptr<Node>> *outputs, uint index,
+    std::vector<std::shared_ptr<Node>> *outputs, int index,
    std::map<std::string, std::pair<std::string, std::string>> *change,
-    Node *begin_node) {
+    Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes) {
  if (change->find(this->type_) != change->end()) {
    auto change_pair = (*change)[this->type_];
    op_desc->GetInputs()[change_pair.second] =
@@ -266,7 +124,9 @@ void Node::Folder(
  if (index > 0) {
    --index;
    for (auto output : outputs_) {
-      output->Folder(op_desc, outputs, index, change, begin_node);
+      removed_nodes->push_back(output);
+      output->Folder(op_desc, outputs, index, change, begin_node,
+                     removed_nodes);
    }
  } else {
    for (auto &op_output : this->op_desc_->outputs_) {
@@ -285,7 +145,7 @@ void Node::Folder(
    }
  }
 }
+#ifdef PADDLE_MOBILE_DEBUG
 std::string Node::ToString(std::string blank, const Node *node) const {
  std::stringstream ss;
  ss << type_ << "-> \n";
@@ -316,6 +176,7 @@ Print &operator<<(Print &printer, const Node &node) {
  printer << node.ToString();
  return printer;
 }
+#endif
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -14,20 +14,17 @@ limitations under the License. */
 #pragma once
+#include <cinttypes>
 #include <map>
 #include <string>
-#include <unordered_set>
-#include <utility>
 #include <vector>
 #include "common/log.h"
-#include "framework/paddle_mobile_object.h"
 #include "framework/program/op_desc.h"
 namespace paddle_mobile {
 namespace framework {
-class Node : PaddleMobileObject {
+class Node {
  friend class ProgramOptimize;
 public:
@@ -37,35 +34,34 @@ class Node : PaddleMobileObject {
      : op_desc_(op_desc), type_(op_desc->Type()) {}
  Node &operator>(std::shared_ptr<Node> node);
  bool operator==(const Node &in);
-  bool CanSplit(std::unordered_set<std::string> complex_compute_set);
+#ifdef PADDLE_MOBILE_DEBUG
  std::string ToString() const;
+  void Description();
+#endif
  std::shared_ptr<Node> To(int size);
-  uint Depth(uint begin = 0);
+  int Depth(int begin = 0);
  Node &Folder(
-      uint size, std::string type,
+      int size, std::string type,
-      std::map<std::string, std::pair<std::string, std::string>> change_map);
+      std::map<std::string, std::pair<std::string, std::string>> change_map,
-  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(uint size);
+      std::vector<std::shared_ptr<Node>> *removed_nodes);
-  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs();
+  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(int size);
  std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
  std::string Type() { return type_; }
-  void Description();
 private:
-  void CanSplit(bool *split, bool spliting, int complex_count,
+  void OpDescs(int size,
-                std::unordered_set<std::string> *complex_compute_set,
-                Node *pre_node);
-  void OpDescs(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-               Node *node, bool adding_thread, int thread_num);
-  void OpDescs(uint size,
               std::vector<std::shared_ptr<framework::OpDesc>> *op_desc);
  void To(int index, std::shared_ptr<Node>);
  void Folder(
      std::shared_ptr<framework::OpDesc> op_desc,
-      std::vector<std::shared_ptr<Node>> *outputs, uint index,
+      std::vector<std::shared_ptr<Node>> *outputs, int index,
      std::map<std::string, std::pair<std::string, std::string>> *change,
-      Node *begin_node);
+      Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes);
  std::shared_ptr<framework::OpDesc> op_desc_;
+#ifdef PADDLE_MOBILE_DEBUG
  std::string ToString(std::string blank, const Node *node) const;
+#endif
  std::vector<std::shared_ptr<Node>> outputs_;
  std::vector<Node *> inputs_;
  std::string type_;

--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "framework/program/program-optimize/program_optimize.h"
+#include <algorithm>
 #include "framework/program/program-optimize/fusion_op_register.h"
 namespace paddle_mobile {
 namespace framework {
-std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
+std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
    std::shared_ptr<ProgramDesc> ori_des, bool add_split) {
  //  ProgramDesc *optimize_program = new ProgramDesc(*ori_des);
  std::shared_ptr<ProgramDesc> optimize_program =
@@ -31,6 +32,8 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
    std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>>
        type_map;
+    std::vector<std::shared_ptr<Node>> nodes;
    std::shared_ptr<Node> begin_node;
    auto block = optimize_program->Block(i);
    //        DLOG << " ops size: " << block->Ops().size();
@@ -38,11 +41,13 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
      auto op = block->Ops()[j];
      auto op_type = op->Type();
      if (op_input_output_key.find(op->Type()) == op_input_output_key.end()) {
-        LOG(kLOG_ERROR) << "return null ";
+        LOG(kLOG_ERROR) << "has not support op return null "
+                        << " op type: " << op->Type();
        return nullptr;
      }
      std::shared_ptr<Node> node = std::make_shared<Node>(op);
+      nodes.push_back(node);
      //
      type_map[op->Type()].push_back(node);
@@ -87,21 +92,29 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FushionOptimize(
          //          DLOG << " match success " << " fusion node: \n" <<
          //          matcher->BeginNode() << "\nsub node: \n" << *sub_node;
          //          DLOG << "match node\n"<< *match_node;
-          matcher->FolderNodes(match_node.get());
-          //          DLOG << " after match node\n"<< *match_node;
-          //          match_node->Description();
-          //          DLOG << "begin node: \n" << *begin_node;
+          std::vector<std::shared_ptr<Node>> removed_nodes;
+          matcher->FolderNodes(match_node.get(), &removed_nodes);
+          for (int j = 0; j < removed_nodes.size(); ++j) {
+            auto removed_node = removed_nodes[j];
+            auto removed_ite =
+                std::find(nodes.begin(), nodes.end(), removed_node);
+            nodes.erase(removed_ite);
+          }
        }
      }
    }
-    //    DLOG << "node: \n" << *begin_node;
    std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
-    //    bool can_splite = begin_node->CanSplit({G_OP_TYPE_CONV,
+    if (add_split) {
-    //    G_OP_TYPE_BATCHNORM, G_OP_TYPE_DEPTHWISE_CONV});
+      GenerateOps(&op_descs, begin_node.get(), add_split);
-    GenerateOps(&op_descs, begin_node.get());
+    } else {
+      for (int m = 0; m < nodes.size(); ++m) {
+        auto &node = nodes[m];
+        op_descs.push_back(node->op_desc_);
+      }
+    }
    block->ops_ = op_descs;
  }
@@ -118,6 +131,14 @@ void ProgramOptimize::GenerateOps(
    Node *current_node) {
  if (current_node->inputs_.size() > 1 &&
      input_node != current_node->inputs_.back()) {
+    DLOG << " current type " << current_node->type_;
+    DLOG << " inputs size of current node > 0 ";
+    for (int i = 0; i < current_node->inputs_.size(); ++i) {
+      DLOG << " input i: " << current_node->inputs_[i]->type_;
+    }
    return;
  } else if (current_node->inputs_.size() > 1 &&
             input_node == current_node->inputs_.back()) {
@@ -250,12 +271,12 @@ void ProgramOptimize::GenerateOps(
 }
 void ProgramOptimize::GenerateOps(
-    std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
+    std::vector<std::shared_ptr<framework::OpDesc>> *op_descs, Node *begin_node,
-    Node *begin_node) {
+    bool can_add_split) {
  // std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
  //             Node *input_node, Node *current_node, bool adding_thread, int
  //             thread_num
-  if (false) {
+  if (can_add_split) {
    this->GenerateOps(op_descs, begin_node, begin_node, false, -1, nullptr);
  } else {
    this->GenerateOps(op_descs, begin_node, begin_node);

--- a/src/framework/program/program-optimize/program_optimize.h
+++ b/src/framework/program/program-optimize/program_optimize.h
@@ -27,14 +27,14 @@ namespace framework {
 class ProgramOptimize {
 public:
  ProgramOptimize() {}
-  std::shared_ptr<ProgramDesc> FushionOptimize(
+  std::shared_ptr<ProgramDesc> FusionOptimize(
      std::shared_ptr<ProgramDesc> ori_des, bool add_split = false);
 private:
  int current_block_;
  std::vector<std::shared_ptr<BlockDesc>> new_blocks_;
  void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
-                   Node *begin_node);
+                   Node *begin_node, bool can_add_split);
  void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
                   Node *input_node, Node *current_node);
  void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,

--- a/src/framework/program/program.cpp
+++ b/src/framework/program/program.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-namespace paddle_mobile {
-namespace framework {}
-}  // namespace paddle_mobile
--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 #include "common/types.h"
-#include "framework/paddle_mobile_object.h"
 #include "framework/program/program_desc.h"
 #include "framework/scope.h"
@@ -23,12 +22,14 @@ namespace paddle_mobile {
 namespace framework {
 template <typename Dtype, Precision P = Precision::FP32>
-class Program : PaddleMobileObject {
+class Program {
 public:
  std::shared_ptr<ProgramDesc> originProgram;
  std::shared_ptr<ProgramDesc> optimizeProgram;
  std::shared_ptr<Scope> scope;
  std::string model_path;
+  std::string para_path;
+  bool is_commbine = false;
 private:
 };

--- a/src/framework/program/program_desc.h
+++ b/src/framework/program/program_desc.h
@@ -18,13 +18,12 @@ limitations under the License. */
 #include "common/types.h"
 #include "framework/framework.pb-c.h"
-#include "framework/paddle_mobile_object.h"
 #include "framework/program/block_desc.h"
 namespace paddle_mobile {
 namespace framework {
-class ProgramDesc : PaddleMobileObject {
+class ProgramDesc {
 public:
  friend class Node;
  friend class ProgramOptimize;

--- a/src/framework/program/var_desc.h
+++ b/src/framework/program/var_desc.h
@@ -14,40 +14,14 @@ limitations under the License. */
 #pragma once
+#include <string>
 #include "framework/framework.pb-c.h"
-#include "framework/paddle_mobile_object.h"
 #include "framework/program/tensor_desc.h"
 namespace paddle_mobile {
 namespace framework {
-/*
-PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL = 0,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16 = 1,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32 = 2,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64 = 3,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16 = 4,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32 = 5,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64 = 6,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR = 7,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS = 8,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH = 9,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST = 10,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES = 11,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE = 12,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_ARRAY = 13,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST = 14,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER = 15,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL = 16,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW = 17,
-        PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE = 18
-                                                                 */
 class VarDesc {
 public:
  VarDesc(const VarDesc &var_desc) {
@@ -56,14 +30,6 @@ class VarDesc {
    this->persistable_ = var_desc.persistable_;
    this->tensor_desc_ = var_desc.tensor_desc_;
    this->type_ = var_desc.type_;
-    /*
-     *
-     *  std::string name_;
-  bool persistable_;
-  TensorDesc tensor_desc_;
-  VarType_Type type_;
-  VarType_Type data_type_;
-     * */
  }
  VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) {
    type_ = (VarType_Type)desc->type->type;
@@ -102,39 +68,6 @@ class VarDesc {
  const TensorDesc &Tensor_desc() const { return tensor_desc_; }
-  //  const proto::VarType::ChannelDesc &channel_desc() const {
-  //    switch (desc_.type().type()) {
-  //      case proto::VarType::CHANNEL:
-  //        return desc_.type().channel();
-  //      default:
-  //        break;
-  //    }
-  //  }
-  //  proto::VarType::Type GetDataType() const {
-  //    switch (desc_.type().type()) {
-  //      case proto::VarType::CHANNEL:
-  //        return channel_desc().data_type();
-  //        break;
-  //      default:
-  //        return tensor_desc().data_type();
-  //    }
-  //  }
-  //  template <typename T>
-  //  std::vector<T> RepeatedToVector(
-  //      const google::protobuf::RepeatedField<T> &repeated_field) const {
-  //    std::vector<T> ret;
-  //    ret.reserve(repeated_field.size());
-  //    std::copy(repeated_field.begin(), repeated_field.end(),
-  //              std::back_inserter(ret));
-  //    return ret;
-  //  }
-  //  std::vector<int64_t> GetShape() const {
-  //    return this->RepeatedToVector(tensor_desc().dims());
-  //  }
 private:
  std::string name_;
  bool persistable_;

--- a/src/framework/scope.cpp
+++ b/src/framework/scope.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "framework/scope.h"
+#include <algorithm>
 #include <set>
 #include <string>
 #include <vector>
@@ -22,7 +23,6 @@ namespace paddle_mobile {
 namespace framework {
 Scope &Scope::NewScope() const {
-  std::unique_lock<std::mutex> lock(mutex_);
  kids_.push_back(new Scope(this));
  return *kids_.back();
 }
@@ -72,11 +72,9 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 void Scope::DeleteScope(Scope *scope) const {
-  std::unique_lock<std::mutex> lock(mutex_);
  auto it = std::find(kids_.begin(), kids_.end(), scope);
  kids_.erase(it);
  delete scope;
-  // deferent
 }
 void Scope::EraseVars(const std::vector<std::string> &var_names) {
@@ -104,14 +102,6 @@ void Scope::Rename(const std::string &origin_name,
  vars_[new_name] = origin_it->second;
  vars_.erase(origin_it);
 }
-//
-//            std::string Scope::Rename(const std::string& origin_name)
-//            const {
-//                auto var_name = string::Sprintf("%p.%d", this,
-//                vars_.size());
-//                Rename(origin_name, var_name);
-//                return var_name;
-//            }
 Variable *Scope::FindVarLocally(const std::string &name) const {
  auto it = vars_.find(name);

--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -14,17 +14,16 @@ limitations under the License. */
 #pragma once
-#include <list>           //std::list
+#include <list>
-#include <mutex>          //std::mutex
+#include <unordered_map>
-#include <unordered_map>  //std::unordered_map
 #include "variable.h"
 namespace paddle_mobile {
 namespace framework {
 class Scope {
 public:
-  Scope() {}
+  Scope() = default;
-  ~Scope() {}
+  ~Scope() = default;
  Scope &NewScope() const;
@@ -70,8 +69,6 @@ class Scope {
  mutable std::unordered_map<std::string, Variable *> vars_;
  mutable std::list<Scope *> kids_;
  Scope const *parent_{nullptr};
-  mutable std::mutex mutex_;
 };
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -14,14 +14,15 @@ limitations under the License. */
 #pragma once
-#include <common/enforce.h>
 #include <cstdint>
 #include <cstring>
 #include <memory>
 #include <type_traits>
 #include <typeindex>
 #include <vector>
+#include "common/enforce.h"
+#include "common/enforce.h"
 #include "framework/data_layout.h"
 #include "framework/ddim.h"
 #include "memory/t_malloc.h"
@@ -84,6 +85,12 @@ class Tensor {
    }
  }
+  Tensor(const Tensor &inTensor) {
+    this->dims_ = inTensor.dims_;
+    this->holder_ = inTensor.holder_;
+    this->offset_ = inTensor.offset_;
+  }
  /*! Return a pointer to mutable memory block. */
  template <typename T>
  inline T *data() {
@@ -130,7 +137,6 @@ class Tensor {
    }
    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor'snumel must >=0.")
    int64_t size = numel() * SizeOfType(type);
-    /* some versions of boost::variant don't have operator!= */
    if (holder_ == nullptr || holder_->size() < size + offset_) {
      holder_.reset(new PlaceholderImpl(size, type));
      offset_ = 0;
@@ -169,7 +175,9 @@ class Tensor {
  /*! The internal of two tensors share the same memory block. */
  inline Tensor &ShareDataWith(const Tensor &src) {
    src.check_memory_size();
-    *this = src;
+    if (holder_.get() != src.holder_.get()) {
+      *this = src;
+    }
    return *this;
  }
@@ -198,7 +206,6 @@ class Tensor {
      size_t base = numel() / dims_[0];
      Tensor dst;
      dst.holder_ = holder_;
-      dst.set_layout(layout_);
      DDim dst_dims = dims_;
      dst_dims[0] = end_idx - begin_idx;
      dst.Resize(dst_dims);
@@ -227,10 +234,6 @@ class Tensor {
                          "Tensor's dims_ is out of bound. ");
  }
-  inline DataLayout layout() const { return layout_; }
-  inline void set_layout(const DataLayout layout) { layout_ = layout; }
 private:
  /**
   * @note    Placeholder hides type T, so it doesn't appear as a
@@ -288,21 +291,6 @@ class Tensor {
  DDim dims_;
-  /**
-   * @brief the layout of memory block, default is NHWC.
-   *
-   * @note the memory allocation order, describe how weight/data is
-   * stored
-   *       For example, in 4-D Tensor(rank=4), there are three
-   * commonly
-   *       used layout. They are
-   *            NCHW, NHWC, CHWN.
-   *       N,C,H,W for respectively the batch size, the number of
-   *       feature maps, the height, the width.
-   */
-  DataLayout layout_ = DataLayout::kNHWC;
  /**
   * @brief   A PlaceHolder may be shared by more than one tensor.
   *

--- a/src/framework/tensor_util.cpp
+++ b/src/framework/tensor_util.cpp
@@ -13,137 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "tensor_util.h"
-#include <algorithm>
-#include <limits>
-#include <vector>
 namespace paddle_mobile {
 namespace framework {
 void TensorCopy(const Tensor &src, Tensor *dst) {
-  //  VLOG(3) << "TensorCopy " << src.dims() << " from " <<
-  //  src.place() << " to
-  //  "
-  //          << dst_place;
-  src.check_memory_size();
-  dst->Resize(src.dims());
-  dst->set_layout(src.layout());
-  auto src_ptr = src.data<void>();
-  auto dst_ptr = dst->mutable_data(src.type());
-  auto size = src.numel() * SizeOfType(src.type());
-  memory::Copy(dst_ptr, src_ptr, size);
-}
-void TensorCopySync(const Tensor &src, Tensor *dst) {
-  //  VLOG(3) << "TensorCopySync " << src.dims() << " from " <<
-  //  src.place()
-  //          << " to " << dst_place;
  src.check_memory_size();
  dst->Resize(src.dims());
-  dst->set_layout(src.layout());
  auto src_ptr = src.data<void>();
  auto dst_ptr = dst->mutable_data(src.type());
  auto size = src.numel() * SizeOfType(src.type());
  memory::Copy(dst_ptr, src_ptr, size);
 }
-template <typename Predicate>
-struct AnyDTypeVisitor {
-  Predicate predicate_;
-  const Tensor &tensor_;
-  Tensor *out_;
-  AnyDTypeVisitor(Predicate predicate, const Tensor &tensor, Tensor *out)
-      : predicate_(predicate), tensor_(tensor), out_(out) {}
-  template <typename T>
-  void operator()() const {
-    //    auto t = EigenVector<T>::Flatten(tensor_);
-    //    auto o = EigenScalar<bool>::From(*out_);
-    // return any of predicate_(t) is true.
-    //    o.device(*ctx_.eigen_device()) = predicate_(t).any();
-  }
-};
-template <typename Predicate>
-inline void AnyImpl(Predicate predicate, const Tensor &tensor,
-                    framework::Tensor *out) {
-  VisitDataType(ToDataType(tensor.type()),
-                AnyDTypeVisitor<Predicate>(predicate, tensor, out));
-}
-template <typename Predicate>
-struct AnyVisitor {
-  const framework::Tensor &tensor_;
-  Predicate predicate_;
-  AnyVisitor(const framework::Tensor &tensor, Predicate predicate)
-      : tensor_(tensor), predicate_(std::move(predicate)) {}
-  bool operator()(void) const {
-    framework::Tensor out;
-    out.Resize({1});
-    out.mutable_data<bool>();
-    AnyImpl(predicate_, tensor_, &out);
-    return this->GetResult(out);
-  }
-  bool GetResult(const framework::Tensor &out) const {
-    return *out.data<bool>();
-  }
-};
-template <typename Predicate>
-inline bool Any(const framework::Tensor &tensor, Predicate predicate) {
-  AnyVisitor<Predicate> visitor(tensor, predicate);
-  //  return platform::VisitPlace(visitor);
-  return visitor();
-}
-struct ContainsNANPredicate {
-  template <typename T>
-  auto operator()(const T &eigen_vec) const
-      -> decltype(std::declval<T>().isnan()) {
-    // Cast eigen_vector to vector of bool. true if is inf.
-    return eigen_vec.isnan();
-  }
-};
-bool TensorContainsNAN(const framework::Tensor &tensor) {
-  ContainsNANPredicate predicate;
-  return Any(tensor, predicate);
-}
-struct ContainsInfPredicate {
-  template <typename T>
-  auto operator()(const T &eigen_vec) const
-      -> decltype(std::declval<T>().isinf()) {
-    // Cast eigen_vector to vector of bool. true if is inf.
-    return eigen_vec.isinf();
-  }
-};
-bool TensorContainsInf(const framework::Tensor &tensor) {
-  ContainsInfPredicate predicate;
-  return Any(tensor, predicate);
-}
-struct DeserializedDataFunctor {
-  DeserializedDataFunctor(void **buf, Tensor *tensor)
-      : buf_(buf), tensor_(tensor) {}
-  template <typename T>
-  void operator()() {
-    *buf_ = tensor_->mutable_data<T>();
-  }
-  void **buf_;
-  Tensor *tensor_;
-};
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/tensor_util.h
+++ b/src/framework/tensor_util.h
@@ -15,51 +15,12 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "memory/t_malloc.h"
-#include "platform/data_type.h"
 #include "tensor.h"
 namespace paddle_mobile {
 namespace framework {
 void TensorCopy(const Tensor &src, Tensor *dst);
-void TensorCopySync(const Tensor &src, Tensor *dst);
-template <typename T>
-void TensorFromVector(const std::vector<T> &src, Tensor *dst);
-template <typename T>
-void TesnorToVector(const Tensor &src, std::vector<T> *dst);
-bool TensorContainsNAN(const framework::Tensor &tensor);
-bool TensorContainsInf(const framework::Tensor &tensor);
-void TensorToStream(std::ostream &os, const Tensor &tensor);
-void TensorFromStream(std::istream &is, Tensor *tensor);
-//
-// The implementation of template functions.
-//
-template <typename T>
-void TensorFromVector(const std::vector<T> &src, Tensor *dst) {
-  auto src_ptr = static_cast<const void *>(src.data());
-  dst->Resize({static_cast<int64_t>(src.size())});
-  auto dst_ptr = static_cast<void *>(dst->mutable_data<T>());
-  auto size = src.size() * sizeof(T);
-  memory::Copy(dst_ptr, src_ptr, size);
-}
-template <typename T>
-void TensorToVector(const Tensor &src, std::vector<T> *dst) {
-  auto src_ptr = static_cast<const void *>(src.data<T>());
-  auto size = src.numel() * sizeof(T);
-  dst->resize(src.numel());
-  auto dst_ptr = static_cast<void *>(dst->data());
-  memory::Copy(dst_ptr, src_ptr, size);
-}
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/variable.h
+++ b/src/framework/variable.h
@@ -14,19 +14,17 @@ limitations under the License. */
 #pragma once
-#include <iostream>
 #include <memory>
 #include <string>
 #include <typeindex>
 #include <typeinfo>
 #include "../common/variant.h"
-#include "paddle_mobile_object.h"
 namespace paddle_mobile {
 namespace framework {
 using std::string;
-class Variable : public PaddleMobileObject {
+class Variable {
 public:
  template <typename T>
  const T *Get() const {

--- a/src/io.cpp
+++ b/src/io.cpp
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "io.h"
+#include "io/io.h"
-#include <fstream>
+#include <algorithm>
 #include <vector>
-#include "common/log.h"
 #include "common/enforce.h"
+#include "common/log.h"
 #include "framework/framework.pb-c.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"
@@ -26,20 +25,29 @@ limitations under the License. */
 #include "framework/program/var_desc.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+#include <queue>
+#include <utility>
+#include "common/threadpool.h"
+#endif
 namespace paddle_mobile {
 using framework::Variable;
-void ReadBinaryFile(const std::string &filename, std::string *contents) {
+char *Get_binary_data(std::string filename) {
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  FILE *file = fopen(filename.c_str(), "rb");
-  PADDLE_MOBILE_ENFORCE(fin.is_open(), "open file: %s failed",
+  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
                        filename.c_str());
-  fin.seekg(0, std::ios::end);
+  fseek(file, 0, SEEK_END);
-  contents->clear();
+  long size = ftell(file);
-  contents->resize(fin.tellg());
+  PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
-  fin.seekg(0, std::ios::beg);
+  rewind(file);
-  fin.read(&(contents->at(0)), contents->size());
+  char *data = new char[size];
-  fin.close();
+  size_t bytes_read = fread(data, 1, size, file);
+  PADDLE_MOBILE_ENFORCE(bytes_read == size,
+                        "read binary file bytes do not match with fseek");
+  fclose(file);
+  return data;
 }
 static size_t ReadBuffer(const char *file_name, uint8_t **out) {
@@ -66,110 +74,28 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
 }
 template <typename Dtype, Precision P>
-void Loader<Dtype, P>::LoadVar(framework::Variable *variable,
+const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-                               const framework::VarDesc &var_desc,
+    const std::string &dirname, bool optimize, bool can_add_split) {
-                               const std::string &file_path) {
+  auto program =
-  auto tensor = variable->GetMutable<framework::LoDTensor>();
+      this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
-  std::ifstream is(file_path);
+  program.model_path = dirname;
-  PADDLE_MOBILE_ENFORCE(is.is_open(), "open file: %s failed",
+  return program;
-                        file_path.c_str());
-  std::fpos<mbstate_t> pos;
-  pos = is.tellg();  // save   current   position
-  is.seekg(0, std::ios::end);
-  is.seekg(pos);  // restore   saved   position
-  // 1. version
-  uint32_t version;
-  is.read(reinterpret_cast<char *>(&version), sizeof(version));
-  // 2 Lod information
-  uint64_t lod_level;
-  is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-  auto &lod = *tensor->mutable_lod();
-  lod.resize(lod_level);
-  for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size;
-    is.read(reinterpret_cast<char *>(&size), sizeof(size));
-    std::vector<size_t> tmp(size / sizeof(size_t));
-    is.read(reinterpret_cast<char *>(tmp.data()),
-            static_cast<std::streamsize>(size));
-    for (auto j : tmp) {
-      LOG(kLOG_DEBUG1) << "    lod - " << j;
-    }
-    lod[i] = tmp;
-  }
-  // 3. tensor version
-  uint32_t tensor_version;
-  is.read(reinterpret_cast<char *>(&tensor_version), sizeof(tensor_version));
-  // 4. tensor desc
-  int32_t size;
-  is.read(reinterpret_cast<char *>(&size), sizeof(size));
-  std::unique_ptr<char[]> buf(new char[size]);
-  is.read(reinterpret_cast<char *>(buf.get()), size);
-  const framework::TensorDesc &desc = var_desc.Tensor_desc();
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor_desc = NULL;
-  //  void *v;
-  //  PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure()(tensor_desc,
-  //  buf.get());
-  //  DLOG << "PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure- " <<
-  //  tensor_desc;
-  //  framework::TensorDesc &tensor_desc = variable->
-  //  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-  //  uint8_t *proto_buf = NULL;
-  //  size_t read_size = ReadBuffer(file_path.c_str(), &proto_buf);
-  //  c_program = paddle_mobile__framework__proto__program_desc__unpack(NULL,
-  //  read_size, buf);
-  //  paddle_mobile__framework__proto__var_type__tensor_desc__init()
-  int memory_size = 1;
-  for (auto l : desc.Dims()) {
-    memory_size *= l;
-  }
-  tensor->Resize(framework::make_ddim(desc.Dims()));
-  void *memory = tensor;
-  int type_size = 0;
-  switch (desc.DataType()) {
-    case framework::VARTYPE_TYPE_FP16:
-      type_size = 2;
-      break;
-    case framework::VARTYPE_TYPE_FP32:
-      type_size = 4;
-      memory = tensor->mutable_data<float>();
-      break;
-    case framework::VARTYPE_TYPE_FP64:
-      type_size = 8;
-      break;
-    case framework::VARTYPE_TYPE_INT32:
-      type_size = 4;
-      break;
-    case framework::VARTYPE_TYPE_INT64:
-      type_size = 8;
-      break;
-    case framework::VARTYPE_TYPE_BOOL:
-      type_size = 1;
-      break;
-    default:
-      break;
-  }
-  is.read(static_cast<char *>(memory), memory_size * type_size);
-  is.close();
 }
 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &dirname, bool optimize) {
+    const std::string &model_path, const std::string &para_path,
-  std::string model_filename = dirname + "/__model__";
+    bool optimize) {
+  auto program = this->LoadProgram(model_path, optimize);
+  program.para_path = para_path;
+  program.is_commbine = true;
+  return program;
+}
+template <typename Dtype, Precision P>
+const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
+    const std::string &model_path, bool optimize, bool can_add_split) {
+  std::string model_filename = model_path;
  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
  uint8_t *buf = NULL;
  size_t read_size = ReadBuffer(model_filename.c_str(), &buf);
@@ -183,22 +109,16 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
  //
  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
  //
-  std::shared_ptr<framework::ProgramDesc> originProgramDesc =
+  auto originProgramDesc = std::make_shared<framework::ProgramDesc>(c_program);
-      std::make_shared<framework::ProgramDesc>(c_program);
  framework::Program<Dtype, P> program;
-  program.model_path = dirname;
  program.originProgram = originProgramDesc;
-  std::shared_ptr<framework::Scope> scope =
+  auto scope = std::make_shared<framework::Scope>();
-      std::make_shared<framework::Scope>();
  program.scope = scope;
-  originProgramDesc->Block(0);
  for (const auto &block : originProgramDesc->Blocks()) {
-    for (int i = 0; i < block->Vars().size(); ++i) {
+    for (auto var_desc : block->Vars()) {
-      std::shared_ptr<framework::VarDesc> var_desc = block->Vars()[i];
-      //      DLOG << "var name-- " << var_desc->Name();
      auto var = scope->Var(var_desc->Name());
      if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
@@ -224,7 +144,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
  if (optimize) {
    framework::ProgramOptimize program_optimize;
    program.optimizeProgram =
-        program_optimize.FushionOptimize(originProgramDesc);
+        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
  }
  if (optimize) {
    program.optimizeProgram->Description("optimize: ");
@@ -237,9 +157,10 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
 }
 template class Loader<CPU, Precision::FP32>;
+template class Loader<FPGA, Precision::FP32>;
+template class Loader<GPU_MALI, Precision::FP32>;
 #pragma mark - executor
 template <typename Dtype, Precision P>
 Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
                             bool use_optimize)
@@ -253,6 +174,9 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
  variable_ptr[0].SetValue<int>(batch_size);
  const std::vector<std::shared_ptr<framework::BlockDesc>> blocks =
      to_predict_program_->Blocks();
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+  depManager.resize(blocks.size());
+#endif
  for (int i = 0; i < blocks.size(); ++i) {
    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
@@ -263,40 +187,54 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
          program_.scope);
      op_base->InferShape();
      ops_of_block_[*block_desc.get()].push_back(op_base);
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+      depManager[i].analysisDep(ops_of_block_[*block_desc.get()]);
+#endif
    }
  }
-  InitMemory();
+  if (program_.is_commbine) {
+    InitCombineMemory();
+  } else {
+    InitMemory();
+  }
+  std::shared_ptr<framework::BlockDesc> to_predict_block =
+      to_predict_program_->Block(0);
+  auto &ops = ops_of_block_[*to_predict_block.get()];
+  for (const auto &op : ops) {
+    op->Init();
+  }
 }
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
-                                    framework::LoDTensor *tensor,
+                                    framework::LoDTensor *tensor, char *&data) {
-                                    const std::string &file_path) {
-  std::ifstream is(file_path);
-  PADDLE_MOBILE_ENFORCE(is.is_open(), "open file: %s failed",
-                        file_path.c_str());
-  std::fpos<mbstate_t> pos;
-  pos = is.tellg();  // save   current   position
-  is.seekg(0, std::ios::end);
-  is.seekg(pos);  // restore   saved   position
  // 1. version
-  uint32_t version;
+  uint32_t version = *(uint32_t *)data;
-  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  data += sizeof(uint32_t);
  // 2 Lod information
-  uint64_t lod_level;
+  uint64_t *lod_level_ptr = new uint64_t();
-  is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+  memcpy(lod_level_ptr, data, sizeof(uint64_t));
+  uint64_t lod_level = *lod_level_ptr;
+  delete lod_level_ptr;
+  data += sizeof(uint64_t);
  auto &lod = *tensor->mutable_lod();
  lod.resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size;
+    uint64_t size = *(uint64_t *)data;
-    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    data += sizeof(uint64_t);
+    DLOG << "lod size: " << i << size;
    std::vector<size_t> tmp(size / sizeof(size_t));
-    is.read(reinterpret_cast<char *>(tmp.data()),
-            static_cast<std::streamsize>(size));
+    for (int k = 0; k < tmp.size(); ++k) {
+      tmp[k] = *(size_t *)data;
+      DLOG << "tmp[k]: " << k << *(size_t *)data;
+      data += sizeof(size_t);
+    }
    for (auto j : tmp) {
      LOG(kLOG_DEBUG1) << "    lod - " << j;
    }
@@ -304,17 +242,20 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
  }
  // 3. tensor version
-  uint32_t tensor_version;
+  uint32_t tensor_version = *(uint32_t *)data;
-  is.read(reinterpret_cast<char *>(&tensor_version), sizeof(tensor_version));
+  data += sizeof(uint32_t);
  // 4. tensor desc
-  int32_t size;
+  int32_t size = *(int32_t *)data;
-  is.read(reinterpret_cast<char *>(&size), sizeof(size));
+  data += sizeof(int32_t);
  std::unique_ptr<char[]> buf(new char[size]);
-  is.read(reinterpret_cast<char *>(buf.get()), size);
+  for (int m = 0; m < size; ++m) {
+    buf.get()[m] = data[m];
+  }
+  data += (sizeof(char) * size);
  const framework::TensorDesc &desc = var_desc.Tensor_desc();
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
@@ -348,8 +289,10 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
      break;
  }
-  is.read(static_cast<char *>(memory), memory_size * type_size);
+  for (int n = 0; n < memory_size * type_size; ++n) {
-  is.close();
+    static_cast<char *>(memory)[n] = data[n];
+  }
+  data += (sizeof(char) * memory_size * type_size);
 }
 template <typename Dtype, Precision P>
@@ -362,8 +305,12 @@ void Executor<Dtype, P>::InitMemory() {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
-        LoadMemory(*var_desc, tensor,
-                   program_.model_path + "/" + var_desc->Name());
+        char *origin_data =
+            Get_binary_data(program_.model_path + "/" + var_desc->Name());
+        char *data = origin_data;
+        LoadMemory(*var_desc, tensor, data);
+        delete origin_data;
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
          auto tensor = var->template GetMutable<framework::LoDTensor>();
@@ -375,6 +322,32 @@ void Executor<Dtype, P>::InitMemory() {
  }
 }
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::InitCombineMemory() {
+  LOG(kLOG_INFO) << " begin init combine memory";
+  char *origin_data = Get_binary_data(program_.para_path);
+  char *data = origin_data;
+  for (const auto &block : to_predict_program_->Blocks()) {
+    for (const auto &var_desc : block->Vars()) {
+      auto var = program_.scope->Var(var_desc->Name());
+      if (var_desc->Persistable()) {
+        auto tensor = var->template GetMutable<framework::LoDTensor>();
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+          continue;
+        }
+        LoadMemory(*var_desc, tensor, data);
+      } else {
+        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+          auto tensor = var->template GetMutable<framework::LoDTensor>();
+          tensor->template mutable_data<Ptype>();
+        }
+      }
+    }
+  }
+  delete origin_data;
+  LOG(kLOG_INFO) << " end init combine memory ";
+}
 template <typename Dtype, Precision P>
 std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
    const framework::Tensor &t) {
@@ -385,19 +358,135 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
  feed_tensor->ShareDataWith(t);
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
-  for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+  auto &ops = ops_of_block_[*to_predict_block.get()];
-    auto op = ops_of_block_[*to_predict_block.get()][j];
+#ifdef PADDLE_MOBILE_PROFILE
-    op->Run();
+  std::vector<ProfInfo> profile(ops.size());
+#endif
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+  std::mutex m;
+  std::condition_variable cv;
+  std::queue<int> next;
+  next.push(0);
+  int rsize = ops.size();
+  std::vector<int> status(rsize, 0);
+  auto &threadPool = ThreadPool::getThreadPool();
+  auto &dep = depManager[0];
+  auto finishF = [&ops, &m, &cv, &next, &status, &rsize, &dep](int opi) {
+    std::lock_guard<std::mutex> lk(m);
+    rsize--;
+    status[opi] = 2;
+    for (int i : dep.getNext(opi)) {
+      bool ok = true;
+      for (int j : dep.getDeps(i)) {
+        if (status[j] != 2) {
+          ok = false;
+          break;
+        }
+      }
+      if (ok && (status[i] == 0)) {
+        next.push(i);
+      }
+    }
+    cv.notify_one();
+  };
+  for (;;) {
+    std::unique_lock<std::mutex> lk(m);
+    cv.wait(lk, [&next, &rsize] { return rsize == 0 || !next.empty(); });
+    if (rsize == 0) {
+      break;
+    }
+    while (next.size() > 0) {
+      int opi = next.front();
+      next.pop();
+      status[opi] = 1;
+      threadPool.enqueue([opi, &ops, &finishF, &profile] {
+        auto &op = ops[opi];
+#ifdef PADDLE_MOBILE_PROFILE
+        struct timespec ts;
+        clock_gettime(CLOCK_MONOTONIC, &ts);
+        profile[opi].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+        profile[opi].tid = ThreadPool::getThreadPoolThreadId();
+#endif
+        ops[opi]->Run();
+#ifdef PADDLE_MOBILE_PROFILE
+        clock_gettime(CLOCK_MONOTONIC, &ts);
+        profile[opi].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+        finishF(opi);
+      });
+    }
  }
-  auto ops = ops_of_block_[*to_predict_program_->Block(0)];
+#else
+  for (int i = 0; i < ops.size(); i++) {
+#ifdef PADDLE_MOBILE_PROFILE
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+    // to Run
+    ops[i]->Run();
+#ifdef PADDLE_MOBILE_PROFILE
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+  }
+#endif
  auto last_op = ops.rbegin();
  auto output_map = (*last_op)->Outputs();
  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
  framework::LoDTensor *output_tensor =
      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                   *(program_.scope));
-  return std::shared_ptr<framework::Tensor>(output_tensor);
+#ifdef PADDLE_MOBILE_PROFILE
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+  // TODO expose profile info as an interface, user can get them to analysis
+  //      the performance of their deepnet.
+  FILE *df = fopen("net.dot", "w");
+  fprintf(df, "digraph {\n");
+  for (int i = 0; i < ops.size(); i++) {
+    for (int j : dep.getNext(i)) {
+      fprintf(df, "op_%d -> op_%d\n", i, j);
+    }
+  }
+  for (int i = 0; i < ops.size(); i++) {
+    fprintf(df, "op_%d[label=\"%s (%d)\"]\n", i, ops[i]->Type().c_str(), i);
+  }
+  fprintf(df, "}\n");
+  fclose(df);
+#endif
+  FILE *pf = fopen("profile.out", "w");
+  std::unordered_map<std::string, uint64_t> _tp;
+  for (int i = 0; i < profile.size(); i++) {
+    const auto &pInfo = profile[i];
+    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
+    _tp[ops[i]->Type()] += timeCost;
+    fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i, ops[i]->Type().c_str(),
+            pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
+  }
+  fclose(pf);
+  printf("====================[ profile ]======================\n");
+  using prof_t = std::pair<std::string, uint64_t>;
+  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
+  uint64_t _ptotal = 0;
+  for (auto const &p : _tv) {
+    _ptotal += p.second;
+  }
+  auto compf = [](const prof_t &a, const prof_t &b) {
+    return a.second > b.second;
+  };
+  std::sort(_tv.begin(), _tv.end(), compf);
+  _tv.push_back(std::make_pair("total", _ptotal));
+  for (auto const &p : _tv) {
+    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(), (float)p.second,
+           (float)p.second / _ptotal * 100.0);
+  }
+  printf("====================[---------]======================\n");
+#endif
+  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
 }
 template <typename Dtype, Precision P>
 std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
@@ -420,5 +509,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
 }
 template class Executor<CPU, Precision::FP32>;
+template class Executor<FPGA, Precision::FP32>;
+template class Executor<GPU_MALI, Precision::FP32>;
 }  // namespace paddle_mobile
--- a/src/io.h
+++ b/src/io.h
@@ -14,51 +14,80 @@ limitations under the License. */
 #pragma once
-#include <memory.h>
 #include <map>
+#include <memory>
 #include <string>
 #include <vector>
 #include "common/types.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"
-#include "framework/paddle_mobile_object.h"
 #include "framework/program/program.h"
 #include "framework/tensor.h"
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+#include "common/dep_core.h"
+#endif
 namespace paddle_mobile {
-template <typename Dtype, Precision P = Precision::FP32>
+template <typename Dtype = CPU, Precision P = Precision::FP32>
-class Loader : PaddleMobileObject {
+class Loader {
 public:
+  /*
+   * @b load separate format fluid model
+   * @b 加载分开形式的 fluid 模型
+   * */
  const framework::Program<Dtype, P> Load(const std::string &dirname,
-                                          bool optimize = true);
+                                          bool optimize = false,
+                                          bool can_add_split = false);
+  /*
+   * @b load combine format fluid mode
+   * @b 加载结合在一起格式的模型
+   * */
+  const framework::Program<Dtype, P> Load(const std::string &model_path,
+                                          const std::string &para_path,
+                                          bool optimize = false);
 private:
-  void LoadVar(framework::Variable *variable,
+  const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
-               const framework::VarDesc &var_desc,
+                                                 bool optimize = false,
-               const std::string &file_path);
+                                                 bool can_add_split = false);
 };
-template <typename Dtype, Precision P = Precision::FP32>
+template <typename Dtype = CPU, Precision P = Precision::FP32>
 class Executor {
 public:
  typedef typename PrecisionTrait<P>::ptype Ptype;
+  /*
+   * @b init executor with program load by Loader class
+   * @b 用 loader load 的 program 实例化 executor
+   * */
  Executor(const framework::Program<Dtype> p, int batch_size = 1,
           bool use_optimize = true);
+  /*
+   * @b to predict
+   * */
  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
+  /*
+   * @b to predict with vector and dim
+   *
+   * @b 使用 输入 和 输入的维度信息 进行预测
+   * */
  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
                             const std::vector<int64_t> &dims);
 protected:
  Executor() = default;
  void InitMemory();
  void LoadMemory(const framework::VarDesc var_desc,
-                  framework::LoDTensor *tensor, const std::string &file_path);
+                  framework::LoDTensor *tensor, char *&data);
+  void InitCombineMemory();
  framework::Program<Dtype> program_;
  int batch_size_ = 1;
  std::shared_ptr<framework::ProgramDesc> to_predict_program_;
@@ -68,6 +97,16 @@ class Executor {
           std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>>
      ops_of_block_;
  bool use_optimize_ = false;
+#ifdef PADDLE_EXECUTOR_MULTITHREAD
+  std::vector<depCore> depManager;
+#endif
+#ifdef PADDLE_MOBILE_PROFILE
+  struct ProfInfo {
+    int tid = 0;
+    uint64_t runBegin = 0UL;
+    uint64_t runEnd = 0UL;
+  };
+#endif
 };
 }  // namespace paddle_mobile
--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ANDROID
+#include "paddle_mobile_jni.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+namespace paddle_mobile {
+namespace jni {
+using framework::DDim;
+using framework::Program;
+using framework::Tensor;
+using paddle_mobile::CPU;
+using std::string;
+extern const char *ANDROID_LOG_TAG =
+    "paddle_mobile LOG built on " __DATE__ " " __TIME__;
+static Executor<CPU> *shared_executor_instance = nullptr;
+// toDo mutex lock
+// static std::mutex shared_mutex;
+Executor<CPU> *getExecutorInstance(const Program<CPU> p, int batch_size,
+                                   bool use_optimize) {
+  if (nullptr == shared_executor_instance) {
+    shared_executor_instance = new Executor<CPU>(p, batch_size, use_optimize);
+  }
+  return shared_executor_instance;
+}
+string jstring2cppstring(JNIEnv *env, jstring jstr) {
+  const char *cstr = env->GetStringUTFChars(jstr, 0);
+  string cppstr(cstr);
+  env->ReleaseStringUTFChars(jstr, cstr);
+  return cppstr;
+}
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
+                                                          jclass thiz,
+                                                          jstring modelPath) {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  bool optimize = true;
+  auto program = loader.Load(jstring2cppstring(env, modelPath), optimize);
+  shared_executor_instance = getExecutorInstance(program, 1, optimize);
+  return shared_executor_instance != nullptr ? JNI_TRUE : JNI_FALSE;
+}
+JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
+    JNIEnv *env, jclass thiz, jfloatArray buf) {
+  jfloatArray result = NULL;
+  int count = 0;
+  float *dataPointer = nullptr;
+  if (nullptr != buf) {
+    dataPointer = env->GetFloatArrayElements(buf, NULL);
+  }
+  framework::Tensor input;
+  framework::DDim ddim = framework::make_ddim({1, 3, 224, 224});
+  input.Resize(ddim);
+  auto input_ptr = input.mutable_data<float>();
+  for (int i = 0; i < framework::product(ddim); i++) {
+    input_ptr[i] = dataPointer[i];
+  }
+  auto output = shared_executor_instance->Predict(input);
+  count = output->numel();
+  result = env->NewFloatArray(count);
+  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
+  return result;
+}
+JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_clear(JNIEnv *env,
+                                                       jclass thiz) {}
+}  // namespace jni
+}  // namespace paddle_mobile
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/src/framework/data_type.h
+++ b/src/framework/data_type.h
@@ -13,25 +13,39 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#ifdef ANDROID
+#include <jni.h>
+#include "common/log.h"
+#include "framework/tensor.h"
+#include "io/io.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
 namespace paddle_mobile {
-namespace framework {
+namespace jni {
+/**
-//    inline proto::VarType::Type ToDataType(std::type_index type) {
+ * load model & params of the net for android
-//        using namespace paddle_mobile::framework::proto;
+ */
-//        if (typeid(float).hash_code() == type.hash_code()) {
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
-//            return proto::VarType::FP32;
+                                                          jclass thiz,
-//        } else if (typeid(double).hash_code() == type.hash_code()) {
+                                                          jstring modelPath);
-//            return proto::VarType::FP64;
-//        } else if (typeid(int).hash_code() == type.hash_code()) {
+/**
-//            return proto::VarType::INT32;
+ * object detection for anroid
-//        } else if (typeid(int64_t).hash_code() == type.hash_code()) {
+ */
-//            return proto::VarType::INT64;
+JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
-//        } else if (typeid(bool).hash_code() == type.hash_code()) {
+    JNIEnv *env, jclass thiz, jfloatArray buf);
-//            return proto::VarType::BOOL;
-//        } else {
+/**
-////            PADDLE_THROW("Not supported");
+ * clear data of the net when destroy for android
-//        }
+ */
-//    }
+JNIEXPORT void JNICALL Java_com_baidu_paddle_PMLL_clear(JNIEnv *env,
-}  // namespace framework
+                                                        jclass thiz);
+}  // namespace jni
 }  // namespace paddle_mobile
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -12,19 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
+#include "memory/t_malloc.h"
-#include "t_malloc.h"
 #include <cstdlib>
 #include <cstring>
 namespace paddle_mobile {
 namespace memory {
-const int MALLOC_ALIGN = 16;
+const int MALLOC_ALIGN = 64;
 void Copy(void *dst, const void *src, size_t num) {
  std::memcpy(dst, src, num);
-};
+}
 void *Alloc(size_t size) {
  size_t offset = sizeof(void *) + MALLOC_ALIGN - 1;

--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -12,20 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef BATCHNORM_OP
 #include "batchnorm_op.h"
+#include "framework/op_proto_maker.h"
+#include "framework/op_registry.h"
 namespace paddle_mobile {
 namespace operators {
 template <typename Dtype, typename T>
 void BatchNormOp<Dtype, T>::InferShape() const {
-  auto x_dims = param_.InputX()->dims();
+  auto x_dims = this->param_.InputX()->dims();
-  param_.OutputY()->Resize(x_dims);
+  this->param_.OutputY()->Resize(x_dims);
 }
 template class BatchNormOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(batch_norm);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(batch_norm, ops::BatchNormOp);
+USE_OP_CPU(batch_norm);
+REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(batch_norm);
+REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/batchnorm_op.h
+++ b/src/operators/batchnorm_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef BATCHNORM_OP
 #pragma once
 #include <string>
@@ -23,27 +25,24 @@ namespace paddle_mobile {
 namespace operators {
 using std::string;
 template <typename DeviceType, typename T>
-class BatchNormOp : public framework::OperatorWithKernel<DeviceType> {
+class BatchNormOp
+    : public framework::OperatorWithKernel<DeviceType, BatchNormParam,
+                                           BatchNormKernel<DeviceType, T>> {
 public:
  BatchNormOp(const string &type, const VariableNameMap &inputs,
              const VariableNameMap &outputs,
-              const framework::AttributeMap attrs,
+              const framework::AttributeMap &attrs,
              std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+      : framework::OperatorWithKernel<DeviceType, BatchNormParam,
-                                                  scope),
+                                      BatchNormKernel<DeviceType, T>>(
-        param_(inputs, outputs, attrs, *scope) {}
+            type, inputs, outputs, attrs, scope) {}
-  void RunImpl() const {
-    operators::BatchNormKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
  void InferShape() const override;
 protected:
-  BatchNormParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/box_coder_op.cpp
+++ b/src/operators/box_coder_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef BOXCODER_OP
 #include "operators/box_coder_op.h"
 #include <vector>
 namespace paddle_mobile {
@@ -19,11 +21,11 @@ namespace operators {
 template <typename Dtype, typename T>
 void BoxCoderOp<Dtype, T>::InferShape() const {
-  auto input_priorbox_dims = param_.InputPriorBox()->dims();
+  auto input_priorbox_dims = this->param_.InputPriorBox()->dims();
-  auto input_priorboxvar_dims = param_.InputPriorBoxVar()->dims();
+  auto input_priorboxvar_dims = this->param_.InputPriorBoxVar()->dims();
-  auto input_targetbox_dims = param_.InputTargetBox()->dims();
+  auto input_targetbox_dims = this->param_.InputTargetBox()->dims();
-  auto code_type = param_.CodeType();
+  auto code_type = this->param_.CodeType();
  if (code_type == "encode_center_size") {
    if (input_targetbox_dims.size() != 2) {
@@ -42,7 +44,7 @@ void BoxCoderOp<Dtype, T>::InferShape() const {
      LOG(kLOG_ERROR) << " dimension not match";
    }
  }
-  param_.OutputBox()->Resize(framework::make_ddim(
+  this->param_.OutputBox()->Resize(framework::make_ddim(
      {input_targetbox_dims[0], input_priorbox_dims[0], 4}));
 }
 template class BoxCoderOp<CPU, float>;
@@ -50,5 +52,13 @@ template class BoxCoderOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(box_coder);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(box_coder, ops::BoxCoderOp);
+USE_OP_CPU(box_coder);
+REGISTER_OPERATOR_CPU(box_coder, ops::BoxCoderOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/box_coder_op.h
+++ b/src/operators/box_coder_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef BOXCODER_OP
 #pragma once
 #include <string>
@@ -26,27 +28,27 @@ namespace operators {
 using paddle_mobile::framework::Tensor;
 template <typename DeviceType, typename T>
-class BoxCoderOp : public framework::OperatorWithKernel<DeviceType> {
+class BoxCoderOp
+    : public framework::OperatorWithKernel<
+          DeviceType, BoxCoderParam, operators::BoxCoderKernel<DeviceType, T>> {
 public:
  BoxCoderOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
-             const framework::AttributeMap attrs,
+             const framework::AttributeMap &attrs,
             std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+      : framework::OperatorWithKernel<DeviceType, BoxCoderParam,
-                                                  scope),
+                                      operators::BoxCoderKernel<DeviceType, T>>(
-        param_(inputs, outputs, attrs, *scope) {}
+            type, inputs, outputs, attrs, scope) {}
-  void RunImpl() const {
-    operators::BoxCoderKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, BoxCoderParam,
+      operators::BoxCoderKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:
-  BoxCoderParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef CONCAT_OP
 #include "concat_op.h"
 namespace paddle_mobile {
@@ -19,7 +21,7 @@ namespace operators {
 template <typename Dtype, typename T>
 void ConcatOp<Dtype, T>::InferShape() const {
-  auto inputs = param_.Inputs();
+  auto inputs = this->param_.Inputs();
  const size_t n = inputs.size();
  std::vector<DDim> inputs_dims;
@@ -28,7 +30,7 @@ void ConcatOp<Dtype, T>::InferShape() const {
    inputs_dims.push_back(inputs[i]->dims());
  }
-  auto axis = static_cast<size_t>(param_.Axis());
+  auto axis = static_cast<size_t>(this->param_.Axis());
  if (n == 1) {
    DLOG << "Warning: concat op have only one input, "
@@ -52,7 +54,7 @@ void ConcatOp<Dtype, T>::InferShape() const {
    out_dims[axis] = -1;
  }
-  param_.Out()->Resize(out_dims);
+  this->param_.Out()->Resize(out_dims);
 }
 template class ConcatOp<CPU, float>;
@@ -60,5 +62,15 @@ template class ConcatOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(concat);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(concat, ops::ConcatOp);
+USE_OP_CPU(concat);
+REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(concat);
+REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef CONCAT_OP
 #pragma once
 #include <string>
@@ -22,26 +24,26 @@ namespace paddle_mobile {
 namespace operators {
 using std::string;
 template <typename DeviceType, typename T>
-class ConcatOp : public framework::OperatorWithKernel<DeviceType> {
+class ConcatOp
+    : public framework::OperatorWithKernel<
+          DeviceType, ConcatParam, operators::ConcatKernel<DeviceType, T>> {
 public:
  ConcatOp(const string &type, const VariableNameMap &inputs,
-           const VariableNameMap &outputs, const framework::AttributeMap attrs,
+           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
           std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+      : framework::OperatorWithKernel<DeviceType, ConcatParam,
-                                                  scope),
+                                      operators::ConcatKernel<DeviceType, T>>(
-        param_(inputs, outputs, attrs, *scope) {}
+            type, inputs, outputs, attrs, scope) {}
-  void RunImpl() const {
-    operators::ConcatKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, ConcatParam,
+      operators::ConcatKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:
-  ConcatParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -12,42 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef CONV_OP
 #include "operators/conv_op.h"
 #include <vector>
-#include "framework/data_type.h"
 #include "framework/op_proto_maker.h"
 #include "framework/op_registry.h"
+#include "operators/math/conv_func.h"
 namespace paddle_mobile {
 namespace operators {
 template <typename Dtype, typename T>
 void ConvOp<Dtype, T>::InferShape() const {
-  //  std::cout << " begin get dims: " << std::endl;
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
-  auto in_dims = param_.Input()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
-  //  std::cout << " end get in dims: " << std::endl;
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
-  //  std::cout << " in_dims: " << in_dims << std::endl;
-  //  std::cout << " begin get Filter " << std::endl;
-  auto filter_dims = param_.Filter()->dims();
-  //  std::cout << " end get Filter " << std::endl;
-  //  std::cout << " begin get Attrs " << std::endl;
-  const std::vector<int> &strides = param_.Strides();
-  //  std::cout << " end get Attrs " << strides[0] << std::endl;
-  std::vector<int> paddings = param_.Paddings();
-  int groups = param_.Groups();
-  std::vector<int> dilations = param_.Dilations();
  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
                         dilations.size() == paddings.size() &&
@@ -56,13 +39,13 @@ void ConvOp<Dtype, T>::InferShape() const {
  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+    output_shape.push_back(
-                                          dilations[i], paddings[i],
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                                          strides[i]));
+                             paddings[i], strides[i]));
  }
  framework::DDim ddim = framework::make_ddim(output_shape);
-  param_.Output()->Resize(ddim);
+  this->param_.Output()->Resize(ddim);
 }
 template class ConvOp<CPU, float>;
@@ -71,5 +54,17 @@ template class ConvOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(conv2d);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(conv2d, ops::ConvOp);
+USE_OP_CPU(conv2d);
+REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(conv2d);
+REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(conv2d);
+REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
+#endif
+#endif
--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef CONV_OP
 #pragma once
 #include <string>
@@ -22,34 +24,26 @@ namespace paddle_mobile {
 namespace operators {
 using std::string;
 template <typename DeviceType, typename T>
-class ConvOp : public framework::OperatorWithKernel<DeviceType> {
+class ConvOp
+    : public framework::OperatorWithKernel<
+          DeviceType, ConvParam, operators::ConvKernel<DeviceType, T>> {
 public:
  ConvOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
         std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+      : framework::OperatorWithKernel<DeviceType, ConvParam,
-                                                  scope),
+                                      operators::ConvKernel<DeviceType, T>>(
-        param_(inputs, outputs, attrs, *scope) {}
+            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, ConvParam,
+      operators::ConvKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
-  void RunImpl() const {
-    operators::ConvKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-    this->ClearVariables({"Filter", "Input"});
-  }
 private:
-  ConvParam param_;
 };
-inline int ConvOutputSize(int input_size, int filter_size, int dilation,
-                          int padding, int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  return output_size;
-}
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -12,24 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef DEPTHWISECONV_OP
 #include "operators/depthwise_conv_op.h"
 #include <vector>
-#include "framework/data_type.h"
 #include "framework/op_proto_maker.h"
 #include "framework/op_registry.h"
 #include "operators/conv_op.h"
+#include "operators/math/conv_func.h"
 namespace paddle_mobile {
 namespace operators {
 template <typename Dtype, typename T>
 void DepthwiseConvOp<Dtype, T>::InferShape() const {
-  auto in_dims = param_.Input()->dims();
+  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = param_.Filter()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = param_.Strides();
+  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = param_.Paddings();
+  std::vector<int> paddings = this->param_.Paddings();
-  int groups = param_.Groups();
+  int groups = this->param_.Groups();
-  std::vector<int> dilations = param_.Dilations();
+  std::vector<int> dilations = this->param_.Dilations();
  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
                         dilations.size() == paddings.size() &&
@@ -38,13 +40,13 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+    output_shape.push_back(
-                                          dilations[i], paddings[i],
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                                          strides[i]));
+                             paddings[i], strides[i]));
  }
  framework::DDim ddim = framework::make_ddim(output_shape);
-  param_.Output()->Resize(ddim);
+  this->param_.Output()->Resize(ddim);
 }
 template class DepthwiseConvOp<CPU, float>;
@@ -53,5 +55,13 @@ template class DepthwiseConvOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(depthwise_conv2d);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(depthwise_conv2d, ops::DepthwiseConvOp);
+USE_OP_CPU(depthwise_conv2d);
+REGISTER_OPERATOR_CPU(depthwise_conv2d, ops::DepthwiseConvOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef DEPTHWISECONV_OP
 #pragma once
 #include <string>
@@ -22,28 +24,28 @@ namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType, typename T>
-class DepthwiseConvOp : public framework::OperatorWithKernel<DeviceType> {
+class DepthwiseConvOp : public framework::OperatorWithKernel<
+                            DeviceType, ConvParam,
+                            operators::DepthwiseConvKernel<DeviceType, T>> {
 public:
  DepthwiseConvOp(const std::string &type, const VariableNameMap &inputs,
                  const VariableNameMap &outputs,
                  const framework::AttributeMap &attrs,
                  std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+      : framework::OperatorWithKernel<
-                                                  scope),
+            DeviceType, ConvParam,
-        param_(inputs, outputs, attrs, *scope) {}
+            operators::DepthwiseConvKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, ConvParam,
+      operators::DepthwiseConvKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
-  void RunImpl() const {
-    operators::DepthwiseConvKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-    this->ClearVariables({"Filter", "Input"});
-  }
 private:
-  ConvParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef ELEMENTWISEADD_OP
 #include "elementwise_add_op.h"
 namespace paddle_mobile {
@@ -19,13 +21,23 @@ namespace operators {
 template <typename Dtype, typename T>
 void ElementwiseAddOp<Dtype, T>::InferShape() const {
-  auto x_dim = param_.InputX()->dims();
+  auto x_dim = this->param_.InputX()->dims();
-  param_.Out()->Resize(x_dim);
+  this->param_.Out()->Resize(x_dim);
 }
 template class ElementwiseAddOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(elementwise_add);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(elementwise_add, ops::ElementwiseAddOp);
+USE_OP_CPU(elementwise_add);
+REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(elementwise_add);
+REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef ELEMENTWISEADD_OP
 #pragma once
 #include <string>
@@ -23,26 +25,27 @@ namespace paddle_mobile {
 namespace operators {
 using std::string;
 template <typename DeviceType, typename T>
-class ElementwiseAddOp : public framework::OperatorWithKernel<DeviceType> {
+class ElementwiseAddOp : public framework::OperatorWithKernel<
+                             DeviceType, ElementwiseAddParam,
+                             operators::ElementwiseAddKernel<DeviceType, T>> {
 public:
  ElementwiseAddOp(const string &type, const VariableNameMap &inputs,
                   const VariableNameMap &outputs,
-                   const framework::AttributeMap attrs,
+                   const framework::AttributeMap &attrs,
                   std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+      : framework::OperatorWithKernel<
-                                                  scope),
+            DeviceType, ElementwiseAddParam,
-        param_(inputs, outputs, attrs, *scope) {}
+            operators::ElementwiseAddKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
-  void RunImpl() const {
-    operators::ElementwiseAddKernel<DeviceType, T> kernel;
+  using framework::OperatorWithKernel<
-    kernel.Compute(param_);
+      DeviceType, ElementwiseAddParam,
-  }
+      operators::ElementwiseAddKernel<DeviceType, T>>::OperatorWithKernel;
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
  void InferShape() const override;
 protected:
-  ElementwiseAddParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -32,6 +32,8 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
        param_(inputs, outputs, attrs, *scope) {}
  void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
+  void Init() const {}
  void InferShape() const {
    auto out_dims = param_.Out()->dims();
    out_dims[0] = param_.BatchSize();
@@ -43,8 +45,16 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
 };
 namespace ops = paddle_mobile::operators;
-USE_OP(feed);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(feed, ops::FeedOp);
+USE_OP_CPU(feed);
+REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(feed);
+REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -33,6 +33,8 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
        param_(inputs, outputs, attrs, *scope) {}
  void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
+  void Init() const {}
  void InferShape() const {
    auto x_dims = param_.InputX()->dims();
    param_.Out()->Resize(x_dims);
@@ -43,8 +45,16 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
 };
 namespace ops = paddle_mobile::operators;
-USE_OP(fetch);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(fetch, ops::FetchOp);
+USE_OP_CPU(fetch);
+REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fetch);
+REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/framework/selected_rows.h
+++ b/src/framework/selected_rows.h
@@ -12,66 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
+#ifdef FUSION_CONVADD_OP
-#include <vector>
+#include "operators/fusion_conv_add.h"
+#include "operators/math/conv_func.h"
-#include "lod_tensor.h"
-#include "tensor.h"
 namespace paddle_mobile {
-namespace framework {
+namespace operators {
-class SelectedRows {
+template <typename Dtype, typename T>
- public:
+void FusionConvAddOp<Dtype, T>::InferShape() const {
-  SelectedRows(const std::vector<int64_t> &rows, const int64_t &height)
+  auto in_dims = this->param_.Input()->dims();
-      : rows_(rows), height_(height) {
+  auto filter_dims = this->param_.Filter()->dims();
-    value_.reset(new Tensor());
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
  }
-  SelectedRows() {
+  framework::DDim ddim = framework::make_ddim(output_shape);
-    height_ = 0;
+  this->param_.Output()->Resize(ddim);
-    value_.reset(new Tensor());
+}
-  }
+template class FusionConvAddOp<CPU, float>;
+}  // namespace operators
-  const Tensor &value() const { return *value_; }
-  Tensor *mutable_value() { return value_.get(); }
-  int64_t height() const { return height_; }
-  void set_height(int64_t height) { height_ = height; }
-  const std::vector<int64_t> &rows() const { return rows_; }
-  std::vector<int64_t> *mutable_rows() { return &rows_; }
-  void set_rows(const std::vector<int64_t> &rows) { rows_ = rows; }
-  /**
-   * get the index of id in rows
-   */
-  int64_t index(int64_t id) const {
-    auto it = std::find(rows_.begin(), rows_.end(), id);
-    //    PADDLE_ENFORCE(it != rows_.end(), "id should be in rows");
-    return static_cast<int64_t>(std::distance(rows_.begin(), it));
-  }
-  DDim GetCompleteDims() const {
-    std::vector<int64_t> dims = vectorize(value_->dims());
-    dims[0] = height_;
-    return make_ddim(dims);
-  }
- private:
-  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9}
-  // here.
-  // SelectedRows are simply concated when adding together. Until a
-  // SelectedRows add a Tensor, will the duplicate rows be handled.
-  std::vector<int64_t> rows_;
-  std::unique_ptr<Tensor> value_{nullptr};
-  int64_t height_;
-};
-}  // namespace framework
 }  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(conv_add);
+REGISTER_OPERATOR_CPU(conv_add, ops::FusionConvAddOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(conv_add);
+REGISTER_OPERATOR_MALI_GPU(conv_add, ops::FusionConvAddOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/fusion_conv_add.h
+++ b/src/operators/fusion_conv_add.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#define FUSION_CONVADD_OP
+#ifdef FUSION_CONVADD_OP
+#pragma once
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "op_param.h"
+#include "operators/kernel/conv_add_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionConvAddMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionConvAddMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
+  }
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
+        node->OpDescs(node_.Depth());
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Y"}}}, removed_nodes);
+  }
+  std::string Type() { return G_OP_TYPE_CONV_ADD; }
+};
+template <typename DeviceType, typename T>
+class FusionConvAddOp : public framework::OperatorWithKernel<
+                            DeviceType, FusionConvAddParam,
+                            operators::ConvAddKernel<DeviceType, T>> {
+ public:
+  FusionConvAddOp(const string &type, const VariableNameMap &inputs,
+                  const VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs,
+                  std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, FusionConvAddParam,
+                                      operators::ConvAddKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  using framework::OperatorWithKernel<
+      DeviceType, FusionConvAddParam,
+      operators::ConvAddKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+ protected:
+};
+#ifdef PADDLE_MOBILE_CPU
+#ifndef CONV_ADD_REGISTER
+static framework::FusionOpRegistrar convadd_registrar(
+    new FusionConvAddMatcher());
+#define CONV_ADD_REGISTER
+#endif
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#ifndef CONV_ADD_REGISTER
+static framework::FusionOpRegistrar convadd_registrar(
+    new FusionConvAddMatcher());
+#define CONV_ADD_REGISTER
+#endif
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -12,4 +12,49 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef CONVADDRELU_OP
 #include "fusion_conv_add_relu_op.h"
+#include "operators/math/conv_func.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename Dtype, typename T>
+void FusionConvAddReluOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_relu);
+REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -12,38 +12,72 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef CONVADDRELU_OP
 #pragma once
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/conv_add_relu_kernel.h"
+#include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
-class FushionConvAddReluOpMatcher : public framework::FusionOpMatcher {
+class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher {
 public:
-  FushionConvAddReluOpMatcher() {
+  FusionConvAddReluOpMatcher() {
    node_ = framework::Node(G_OP_TYPE_CONV);
    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
  }
-  void FolderNodes(framework::Node *node) {
+  void FolderNodes(
-    std::vector<std::shared_ptr<framework::OpDesc>> origin_descs =
+      framework::Node *node,
-        node->OpDescs(node_.Depth());
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Y"}}}, removed_nodes);
  }
  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; }
 };
-class FusionFcOp {
+template <typename DeviceType, typename T>
+class FusionConvAddReluOp : public framework::OperatorWithKernel<
+                                DeviceType, FusionConvAddReluParam,
+                                operators::ConvAddReluKernel<DeviceType, T>> {
 public:
- private:
+  FusionConvAddReluOp(const string &type, const VariableNameMap &inputs,
+                      const VariableNameMap &outputs,
+                      const framework::AttributeMap &attrs,
+                      std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionConvAddReluParam,
+            operators::ConvAddReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                         attrs, scope) {}
+  using framework::OperatorWithKernel<
+      DeviceType, FusionConvAddReluParam,
+      operators::ConvAddReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+ protected:
 };
-// static framework::FusionOpRegistrar fc_registrar(
+#ifdef PADDLE_MOBILE_CPU
-//    new FushionConvAddReluOpMatcher());
+#ifndef CONV_ADD_RELU_REGISTER
+#define CONV_ADD_RELU_REGISTER
+// static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(new
+// FusionConvAddReluOpMatcher());
+#endif
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -12,16 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef FUSION_FC_OP
 #include "operators/fusion_fc_op.h"
 namespace paddle_mobile {
 namespace operators {
 template <typename Dtype, typename T>
-void FushionFcOp<Dtype, T>::InferShape() const {
+void FusionFcOp<Dtype, T>::InferShape() const {
-  auto x_dims = param_.InputX()->dims();
+  auto x_dims = this->param_.InputX()->dims();
-  auto y_dims = param_.InputY()->dims();
+  auto y_dims = this->param_.InputY()->dims();
-  int x_num_col_dims = param_.XNumColDims();
+  int x_num_col_dims = this->param_.XNumColDims();
-  int y_num_col_dims = param_.YNumColDims();
+  int y_num_col_dims = this->param_.YNumColDims();
  assert(x_dims.size() > x_num_col_dims);
  assert(y_dims.size() > y_num_col_dims);
@@ -45,12 +47,22 @@ void FushionFcOp<Dtype, T>::InferShape() const {
  }
  framework::DDim ddim = framework::make_ddim(output_dims);
-  param_.Out()->Resize(ddim);
+  this->param_.Out()->Resize(ddim);
 }
-template class FushionFcOp<CPU, float>;
+template class FusionFcOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(fc);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(fc, ops::FushionFcOp);
+USE_OP_CPU(fc);
+REGISTER_OPERATOR_CPU(fc, ops::FusionFcOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fc);
+REGISTER_OPERATOR_MALI_GPU(fc, ops::FusionFcOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef FUSION_FC_OP
 #pragma once
 #include <string>
@@ -19,7 +21,7 @@ limitations under the License. */
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/fushion_fc_kernel.h"
+#include "operators/kernel/fusion_fc_kernel.h"
 namespace paddle_mobile {
 namespace operators {
@@ -32,40 +34,55 @@ class FusionFcMatcher : public framework::FusionOpMatcher {
    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
  }
-  void FolderNodes(framework::Node *node) {
+  void FolderNodes(
-    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
+      framework::Node *node,
-        node->OpDescs(node_.Depth());
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}});
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}}, removed_nodes);
  }
  std::string Type() { return G_OP_TYPE_FC; }
 };
 template <typename DeviceType, typename T>
-class FushionFcOp : public framework::OperatorWithKernel<DeviceType> {
+class FusionFcOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionFcParam, operators::FusionFcKernel<DeviceType, T>> {
 public:
-  FushionFcOp(const string &type, const VariableNameMap &inputs,
+  FusionFcOp(const string &type, const VariableNameMap &inputs,
-              const VariableNameMap &outputs,
+             const VariableNameMap &outputs,
-              const framework::AttributeMap attrs,
+             const framework::AttributeMap &attrs,
-              std::shared_ptr<framework::Scope> scope)
+             std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+      : framework::OperatorWithKernel<DeviceType, FusionFcParam,
-                                                  scope),
+                                      operators::FusionFcKernel<DeviceType, T>>(
-        param_(inputs, outputs, attrs, *scope) {}
+            type, inputs, outputs, attrs, scope) {}
-  void RunImpl() const {
-    operators::FushionFcKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, FusionFcParam,
+      operators::FusionFcKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:
-  FushionFcParam param_;
 };
+#ifdef PADDLE_MOBILE_CPU
+#ifndef CONV_CPU_REGISTER
+#define CONV_CPU_REGISTER
+static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
+#endif
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#ifndef CONV_CPU_REGISTER
+#define CONV_CPU_REGISTER
 static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
+#endif
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/batchnorm_kernel.cpp
+++ b/src/operators/kernel/arm/batchnorm_kernel.cpp
@@ -12,82 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
+#ifdef BATCHNORM_OP
 #include "operators/kernel/batchnorm_kernel.h"
+#include "operators/kernel/central-arm-func/batchnorm_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
 template <>
-void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
+bool BatchNormKernel<CPU, float>::Init(const BatchNormParam &para) const {
-  /// todo: test.
+  return true;
-  const Tensor *input_x = param.InputX();
+}
-  auto input_x_ptr = input_x->data<float>();
-  const auto &x_dims = input_x->dims();
-  const int N = x_dims[0];
-  const int C = x_dims[1];
-  const int H = x_dims[2];
-  const int W = x_dims[3];
-  const int stride0 = C * H * W;
-  const int stride1 = H * W;
-  const int stride2 = W;
-  Tensor *out = param.OutputY();
-  auto out_ptr = out->mutable_data<float>();
-  const float epsilon = param.Epsilon();
-  const Tensor *mean = param.InputMean();
-  const Tensor *variance = param.InputVariance();
-  const Tensor *scale = param.InputScale();
-  const Tensor *bias = param.InputBias();
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-  Tensor inv_std;
-  auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
-  if (C != variance->numel()) {
-    std::cout << "C must equal to variance.numel()" << std::endl;
-  }
-  assert(C == variance->numel());
-  /// std = (var + epsilon).sqrt();
-  /// inv_std = 1 / std;
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-  Tensor new_scale;
-  auto new_scale_ptr = new_scale.mutable_data<float>(make_ddim({C}));
-  Tensor new_bias;
-  auto new_bias_ptr = new_bias.mutable_data<float>(make_ddim({C}));
-  /// ((x - est_mean) * (inv_var) * scale + bias equal to
+template <>
-  /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
-  for (int i = 0; i < C; i++) {
+  BatchnormCompute<float>(param);
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-    {
-      for (int n = 0; n < N; n++) {
-        for (int h = 0; h < H; h++) {
-          int tmp_index = n * stride0 + i * stride1 + h * stride2;
-          for (int w = 0; w < W; w++) {
-            int index = tmp_index + w;
-            out_ptr[index] =
-                input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
-          }
-        }
-      }
-    }
-  }
-  DLOG << "input[2,5,1,0](input[102]) ,channel 5 :";
-  DLOG << "input_x_ptr : " << input_x_ptr[102];
-  DLOG << "variance : " << variance_ptr[5];
-  DLOG << "inv_std_ptr : " << inv_std_ptr[5];
-  DLOG << "new_scale_ptr : " << new_scale_ptr[5];
-  DLOG << "new_bias_ptr : " << new_bias_ptr[5];
-  DLOG << "out_ptr : " << out_ptr[102];
 }
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/box_coder_kernel.cpp
+++ b/src/operators/kernel/arm/box_coder_kernel.cpp
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
+#ifdef BOXCODER_OP
 #include "operators/kernel/box_coder_kernel.h"
+#include <cmath>
 namespace paddle_mobile {
 namespace operators {
@@ -109,6 +110,11 @@ void DecodeCenterSize(const framework::Tensor& target_box,
  }
 }
+template <>
+bool BoxCoderKernel<CPU, float>::Init(const BoxCoderParam& para) const {
+  return true;
+}
 template <>
 void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam& param) const {
  const auto* input_priorbox = param.InputPriorBox();
@@ -135,3 +141,5 @@ void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam& param) const {
 }
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/concat_kernel.cpp
+++ b/src/operators/kernel/arm/concat_kernel.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
+#ifdef CONCAT_OP
 #include "operators/kernel/concat_kernel.h"
@@ -52,6 +52,11 @@ class ConcatFunctor {
  }
 };
+template <>
+bool ConcatKernel<CPU, float>::Init(const ConcatParam &para) const {
+  return true;
+}
 template <>
 void ConcatKernel<CPU, float>::Compute(const ConcatParam &param) const {
  auto inputs = param.Inputs();
@@ -85,3 +90,5 @@ void ConcatKernel<CPU, float>::Compute(const ConcatParam &param) const {
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADD_OP
+#include "operators/kernel/conv_add_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ConvAddKernel<CPU, float>::Init(const FusionConvAddParam &para) const {
+  return true;
+}
+template <>
+void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam &param) const {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor bias = *param.Bias();
+  int axis = param.Axis();
+  Tensor *output = param.Output();
+  math::expand_bias(bias, axis, output->dims());
+  output->ShareDataWith(bias);
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(1));
+    }
+  }
+}
+template class ConvAddKernel<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/framework/program/var_desc.cpp
+++ b/src/framework/program/var_desc.cpp
@@ -12,9 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "var_desc.h"
+#ifdef FUSION_CONVADD_RELU_OP
+#include "operators/kernel/conv_add_relu_kernel.h"
+#include "operators/kernel/central-arm-func/conv_add_relu_arm_func.h"
 namespace paddle_mobile {
+namespace operators {
+template <>
+bool ConvAddReluKernel<CPU, float>::Init(
+    const FusionConvAddReluParam &para) const {
+  return true;
+}
-namespace framework {}  // namespace framework
+template <>
+void ConvAddReluKernel<CPU, float>::Compute(
+    const FusionConvAddReluParam &param) const {
+  ConvAddReluCompute<float>(param);
+}
+template class ConvAddReluKernel<CPU, float>;
+}  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -12,103 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef CONV_OP
 #include "operators/kernel/conv_kernel.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
 template <>
-void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
+bool ConvKernel<CPU, float>::Init(const ConvParam &para) const {
-  LOG(kLOG_DEBUG) << param;
+  return true;
+}
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-  //  DLOG << " compute end get Attrs " << strides[0];
-  const int batch_size = static_cast<int>(input->dims()[0]);
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  DLOG << " filter.dims() = " << filter.dims();
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
+template <>
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+void ConvKernel<CPU, float>::Compute(const ConvParam &param) const {
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+  ConvCompute<float>(param);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(0));
-    }
-  }
 }
 template class ConvKernel<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
@@ -12,115 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef DEPTHWISECONV_OP
 #include "operators/kernel/depthwise_conv_kernel.h"
-#include "operators/kernel/conv_kernel.h"
+#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
 template <>
-void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
+bool DepthwiseConvKernel<CPU, float>::Init(const ConvParam &para) const {
-  LOG(kLOG_DEBUG) << param;
+  return true;
+}
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-  //  DLOG << " compute end get Attrs " << strides[0];
-  const int batch_size = static_cast<int>(input->dims()[0]);
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-  //  DLOG << " col_shape = " << col_shape;
-  //  DLOG << " col_matrix_shape = " << col_matrix_shape;
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-  //  DLOG << " input_shape = " << input_shape;
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  //  DLOG << " filter.dims() = " << filter.dims();
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    //    DLOG << " in_batch.dims() = " << in_batch.dims();
-    //    DLOG << " out_batch.dims() = " << out_batch.dims();
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
+template <>
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+  DepthwiseConvCompute<float>(param);
-      //      DLOG << " out_slice " << out_slice.dims();
-      //      DLOG << " filter_slice " << filter_slice.dims();
-      //      DLOG << " col_matrix " << col_matrix.dims();
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(0));
-      auto filter_ptr = filter_slice.data<float>();
-    }
-  }
 }
 template class DepthwiseConvKernel<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/arm/elementwise_add_kernel.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef ELEMENTWISEADD_OP
 #pragma once
 #include "operators/kernel/elementwise_add_kernel.h"
@@ -24,6 +26,12 @@ struct AddFunctor {
  inline T operator()(T a, T b) const { return a + b; }
 };
+template <>
+bool ElementwiseAddKernel<CPU, float>::Init(
+    const ElementwiseAddParam &para) const {
+  return true;
+}
 template <>
 void ElementwiseAddKernel<CPU, float>::Compute(
    const ElementwiseAddParam &param) const {
@@ -40,3 +48,5 @@ template class ElementwiseAddKernel<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/arm/fushion_fc_kernel.cpp
@@ -12,15 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef FUSION_FC_OP
 #pragma once
-#include "operators/kernel/fushion_fc_kernel.h"
+#include "operators/kernel/fusion_fc_kernel.h"
 namespace paddle_mobile {
 namespace operators {
 template <>
-void FushionFcKernel<CPU, float>::Compute(const FushionFcParam &param) const {
+bool FusionFcKernel<CPU, float>::Init(const FusionFcParam &para) const {
+  return true;
+}
+template <>
+void FusionFcKernel<CPU, float>::Compute(const FusionFcParam &param) const {
  const Tensor *input_x = param.InputX();
  const Tensor *input_y = param.InputY();
  const Tensor *input_z = param.InputZ();
@@ -65,3 +72,5 @@ void FushionFcKernel<CPU, float>::Compute(const FushionFcParam &param) const {
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/lrn_kernel.cpp
+++ b/src/operators/kernel/arm/lrn_kernel.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef LRN_OP
 #pragma once
 #include "operators/kernel/lrn_kernel.h"
@@ -19,17 +21,23 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
+template <>
+bool LrnKernel<CPU, float>::Init(const LrnParam &para) const {
+  return true;
+}
 template <>
 void LrnKernel<CPU, float>::Compute(const LrnParam &param) const {
  const Tensor *input_x = param.InputX();
  auto x_dims = input_x->dims();
+  Tensor *out = param.Out();
+  out->mutable_data<float>();
  /// data_format = NCHW
  const int N = x_dims[0];
  const int C = x_dims[1];
  const int H = x_dims[2];
  const int W = x_dims[3];
-  Tensor *out = param.Out();
-  out->mutable_data<float>();
  const int n = param.N();
  const float alpha = param.Alpha();
  const float beta = param.Beta();
@@ -42,3 +50,5 @@ template class LrnKernel<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/mul_kernel.cpp
+++ b/src/operators/kernel/arm/mul_kernel.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef MUL_OP
 #pragma once
 #include "operators/kernel/mul_kernel.h"
@@ -19,6 +21,11 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
+template <>
+bool MulKernel<CPU, float>::Init(const MulParam &para) const {
+  return true;
+}
 template <>
 void MulKernel<CPU, float>::Compute(const MulParam &param) const {
  const Tensor *input_x = param.InputX();
@@ -48,3 +55,5 @@ template class MulKernel<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/multiclass_nms_kernel.cpp
+++ b/src/operators/kernel/arm/multiclass_nms_kernel.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
+#ifdef MULTICLASSNMS_OP
 #include "operators/kernel/multiclass_nms_kernel.h"
+#include <algorithm>
 namespace paddle_mobile {
 namespace operators {
@@ -203,6 +203,12 @@ void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
  }
 }
+template <>
+bool MultiClassNMSKernel<CPU, float>::Init(
+    const MultiClassNMSParam& para) const {
+  return true;
+}
 template <>
 void MultiClassNMSKernel<CPU, float>::Compute(
    const MultiClassNMSParam& param) const {
@@ -273,3 +279,5 @@ void MultiClassNMSKernel<CPU, float>::Compute(
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/pool_kernel.cpp
+++ b/src/operators/kernel/arm/pool_kernel.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef POOL_OP
 #include <operators/kernel/pool_kernel.h>
 #include "common/log.h"
@@ -33,6 +35,11 @@ inline void PoolBasic(std::string pooling_type, std::vector<int> ksize,
  }
 }
+template <>
+bool PoolKernel<CPU, float>::Init(const PoolParam &para) const {
+  return true;
+}
 template <>
 void PoolKernel<CPU, float>::Compute(const PoolParam &param) const {
  const Tensor *in_x = param.Input();
@@ -54,22 +61,25 @@ void PoolKernel<CPU, float>::Compute(const PoolParam &param) const {
      paddings[i] = 0;
      ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
    }
-  }
+  } else if (ksize[0] == 3 && ksize[0] == ksize[1]) {
+    if (pooling_type == "max") {
+      math::Pool3x3Max(strides, paddings, in_x, out);
+    } else if (pooling_type == "avg") {
+      math::Pool3x3Avg(strides, paddings, in_x, out);
+    }
+  } else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
+    if (pooling_type == "max") {
+      math::Pool2x2Max(strides, paddings, in_x, out);
+    } else if (pooling_type == "avg") {
+      math::Pool2x2Avg(strides, paddings, in_x, out);
+    }
-  PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+  } else {
+    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
-  //    if (param.isGlobalPooling() || ksize[0] != ksize[1] ||
+  }
-  //        strides[0] != strides[1] || strides[1] != 2 ||
-  //        paddings[0] != paddings[1] || paddings[1] > 1) {
-  //        PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
-  //
-  //    } else if (ksize[0] == 2) {
-  //
-  //    } else if (ksize[0] == 3) {
-  //
-  //    } else {
-  //        PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
-  //    }
 }
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/prior_box_kernel.cpp
+++ b/src/operators/kernel/arm/prior_box_kernel.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
+#ifdef PRIORBOX_OP
 #include "operators/kernel/prior_box_kernel.h"
@@ -26,6 +26,11 @@ struct ClipFunctor {
  }
 };
+template <>
+bool PriorBoxKernel<CPU, float>::Init(const PriorBoxParam &para) const {
+  return true;
+}
 template <>
 void PriorBoxKernel<CPU, float>::Compute(const PriorBoxParam &param) const {
  const auto *input_ = param.Input();
@@ -143,3 +148,5 @@ void PriorBoxKernel<CPU, float>::Compute(const PriorBoxParam &param) const {
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
+#ifdef RELU_OP
 #include "operators/kernel/relu_kernel.h"
 #include <operators/math/transform.h>
@@ -25,6 +25,11 @@ struct ReluFunctor {
  inline T operator()(T in) const { return in > 0 ? in : 0; }
 };
+template <>
+bool ReluKernel<CPU, float>::Init(const ReluParam &para) const {
+  return true;
+}
 /*
 * @b 特化到具体平台的实现, param 从 op 层传入
 * */
@@ -35,13 +40,74 @@ void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
  auto *out = param.Out();
  auto *out_ptr = out->mutable_data<float>();
+  int numel = input_x->numel();
+  //  if (numel > 64) {
+  //    asm volatile(
+  //        "pld        [%[input_x_ptr], #0]        \n\t"
+  //        "vmov.f32   q8,    #0.0                 \n\t"
+  //        "subs %[num], %[num], #32                \n\t"
+  //        "blt        end_num_%=                  \n\t"
+  //        "loop_num_%=:                           \n\t"
+  //        "pld        [%[input_x_ptr], #1024]      \n\t"
+  //
+  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+  //
+  //        "vmax.f32 q0, q0, q8                   \n\t"
+  //        "vmax.f32 q1, q1, q8                    \n\t"
+  //        "vmax.f32 q2, q2, q8                   \n\t"
+  //        "vmax.f32 q3, q3, q8                   \n\t"
+  //        "vmax.f32 q4, q4, q8                   \n\t"
+  //        "vmax.f32 q5, q5, q8                   \n\t"
+  //        "vmax.f32 q6, q6, q8                   \n\t"
+  //        "vmax.f32 q7, q7, q8                   \n\t"
+  //
+  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+  //
+  //        "subs %[num], %[num], #32              \n\t"
+  //        "bge        loop_num_%=                \n\t"
+  //        "end_num_%=:                           \n\t"
+  //        "cmp %[num], #0                         \n\t"
+  //        "bge   end_%=                          \n\t"
+  //        "mov r6, #4                             \n\t"
+  //        "mul r5, %[num], r6                     \n\t"
+  //        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
+  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+  //        "vmax.f32 q0, q0, q8                   \n\t"
+  //        "vmax.f32 q1, q1, q8                    \n\t"
+  //        "vmax.f32 q2, q2, q8                   \n\t"
+  //        "vmax.f32 q3, q3, q8                   \n\t"
+  //        "vmax.f32 q4, q4, q8                   \n\t"
+  //        "vmax.f32 q5, q5, q8                   \n\t"
+  //        "vmax.f32 q6, q6, q8                   \n\t"
+  //        "vmax.f32 q7, q7, q8                   \n\t"
+  //        "add %[out_ptr], %[out_ptr], r5       \n\t"
+  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+  //        "end_%=:                                \n\t"
+  //        :
+  //        :
+  //        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num]
+  //        "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
+  //        "q7", "q8", "r5",
+  //          "r6");
+  //  } else {
  ReluFunctor<float> func_;
  math::Transform trans;
-  trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_);
+  trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
-  //  for (int i = 0; i < input_x->numel(); i++) {
-  //    out_ptr[i] = input_x_ptr[i] > 0 ? input_x_ptr[i] : 0;
  //  }
 }
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/reshape_kernel.cpp
+++ b/src/operators/kernel/arm/reshape_kernel.cpp
@@ -12,13 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
+#ifdef RESHAPE_OP
 #include "operators/kernel/reshape_kernel.h"
 namespace paddle_mobile {
 namespace operators {
+template <>
+bool ReshapeKernel<CPU, float>::Init(const ReshapeParam &para) const {
+  return true;
+}
 template <>
 void ReshapeKernel<CPU, float>::Compute(const ReshapeParam &param) const {
  const auto *input_x = param.InputX();
@@ -49,3 +54,5 @@ void ReshapeKernel<CPU, float>::Compute(const ReshapeParam &param) const {
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/sigmoid_kernel.cpp
+++ b/src/operators/kernel/arm/sigmoid_kernel.cpp
@@ -12,11 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef SIGMOID_OP
 #include "../sigmoid_kernel.h"
 #if __ARM_NEON
 #include "../../math/math_func_neon.h"
 #endif
+#include <cmath>
 namespace paddle_mobile {
 namespace operators {
@@ -25,35 +27,23 @@ using framework::Tensor;
 void sigmoid(const Tensor *X, Tensor *Y) {
 #if __ARM_NEON
-  DLOG << "step1";
  const float *input = X->data<float>();
-  DLOG << "step11";
  float *output = Y->mutable_data<float>();
-  DLOG << "step2";
  const DDim &dDim = X->dims();
-  DLOG << "step3";
  int axis_index = 1;
  if (dDim.size() < 4) {
    axis_index = 0;
  }
-  DLOG << "step4";
  DDim outer_ddim =
      paddle_mobile::framework::slice_ddim(dDim, 0, axis_index + 1);
  DDim inner_ddim =
      paddle_mobile::framework::slice_ddim(dDim, axis_index + 1, dDim.size());
-  DLOG << "step5";
  int out_size = paddle_mobile::framework::product(outer_ddim);
  int inner_size = paddle_mobile::framework::product(inner_ddim);
-  DLOG << "step6";
-#pragma omp parallel for
  DLOG << "outsize=" << out_size;
  DLOG << "innersize=" << inner_size;
+  #pragma omp parallel for
  for (int i = 0; i < out_size; ++i) {
    const float *input_outer_ptr = input + i * inner_size;
    float *output_outer_ptr = output + i * inner_size;
@@ -81,6 +71,11 @@ void sigmoid(const Tensor *X, Tensor *Y) {
 #endif
 }
+template <>
+bool SigmoidKernel<CPU, float>::Init(const SigmoidParam &para) const {
+  return true;
+}
 template <>
 void SigmoidKernel<CPU, float>::Compute(const SigmoidParam &param) const {
  const Tensor *in_x = param.InputX();
@@ -93,3 +88,5 @@ void SigmoidKernel<CPU, float>::Compute(const SigmoidParam &param) const {
 template class SigmoidKernel<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/softmax_kernel.cpp
+++ b/src/operators/kernel/arm/softmax_kernel.cpp
@@ -12,11 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef SOFTMAX_OP
 #include "../softmax_kernel.h"
 #include "../../math/softmax.h"
 namespace paddle_mobile {
 namespace operators {
+template <>
+bool SoftmaxKernel<CPU, float>::Init(const SoftmaxParam &para) const {
+  return true;
+}
 template <>
 void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam &param) const {
  const Tensor *in_x = param.InputX();
@@ -29,3 +36,5 @@ void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam &param) const {
 template class SoftmaxKernel<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/transpose_kernel.cpp
+++ b/src/operators/kernel/arm/transpose_kernel.cpp
@@ -11,28 +11,32 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef TRANSPOSE_OP
-#pragma once
 #include "operators/kernel/transpose_kernel.h"
 namespace paddle_mobile {
 namespace operators {
-template <typename T>
+// vector<int> pos;
-void TransposeFunc(const int numel, const T* input, const vector<int> axis,
+// template <typename T>
-                   const vector<int> old_strides, const vector<int> new_strides,
+// void TransposeFunc(const int numel, const T* input, const vector<int> axis,
-                   T* output) {
+//                    const vector<int> old_strides, const vector<int>
-  for (int i = 0; i < numel; ++i) {
+//                    new_strides, T* output) {
-    int old_idx = 0;
+//   for (int i = 0; i < numel; ++i) {
-    int idx = i;
+//     int old_idx = 0;
-    for (int j = 0; j < axis.size(); ++j) {
+//     int idx = i;
-      int order = axis[j];
+//     for (int j = 0; j < axis.size(); ++j) {
-      old_idx += (idx / new_strides[j]) * old_strides[order];
+//       int order = axis[j];
-      idx %= new_strides[j];
+//       old_idx += (idx / new_strides[j]) * old_strides[order];
-    }
+//       idx %= new_strides[j];
-    output[i] = input[old_idx];
+//     }
-  }
+//     output[i] = input[old_idx];
+//   }
+// }
+template <>
+bool TransposeKernel<CPU, float>::Init(const TransposeParam& para) const {
+  return true;
 }
 template <>
@@ -44,29 +48,41 @@ void TransposeKernel<CPU, float>::Compute(const TransposeParam& param) const {
  const auto* input_x_data = input_x->data<float>();
  auto* out_data = out->mutable_data<float>();
-  size_t axis_size = axis.size();
+  size_t ndim = axis.size();
-  std::vector<int> new_dims;
+  std::vector<int> xdim(ndim);
-  new_dims.reserve(axis_size);
+  std::vector<int> xstride(ndim);
-  for (auto c : axis) {
+  std::vector<int> xout(ndim);
-    new_dims.push_back(input_x_dims[c]);
+  for (int i = 0; i < ndim; i++) {
+    int j = ndim - 1 - i;
+    xdim[j] = input_x_dims[axis[i]];
+    xstride[j] = 1;
+    for (int k = axis[i] + 1; k < ndim; k++) {
+      xstride[j] *= input_x_dims[k];
+    }
+    xout[j] = xstride[j] * xdim[j];
  }
-  std::vector<int> old_strides;
+  auto numel = input_x->numel();
-  std::vector<int> new_strides;
+  size_t pind = 0;
-  for (int i = 0; i < axis.size(); i++) {
+  std::vector<int> ind(ndim);
-    int temp_old = 1;
+  for (int i = 0; i < numel; i++) {
-    int temp_new = 1;
+    out_data[i] = input_x_data[pind];
-    for (int j = i + 1; j < axis.size(); j++) {
+    ind[0]++;
-      temp_old *= input_x_dims[j];
+    pind += xstride[0];
-      temp_new *= new_dims[j];
+    for (int j = 0; j < ndim - 1; j++) {
+      if (ind[j] == xdim[j]) {
+        ind[j + 1]++;
+        ind[j] = 0;
+        pind += xstride[j + 1];
+        pind -= xout[j];
+      } else {
+        break;
+      }
    }
-    old_strides.push_back(temp_old);
-    new_strides.push_back(temp_new);
  }
-  TransposeFunc<float>(input_x->numel(), input_x_data, axis, old_strides,
-                       new_strides, out_data);
 }
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/batchnorm_kernel.h
+++ b/src/operators/kernel/batchnorm_kernel.h
@@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef BATCHNORM_OP
+#pragma once
 #include "framework/operator.h"
 #include "operators/op_param.h"
-#pragma once;
 namespace paddle_mobile {
 namespace operators {
@@ -26,7 +29,10 @@ class BatchNormKernel
    : public framework::OpKernelBase<DeviceType, BatchNormParam> {
 public:
  void Compute(const BatchNormParam &param) const;
+  bool Init(const BatchNormParam &para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/box_coder_kernel.h
+++ b/src/operators/kernel/box_coder_kernel.h
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef BOXCODER_OP
+#pragma once
 #include <vector>
 #include "framework/operator.h"
 #include "operators/math/transform.h"
 #include "operators/op_param.h"
-#pragma once;
 namespace paddle_mobile {
 namespace operators {
@@ -28,6 +30,9 @@ class BoxCoderKernel
    : public framework::OpKernelBase<DeviceType, BoxCoderParam> {
 public:
  void Compute(const BoxCoderParam& param) const;
+  bool Init(const BoxCoderParam& para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
+++ b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef BATCHNORM_OP
+#pragma once
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename P>
+void BatchnormCompute(const BatchNormParam &param) {
+  const Tensor *input_x = param.InputX();
+  auto input_x_ptr = input_x->data<float>();
+  const auto &x_dims = input_x->dims();
+  const int N = x_dims[0];
+  const int C = x_dims[1];
+  const int H = x_dims[2];
+  const int W = x_dims[3];
+  const int stride0 = C * H * W;
+  const int stride1 = H * W;
+  const int stride2 = W;
+  Tensor *out = param.OutputY();
+  auto out_ptr = out->mutable_data<float>();
+  const float epsilon = param.Epsilon();
+  const Tensor *mean = param.InputMean();
+  const Tensor *variance = param.InputVariance();
+  const Tensor *scale = param.InputScale();
+  const Tensor *bias = param.InputBias();
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+  //  Tensor inv_std;
+  //  auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
+  PADDLE_MOBILE_ENFORCE(C == variance->numel(),
+                        "C must equal to variance.numel()");
+  int HXW = H * W;
+  if (HXW > 32) {
+    int NXC = N * C;
+    float *inv_std_ptr = new float[NXC * 4];
+    float *volatile new_scale_ptr = new float[NXC * 4];
+    float *volatile new_bias_ptr = new float[NXC * 4];
+    /// std = (var + epsilon).sqrt();
+    /// inv_std = 1 / std;
+    for (int i = 0; i < C * 4; i += 4) {
+      int index = i / 4;
+      inv_std_ptr[i] =
+          1 / static_cast<float>(pow((variance_ptr[index] + epsilon), 0.5));
+      inv_std_ptr[i + 1] = inv_std_ptr[i];
+      inv_std_ptr[i + 2] = inv_std_ptr[i];
+      inv_std_ptr[i + 3] = inv_std_ptr[i];
+      new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[index];
+      new_scale_ptr[i + 1] = new_scale_ptr[i];
+      new_scale_ptr[i + 2] = new_scale_ptr[i];
+      new_scale_ptr[i + 3] = new_scale_ptr[i];
+      new_bias_ptr[i] =
+          bias_ptr[index] - mean_ptr[index] * inv_std_ptr[i] * scale_ptr[index];
+      new_bias_ptr[i + 1] = new_bias_ptr[i];
+      new_bias_ptr[i + 2] = new_bias_ptr[i];
+      new_bias_ptr[i + 3] = new_bias_ptr[i];
+    }
+    for (int j = C * 4; j < NXC * 4; ++j) {
+      new_scale_ptr[j] = new_scale_ptr[j - C * 4];
+      new_bias_ptr[j] = new_bias_ptr[j - C * 4];
+    }
+    asm volatile(
+        "subs %[N], %[N], #1                  \n\t"
+        "blt        end_n_%=                  \n\t"
+        "loop_n_%=:                           \n\t"
+        "subs %[C], %[C], #1                   \n\t"
+        "blt        end_c_%=                  \n\t"
+        "loop_c_%=:                           \n\t"
+        "vld1.32 {q9}, [%[new_scale_ptr]]!    \n\t"
+        "vld1.32 {q10}, [%[new_bias_ptr]]!    \n\t"
+        "mov r6, %[HXW]       \n\t"
+        "subs r6, r6, #32                       \n\t"
+        "blt        end_hw_%=                   \n\t"
+        "loop_hw_%=:                            \n\t"
+        "vld1.32 {q1, q2}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q3, q4}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q5, q6}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q7, q8}, [%[input_x_ptr]]!    \n\t"
+        "vmul.f32   q1, q1,   q9  \n\t"
+        "vmul.f32   q2, q2,   q9  \n\t"
+        "vmul.f32   q3, q3,   q9  \n\t"
+        "vmul.f32   q4, q4,   q9  \n\t"
+        "vmul.f32   q5, q5,   q9  \n\t"
+        "vmul.f32   q6, q6,   q9  \n\t"
+        "vmul.f32   q7, q7,   q9  \n\t"
+        "vmul.f32   q8, q8,   q9  \n\t"
+        "vadd.f32   q1,  q1,  q10 \n\t"
+        "vadd.f32   q2, q2,   q10  \n\t"
+        "vadd.f32   q3, q3,   q10  \n\t"
+        "vadd.f32   q4,  q4,  q10 \n\t"
+        "vadd.f32   q5,  q5,  q10 \n\t"
+        "vadd.f32   q6,  q6,  q10 \n\t"
+        "vadd.f32   q7,  q7,  q10 \n\t"
+        "vadd.f32   q8,  q8,  q10 \n\t"
+        "vst1.32 {q1, q2}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q3, q4}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q5, q6}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q7, q8}, [%[out_ptr]]!       \n\t"
+        "subs r6, r6, #32                    \n\t"
+        "bge        loop_hw_%=                \n\t"
+        "end_hw_%=:                           \n\t"
+        "cmp  r6, #0                                \n\t"
+        "bge  end_remainder_%=                      \n\t"
+        "mov r5, #4                             \n\t"
+        "mul  r6, r6, r5                            \n\t"
+        "add %[input_x_ptr], %[input_x_ptr], r6     \n\t"
+        "vld1.32 {q1, q2}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q3, q4}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q5, q6}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q7, q8}, [%[input_x_ptr]]!    \n\t"
+        "vmul.f32   q1, q1,   q9  \n\t"
+        "vmul.f32   q2, q2,   q9  \n\t"
+        "vmul.f32   q3, q3,   q9  \n\t"
+        "vmul.f32   q4, q4,   q9  \n\t"
+        "vmul.f32   q5, q5,   q9  \n\t"
+        "vmul.f32   q6, q6,   q9  \n\t"
+        "vmul.f32   q7, q7,   q9  \n\t"
+        "vmul.f32   q8, q8,   q9  \n\t"
+        "vadd.f32   q1,  q1,  q10 \n\t"
+        "vadd.f32   q2, q2,   q10  \n\t"
+        "vadd.f32   q3, q3,   q10  \n\t"
+        "vadd.f32   q4,  q4,  q10 \n\t"
+        "vadd.f32   q5,  q5,  q10 \n\t"
+        "vadd.f32   q6,  q6,  q10 \n\t"
+        "vadd.f32   q7,  q7,  q10 \n\t"
+        "vadd.f32   q8,  q8,  q10 \n\t"
+        "add %[out_ptr], %[out_ptr], r6         \n\t"
+        "vst1.32 {q1, q2}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q3, q4}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q5, q6}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q7, q8}, [%[out_ptr]]!        \n\t"
+        "end_remainder_%=:                      \n\t"
+        "subs %[C], %[C], #1                    \n\t"
+        "bge        loop_c_%=                   \n\t"
+        "end_c_%=:                              \n\t"
+        "subs %[N], %[N], #1                    \n\t"
+        "bge        loop_n_%=                   \n\t"
+        "end_n_%=:                              \n\t"
+        :
+        : [input_x_ptr] "r"(input_x_ptr), [out_ptr] "r"(out_ptr),
+          [new_scale_ptr] "r"(new_scale_ptr), [new_bias_ptr] "r"(new_bias_ptr),
+          [N] "r"(N), [C] "r"(C), [HXW] "r"(HXW)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+          "q10", "r5", "r6");
+    delete[] inv_std_ptr;
+    delete[] new_scale_ptr;
+    delete[] new_bias_ptr;
+  } else {
+    float *inv_std_ptr = new float[C];
+    for (int i = 0; i < C; i++) {
+      inv_std_ptr[i] =
+          1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+    }
+    Tensor new_scale;
+    auto new_scale_ptr =
+        new_scale.mutable_data<float>(framework::make_ddim({C}));
+    Tensor new_bias;
+    auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
+    /// ((x - est_mean) * (inv_var) * scale + bias equal to
+    /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+    for (int i = 0; i < C; i++) {
+      new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+      new_bias_ptr[i] =
+          bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+      {
+        for (int n = 0; n < N; n++) {
+          for (int h = 0; h < H; h++) {
+            int tmp_index = n * stride0 + i * stride1 + h * stride2;
+            for (int w = 0; w < W; w++) {
+              int index = tmp_index + w;
+              out_ptr[index] =
+                  input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
+            }
+          }
+        }
+      }
+    }
+    delete[] inv_std_ptr;
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADD_RELU_OP
+#pragma once
+#include <vector>
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename P>
+void ConvAddReluCompute(const FusionConvAddReluParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor bias = *param.Bias();
+  int axis = param.Axis();
+  Tensor *output = param.Output();
+  math::expand_bias(bias, axis, output->dims());
+  output->ShareDataWith(bias);
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(1), true);
+    }
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef CONV_OP
+#pragma once
+#include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename P>
+void ConvCompute(const ConvParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor *output = param.Output();
+  output->mutable_data<float>();
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(0));
+    }
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef DEPTHWISECONV_OP
+#pragma once
+#include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename P>
+void DepthwiseConvCompute(const ConvParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor *output = param.Output();
+  output->mutable_data<float>();
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+  //  DLOG << " compute end get Attrs " << strides[0];
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(0));
+    }
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/concat_kernel.h
+++ b/src/operators/kernel/concat_kernel.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef CONCAT_OP
 #pragma once
 #include "framework/operator.h"
 #include "operators/op_param.h"
@@ -25,7 +27,10 @@ template <typename DeviceType, typename T>
 class ConcatKernel : public framework::OpKernelBase<DeviceType, ConcatParam> {
 public:
  void Compute(const ConcatParam &param) const;
+  bool Init(const ConcatParam &para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/framework/var_type.h
+++ b/src/framework/var_type.h
@@ -12,24 +12,36 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef FUSION_CONVADD_OP
 #pragma once
-#include "framework.pb.h"
-#include "lod_tensor.h"
+#include <vector>
-#include "selected_rows.h"
+#if __ARM_NEON
-#include "variable.h"
+#include <arm_neon.h>
+#endif
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
 namespace paddle_mobile {
-namespace framework {
+namespace operators {
-inline proto::VarType::Type ToVarType(std::type_index type) {
-  if (type.hash_code() == typeid(LoDTensor).hash_code()) {
+using framework::DDim;
-    return proto::VarType_Type_LOD_TENSOR;
+using framework::OpKernelBase;
-  } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
-    return proto::VarType_Type_SELECTED_ROWS;
+template <typename DeviceType, typename T>
-  } else {
+class ConvAddKernel : public OpKernelBase<DeviceType, FusionConvAddParam> {
-    //    PADDLE_THROW("ToVarType:Unsupported type %s",
+ public:
-    //    type.name());
+  void Compute(const FusionConvAddParam &param) const;
-  }
+  bool Init(const FusionConvAddParam &para) const;
-}
+};
-}  // namespace framework
+}  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/framework/data_transform.h
+++ b/src/framework/data_transform.h
@@ -14,24 +14,32 @@ limitations under the License. */
 #pragma once
-#include <functional>
+#ifdef FUSION_CONVADD_RELU_OP
-#include <utility>
-#include <vector>
-#include "framework/op_kernel_type.h"
+#include <vector>
-#include "framework/selected_rows.h"
+#include "framework/ddim.h"
-#include "framework/tensor.h"
+#include "framework/operator.h"
-#include "framework/variable.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
 namespace paddle_mobile {
-namespace framework {
+namespace operators {
-void DataTransform(const OpKernelType &expected_kernel_type,
+using framework::DDim;
-                   const OpKernelType &kernel_type_for_var,
+using framework::OpKernelBase;
-                   const Tensor &input_tensor, Tensor *out);
-void CopyVariableWithTensor(const Variable &in_var, const Tensor &tensor,
+template <typename DeviceType, typename T>
-                            Variable *out_var);
+class ConvAddReluKernel
+    : public OpKernelBase<DeviceType, FusionConvAddReluParam> {
+ public:
+  void Compute(const FusionConvAddReluParam &param) const;
+  bool Init(const FusionConvAddReluParam &para) const;
+};
-}  // namespace framework
+}  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/conv_kernel.h
+++ b/src/operators/kernel/conv_kernel.h
@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef CONV_OP
+#pragma once
 #include <vector>
 #include "framework/operator.h"
 #include "operators/math/im2col.h"
@@ -19,8 +23,6 @@ limitations under the License. */
 #include "operators/math/vol2col.h"
 #include "operators/op_param.h"
-#pragma once;
 namespace paddle_mobile {
 namespace operators {
@@ -30,22 +32,10 @@ template <typename DeviceType, typename T>
 class ConvKernel : public OpKernelBase<DeviceType, ConvParam> {
 public:
  void Compute(const ConvParam &param) const;
+  bool Init(const ConvParam &para) const;
 };
-inline bool IsExpand(const std::vector<int64_t> &filter_dim,
-                     const std::vector<int> &strides,
-                     const std::vector<int> &paddings,
-                     const std::vector<int> &dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/depthwise_conv_kernel.h
+++ b/src/operators/kernel/depthwise_conv_kernel.h
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef DEPTHWISECONV_OP
+#pragma once
 #include "framework/operator.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"
 #include "operators/op_param.h"
-#pragma once;
 namespace paddle_mobile {
 namespace operators {
@@ -29,6 +31,9 @@ template <typename DeviceType, typename T>
 class DepthwiseConvKernel : public OpKernelBase<DeviceType, ConvParam> {
 public:
  void Compute(const ConvParam &param) const;
+  bool Init(const ConvParam &para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/elementwise_add_kernel.h
+++ b/src/operators/kernel/elementwise_add_kernel.h
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once;
+#ifdef ELEMENTWISEADD_OP
+#pragma once
 #include "framework/operator.h"
 #include "operators/math/elementwise_op_function.h"
@@ -28,6 +30,9 @@ class ElementwiseAddKernel
    : public framework::OpKernelBase<DeviceType, ElementwiseAddParam> {
 public:
  void Compute(const ElementwiseAddParam &param) const;
+  bool Init(const ElementwiseAddParam &para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/conv_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_kernel.cpp
@@ -12,13 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef CONV_OP
+#include "operators/kernel/conv_kernel.h"
 namespace paddle_mobile {
 namespace operators {
-// template<>
+template <>
-// void ConvKernel<FPGA, float>::Compute(const ConvParam &param) const
+bool ConvKernel<FPGA, float>::Init(const ConvParam &para) const {
-// {}
+  return true;
-//
-// template class ConvKernel<FPGA, float>;
 }
+template <>
+void ConvKernel<FPGA, float>::Compute(const ConvParam &param) const {}
+template class ConvKernel<FPGA, float>;
+}  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fushion_fc_kernel.h
+++ b/src/operators/kernel/fushion_fc_kernel.h
@@ -12,20 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef FUSION_FC_OP
+#pragma once
 #include "framework/operator.h"
 #include "operators/math/math_function.h"
 #include "operators/op_param.h"
-#pragma once;
 namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType, typename T>
-class FushionFcKernel
+class FusionFcKernel
-    : public framework::OpKernelBase<DeviceType, FushionFcParam> {
+    : public framework::OpKernelBase<DeviceType, FusionFcParam> {
 public:
-  void Compute(const FushionFcParam& param) const;
+  void Compute(const FusionFcParam& param) const;
+  bool Init(const FusionFcParam& para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
@@ -12,9 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef LRN_OP
 #include "framework/operator.h"
 #include "operators/op_param.h"
-#pragma once;
+#include <cmath>
+#ifdef __ARM_NEON
+#include "arm_neon.h"
+#include "operators/math/math_func_neon.h"
+#endif
 namespace paddle_mobile {
 namespace operators {
@@ -24,42 +32,137 @@ using namespace framework;
 template <typename T>
 struct LRNFunctor {
  void operator()(const framework::Tensor &input, framework::Tensor *out, int N,
-                  int C, int H, int W, int n, T k, T alpha, T beta) {
+                  int C, int H, int W, int n, float k, float alpha,
-    auto input_ptr = input.data<T>();
+                  float beta) {
+    const float *input_ptr = input.data<float>();
    const int start = -(n - 1) / 2;
    const int end = start + n;
+    auto out_ptr = out->data<T>();
    const int stride0 = C * H * W;
    const int stride1 = H * W;
    const int stride2 = W;
-    const int stride3 = 1;
    framework::Tensor sqr_buffer;
-    auto sqr_buffer_ptr = sqr_buffer.mutable_data<T>(input.dims());
+    auto sqr_buffer_ptr = sqr_buffer.mutable_data<float>(input.dims());
-    std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), k);
+    std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), 0.0);
    for (int a = 0; a < N; a++) {
      for (int b = 0; b < C; b++) {
        for (int index = start; index < end; index++) {
          int channel = b + index;
          if (channel >= 0 && channel < C) {
-            int tmp_u = a * stride0 + b * stride1;
+            int tmp_s = a * stride0 + b * stride1;
-            int tmp_i = a * stride0 + channel * stride1;
+            int tmp_c = a * stride0 + channel * stride1;
-            for (int c = 0; c < H; c++) {
+#ifdef __ARM_NEON
-              for (int d = 0; d < W; d++) {
+            int n4 = stride1 / 4;
-                int tmp = c * stride2 + d;
+            int m4 = stride1 % 4;
-                int u = tmp_u + tmp;
+            float32x4_t sqr0;
-                int i = tmp_i + tmp;
+            float32x4_t in0;
-                sqr_buffer_ptr[u] += alpha * input_ptr[i] * input_ptr[i];
+            float32x4_t res0;
-              }
+            for (int i = 0; i < n4; i++) {
+              sqr0 = vld1q_f32(sqr_buffer_ptr + tmp_s);
+              in0 = vld1q_f32(input_ptr + tmp_c);
+              res0 = vmlaq_f32(sqr0, in0, in0);
+              vst1q_f32(sqr_buffer_ptr + tmp_s, res0);
+              tmp_s += 4;
+              tmp_c += 4;
            }
+            for (int i = 0; i < m4; i++) {
+              int s_i = tmp_s + i;
+              int c_i = tmp_c + i;
+              sqr_buffer_ptr[s_i] += input_ptr[c_i] * input_ptr[c_i];
+            }
+#else
+            for (int tmp = 0; tmp < stride1; tmp++) {
+              int s_i = tmp_s + tmp;
+              int c_i = tmp_c + tmp;
+              sqr_buffer_ptr[s_i] += input_ptr[c_i] * input_ptr[c_i];
+            }
+#endif
          }
        }
      }
    }
-    auto out_ptr = out->data<T>();
+#ifdef __ARM_NEON
+    float32x4_t sqr1, sqr2, sqr3, sqr4;
+    float32x4_t alpha4;
+    float32x4_t k4;
+    float32x4_t beta4;
+    float32x4_t res1, res2, res3, res4;
+    float32x4_t in1, in2, in3, in4;
+    beta4 = vdupq_n_f32(beta);
+    alpha4 = vdupq_n_f32(alpha);
+    k4 = vdupq_n_f32(k);
+    auto out_tmp_ptr = out_ptr;
+    int n16 = input.numel() / 16;
+    int m16 = input.numel() % 16;
+    int m16n4 = m16 / 4;
+    int m16m4 = m16 % 4;
+    for (int i = 0; i < n16; i++) {
+      sqr1 = vld1q_f32(sqr_buffer_ptr);
+      sqr2 = vld1q_f32(sqr_buffer_ptr + 4);
+      sqr3 = vld1q_f32(sqr_buffer_ptr + 8);
+      sqr4 = vld1q_f32(sqr_buffer_ptr + 12);
+      in1 = vld1q_f32(input_ptr);
+      in2 = vld1q_f32(input_ptr + 4);
+      in3 = vld1q_f32(input_ptr + 8);
+      in4 = vld1q_f32(input_ptr + 12);
+      sqr1 = vmlaq_f32(k4, sqr1, alpha4);
+      sqr2 = vmlaq_f32(k4, sqr2, alpha4);
+      sqr3 = vmlaq_f32(k4, sqr3, alpha4);
+      sqr4 = vmlaq_f32(k4, sqr4, alpha4);
+      sqr1 = pow_ps(sqr1, -beta4);
+      sqr2 = pow_ps(sqr2, -beta4);
+      sqr3 = pow_ps(sqr3, -beta4);
+      sqr4 = pow_ps(sqr4, -beta4);
+      sqr1 = vmulq_f32(sqr1, in1);
+      sqr2 = vmulq_f32(sqr2, in2);
+      sqr3 = vmulq_f32(sqr3, in3);
+      sqr4 = vmulq_f32(sqr4, in4);
+      vst1q_f32(out_tmp_ptr, sqr1);
+      vst1q_f32(out_tmp_ptr + 4, sqr2);
+      vst1q_f32(out_tmp_ptr + 8, sqr3);
+      vst1q_f32(out_tmp_ptr + 12, sqr4);
+      sqr_buffer_ptr += 4 * 4;
+      input_ptr += 4 * 4;
+      out_tmp_ptr += 4 * 4;
+    }
+    for (int i = 0; i < m16n4; i++) {
+      sqr4 = vld1q_f32(sqr_buffer_ptr);
+      in4 = vld1q_f32(input_ptr);
+      sqr4 = vmlaq_f32(k4, sqr4, alpha4);
+      sqr4 = pow_ps(sqr4, -beta4);
+      sqr4 = vmulq_f32(sqr4, in4);
+      vst1q_f32(out_tmp_ptr, sqr4);
+      sqr_buffer_ptr += 4;
+      input_ptr += 4;
+      out_tmp_ptr += 4;
+    }
+    for (int i = 0; i < m16m4; i++) {
+      out_tmp_ptr[i] = input_ptr[i] / pow(k + alpha * sqr_buffer_ptr[i], beta);
+    }
+#else
    for (int i = 0; i < input.numel(); i++) {
-      out_ptr[i] = input_ptr[i] / pow(sqr_buffer_ptr[i], beta);
+      out_ptr[i] = input_ptr[i] / pow(k + alpha * sqr_buffer_ptr[i], beta);
    }
+#endif
  }
 };
@@ -67,6 +170,9 @@ template <typename DeviceType, typename T>
 class LrnKernel : public framework::OpKernelBase<DeviceType, LrnParam> {
 public:
  void Compute(const LrnParam &param) const;
+  bool Init(const LrnParam &para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/ACL_Android @ 591027fc
+++ b/ACL_Android @ 591027fc
+Subproject commit 591027fcffea084100c756e48356e0f8a48e35e5
--- a/src/operators/kernel/mali/acl_operator.cc
+++ b/src/operators/kernel/mali/acl_operator.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#if USE_ACL == 1
+#include "acl_operator.h"
+unsigned int bypass_acl_class_layer =
+    (0 | FLAGS_ENABLE_ACL_CONCAT |
+     /*0xffffffff |*/ /*FLAGS_ENABLE_ACL_FC |*/ /*FLAGS_ENABLE_ACL_LRN
+                                                   |*/
+     0);
+int enable_schedule = 0;
+#ifdef USE_PROFILING
+#include "arm_neon.h"
+unsigned int acl_log_flags =
+    (0 | MASK_LOG_APP_TIME | /*MASK_LOG_ALLOCATE | */  /*MASK_LOG_ALLOCATE | */
+     /*MASK_LOG_RUN      | */ /*MASK_LOG_CONFIG   | */ /*MASK_LOG_COPY     | */
+     MASK_LOG_ABSVAL | MASK_LOG_BNLL | MASK_LOG_CONV | MASK_LOG_FC |
+     MASK_LOG_LRN | MASK_LOG_POOLING | MASK_LOG_RELU | MASK_LOG_SIGMOID |
+     MASK_LOG_SOFTMAX | MASK_LOG_TANH | MASK_LOG_LC | MASK_LOG_BN |
+     MASK_LOG_CONCAT | 0);
+#include <stdio.h>  /* printf */
+#include <stdlib.h> /* getenv */
+#endif              // USE_PROFILING
+static bool force_enable_gpu = false;
+bool AclEnableSchedule(int enable) {
+  enable_schedule = enable;
+  if (enable) {
+    force_enable_gpu = true;
+  }
+  return true;
+}
+int isScheduleEnable() { return enable_schedule; }
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+bool ACLOperator::init_gpu_env = true;
+#ifdef USE_OPENCL
+bool ACLOperator::support_opencl_ = false;
+bool opencl_is_available() { return arm_compute::opencl_is_available(); }
+#elif defined(USE_OPENGLES)
+bool ACLOperator::support_opengles_ = false;
+#endif
+ACLOperator::ACLOperator(bool is_gpu)
+    : operator_state_(operator_not_init),
+      force_bypass_acl_path_(false),
+      target_hint_(TargetHint::DONT_CARE),
+      convolution_method_hint_(ConvolutionMethodHint::GEMM),
+      _group(1),
+      name_(""),
+      input_idx_(0),
+      output_idx_(0),
+      is_gpu_(is_gpu) {
+  const char* pBypassACL;
+  if (init_gpu_env) {
+#ifdef USE_OPENCL
+    try {
+      if (opencl_is_available()) {
+        arm_compute::CLScheduler::get().default_init();
+        support_opencl_ = true;
+      }
+    } catch (std::exception& e) {
+      support_opencl_ = false;
+    }
+#elif defined(USE_OPENGLES)
+    try {
+      arm_compute::GCScheduler::get().default_init();
+      support_opengles_ = true;
+    } catch (std::exception& e) {
+      support_opengles_ = false;
+    }
+#endif
+    init_gpu_env = false;
+  }
+  if (force_enable_gpu) is_gpu_ = true;
+  pBypassACL = getenv("BYPASSACL");
+  if (pBypassACL) {
+    unsigned int bacl;
+    sscanf(pBypassACL, "%i", &bacl);
+    if (bacl != bypass_acl_class_layer) {
+      bypass_acl_class_layer = bacl;
+      printf("BYPASSACL<%s>\n", pBypassACL);
+      printf("BYPASSACL: %x\n", bypass_acl_class_layer);
+    }
+  }
+#ifdef USE_PROFILING
+  const char* pLogACL;
+  pLogACL = getenv("LOGACL");
+  if (pLogACL) {
+    unsigned int alf;
+    sscanf(pLogACL, "%i", &alf);
+    if (alf != acl_log_flags) {
+      acl_log_flags = alf;
+      printf("LOGACL<%s>\n", pLogACL);
+      printf("LOGACL: %x\n", acl_log_flags);
+    }
+  }
+#endif  // USE_PROFILING
+  const char* pEnableSchedule;
+  pEnableSchedule = getenv("ENABLESCHEDULE");
+  if (pEnableSchedule) {
+    int bshedule;
+    sscanf(pEnableSchedule, "%i", &bshedule);
+    if (bshedule != enable_schedule) {
+      enable_schedule = bshedule;
+      printf("ENABLESCHEDULE<%s>\n", pEnableSchedule);
+      printf("ENABLESCHEDULE: %x\n", enable_schedule);
+    }
+    if (enable_schedule) {
+      AclEnableSchedule(1);
+    }
+  }
+}
+ACLOperator::~ACLOperator() {}
+bool ACLOperator::new_tensor(std::unique_ptr<ACLTensor>& tensor,
+                             arm_compute::TensorShape& shape, void* mem,
+                             bool commit) {
+  auto acl_tensor =
+      new ACLTensor(arm_compute::TensorInfo(shape, arm_compute::Format::F32));
+  acl_tensor->set_target(getTargetHint());
+  acl_tensor->bindmem(mem);
+  if (commit) acl_tensor->commit();
+  tensor = (std::unique_ptr<ACLTensor>)std::move(acl_tensor);
+  return true;
+}
+bool ACLOperator::new_tensor(std::unique_ptr<ACLSubTensor>& tensor,
+                             std::unique_ptr<ACLTensor>& parent,
+                             arm_compute::TensorShape& shape,
+                             arm_compute::Coordinates& coord) {
+  auto acl_tensor = new ACLSubTensor(parent, shape, coord);
+  acl_tensor->set_target(getTargetHint());
+  tensor = (std::unique_ptr<ACLSubTensor>)std::move(acl_tensor);
+  return true;
+}
+void ACLTensor::commit(TensorType type) {
+  settensortype(type);
+  if (mem_) {
+    if (!allocate_) {
+#ifdef USE_PROFILING
+      logtime_util log_time(ACL_ALLOCATE_INFO);
+#endif  // USE_PROFILING
+      allocate();
+      allocate_ = true;
+    }
+    if (type_ != tensor_output) {
+      tensor_copy(mem_);
+    }
+    mem_ = nullptr;
+  }
+}
+int BaseACLTensor::tensor_copy(arm_compute::ITensor* tensor, void* mem,
+                               bool toTensor) {
+#ifdef USE_PROFILING
+  logtime_util log_time(ACL_COPY_INFO);
+#endif  // USE_PROFILING
+  arm_compute::Window window;
+  // Iterate through the rows (not each element)
+  window.use_tensor_dimensions(tensor->info()->tensor_shape(),
+                               /* first_dimension =*/arm_compute::Window::DimY);
+  int width = tensor->info()->tensor_shape()[0];
+  int height = tensor->info()->tensor_shape()[1];
+  int deepth = tensor->info()->tensor_shape()[2];
+  map();
+  // Create an iterator:
+  arm_compute::Iterator it(tensor, window);
+  // Except it works for an arbitrary number of dimensions
+  if (toTensor) {  // mem->tensor
+    arm_compute::execute_window_loop(
+        window,
+        [&](const arm_compute::Coordinates& id) {
+          memcpy(it.ptr(),
+                 ((char*)mem) +
+                     ((id[3] * (width * height * deepth) +
+                       id.z() * (width * height) + id.y() * width + id.x()) *
+                      tensor->info()->element_size()),
+                 width * tensor->info()->element_size());
+        },
+        it);
+  } else {  // tensor-->mem
+    arm_compute::execute_window_loop(
+        window,
+        [&](const arm_compute::Coordinates& id) {
+          memcpy(((char*)mem) + ((id[3] * (width * height * deepth) +
+                                  id.z() * (width * height) + id.y() * width) *
+                                 tensor->info()->element_size()),
+                 it.ptr(), width * tensor->info()->element_size());
+        },
+        it);
+  }
+  unmap();
+  return 0;
+}
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/acl_operator.h
+++ b/src/operators/kernel/mali/acl_operator.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef ACL_OPERATOR_H_
+#define ACL_OPERATOR_H_
+#include <framework/tensor.h>
+#include <operators/op_param.h>
+#if USE_ACL == 1
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+#endif
+#ifdef USE_OPENGLES
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h"
+#endif
+#include "acl_tensor.h"
+#define FLAGS_ENABLE_ACL_ABSVAL 0x00000001
+#define FLAGS_ENABLE_ACL_BNLL 0x00000002
+#define FLAGS_ENABLE_ACL_CONV 0x00000004
+#define FLAGS_ENABLE_ACL_FC 0x00000008
+#define FLAGS_ENABLE_ACL_LRN 0x00000010
+#define FLAGS_ENABLE_ACL_POOLING 0x00000020
+#define FLAGS_ENABLE_ACL_RELU 0x00000040
+#define FLAGS_ENABLE_ACL_SIGMOID 0x00000080
+#define FLAGS_ENABLE_ACL_SOFTMAX 0x00000100
+#define FLAGS_ENABLE_ACL_TANH 0x00000200
+#define FLAGS_ENABLE_ACL_LC 0x00000400
+#define FLAGS_ENABLE_ACL_BN 0x00000800
+#define FLAGS_ENABLE_ACL_CONCAT 0x00001000
+extern unsigned int bypass_acl_class_layer;
+#ifdef USE_PROFILING
+#include <sys/time.h>
+#define NANO_SEC_CONV 1000000
+#define MASK_LOG_APP_TIME 0x00000001
+#define MASK_LOG_ALLOCATE 0x00000002
+#define MASK_LOG_RUN 0x00000004
+#define MASK_LOG_CONFIG 0x00000008
+#define MASK_LOG_COPY 0x00000010
+#define MASK_LOG_ABSVAL 0x00000020
+#define MASK_LOG_BNLL 0x00000040
+#define MASK_LOG_CONV 0x00000080
+#define MASK_LOG_FC 0x00000100
+#define MASK_LOG_LRN 0x00000200
+#define MASK_LOG_POOLING 0x00000400
+#define MASK_LOG_RELU 0x00000800
+#define MASK_LOG_SIGMOID 0x00001000
+#define MASK_LOG_SOFTMAX 0x00002000
+#define MASK_LOG_TANH 0x00004000
+#define MASK_LOG_LC 0x00008000
+#define MASK_LOG_BN 0x00010000
+#define MASK_LOG_CONCAT 0x00020000
+#define APP_TIME_INFO MASK_LOG_APP_TIME, "time:       \t"
+#define ACL_ALLOCATE_INFO MASK_LOG_ALLOCATE, "allocate:   \t\t"
+#define ACL_RUN_INFO MASK_LOG_RUN, "run:        \t\t\t"
+#define ACL_CONFIG_INFO MASK_LOG_CONFIG, "configure:  \t\t\t\t"
+#define ACL_COPY_INFO MASK_LOG_COPY, "tensor_copy:\t\t\t\t\t"
+#define ACL_ABSVAL_INFO MASK_LOG_ABSVAL, "ACL_ABSVAL :\t\t\t\t\t\t"
+#define ACL_BNLL_INFO MASK_LOG_BNLL, "ACL_BNLL   :\t\t\t\t\t\t\t"
+#define ACL_CONV_INFO MASK_LOG_CONV, "ACL_CONV   :\t\t\t\t\t\t\t\t"
+#define ACL_FC_INFO MASK_LOG_FC, "ACL_FC     :\t\t\t\t\t\t\t\t\t"
+#define ACL_LRN_INFO MASK_LOG_LRN, "ACL_LRN    :\t\t\t\t\t\t\t\t\t\t"
+#define ACL_POOLING_INFO MASK_LOG_POOLING, "ACL_POOLING:\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_RELU_INFO MASK_LOG_RELU, "ACL_RELU   :\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_SIGMOID_INFO \
+  MASK_LOG_SIGMOID, "ACL_SIGMOID:\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_SOFTMAX_INFO \
+  MASK_LOG_SOFTMAX, "ACL_SOFTMAX:\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_TANH_INFO \
+  MASK_LOG_TANH, "ACL_TANH   :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_LC_INFO MASK_LOG_LC, "ACL_LC     :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_BN_INFO \
+  MASK_LOG_BN, "ACL_BN     :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_CONCAT_INFO \
+  MASK_LOG_CONCAT, "ACL_CONCAT :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+extern unsigned int acl_log_flags;
+class logtime_util {
+ public:
+  logtime_util() { mask = 0; }
+  logtime_util(int mask_, const char *information_) {
+    setlogtime_info(mask_, information_);
+  }
+  void setlogtime_info(int mask_, const char *information_) {
+    mask = mask_;
+    if (acl_log_flags & mask) {
+      strncpy(information, information_, 255);
+      gettimeofday(&tv[0], NULL);
+    }
+  }
+  ~logtime_util() {
+    if (acl_log_flags & mask) {
+      int time[2];
+      gettimeofday(&tv[1], NULL);
+      time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
+      time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
+      printf("%s %.6lf\n", information,
+             (((double)time[1] - time[0]) / NANO_SEC_CONV));
+    }
+  }
+  void log_time(bool start) {
+    if (acl_log_flags & mask) {
+      if (start) {
+        gettimeofday(&tv[0], NULL);
+      } else {
+        int time[2];
+        gettimeofday(&tv[1], NULL);
+        time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
+        time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
+        printf("%s %.6lf\n", information,
+               (((double)time[1] - time[0]) / NANO_SEC_CONV));
+      }
+    }
+  }
+ private:
+  struct timeval tv[2];
+  int mask;
+  char information[256];
+};
+#endif  // USE_PROFILING
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+class AclParameters {
+ public:
+  AclParameters() {
+    dilated = false;
+    dim = 2;
+    num_group = 1;
+  }
+  int batch;
+  int in_depth;
+  int in_rows;
+  int in_cols;
+  int out_depth;
+  int out_rows;
+  int out_cols;
+  int out_num;
+  int filter_rows;
+  int filter_cols;
+  int stride_rows;
+  int stride_cols;
+  int pad_rows;
+  int pad_cols;
+  int dilation_rows;
+  int dilation_cols;
+  int num_group;
+  bool dilated;
+  int dim;
+  int epsilon;
+  int nsize;
+  float alpha;
+  float beta;
+  float knorm;
+  void *input_data;
+  void *output_data;
+  void *weight_data;
+  void *biases_data;
+  void *mean_data;
+  void *var_data;
+  std::string pool_type;
+  std::string act_type;
+  std::string data_layout;
+  bool is_global_pool;
+  bool is_channel_concat;
+  std::vector<framework::LoDTensor *> in_tensor;
+};
+enum TensorType {
+  tensor_input,
+  tensor_output,
+  tensor_weights,
+  tensor_biases,
+  tensor_mean,
+  tensor_var,
+  tensor_beta,
+  tensor_gamma,
+  tensor_concat,
+  tensor_data,
+};
+enum OperatorState {
+  operator_not_init,
+  operator_init_done,
+  operator_reinit,
+};
+enum OperateType {
+  operate_type_pooling,
+  operate_type_activation,
+  operate_type_lrn,
+  operate_type_conv,
+  operate_type_lc,
+  operate_type_fc,
+  operate_type_bn,
+  operate_type_softmax,
+  operate_type_concat,
+};
+class BaseACLTensor {
+ public:
+  BaseACLTensor() : type_(tensor_input), allocate_(false) {}
+  virtual ~BaseACLTensor() {}
+  virtual void bindmem(void *mem) { mem_ = mem; }
+  virtual void settensortype(TensorType type) { type_ = type; }
+  virtual void map(bool blocking = true) {}
+  virtual void unmap() {}
+  virtual void commit(TensorType type = tensor_data) {}
+  int tensor_copy(arm_compute::ITensor *tensor, void *mem,
+                  bool toTensor = true);
+ protected:
+  void *mem_;
+  TensorType type_;
+  bool allocate_;
+};
+class ACLTensor : public BaseACLTensor, public Tensor {
+ public:
+  explicit ACLTensor(arm_compute::TensorInfo &&info) : Tensor(info) {}
+  virtual void map(bool blocking = true) {
+    if (!allocate_) {
+      Tensor::allocate();
+      allocate_ = true;
+    }
+    Tensor::map(blocking);
+  }
+  virtual int tensor_copy(void *mem, bool toTensor = true) {
+    auto acl_tensor = this;
+    arm_compute::ITensor *tensor = acl_tensor->tensor();
+    BaseACLTensor::tensor_copy(tensor, mem, toTensor);
+    return 0;
+  }
+  virtual void unmap() { Tensor::unmap(); }
+  virtual void commit(TensorType type = tensor_data);
+};
+class ACLSubTensor : public BaseACLTensor, public SubTensor {
+ public:
+  ACLSubTensor(std::unique_ptr<ACLTensor> &parent,
+               arm_compute::TensorShape &shape, arm_compute::Coordinates &coord)
+      : SubTensor(parent.get(), shape, coord) {}
+  virtual int tensor_copy(void *mem, bool toTensor = true) { return 0; }
+};
+template <typename T>
+class TensorPair {
+ public:
+  TensorPair() {}
+  ~TensorPair() {}
+  TensorType type;
+  std::unique_ptr<T> tensor;
+};
+template <typename T>
+std::unique_ptr<T> &tensor_item(
+    std::vector<std::unique_ptr<TensorPair<T>>> &pool, TensorType type,
+    int idx) {
+  int count = 0;
+  for (auto &item : pool) {
+    if (item.get()->type == type) {
+      ++count;
+    }
+    if (item.get()->type == type && idx == count - 1) {
+      return item.get()->tensor;
+    }
+  }
+  pool.push_back((std::unique_ptr<TensorPair<T>>)std::move(new TensorPair<T>));
+  auto item = pool[pool.size() - 1].get();
+  item->type = type;
+  item->tensor = NULL;
+  return item->tensor;
+}
+class ACLOperator {
+ public:
+  virtual void commit() {
+    for (auto &item : tensor_pool_) {
+      if (item.get()->tensor) item.get()->tensor->commit(item.get()->type);
+    }
+  }
+  inline void run() {
+    commit();
+#ifdef USE_PROFILING
+    logtime_util log_time(ACL_RUN_INFO);
+#endif  // USE_PROFILING
+    for (auto &c : funcs_) {
+      c->run();
+    }
+  }
+  inline std::vector<std::unique_ptr<arm_compute::IFunction>> &funcs() {
+    return funcs_;
+  }
+  inline std::unique_ptr<ACLSubTensor> &sinput(int idx = 0) {
+    return tensor_item(subtensor_pool_, tensor_input, idx);
+  }
+  inline std::unique_ptr<ACLSubTensor> &soutput(int idx = 0) {
+    return tensor_item(subtensor_pool_, tensor_output, idx);
+  }
+  inline std::unique_ptr<ACLSubTensor> &sweights(int idx = 0) {
+    return tensor_item(subtensor_pool_, tensor_weights, idx);
+  }
+  inline std::unique_ptr<ACLSubTensor> &sbiases(int idx = 0) {
+    return tensor_item(subtensor_pool_, tensor_biases, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &cinput(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_concat, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &input(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_input, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &output(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_output, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &weights(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_weights, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &biases(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_biases, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &mean(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_mean, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &var(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_var, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &beta(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_beta, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &gamma(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_gamma, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &tensor(TensorType type) {
+    switch (type) {
+      case tensor_biases:
+        return biases();
+        break;
+      case tensor_weights:
+        return weights();
+        break;
+      case tensor_output:
+        return output();
+        break;
+      default:
+      case tensor_input:
+        return input();
+        break;
+    }
+    return input();
+  }
+  explicit ACLOperator(bool is_gpu = false);
+  virtual ~ACLOperator();
+  inline TargetHint getTargetHint() {
+#ifdef USE_OPENCL
+    if (target_hint_ == TargetHint::DONT_CARE) {
+      if (is_gpu_) {
+        return TargetHint::OPENCL;
+      }
+      return TargetHint::NEON;
+    }
+    return target_hint_;
+#elif defined(USE_OPENGLES)
+    if (target_hint_ == TargetHint::DONT_CARE) {
+      if (is_gpu_) {
+        return TargetHint::OPENGLES;
+      }
+      return TargetHint::NEON;
+    }
+    return target_hint_;
+#else
+    return TargetHint::NEON;
+#endif
+  }
+  inline void setTargetHint(TargetHint hint) { target_hint_ = hint; }
+  inline ConvolutionMethodHint &getConvMethod() {
+    return convolution_method_hint_;
+  }
+  inline void setConvMethod() {
+    convolution_method_hint_ = ConvolutionMethodHint::DIRECT;
+  }
+  inline bool tensor_mem(std::unique_ptr<ACLTensor> &tensor, void *mem) {
+    tensor->bindmem(mem);
+    return true;
+  }
+  inline bool tensor_mem(void *mem, std::unique_ptr<ACLTensor> &tensor) {
+    tensor->tensor_copy(mem, false);
+    return true;
+  }
+  bool new_tensor(std::unique_ptr<ACLTensor> &tensor,
+                  arm_compute::TensorShape &shape, void *mem = nullptr,
+                  bool commit = false);
+  bool new_tensor(std::unique_ptr<ACLSubTensor> &tensor,
+                  std::unique_ptr<ACLTensor> &parent,
+                  arm_compute::TensorShape &shape,
+                  arm_compute::Coordinates &coord);
+  inline int &group() { return _group; }
+  inline void set_operator_property(OperateType type, const char *name) {
+    name_ = name;
+    type_ = type;
+  }
+  inline void acl_run(void *input_data, void *output_data) {
+    if (input_data) tensor_mem(input(), input_data);
+    run();
+    tensor_mem(output_data, output());
+  }
+  inline int &input_idx() { return input_idx_; }
+  inline int &output_idx() { return output_idx_; }
+ protected:
+  inline bool isGPUMode() {
+#ifdef USE_OPENCL
+    if (!support_opencl_) return false;
+    return getTargetHint() == TargetHint::OPENCL;
+#elif defined(USE_OPENGLES)
+    if (!support_opengles_) return false;
+    return getTargetHint() == TargetHint::OPENGLES;
+#endif
+    return false;
+  }
+  inline OperatorState &opstate() { return operator_state_; }
+  inline bool is_operator_init_done(arm_compute::TensorShape shape,
+                                    TensorType type = tensor_input) {
+    checkreshape(shape, type);
+    return operator_state_ == operator_init_done;
+  }
+  inline void set_operator_init_done() {
+    opstate() = operator_init_done;
+    set_bypass_state(false);
+  }
+  inline void set_bypass_state(bool state = false) {
+    force_bypass_acl_path_ = state;
+  }
+  inline OperatorState checkreshape(arm_compute::TensorShape shape,
+                                    TensorType type = tensor_input) {
+    opstate() = reshape(shape, type);
+    if (opstate() == operator_reinit) {
+      freeres();
+    }
+    return opstate();
+  }
+  inline OperatorState reshape(arm_compute::TensorShape &shape,
+                               TensorType type) {
+    arm_compute::TensorShape _shape;
+    std::unique_ptr<ACLTensor> &acl_tensor = tensor(type);
+    if (!acl_tensor.get()) return operator_not_init;
+    _shape = acl_tensor->info().tensor_shape();
+    if (_shape.total_size() == shape.total_size() && _shape[0] == shape[0] &&
+        _shape[1] == shape[1]) {
+      return operator_init_done;
+    }
+    return operator_reinit;
+  }
+  inline void freeres() {
+    tensor_pool_.clear();
+    subtensor_pool_.clear();
+    funcs_.clear();
+  }
+  inline const char *&name() { return name_; }
+  inline void set_in_out_index(int indata_idx, int outdata_idx) {
+    input_idx() = indata_idx;
+    output_idx() = outdata_idx;
+  }
+ protected:
+  std::vector<std::unique_ptr<TensorPair<ACLTensor>>> tensor_pool_;
+  std::vector<std::unique_ptr<TensorPair<ACLSubTensor>>> subtensor_pool_;
+  std::vector<std::unique_ptr<arm_compute::IFunction>> funcs_;
+  OperatorState operator_state_;
+  bool force_bypass_acl_path_;
+  TargetHint target_hint_;
+  ConvolutionMethodHint convolution_method_hint_;
+  static bool support_opengles_;
+  static bool support_opencl_;
+  static bool init_gpu_env;
+  int _group;
+  const char *name_;
+  OperateType type_;
+  int input_idx_, output_idx_;
+  bool is_gpu_;
+};
+int isScheduleEnable();
+template <typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    arm_compute::ITensor *input, arm_compute::ITensor *output) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(dynamic_cast<TensorType *>(input),
+                dynamic_cast<TensorType *>(output));
+  return std::move(op);
+}
+template <typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    arm_compute::ITensor *input, arm_compute::ITensor *output) {
+  return instantiate_function<OperatorType, TensorType>(input, output);
+}
+template <typename OpType, typename OpTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint &hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  func = instantiate<OpType, OpTensor>(input->tensor(), output->tensor());
+  return func;
+}
+template <typename OperatorType, typename TensorType, typename VectorTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    VectorTensor inputs, arm_compute::ITensor *output) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(inputs, dynamic_cast<TensorType *>(output));
+  return std::move(op);
+}
+template <typename OperatorType, typename TensorType, typename VectorTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    VectorTensor inputs, arm_compute::ITensor *output) {
+  return instantiate_function<OperatorType, TensorType, VectorTensor>(inputs,
+                                                                      output);
+}
+template <typename OpType, typename OpTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func_lists(
+    ACLOperator *&acl_op, std::unique_ptr<ACLTensor> &output, int num,
+    TargetHint &hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  static std::vector<OpTensor *> tensors;
+  tensors.clear();
+  for (int i = 0; i < num; ++i) {
+    tensors.push_back(
+        dynamic_cast<OpTensor *>(acl_op->cinput(i).get()->tensor()));
+  }
+  func = instantiate<OpType, OpTensor, std::vector<OpTensor *>>(
+      tensors, output->tensor());
+  return func;
+}
+template <typename OperatorType, typename TensorType, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    arm_compute::ITensor *input, arm_compute::ITensor *output,
+    const OperatorInfo &info) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(dynamic_cast<TensorType *>(input),
+                dynamic_cast<TensorType *>(output), info);
+  return std::move(op);
+}
+template <typename OperatorType, typename TensorType, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    arm_compute::ITensor *input, arm_compute::ITensor *output,
+    const OperatorInfo &info) {
+  return instantiate_function<OperatorType, TensorType, OperatorInfo>(
+      input, output, info);
+}
+template <typename OpType, typename OpTensor, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    const OperatorInfo &info, TargetHint &hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  func = instantiate<OpType, OpTensor, OperatorInfo>(input->tensor(),
+                                                     output->tensor(), info);
+  return func;
+}
+template <typename OperatorType, typename TensorType, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    arm_compute::ITensor *input, arm_compute::ITensor *weights,
+    arm_compute::ITensor *biases, arm_compute::ITensor *output,
+    const OperatorInfo &info) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(dynamic_cast<TensorType *>(input),
+                dynamic_cast<TensorType *>(weights),
+                dynamic_cast<TensorType *>(biases),
+                dynamic_cast<TensorType *>(output), info);
+  return std::move(op);
+}
+template <typename OperatorType, typename TensorType, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    arm_compute::ITensor *input, arm_compute::ITensor *weights,
+    arm_compute::ITensor *biases, arm_compute::ITensor *output,
+    const OperatorInfo &info) {
+  return instantiate_function<OperatorType, TensorType, OperatorInfo>(
+      input, weights, biases, output, info);
+}
+template <typename OpType, typename OpTensor, typename OperatorInfo,
+          typename ACLTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &weights,
+    std::unique_ptr<ACLTensor> &biases, std::unique_ptr<ACLTensor> &output,
+    const OperatorInfo &info, TargetHint &hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  arm_compute::ITensor *biases_tensor = NULL;
+  if (biases.get()) {
+    biases_tensor = biases->tensor();
+  }
+  func = instantiate<OpType, OpTensor, OperatorInfo>(
+      input->tensor(), weights->tensor(), biases_tensor, output->tensor(),
+      info);
+  return func;
+}
+template <typename Dtype, typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    arm_compute::ITensor *input, arm_compute::ITensor *output,
+    arm_compute::ITensor *mean, arm_compute::ITensor *var,
+    arm_compute::ITensor *beta, arm_compute::ITensor *gamma, Dtype &eps) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(
+      dynamic_cast<TensorType *>(input), dynamic_cast<TensorType *>(output),
+      dynamic_cast<TensorType *>(mean), dynamic_cast<TensorType *>(var),
+      dynamic_cast<TensorType *>(beta), dynamic_cast<TensorType *>(gamma), eps);
+  return std::move(op);
+}
+template <typename Dtype, typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    arm_compute::ITensor *input, arm_compute::ITensor *output,
+    arm_compute::ITensor *mean, arm_compute::ITensor *var,
+    arm_compute::ITensor *beta, arm_compute::ITensor *gamma, Dtype eps) {
+  return instantiate_function<Dtype, OperatorType, TensorType>(
+      input, output, mean, var, beta, gamma, eps);
+}
+template <typename Dtype, typename OpType, typename OpTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    std::unique_ptr<ACLTensor> &mean, std::unique_ptr<ACLTensor> &var,
+    std::unique_ptr<ACLTensor> &beta, std::unique_ptr<ACLTensor> &gamma,
+    Dtype eps, TargetHint hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  func = instantiate<Dtype, OpType, OpTensor>(
+      input->tensor(), output->tensor(), mean->tensor(), var->tensor(),
+      beta->tensor(), gamma->tensor(), eps);
+  return func;
+}
+template <typename OperatorInfo>
+bool instantiate_op_pooling(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func<arm_compute::CLPoolingLayer, arm_compute::ICLTensor,
+                            arm_compute::PoolingLayerInfo>(input, output, info,
+                                                           hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(
+        instantiate_op_func<arm_compute::GCPoolingLayer, arm_compute::IGCTensor,
+                            arm_compute::PoolingLayerInfo>(input, output, info,
+                                                           hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func<arm_compute::NEPoolingLayer, arm_compute::ITensor,
+                            arm_compute::PoolingLayerInfo>(input, output, info,
+                                                           hint));
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_activation(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(instantiate_op_func<arm_compute::CLActivationLayer,
+                                       arm_compute::ICLTensor,
+                                       arm_compute::ActivationLayerInfo>(
+        input, output, info, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(instantiate_op_func<arm_compute::GCActivationLayer,
+                                       arm_compute::IGCTensor,
+                                       arm_compute::ActivationLayerInfo>(
+        input, output, info, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(instantiate_op_func<arm_compute::NEActivationLayer,
+                                       arm_compute::ITensor,
+                                       arm_compute::ActivationLayerInfo>(
+        input, output, info, hint));
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_lrn(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(instantiate_op_func<arm_compute::CLNormalizationLayer,
+                                       arm_compute::ICLTensor,
+                                       arm_compute::NormalizationLayerInfo>(
+        input, output, info, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(instantiate_op_func<arm_compute::GCNormalizationLayer,
+                                       arm_compute::IGCTensor,
+                                       arm_compute::NormalizationLayerInfo>(
+        input, output, info, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(instantiate_op_func<arm_compute::NENormalizationLayer,
+                                       arm_compute::ITensor,
+                                       arm_compute::NormalizationLayerInfo>(
+        input, output, info, hint));
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_conv(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+  std::unique_ptr<ACLTensor> &weights = acl_op->weights();
+  std::unique_ptr<ACLTensor> &biases = acl_op->biases();
+  ConvolutionMethodHint &conv_method = acl_op->getConvMethod();
+  bool has_biases = biases.get() ? true : false;
+  int &groups = acl_op->group();
+  arm_compute::TensorShape input_shape = input->info().tensor_shape();
+  arm_compute::TensorShape weights_shape = weights->info().tensor_shape();
+  arm_compute::TensorShape biases_shape;
+  if (has_biases) {
+    biases_shape = biases->info().tensor_shape();
+  }
+  arm_compute::TensorShape output_shape = output->info().tensor_shape();
+  if (groups == 1) {
+    if (conv_method == ConvolutionMethodHint::GEMM) {
+#ifdef USE_OPENCL
+      if (hint == TargetHint::OPENCL) {
+        func.push_back(instantiate_op_func<arm_compute::CLConvolutionLayer,
+                                           arm_compute::ICLTensor,
+                                           arm_compute::PadStrideInfo>(
+            acl_op->input(), acl_op->weights(), acl_op->biases(),
+            acl_op->output(), info, hint));
+        return true;
+      }
+#elif defined(USE_OPENGLES)
+      if (hint == TargetHint::OPENGLES) {
+        func.push_back(instantiate_op_func<arm_compute::GCConvolutionLayer,
+                                           arm_compute::IGCTensor,
+                                           arm_compute::PadStrideInfo>(
+            acl_op->input(), acl_op->weights(), acl_op->biases(),
+            acl_op->output(), info, hint));
+        return true;
+      }
+#endif
+      {
+        func.push_back(instantiate_op_func<arm_compute::NEConvolutionLayer,
+                                           arm_compute::ITensor,
+                                           arm_compute::PadStrideInfo>(
+            acl_op->input(), acl_op->weights(), acl_op->biases(),
+            acl_op->output(), info, hint));
+      }
+    } else {
+#ifdef USE_OPENCL
+      if (hint == TargetHint::OPENCL) {
+        func.push_back(
+            instantiate_op_func<arm_compute::CLDirectConvolutionLayer,
+                                arm_compute::ICLTensor,
+                                arm_compute::PadStrideInfo>(
+                acl_op->input(), acl_op->weights(), acl_op->biases(),
+                acl_op->output(), info, hint));
+        return true;
+      }
+#elif defined(USE_OPENGLES)
+      if (hint == TargetHint::OPENGLES) {
+        func.push_back(
+            instantiate_op_func<arm_compute::GCDirectConvolutionLayer,
+                                arm_compute::IGCTensor,
+                                arm_compute::PadStrideInfo>(
+                acl_op->input(), acl_op->weights(), acl_op->biases(),
+                acl_op->output(), info, hint));
+        return true;
+      }
+#endif
+      {
+        func.push_back(
+            instantiate_op_func<arm_compute::NEDirectConvolutionLayer,
+                                arm_compute::ITensor,
+                                arm_compute::PadStrideInfo>(
+                acl_op->input(), acl_op->weights(), acl_op->biases(),
+                acl_op->output(), info, hint));
+      }
+    }
+    return true;
+  }
+  // Calculate sub-tensor splits
+  const int input_split = input_shape.z() / groups;
+  const int output_split = output_shape.z() / groups;
+  const int weights_split = weights_shape[3] / groups;
+  const int biases_split = biases_shape.x() / groups;
+  // Calculate sub-tensor shapes
+  input_shape.set(2, input_split);
+  output_shape.set(2, output_split);
+  weights_shape.set(3, weights_split);
+  biases_shape.set(0, biases_split);
+  for (auto i = 0; i < groups; ++i) {
+    // Calculate sub-tensors starting coordinates
+    arm_compute::Coordinates input_coord(0, 0, input_split * i);
+    arm_compute::Coordinates output_coord(0, 0, output_split * i);
+    arm_compute::Coordinates weights_coord(0, 0, 0, weights_split * i);
+    arm_compute::Coordinates biases_coord(biases_split * i);
+    // Create sub-tensors for input, output, weights and bias
+    acl_op->new_tensor(acl_op->sinput(i), acl_op->input(), input_shape,
+                       input_coord);
+    acl_op->new_tensor(acl_op->soutput(i), acl_op->output(), output_shape,
+                       output_coord);
+    acl_op->new_tensor(acl_op->sweights(i), acl_op->weights(), weights_shape,
+                       weights_coord);
+    if (has_biases) {
+      acl_op->new_tensor(acl_op->sbiases(i), acl_op->biases(), biases_shape,
+                         biases_coord);
+    }
+    bool use_opencl = false;
+    if (conv_method == ConvolutionMethodHint::GEMM) {
+#ifdef USE_OPENCL
+      if (hint == TargetHint::OPENCL) {
+        use_opencl = true;
+        func.push_back(
+            instantiate_op_func<arm_compute::CLConvolutionLayer,
+                                arm_compute::ICLTensor,
+                                arm_compute::PadStrideInfo, ACLSubTensor>(
+                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
+                acl_op->soutput(i), info, hint));
+      }
+#endif
+      if (!use_opencl) {
+        func.push_back(
+            instantiate_op_func<arm_compute::NEConvolutionLayer,
+                                arm_compute::ITensor,
+                                arm_compute::PadStrideInfo, ACLSubTensor>(
+                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
+                acl_op->soutput(i), info, hint));
+      }
+    } else {
+#ifdef USE_OPENCL
+      if (hint == TargetHint::OPENCL) {
+        use_opencl = true;
+        func.push_back(
+            instantiate_op_func<arm_compute::CLDirectConvolutionLayer,
+                                arm_compute::ICLTensor,
+                                arm_compute::PadStrideInfo, ACLSubTensor>(
+                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
+                acl_op->soutput(i), info, hint));
+      }
+#endif
+      if (!use_opencl) {
+        func.push_back(
+            instantiate_op_func<arm_compute::NEDirectConvolutionLayer,
+                                arm_compute::ITensor,
+                                arm_compute::PadStrideInfo, ACLSubTensor>(
+                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
+                acl_op->soutput(i), info, hint));
+      }
+    }
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_lc(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+  std::unique_ptr<ACLTensor> &weights = acl_op->weights();
+  std::unique_ptr<ACLTensor> &biases = acl_op->biases();
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func<arm_compute::CLLocallyConnectedLayer,
+                            arm_compute::ICLTensor, arm_compute::PadStrideInfo>(
+            input, weights, biases, output, info, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func<arm_compute::NELocallyConnectedLayer,
+                            arm_compute::ITensor, arm_compute::PadStrideInfo>(
+            input, weights, biases, output, info, hint));
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_fc(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+  std::unique_ptr<ACLTensor> &weights = acl_op->weights();
+  std::unique_ptr<ACLTensor> &biases = acl_op->biases();
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(instantiate_op_func<arm_compute::CLFullyConnectedLayer,
+                                       arm_compute::ICLTensor, bool>(
+        input, weights, biases, output, info, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(instantiate_op_func<arm_compute::GCFullyConnectedLayer,
+                                       arm_compute::IGCTensor, bool>(
+        input, weights, biases, output, info, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(instantiate_op_func<arm_compute::NEFullyConnectedLayer,
+                                       arm_compute::ITensor, bool>(
+        input, weights, biases, output, info, hint));
+  }
+  return true;
+}
+template <typename Dtype>
+bool instantiate_op_bn(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, Dtype eps) {
+  std::unique_ptr<ACLTensor> &mean = acl_op->mean();
+  std::unique_ptr<ACLTensor> &var = acl_op->var();
+  std::unique_ptr<ACLTensor> &beta = acl_op->beta();
+  std::unique_ptr<ACLTensor> &gamma = acl_op->gamma();
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func<Dtype, arm_compute::CLBatchNormalizationLayer,
+                            arm_compute::ICLTensor>(input, output, mean, var,
+                                                    beta, gamma, eps, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(
+        instantiate_op_func<Dtype, arm_compute::GCBatchNormalizationLayer,
+                            arm_compute::IGCTensor>(input, output, mean, var,
+                                                    beta, gamma, eps, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func<Dtype, arm_compute::NEBatchNormalizationLayer,
+                            arm_compute::ITensor>(input, output, mean, var,
+                                                  beta, gamma, eps, hint));
+  }
+  return true;
+}
+inline bool instantiate_op_softmax(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, void *data) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func<arm_compute::CLSoftmaxLayer,
+                            arm_compute::ICLTensor>(input, output, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(
+        instantiate_op_func<arm_compute::GCSoftmaxLayer,
+                            arm_compute::IGCTensor>(input, output, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func<arm_compute::NESoftmaxLayer, arm_compute::ITensor>(
+            input, output, hint));
+  }
+  return true;
+}
+inline bool instantiate_op_concat(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, int num) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func_lists<arm_compute::CLDepthConcatenateLayer,
+                                  arm_compute::ICLTensor>(acl_op, output, num,
+                                                          hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(
+        instantiate_op_func_lists<arm_compute::GCDepthConcatenateLayer,
+                                  arm_compute::IGCTensor>(acl_op, output, num,
+                                                          hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func_lists<arm_compute::NEDepthConcatenateLayer,
+                                  arm_compute::ITensor>(acl_op, output, num,
+                                                        hint));
+  }
+  return true;
+}
+template <typename Dtype>
+void *InputdataPtr(ACLOperator *op,
+                   const std::vector<framework::LoDTensor *> &input_data,
+                   Dtype type, int index = -1) {
+  if (index == -1) index = 0;
+  return (void *)(input_data[index]->mutable_data<Dtype>());
+}
+template <typename Dtype>
+void acl_run(ACLOperator *op,
+             const std::vector<framework::LoDTensor *> &in_data, void *out_data,
+             Dtype type, bool multi_input_run = true) {
+  for (int i = 0; i < in_data.size(); ++i) {
+    op->tensor_mem(op->cinput(i), InputdataPtr(op, in_data, type, i));
+  }
+  op->acl_run(NULL, out_data);
+}
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
+#ifdef USE_PROFILING
+#define acl_configure(opname, acl_op, args...)                                \
+  {                                                                           \
+    set_operator_property(acl::operate_type_##opname, #opname);               \
+    logtime_util log_time(ACL_CONFIG_INFO);                                   \
+    instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(),         \
+                            acl_op->output(), acl_op->getTargetHint(), args); \
+  }
+#else
+#define acl_configure(opname, acl_op, args...)                                \
+  {                                                                           \
+    set_operator_property(acl::operate_type_##opname, #opname);               \
+    instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(),         \
+                            acl_op->output(), acl_op->getTargetHint(), args); \
+  }
+#endif
+#define ACLOp_Ptr(a) dynamic_cast<ACLOperator *>(a)
+#endif  // USE_ACL
+#endif  // ACL_OPERATOR_H_
--- a/src/operators/kernel/mali/acl_tensor.cc
+++ b/src/operators/kernel/mali/acl_tensor.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "acl_tensor.h"
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+#ifdef USE_ACL
+template <typename TensorType>
+std::unique_ptr<arm_compute::ITensor> initialise_tensor(
+    arm_compute::TensorInfo &info) {
+  auto tensor = cpp14::make_unique<TensorType>();
+  tensor->allocator()->init(info);
+  return std::move(tensor);
+}
+template <typename TensorType>
+void tensor_allocate(arm_compute::ITensor &tensor) {
+  auto itensor = dynamic_cast<TensorType *>(&tensor);
+  itensor->allocator()->allocate();
+}
+Tensor::Tensor(arm_compute::TensorInfo &info) noexcept
+    : _target(TargetHint::DONT_CARE), _info(info), _tensor(nullptr) {}
+Tensor::Tensor(Tensor &&src) noexcept
+    : _target(src._target),
+      _info(std::move(src._info)),
+      _tensor(std::move(src._tensor)) {}
+arm_compute::ITensor *Tensor::set_target(TargetHint target) {
+  switch (target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      _tensor = initialise_tensor<arm_compute::CLTensor>(_info);
+      break;
+#elif defined(USE_OPENGLES)
+    case TargetHint::OPENGLES:
+      _tensor = initialise_tensor<arm_compute::GCTensor>(_info);
+      break;
+#endif
+    case TargetHint::NEON:
+      _tensor = initialise_tensor<arm_compute::Tensor>(_info);
+      break;
+    default:
+      break;
+  }
+  _target = target;
+  return _tensor.get();
+}
+void Tensor::allocate() {
+  switch (_target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      tensor_allocate<arm_compute::CLTensor>(*_tensor);
+      break;
+#elif defined(USE_OPENGLES)
+    case TargetHint::OPENGLES:
+      tensor_allocate<arm_compute::GCTensor>(*_tensor);
+      break;
+#endif
+    case TargetHint::NEON:
+      tensor_allocate<arm_compute::Tensor>(*_tensor);
+      break;
+    default:
+      break;
+  }
+}
+void Tensor::map(bool blocking) {
+#ifdef USE_OPENCL
+  if (_target == TargetHint::OPENCL)
+    dynamic_cast<arm_compute::CLTensor *>(tensor())->map(blocking);
+#elif defined(USE_OPENGLES)
+  if (_target == TargetHint::OPENGLES)
+    dynamic_cast<arm_compute::GCTensor *>(tensor())->map(blocking);
+#endif
+}
+void Tensor::unmap() {
+#ifdef USE_OPENCL
+  if (_target == TargetHint::OPENCL)
+    dynamic_cast<arm_compute::CLTensor *>(tensor())->unmap();
+#elif defined(USE_OPENGLES)
+  if (_target == TargetHint::OPENGLES)
+    dynamic_cast<arm_compute::GCTensor *>(tensor())->unmap();
+#endif
+}
+template <typename SubTensorType, typename ParentTensorType>
+std::unique_ptr<arm_compute::ITensor> initialise_subtensor(
+    arm_compute::ITensor *parent, arm_compute::TensorShape shape,
+    arm_compute::Coordinates coords) {
+  auto ptensor = dynamic_cast<ParentTensorType *>(parent);
+  auto subtensor = cpp14::make_unique<SubTensorType>(ptensor, shape, coords);
+  return std::move(subtensor);
+}
+SubTensor::SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
+                     arm_compute::Coordinates &coords) noexcept
+    : _target(TargetHint::DONT_CARE),
+      _tensor_shape(tensor_shape),
+      _coords(coords),
+      _parent(nullptr),
+      _subtensor(nullptr) {
+  _parent = parent->tensor();
+  _target = parent->target();
+  instantiate_subtensor();
+}
+arm_compute::ITensor *SubTensor::set_target(TargetHint target) {
+  return (target == _target) ? _subtensor.get() : nullptr;
+}
+arm_compute::ITensor *SubTensor::tensor() { return _subtensor.get(); }
+const arm_compute::ITensor *SubTensor::tensor() const {
+  return _subtensor.get();
+}
+TargetHint SubTensor::target() const { return _target; }
+void SubTensor::allocate() {
+  // NOP for sub-tensors
+}
+void SubTensor::instantiate_subtensor() {
+  switch (_target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      _subtensor = initialise_subtensor<arm_compute::CLSubTensor,
+                                        arm_compute::ICLTensor>(
+          _parent, _tensor_shape, _coords);
+      break;
+#endif
+    default:
+    case TargetHint::NEON:
+      _subtensor =
+          initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(
+              _parent, _tensor_shape, _coords);
+      break;
+  }
+}
+#endif
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/mali/acl_tensor.h
+++ b/src/operators/kernel/mali/acl_tensor.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef ACL_TENSOR_H_
+#define ACL_TENSOR_H_
+#ifdef USE_ACL
+#ifdef USE_OPENCL
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#elif defined(USE_OPENGLES)
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#endif
+#include "arm_compute/runtime/SubTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+#include <memory>
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+enum class TargetHint {
+  DONT_CARE,
+  OPENCL,
+  OPENGLES,
+  NEON,
+};
+enum class ConvolutionMethodHint {
+  GEMM,
+  DIRECT,
+};
+namespace cpp14 {
+template <class T>
+struct _Unique_if {
+  typedef std::unique_ptr<T> _Single_object;
+};
+template <class T>
+struct _Unique_if<T[]> {
+  typedef std::unique_ptr<T[]> _Unknown_bound;
+};
+template <class T, size_t N>
+struct _Unique_if<T[N]> {
+  typedef void _Known_bound;
+};
+template <class T, class... Args>
+typename _Unique_if<T>::_Single_object make_unique(Args &&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+template <class T>
+typename _Unique_if<T>::_Unknown_bound make_unique(size_t n) {
+  typedef typename std::remove_extent<T>::type U;
+  return std::unique_ptr<T>(new U[n]());
+}
+template <class T, class... Args>
+typename _Unique_if<T>::_Known_bound make_unique(Args &&...);
+}  // namespace cpp14
+class Tensor {
+ public:
+  explicit Tensor(arm_compute::TensorInfo &info) noexcept;
+  virtual ~Tensor() {}
+  Tensor(Tensor &&src) noexcept;
+  void set_info(arm_compute::TensorInfo &&info) { _info = info; }
+  arm_compute::ITensor *set_target(TargetHint target);
+  const arm_compute::TensorInfo &info() const { return _info; }
+  arm_compute::ITensor *tensor() { return _tensor.get(); }
+  void allocate();
+  void init() {}
+  TargetHint target() const { return _target; }
+  virtual void map(bool blocking = true);
+  virtual void unmap();
+ private:
+  TargetHint _target;
+  arm_compute::TensorInfo _info;
+  std::unique_ptr<arm_compute::ITensor> _tensor;
+};
+class SubTensor {
+ public:
+  SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
+            arm_compute::Coordinates &coords) noexcept;
+  ~SubTensor() {}
+  arm_compute::ITensor *tensor();
+  const arm_compute::ITensor *tensor() const;
+  TargetHint target() const;
+  void allocate();
+  arm_compute::ITensor *set_target(TargetHint target);
+ private:
+  /** Instantiates a sub-tensor */
+  void instantiate_subtensor();
+ private:
+  /**< Target that this tensor is pinned on */
+  TargetHint _target;
+  /**< SubTensor shape */
+  arm_compute::TensorShape _tensor_shape;
+  /**< SubTensor Coordinates */
+  arm_compute::Coordinates _coords;
+  /**< Parent tensor */
+  arm_compute::ITensor *_parent;
+  /**< SubTensor */
+  std::unique_ptr<arm_compute::ITensor> _subtensor;
+};
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif  // ACL_TENSOR_H_
--- a/src/operators/kernel/mali/batchnorm_kernel.cpp
+++ b/src/operators/kernel/mali/batchnorm_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef BATCHNORM_OP
+#include "operators/kernel/batchnorm_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclBatchNormOp : public acl::ACLOperator {
+ public:
+  AclBatchNormOp() {
+    this->force_bypass_acl_path_ = bypass_acl_class_layer & FLAGS_ENABLE_ACL_BN;
+  }
+  ~AclBatchNormOp() = default;
+  AclBatchNormOp(const AclBatchNormOp&) = delete;
+  AclBatchNormOp& operator=(const AclBatchNormOp&) = delete;
+  AclBatchNormOp(AclBatchNormOp&&) = delete;
+  AclBatchNormOp& operator=(AclBatchNormOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const BatchNormParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    arm_compute::TensorShape mean_shape(args.in_depth);
+    arm_compute::TensorShape var_shape = mean_shape;
+    arm_compute::TensorShape beta_shape = mean_shape;
+    arm_compute::TensorShape gamma_shape = mean_shape;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    new_tensor(mean(), mean_shape, args.mean_data);
+    new_tensor(var(), var_shape, args.var_data);
+    new_tensor(beta(), beta_shape, args.biases_data);
+    new_tensor(gamma(), gamma_shape, args.weight_data);
+    acl_configure(bn, this, args.epsilon);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const BatchNormParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const BatchNormParam& param) {
+    const Tensor* in_x = param.InputX();
+    Tensor* out = param.OutputY();
+    const Tensor* scale = param.InputScale();
+    const Tensor* bias = param.InputBias();
+    const Tensor* saved_mean = param.InputMean();
+    const Tensor* saved_variance = param.InputVariance();
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    const T* weight_data = scale->data<T>();
+    const T* bias_data = bias->data<T>();
+    const T* mean_data = saved_mean->data<T>();
+    const T* var_data = saved_variance->data<T>();
+    float epsilon = param.Epsilon();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    // args.weight_data = (void*)weight_data;
+    // args.biases_data = (void*)bias_data;
+    args.mean_data = (void*)mean_data;
+    args.var_data = (void*)var_data;
+    args.epsilon = epsilon;
+    args.dim = in_x->dims().size();
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+    args.out_num = out->dims()[0];
+    args.out_depth = out->dims()[1];
+    args.out_rows = out->dims()[2];
+    args.out_cols = out->dims()[3];
+    args.weight_data = (void*)weight_data;
+    args.biases_data = (void*)bias_data;
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
+template <>
+bool BatchNormKernel<GPU_MALI, float>::Init(const BatchNormParam& param) const {
+  AclBatchNormOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclBatchNormOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void BatchNormKernel<GPU_MALI, float>::Compute(
+    const BatchNormParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclBatchNormOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
+}
+template class BatchNormKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/concat_kernel.cpp
+++ b/src/operators/kernel/mali/concat_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef CONCAT_OP
+#include "operators/kernel/concat_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclConcatOp : public acl::ACLOperator {
+ public:
+  AclConcatOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONCAT;
+  }
+  ~AclConcatOp() = default;
+  AclConcatOp(const AclConcatOp&) = delete;
+  AclConcatOp& operator=(const AclConcatOp&) = delete;
+  AclConcatOp(AclConcatOp&&) = delete;
+  AclConcatOp& operator=(AclConcatOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const ConcatParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    const std::vector<framework::LoDTensor*>* input_data = &args.in_tensor;
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.batch);
+    if (is_operator_init_done(output_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    T type;
+    for (int i = 0; i < input_data->size(); i++) {
+      const T* idata = (*input_data)[i]->data<T>();
+      const T* pdata = (*input_data)[i]->data<T>();
+      int in_batch = (*input_data)[i]->dims()[0];
+      int in_channels = (*input_data)[i]->dims()[1];
+      int in_width = (*input_data)[i]->dims()[2];
+      int in_height = (*input_data)[i]->dims()[3];
+      arm_compute::TensorShape in_shape(in_width, in_height, in_channels);
+      new_tensor(cinput(i), in_shape,
+                 acl::InputdataPtr(this, args.in_tensor, type, i));
+    }
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(concat, this, input_data->size());
+  }
+  void RunAcl(const std::vector<framework::LoDTensor*>& input, void* output) {
+    T type;
+    acl::acl_run(this, input, output, type);
+  }
+  bool Bypass_acl(const ConcatParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || !args.is_channel_concat) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const ConcatParam& param) {
+    auto inputs = param.Inputs();
+    auto* output = param.Out();
+    int64_t axis = param.Axis();
+    T* output_data = output->mutable_data<T>();
+    args.is_channel_concat = (axis == 1);
+    args.in_tensor = inputs;
+    args.output_data = (void*)output_data;
+    args.batch = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+  }
+  acl::AclParameters args;
+};
+template <>
+bool ConcatKernel<GPU_MALI, float>::Init(const ConcatParam& param) const {
+  AclConcatOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConcatOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void ConcatKernel<GPU_MALI, float>::Compute(const ConcatParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclConcatOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  std::vector<framework::LoDTensor*> temp_data = args.in_tensor;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl(temp_data, (void*)output_data);
+}
+template class ConcatKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/conv_add_kernel.cpp
+++ b/src/operators/kernel/mali/conv_add_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADD_OP
+#include "operators/kernel/conv_add_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclConvAddOp : public acl::ACLOperator {
+ public:
+  AclConvAddOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
+  }
+  ~AclConvAddOp() = default;
+  AclConvAddOp(const AclConvAddOp&) = delete;
+  AclConvAddOp& operator=(const AclConvAddOp&) = delete;
+  AclConvAddOp(AclConvAddOp&&) = delete;
+  AclConvAddOp& operator=(AclConvAddOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const FusionConvAddParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+    arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
+                                           args.in_depth / args.num_group,
+                                           args.out_depth);
+    arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PadStrideInfo conv_info(
+        args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
+        arm_compute::DimensionRoundingType::FLOOR);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    check_direct_conv();
+    //[kernel_x, kernel_y, IFM, OFM]
+    new_tensor(weights(), weights_shape, args.weight_data);
+    //[OFM]
+    if (args.biases_data) {
+      new_tensor(biases(), biases_shape, args.biases_data);
+    }
+    group() = args.num_group;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(conv, this, conv_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const FusionConvAddParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
+      bypass_acl = true;
+    }
+    if (args.dim > 2) {
+      bypass_acl = true;
+    }
+    if (args.dilated) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void check_direct_conv() {
+    bool use_direct_conv = false;
+    const char* pDirectConv;
+    pDirectConv = getenv("DIRECTCONV");
+    if (pDirectConv) {
+      unsigned int bdirectconv;
+      sscanf(pDirectConv, "%i", &bdirectconv);
+      if (bdirectconv != use_direct_conv) {
+        use_direct_conv = bdirectconv;
+        printf("DIRECTCONV<%s>\n", pDirectConv);
+        printf("DIRECTCONV: %x\n", use_direct_conv);
+      }
+    }
+    int pad_data[2], kernel[2];
+    pad_data[1] = args.pad_rows;
+    pad_data[0] = args.pad_cols;
+    kernel[1] = args.filter_rows;
+    kernel[0] = args.filter_cols;
+    if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
+                             pad_data[0] == 0 && pad_data[1] == 0) ||
+                            (kernel[0] == 3 && kernel[1] == 3 &&
+                             pad_data[0] <= 1 && pad_data[1] <= 1))) {
+      setConvMethod();  // NEDirectConvolutionLayer only for 1x1 and 3x3
+    }
+  }
+  void AclParametersByContext(const FusionConvAddParam& param) {
+    const Tensor* input = param.Input();
+    Tensor filter = *param.Filter();
+    Tensor* output = param.Output();
+    Tensor* bias;
+    int groups = param.Groups();
+    std::vector<int> strides = param.Strides();
+    std::vector<int> paddings = param.Paddings();
+    std::vector<int> dilations = param.Dilations();
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>();
+    const T* weight_data = filter.data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.weight_data = (void*)weight_data;
+    args.biases_data = nullptr;
+    try {
+      bias = param.Bias();
+    } catch (const std::exception& e) {
+    }
+    if (bias) {
+      const T* biases_data = bias->data<T>();
+      args.biases_data = (void*)biases_data;
+    }
+    args.num_group = groups;
+    args.dilation_rows = dilations[0];
+    args.dilation_cols = dilations[1];
+    if (dilations[0] != 1 || dilations[1] != 1) {
+      args.dilated = true;
+    }
+    // NCHW
+    // std::cout << "In dims: " << (input->dims()).size() << std::endl;
+    args.batch = input->dims()[0];
+    args.in_depth = input->dims()[1];
+    args.in_rows = input->dims()[2];
+    args.in_cols = input->dims()[3];
+    // std::cout <<"In N: " << args.batch << " C: " <<  args.in_depth
+    //  << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout << "Out dims: " << (output->dims()).size() << std::endl;
+    args.out_num = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    args.filter_rows = filter.dims()[2];
+    args.filter_cols = filter.dims()[3];
+    // std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
+    //  << " I: " <<  static_cast<int>(filter.dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+  }
+  acl::AclParameters args;
+};
+template <>
+bool ConvAddKernel<GPU_MALI, float>::Init(
+    const FusionConvAddParam& param) const {
+  AclConvAddOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConvAddOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void ConvAddKernel<GPU_MALI, float>::Compute(
+    const FusionConvAddParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclConvAddOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
+}
+template class ConvAddKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/conv_kernel.cpp
+++ b/src/operators/kernel/mali/conv_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef CONV_OP
+#include "operators/kernel/conv_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclConvOp : public acl::ACLOperator {
+ public:
+  AclConvOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
+  }
+  ~AclConvOp() = default;
+  AclConvOp(const AclConvOp&) = delete;
+  AclConvOp& operator=(const AclConvOp&) = delete;
+  AclConvOp(AclConvOp&&) = delete;
+  AclConvOp& operator=(AclConvOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const ConvParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+    arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
+                                           args.in_depth / args.num_group,
+                                           args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PadStrideInfo conv_info(
+        args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
+        arm_compute::DimensionRoundingType::FLOOR);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    check_direct_conv();
+    //[kernel_x, kernel_y, IFM, OFM]
+    new_tensor(weights(), weights_shape, args.weight_data);
+    //[OFM]
+    // if (args.biases_data) {
+    //    new_tensor(biases(),biases_shape,args.biases_data);
+    //}
+    group() = args.num_group;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(conv, this, conv_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const ConvParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
+      bypass_acl = true;
+    }
+    if (args.dim > 2) {
+      bypass_acl = true;
+    }
+    if (args.dilated) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void check_direct_conv() {
+    bool use_direct_conv = false;
+    const char* pDirectConv;
+    pDirectConv = getenv("DIRECTCONV");
+    if (pDirectConv) {
+      unsigned int bdirectconv;
+      sscanf(pDirectConv, "%i", &bdirectconv);
+      if (bdirectconv != use_direct_conv) {
+        use_direct_conv = bdirectconv;
+        printf("DIRECTCONV<%s>\n", pDirectConv);
+        printf("DIRECTCONV: %x\n", use_direct_conv);
+      }
+    }
+    int pad_data[2], kernel[2];
+    pad_data[1] = args.pad_rows;
+    pad_data[0] = args.pad_cols;
+    kernel[1] = args.filter_rows;
+    kernel[0] = args.filter_cols;
+    if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
+                             pad_data[0] == 0 && pad_data[1] == 0) ||
+                            (kernel[0] == 3 && kernel[1] == 3 &&
+                             pad_data[0] <= 1 && pad_data[1] <= 1))) {
+      setConvMethod();  // NEDirectConvolutionLayer only for 1x1 and 3x3
+    }
+  }
+  void AclParametersByContext(const ConvParam& param) {
+    const Tensor* input = param.Input();
+    Tensor filter = *param.Filter();
+    Tensor* output = param.Output();
+    int groups = param.Groups();
+    std::vector<int> strides = param.Strides();
+    std::vector<int> paddings = param.Paddings();
+    std::vector<int> dilations = param.Dilations();
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>();
+    const T* weight_data = filter.data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.weight_data = (void*)weight_data;
+    args.biases_data = nullptr;
+    // try {
+    //     bias = context.Input<framework::Tensor>("Bias");
+    // } catch (const std::exception& e) {
+    // }
+    // if (bias) {
+    //     const T* biases_data = bias->data<T>();
+    //     args.biases_data = (void*)biases_data;
+    // }
+    args.num_group = groups;
+    args.dilation_rows = dilations[0];
+    args.dilation_cols = dilations[1];
+    if (dilations[0] != 1 || dilations[1] != 1) {
+      args.dilated = true;
+    }
+    // NCHW
+    // std::cout << "In dims: " << (input->dims()).size() << std::endl;
+    args.batch = input->dims()[0];
+    args.in_depth = input->dims()[1];
+    args.in_rows = input->dims()[2];
+    args.in_cols = input->dims()[3];
+    std::cout << "In N: " << args.batch << " C: " << args.in_depth
+              << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout << "Out dims: " << (output->dims()).size() << std::endl;
+    args.out_num = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    args.filter_rows = filter.dims()[2];
+    args.filter_cols = filter.dims()[3];
+    // std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
+    //  << " I: " <<  static_cast<int>(filter.dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+  }
+  acl::AclParameters args;
+};
+template <>
+bool ConvKernel<GPU_MALI, float>::Init(const ConvParam& param) const {
+  AclConvOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConvOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void ConvKernel<GPU_MALI, float>::Compute(const ConvParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclConvOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
+}
+template class ConvKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ELEMENTWISEADD_OP
+#pragma once
+#include "operators/kernel/elementwise_add_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename T>
+struct AddFunctor {
+  inline T operator()(T a, T b) const { return a + b; }
+};
+template <>
+bool ElementwiseAddKernel<GPU_MALI, float>::Init(
+    const ElementwiseAddParam &para) const {
+  return true;
+}
+template <>
+void ElementwiseAddKernel<GPU_MALI, float>::Compute(
+    const ElementwiseAddParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *Out = param.Out();
+  Out->mutable_data<float>();
+  int axis = param.Axis();
+  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
+                                                 AddFunctor<float>(), Out);
+}
+template class ElementwiseAddKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FC_OP
+#pragma once
+#include "operators/kernel/fusion_fc_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FusionFcKernel<GPU_MALI, float>::Init(const FusionFcParam &para) const {
+  return true;
+}
+template <>
+void FusionFcKernel<GPU_MALI, float>::Compute(
+    const FusionFcParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  const Tensor *input_z = param.InputZ();
+  auto *input_z_data = input_z->data<float>();
+  int axis = param.Axis();
+  Tensor *out = param.Out();
+  auto *out_data = out->mutable_data<float>();
+  const Tensor x_matrix =
+      input_x->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+          : *input_x;
+  const Tensor y_matrix =
+      input_y->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+          : *input_y;
+  auto out_dim = out->dims();
+  if (out_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
+  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
+                        " out_dim.size must be 2.");
+  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
+  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ")
+  int64_t classes = input_z->numel();
+  for (int i = 0; i < out_dim[0]; i++) {
+    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
+  }
+  for (int i = 0; i < out->numel(); i++) {
+    DLOG << out_data[i];
+  }
+  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                      out, static_cast<float>(1));
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  //            if (out_dim.size() != 2) {
+  //                out->Resize(out_dim);
+  //            }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/lrn_kernel.cpp
+++ b/src/operators/kernel/mali/lrn_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef LRN_OP
+#pragma once
+#include "operators/kernel/lrn_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclLrnOp : public acl::ACLOperator {
+ public:
+  AclLrnOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_LRN;
+  }
+  ~AclLrnOp() = default;
+  AclLrnOp(const AclLrnOp&) = delete;
+  AclLrnOp& operator=(const AclLrnOp&) = delete;
+  AclLrnOp(AclLrnOp&&) = delete;
+  AclLrnOp& operator=(AclLrnOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const LrnParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape shape(args.in_cols, args.in_rows, args.in_depth);
+    if (is_operator_init_done(shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    arm_compute::NormalizationLayerInfo norm_info(
+        arm_compute::NormType::CROSS_MAP, args.nsize, args.alpha, args.beta,
+        args.knorm);
+    //[width, height, IFM]
+    new_tensor(input(), shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), shape, args.output_data);
+    acl_configure(lrn, this, norm_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const LrnParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const LrnParam& param) {
+    const Tensor* in_x = param.InputX();
+    Tensor* out = param.Out();
+    int n = param.N();
+    T alpha = param.Alpha();
+    T beta = param.Beta();
+    T k = param.K();
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.nsize = n;
+    args.alpha = alpha;
+    args.beta = beta;
+    args.knorm = k;
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
+template <>
+bool LrnKernel<GPU_MALI, float>::Init(const LrnParam& param) const {
+  AclLrnOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclLrnOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void LrnKernel<GPU_MALI, float>::Compute(const LrnParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclLrnOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  for (int n = 0; n < args.batch; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth * args.in_cols * args.in_rows;
+    output_data += args.in_depth * args.in_cols * args.in_rows;
+  }
+}
+template class LrnKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/mul_kernel.cpp
+++ b/src/operators/kernel/mali/mul_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef MUL_OP
+#pragma once
+#include "operators/kernel/mul_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool MulKernel<GPU_MALI, float>::Init(const MulParam &para) const {
+  return true;
+}
+template <>
+void MulKernel<GPU_MALI, float>::Compute(const MulParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *out = param.Out();
+  out->mutable_data<float>();
+  const Tensor x_matrix =
+      input_x->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+          : *input_x;
+  const Tensor y_matrix =
+      input_y->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+          : *input_y;
+  auto out_dim = out->dims();
+  if (out_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                      out, static_cast<float>(0));
+  if (out_dim.size() != 2) {
+    out->Resize(out_dim);
+  }
+}
+template class MulKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/pool_kernel.cpp
+++ b/src/operators/kernel/mali/pool_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POOL_OP
+#pragma once
+#include "operators/kernel/pool_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclPoolOp : public acl::ACLOperator {
+ public:
+  AclPoolOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_POOLING;
+  }
+  ~AclPoolOp() = default;
+  AclPoolOp(const AclPoolOp&) = delete;
+  AclPoolOp& operator=(const AclPoolOp&) = delete;
+  AclPoolOp(AclPoolOp&&) = delete;
+  AclPoolOp& operator=(AclPoolOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const PoolParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth);
+    // arm_compute::TensorShape weights_shape(
+    // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PoolingLayerInfo pool_info;
+    if (args.pool_type == "max") {
+      pool_info = arm_compute::PoolingLayerInfo(
+          arm_compute::PoolingType::MAX, args.filter_rows,
+          arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
+                                     args.pad_cols, args.pad_rows,
+                                     arm_compute::DimensionRoundingType::CEIL));
+    } else {
+      pool_info = arm_compute::PoolingLayerInfo(
+          arm_compute::PoolingType::AVG, args.filter_rows,
+          arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
+                                     args.pad_cols, args.pad_rows,
+                                     arm_compute::DimensionRoundingType::CEIL));
+    }
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(pooling, this, pool_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const PoolParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    if (args.pool_type != "max" && args.pool_type != "avg") {
+      bypass_acl = true;
+    }
+    if (args.filter_rows != args.filter_cols) {
+      bypass_acl = true;
+    }
+    // if (args.filter_rows!=2 && args.filter_rows!=3) {
+    //     bypass_acl = true;
+    // }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const PoolParam& param) {
+    const Tensor* in_x = param.Input();
+    Tensor* out = param.Output();
+    std::string pooling_type = param.PoolingType();
+    std::vector<int> ksize = param.Ksize();
+    std::vector<int> strides = param.Strides();
+    std::vector<int> paddings = param.Paddings();
+    bool is_global_pooling = param.isGlobalPooling();
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.is_global_pool = is_global_pooling;
+    args.pool_type = pooling_type;
+    args.filter_rows = ksize[0];
+    args.filter_cols = ksize[1];
+    args.dim = ksize.size();
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+    // std::cout <<"In N: " << args.batch << " C: " <<  args.in_depth
+    //  << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    // std::cout <<"Filter O: " << static_cast<int>(filter->dims()[0])
+    //  << " I: " <<  static_cast<int>(filter->dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"PoolingType: " << args.pool_type << "\n";
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+    args.out_depth = args.in_depth;
+    // args.out_rows = out->dims()[2];
+    // args.out_cols = out->dims()[3];
+    args.out_rows = static_cast<int>(ceil(static_cast<float>(args.in_rows +
+                                                             2 * args.pad_rows -
+                                                             args.filter_rows) /
+                                          args.stride_rows)) +
+                    1;
+    args.out_cols = static_cast<int>(ceil(static_cast<float>(args.in_cols +
+                                                             2 * args.pad_cols -
+                                                             args.filter_cols) /
+                                          args.stride_cols)) +
+                    1;
+    if (is_global_pooling) {
+      args.filter_rows = args.in_rows;
+      args.filter_cols = args.in_cols;
+      args.pad_rows = 0;
+      args.pad_cols = 0;
+    }
+  }
+  acl::AclParameters args;
+};
+template <>
+bool PoolKernel<GPU_MALI, float>::Init(const PoolParam& param) const {
+  AclPoolOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclPoolOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void PoolKernel<GPU_MALI, float>::Compute(const PoolParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclPoolOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  for (int n = 0; n < args.batch; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth * args.in_cols * args.in_rows;
+    output_data += args.in_depth * args.out_cols * args.out_rows;
+  }
+}
+template class PoolKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/relu_kernel.cpp
+++ b/src/operators/kernel/mali/relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef RELU_OP
+#pragma once
+#include "operators/kernel/relu_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclReluOp : public acl::ACLOperator {
+ public:
+  AclReluOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_RELU;
+  }
+  ~AclReluOp() = default;
+  AclReluOp(const AclReluOp&) = delete;
+  AclReluOp& operator=(const AclReluOp&) = delete;
+  AclReluOp(AclReluOp&&) = delete;
+  AclReluOp& operator=(AclReluOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const ReluParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols * args.in_rows *
+                                         args.in_depth * args.batch);
+    arm_compute::TensorShape output_shape(args.in_cols * args.in_rows *
+                                          args.in_depth * args.out_num);
+    // arm_compute::TensorShape weights_shape(
+    // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::ActivationLayerInfo::ActivationFunction type;
+    type = arm_compute::ActivationLayerInfo::ActivationFunction::RELU;
+    arm_compute::ActivationLayerInfo act_info(type);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(activation, this, act_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const ReluParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const ReluParam& param) {
+    const auto* input_x = param.InputX();
+    auto* out = param.Out();
+    const T* input_data = input_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.batch = input_x->dims()[0];
+    args.in_depth = input_x->dims()[1];
+    args.in_rows = input_x->dims()[2];
+    args.in_cols = input_x->dims()[3];
+    args.out_num = out->dims()[0];
+  }
+  acl::AclParameters args;
+};
+template <>
+bool ReluKernel<GPU_MALI, float>::Init(const ReluParam& param) const {
+  AclReluOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclReluOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void ReluKernel<GPU_MALI, float>::Compute(const ReluParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclReluOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
+}
+template class ReluKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/reshape_kernel.cpp
+++ b/src/operators/kernel/mali/reshape_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef RESHAPE_OP
+#pragma once
+#include "operators/kernel/reshape_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ReshapeKernel<GPU_MALI, float>::Init(const ReshapeParam &para) const {
+  return true;
+}
+template <>
+void ReshapeKernel<GPU_MALI, float>::Compute(const ReshapeParam &param) const {
+  const auto *input_x = param.InputX();
+  const auto &input_x_dims = input_x->dims();
+  auto *out = param.Out();
+  framework::DDim out_dims = out->dims();
+  const auto *input_shape = param.InputShape();
+  if (input_shape) {
+    auto *shape_data = input_shape->data<int>();
+    framework::Tensor cpu_shape_tensor;
+    auto shape =
+        std::vector<int>(shape_data, shape_data + input_shape->numel());
+    out_dims = ValidateShape(shape, input_x->dims());
+  }
+  bool inplace = param.Inplace();
+  out->Resize(out_dims);
+  if (!inplace) {
+    out->mutable_data<float>();
+    framework::TensorCopy(*input_x, out);
+    out->Resize(out_dims);
+  } else {
+    out->ShareDataWith(*input_x);
+    out->Resize(out_dims);
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/softmax_kernel.cpp
+++ b/src/operators/kernel/mali/softmax_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SOFTMAX_OP
+#pragma once
+#include "operators/kernel/softmax_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclSoftmaxOp : public acl::ACLOperator {
+ public:
+  AclSoftmaxOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_SOFTMAX;
+  }
+  ~AclSoftmaxOp() = default;
+  AclSoftmaxOp(const AclSoftmaxOp&) = delete;
+  AclSoftmaxOp& operator=(const AclSoftmaxOp&) = delete;
+  AclSoftmaxOp(AclSoftmaxOp&&) = delete;
+  AclSoftmaxOp& operator=(AclSoftmaxOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const SoftmaxParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape shape(args.in_depth, args.batch);
+    if (is_operator_init_done(shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    //[width, height, IFM]
+    new_tensor(input(), shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), shape, args.output_data);
+    acl_configure(softmax, this, NULL);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const SoftmaxParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const SoftmaxParam& param) {
+    const framework::Tensor* in_x = param.InputX();
+    framework::Tensor* out = param.Out();
+    auto x_dims = in_x->dims();
+    out->Resize(x_dims);
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.out_num = out->dims()[0];
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
+template <>
+bool SoftmaxKernel<GPU_MALI, float>::Init(const SoftmaxParam& param) const {
+  AclSoftmaxOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclSoftmaxOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void SoftmaxKernel<GPU_MALI, float>::Compute(const SoftmaxParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclSoftmaxOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  for (int n = 0; n < args.out_num; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth;
+    output_data += args.in_depth;
+  }
+}
+template class SoftmaxKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mul_kernel.h
+++ b/src/operators/kernel/mul_kernel.h
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef MUL_OP
+#pragma once
 #include "framework/operator.h"
 #include "operators/math/math_function.h"
 #include "operators/op_param.h"
-#pragma once;
 namespace paddle_mobile {
 namespace operators {
@@ -26,6 +29,9 @@ template <typename DeviceType, typename T>
 class MulKernel : public framework::OpKernelBase<DeviceType, MulParam> {
 public:
  void Compute(const MulParam &param) const;
+  bool Init(const MulParam &para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/multiclass_nms_kernel.h
+++ b/src/operators/kernel/multiclass_nms_kernel.h
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef MULTICLASSNMS_OP
+#pragma once
 #include "framework/operator.h"
-#include "operators/op_param.h"
-#pragma once;
+#include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
@@ -25,6 +28,9 @@ class MultiClassNMSKernel
    : public framework::OpKernelBase<DeviceType, MultiClassNMSParam> {
 public:
  void Compute(const MultiClassNMSParam& param) const;
+  bool Init(const MultiClassNMSParam& para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/pool_kernel.h
+++ b/src/operators/kernel/pool_kernel.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef POOL_OP
 #pragma once
 #include "framework/operator.h"
@@ -26,6 +28,9 @@ template <typename DeviceType, typename T>
 class PoolKernel : public OpKernelBase<DeviceType, PoolParam> {
 public:
  void Compute(const PoolParam &param) const override;
+  bool Init(const PoolParam &para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/prior_box_kernel.h
+++ b/src/operators/kernel/prior_box_kernel.h
@@ -12,14 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <vector>
+#ifdef PRIORBOX_OP
+#pragma once
+#include <algorithm>
+#include <cmath>
+#include <vector>
 #include "framework/operator.h"
 #include "operators/math/transform.h"
 #include "operators/op_param.h"
-#pragma once;
 namespace paddle_mobile {
 namespace operators {
@@ -52,6 +55,9 @@ class PriorBoxKernel
    : public framework::OpKernelBase<DeviceType, PriorBoxParam> {
 public:
  void Compute(const PriorBoxParam& param) const;
+  bool Init(const PriorBoxParam& para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/relu_kernel.h
+++ b/src/operators/kernel/relu_kernel.h
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef RELU_OP
+#pragma once
 #include "framework/operator.h"
-#include "operators/op_param.h"
-#pragma once;
+#include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
@@ -24,6 +27,9 @@ template <typename DeviceType, typename T>
 class ReluKernel : public framework::OpKernelBase<DeviceType, ReluParam> {
 public:
  void Compute(const ReluParam& param) const;
+  bool Init(const ReluParam& para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/reshape_kernel.h
+++ b/src/operators/kernel/reshape_kernel.h
@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <vector>
+#ifdef RESHAPE_OP
+#pragma once
+#include <vector>
 #include "framework/operator.h"
-#include "operators/op_param.h"
-#pragma once;
+#include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
@@ -69,6 +71,9 @@ template <typename DeviceType, typename T>
 class ReshapeKernel : public framework::OpKernelBase<DeviceType, ReshapeParam> {
 public:
  void Compute(const ReshapeParam& param) const;
+  bool Init(const ReshapeParam& para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/sigmoid_kernel.h
+++ b/src/operators/kernel/sigmoid_kernel.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef SIGMOID_OP
 #pragma once
 #include "framework/operator.h"
@@ -24,6 +26,9 @@ template <typename DeviceType, typename T>
 class SigmoidKernel : public OpKernelBase<DeviceType, SigmoidParam> {
 public:
  void Compute(const SigmoidParam& param) const override;
+  bool Init(const SigmoidParam& para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/softmax_kernel.h
+++ b/src/operators/kernel/softmax_kernel.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef SOFTMAX_OP
 #pragma once
 #include "framework/operator.h"
@@ -27,6 +29,9 @@ template <typename DeviceType, typename T>
 class SoftmaxKernel : public OpKernelBase<DeviceType, SoftmaxParam> {
 public:
  void Compute(const SoftmaxParam &param) const override;
+  bool Init(const SoftmaxParam &para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/transpose_kernel.h
+++ b/src/operators/kernel/transpose_kernel.h
@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef TRANSPOSE_OP
+#pragma once
 #include <vector>
 #include "framework/operator.h"
 #include "operators/op_param.h"
-#pragma once;
 namespace paddle_mobile {
 namespace operators {
@@ -27,6 +29,9 @@ class TransposeKernel
    : public framework::OpKernelBase<DeviceType, TransposeParam> {
 public:
  void Compute(const TransposeParam& param) const;
+  bool Init(const TransposeParam& para) const;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef LRN_OP
 #include "lrn_op.h"
 namespace paddle_mobile {
@@ -19,13 +21,23 @@ namespace operators {
 template <typename Dtype, typename T>
 void LrnOp<Dtype, T>::InferShape() const {
-  auto x_dims = param_.InputX()->dims();
+  auto x_dims = this->param_.InputX()->dims();
-  param_.Out()->Resize(x_dims);
+  this->param_.Out()->Resize(x_dims);
 }
 template class LrnOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(lrn);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(lrn, ops::LrnOp);
+USE_OP_CPU(lrn);
+REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(lrn);
+REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/lrn_op.h
+++ b/src/operators/lrn_op.h
@@ -11,6 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef LRN_OP
 #pragma once
 #include <string>
@@ -22,26 +25,25 @@ namespace paddle_mobile {
 namespace operators {
 using std::string;
 template <typename DeviceType, typename T>
-class LrnOp : public framework::OperatorWithKernel<DeviceType> {
+class LrnOp : public framework::OperatorWithKernel<
+                  DeviceType, LrnParam, operators::LrnKernel<DeviceType, T>> {
 public:
  LrnOp(const string &type, const VariableNameMap &inputs,
-        const VariableNameMap &outputs, const framework::AttributeMap attrs,
+        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
        std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+      : framework::OperatorWithKernel<DeviceType, LrnParam,
-                                                  scope),
+                                      operators::LrnKernel<DeviceType, T>>(
-        param_(inputs, outputs, attrs, *scope) {}
+            type, inputs, outputs, attrs, scope) {}
-  void RunImpl() const {
+  using framework::OperatorWithKernel<
-    operators::LrnKernel<DeviceType, T> kernel;
+      DeviceType, LrnParam,
-    kernel.Compute(param_);
+      operators::LrnKernel<DeviceType, T>>::OperatorWithKernel;
-  }
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
  void InferShape() const override;
 protected:
-  LrnParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/math/conv_func.h
+++ b/src/operators/math/conv_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+#include "framework/ddim.h"
+#include "framework/tensor.h"
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+using framework::DDim;
+using framework::Tensor;
+inline int ConvOutputSize(int input_size, int filter_size, int dilation,
+                          int padding, int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  return output_size;
+}
+inline void expand_bias(Tensor &bias, int axis, const DDim &dDim) {
+  auto bias_ptr = bias.data<float>();
+  const DDim bias_ddim = bias.dims();
+  PADDLE_MOBILE_ENFORCE(bias.dims().size() == 1,
+                        "the bias tensor's dims size != 1")
+  DDim outer_ddim = paddle_mobile::framework::slice_ddim(dDim, 0, axis + 1);
+  DDim inner_ddim =
+      paddle_mobile::framework::slice_ddim(dDim, axis + 1, dDim.size());
+  int outer_size = paddle_mobile::framework::product(outer_ddim);
+  int inner_size = paddle_mobile::framework::product(inner_ddim);
+  bias.Resize(dDim);
+  auto new_ptr = bias.mutable_data<float>();
+  int axis_size = dDim[axis];
+#if __ARM_NEON
+  for (int i = 0; i < outer_size; ++i) {
+    int inner_num = inner_size >> 4;
+    int remain = inner_size - (inner_num << 4);
+    float v_bias = bias_ptr[i * axis_size / outer_size];
+    for (; inner_num > 0; inner_num--) {
+      float32x4_t v_newptr1 = vdupq_n_f32(v_bias);
+      float32x4_t v_newptr2 = vdupq_n_f32(v_bias);
+      float32x4_t v_newptr3 = vdupq_n_f32(v_bias);
+      float32x4_t v_newptr4 = vdupq_n_f32(v_bias);
+      vst1q_f32(new_ptr, v_newptr1);
+      new_ptr += 4;
+      vst1q_f32(new_ptr, v_newptr2);
+      new_ptr += 4;
+      vst1q_f32(new_ptr, v_newptr3);
+      new_ptr += 4;
+      vst1q_f32(new_ptr, v_newptr4);
+      new_ptr += 4;
+    }
+    for (; remain > 0; remain--) {
+      *new_ptr = v_bias;
+      new_ptr++;
+    }
+  }
+#else
+  for (int i = 0; i < outer_size; ++i) {
+    float v_bias = bias_ptr[i * axis_size / outer_size];
+    for (int j = 0; j < inner_size; ++j) {
+      new_ptr[i * inner_size + j] = v_bias;
+    }
+  }
+#endif
+}
+inline bool IsExpand(const std::vector<int64_t> &filter_dim,
+                     const std::vector<int> &strides,
+                     const std::vector<int> &paddings,
+                     const std::vector<int> &dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/math/depthwiseconv3x3s1p1.cpp
+++ b/src/operators/math/depthwiseconv3x3s1p1.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/math/depthwiseconv3x3s1p1.h"
+#include <arm_neon.h>
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+using framework::Tensor;
+void DepthwiseConv3x3s1p1(const Tensor *input, Tensor filter, Tensor *output,
+                          Tensor bias, bool if_bias) {
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter.data<float>();
+  float *output_data = output->data<float>();
+  const float *bias_data = bias.data<float>();
+  const int h = static_cast<int>(input->dims()[2]);
+  const int w = static_cast<int>(input->dims()[3]);
+  const int l = h;
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int c = static_cast<int>(input->dims()[1]);
+  const int hxw = h * w;
+  float32x4_t vbias = vdupq_n_f32(0.0);
+  for (int b = 0; b < batch_size; ++b) {
+    const float *filter_data_tmp = filter_data;
+    for (int j = 0; j < c; ++j) {
+      if (if_bias) {
+        vbias = vdupq_n_f32(bias_data[j]);
+      }
+      int l_mid = l - 2;  // l=1->l_mid=-1,l=2->l_mid=0
+      float w00 = filter_data_tmp[0];
+      float w01 = filter_data_tmp[1];
+      float w02 = filter_data_tmp[2];
+      float w10 = filter_data_tmp[3];
+      float w11 = filter_data_tmp[4];
+      float w12 = filter_data_tmp[5];
+      float w20 = filter_data_tmp[6];
+      float w21 = filter_data_tmp[7];
+      float w22 = filter_data_tmp[8];
+      output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
+                       w21 * input_data[l] + w22 * input_data[l + 1] +
+                       bias_data[j];
+      output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
+                           w20 * input_data[2 * l - 2] +
+                           w21 * input_data[2 * l - 1] + bias_data[j];
+      output_data[(l - 1) * l] =
+          w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
+          w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1] +
+          bias_data[j];
+      output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
+                               w01 * input_data[(l - 2) * (l + 1) + 1] +
+                               w10 * input_data[l * l - 2] +
+                               w11 * input_data[l * l - 1] + bias_data[j];
+      for (int i = 1; i < l - 1; ++i) {
+        output_data[i * l] =
+            w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
+            w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
+            w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1] +
+            bias_data[j];
+        output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
+                                     w01 * input_data[i * l + l - 1 - l] +
+                                     w10 * input_data[i * l + l - 1 - 1] +
+                                     w11 * input_data[i * l + l - 1] +
+                                     w20 * input_data[i * l + l - 1 + l - 1] +
+                                     w21 * input_data[i * l + l - 1 + l] +
+                                     bias_data[j];
+      }
+      // top 1 row and bottom 1 row
+      const float *input_tmp = input_data;
+      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
+          tmp3, tmp4, tmp5, out0;
+      in0 = vld1q_f32(input_tmp);
+      in2 = vld1q_f32(input_tmp + l);
+      const float *input_tmp_end = input_tmp + (l - 2) * l;
+      in4 = vld1q_f32(input_tmp_end);
+      in6 = vld1q_f32(input_tmp_end + l);
+      int c_mid = l_mid;
+      auto output_ptr = output_data + 1;
+      for (; c_mid > 3; c_mid -= 4) {
+        in1 = vld1q_f32(input_tmp + 4);
+        in3 = vld1q_f32(input_tmp + l + 4);
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+        out0 = vmulq_n_f32(in0, w10);
+        out0 = vmlaq_n_f32(out0, tmp0, w11);
+        out0 = vmlaq_n_f32(out0, tmp1, w12);
+        out0 = vmlaq_n_f32(out0, in2, w20);
+        out0 = vmlaq_n_f32(out0, tmp2, w21);
+        out0 = vmlaq_n_f32(out0, tmp3, w22);
+        out0 = vaddq_f32(out0, vbias);
+        vst1q_f32(output_ptr, out0);
+        in5 = vld1q_f32(input_tmp_end + 4);
+        in7 = vld1q_f32(input_tmp_end + l + 4);
+        tmp0 = vextq_f32(in4, in5, 1);
+        tmp1 = vextq_f32(in4, in5, 2);
+        tmp2 = vextq_f32(in6, in7, 1);
+        tmp3 = vextq_f32(in6, in7, 2);
+        out0 = vmulq_n_f32(in4, w00);
+        out0 = vmlaq_n_f32(out0, tmp0, w01);
+        out0 = vmlaq_n_f32(out0, tmp1, w02);
+        out0 = vmlaq_n_f32(out0, in6, w10);
+        out0 = vmlaq_n_f32(out0, tmp2, w11);
+        out0 = vmlaq_n_f32(out0, tmp3, w12);
+        out0 = vaddq_f32(out0, vbias);
+        vst1q_f32(output_ptr + (l - 1) * l, out0);
+        // can optimize to each 8 stride.
+        input_tmp += 4;
+        input_tmp_end += 4;
+        output_ptr += 4;
+        in0 = in1;
+        in2 = in3;
+        in4 = in5;
+        in6 = in7;
+      }
+      // top right pad
+      float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]);
+      float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]);
+      tmp0 = vextq_f32(in0, pad0, 1);
+      tmp1 = vextq_f32(in0, pad0, 2);
+      tmp2 = vextq_f32(in2, pad1, 1);
+      tmp3 = vextq_f32(in2, pad1, 2);
+      out0 = vmulq_n_f32(in0, w10);
+      out0 = vmlaq_n_f32(out0, tmp0, w11);
+      out0 = vmlaq_n_f32(out0, tmp1, w12);
+      out0 = vmlaq_n_f32(out0, in2, w20);
+      out0 = vmlaq_n_f32(out0, tmp2, w21);
+      out0 = vmlaq_n_f32(out0, tmp3, w22);
+      out0 = vaddq_f32(out0, vbias);
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + i, out0, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + i, out0, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + i, out0, 2);
+        }
+      }
+      // bottom right pad
+      float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]);
+      float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]);
+      tmp0 = vextq_f32(in4, pad2, 1);
+      tmp1 = vextq_f32(in4, pad2, 2);
+      tmp2 = vextq_f32(in6, pad3, 1);
+      tmp3 = vextq_f32(in6, pad3, 2);
+      out0 = vmulq_n_f32(in4, w00);
+      out0 = vmlaq_n_f32(out0, tmp0, w01);
+      out0 = vmlaq_n_f32(out0, tmp1, w02);
+      out0 = vmlaq_n_f32(out0, in6, w10);
+      out0 = vmlaq_n_f32(out0, tmp2, w11);
+      out0 = vmlaq_n_f32(out0, tmp3, w12);
+      out0 = vaddq_f32(out0, vbias);
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2);
+        }
+      }
+      // mid
+      for (int i = 0; i < l - 2; ++i) {
+        auto output_ptr = output_data + (i + 1) * l + 1;
+        input_tmp = input_data + i * l;
+        auto in0_tmp = vld1q_f32(input_tmp);
+        auto in2_tmp = vld1q_f32(input_tmp + l);
+        auto in4_tmp = vld1q_f32(input_tmp + l + l);
+        c_mid = l_mid;
+        for (; c_mid > 3; c_mid -= 4) {
+          auto in1_tmp = vld1q_f32(input_tmp + 4);
+          auto in3_tmp = vld1q_f32(input_tmp + l + 4);
+          auto in5_tmp = vld1q_f32(input_tmp + l + l + 4);
+          tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
+          tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
+          tmp2 = vextq_f32(in2_tmp, in3_tmp, 1);
+          tmp3 = vextq_f32(in2_tmp, in3_tmp, 2);
+          tmp4 = vextq_f32(in4_tmp, in5_tmp, 1);
+          tmp5 = vextq_f32(in4_tmp, in5_tmp, 2);
+          out0 = vmulq_n_f32(in0_tmp, w00);
+          out0 = vmlaq_n_f32(out0, tmp0, w01);
+          out0 = vmlaq_n_f32(out0, tmp1, w02);
+          out0 = vmlaq_n_f32(out0, in2_tmp, w10);
+          out0 = vmlaq_n_f32(out0, tmp2, w11);
+          out0 = vmlaq_n_f32(out0, tmp3, w12);
+          out0 = vmlaq_n_f32(out0, in4_tmp, w20);
+          out0 = vmlaq_n_f32(out0, tmp4, w21);
+          out0 = vmlaq_n_f32(out0, tmp5, w22);
+          out0 = vaddq_f32(out0, vbias);
+          vst1q_f32(output_ptr, out0);
+          output_ptr += 4;
+          input_tmp += 4;
+          in0_tmp = in1_tmp;
+          in2_tmp = in3_tmp;
+          in4_tmp = in5_tmp;
+        }
+        float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]);
+        float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]);
+        float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]);
+        tmp0 = vextq_f32(in0_tmp, pad0, 1);
+        tmp1 = vextq_f32(in0_tmp, pad0, 2);
+        tmp2 = vextq_f32(in2_tmp, pad1, 1);
+        tmp3 = vextq_f32(in2_tmp, pad1, 2);
+        tmp4 = vextq_f32(in4_tmp, pad2, 1);
+        tmp5 = vextq_f32(in4_tmp, pad2, 2);
+        out0 = vmulq_n_f32(in0_tmp, w00);
+        out0 = vmlaq_n_f32(out0, tmp0, w01);
+        out0 = vmlaq_n_f32(out0, tmp1, w02);
+        out0 = vmlaq_n_f32(out0, in2_tmp, w10);
+        out0 = vmlaq_n_f32(out0, tmp2, w11);
+        out0 = vmlaq_n_f32(out0, tmp3, w12);
+        out0 = vmlaq_n_f32(out0, in4_tmp, w20);
+        out0 = vmlaq_n_f32(out0, tmp4, w21);
+        out0 = vmlaq_n_f32(out0, tmp5, w22);
+        out0 = vaddq_f32(out0, vbias);
+        for (int i = 0; i < c_mid; ++i) {
+          if (i == 0) {
+            vst1q_lane_f32(output_ptr + i, out0, 0);
+          }
+          if (i == 1) {
+            vst1q_lane_f32(output_ptr + i, out0, 1);
+          }
+          if (i == 2) {
+            vst1q_lane_f32(output_ptr + i, out0, 2);
+          }
+        }
+      }
+      output_data += hxw;
+      input_data += hxw;
+      filter_data_tmp += 9;
+    }
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/framework/paddle_mobile_object.h
+++ b/src/framework/paddle_mobile_object.h
@@ -13,20 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include "framework/tensor.h"
-#include <string>
-#include "stdio.h"
 namespace paddle_mobile {
+namespace operators {
-class PaddleMobileObject {
+namespace math {
- public:
+using framework::Tensor;
-  virtual std::string ToString() {
-    char address[128] = {0};
+void DepthwiseConv3x3s1p1(const Tensor *input, Tensor filter, Tensor *output,
-    sprintf(address, "%p", this);
+                          Tensor bias, bool if_bias);
-    return std::string(address);
+}  // namespace math
-  }
+}  // namespace operators
- private:
-};
 }  // namespace paddle_mobile
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -13,17 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "operators/math/gemm.h"
+#include "common/log.h"
+#include "memory/t_malloc.h"
+#ifndef X86
+#include <arm_neon.h>
+#endif
 namespace paddle_mobile {
 namespace operators {
 namespace math {
+alignas(64) float packedA[MC * KC];
+alignas(64) float packedB[KC * NC];
+alignas(64) float ab[MR * NR];
 // 将A矩阵分块复制到连续内存(ColMajor)
-void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
+void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                 float *buffer) {
  int i, j;
  const float *Aij;
-  for (i = 0; i < m - paddingM; i += MR) {
+  for (i = 0; i < m - m_tail; i += MR) {
-    for (int j = 0; j < k; ++j) {
+    for (j = 0; j < k; ++j) {
      Aij = &A(i, j);
      *buffer++ = *Aij;
      *buffer++ = *(Aij + 1);
@@ -31,13 +39,13 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
      *buffer++ = *(Aij + 3);
    }
  }
-  if (paddingM != 0) {
+  if (m_tail != 0) {
    for (j = 0; j < k; ++j) {
-      Aij = &A(m - paddingM, j);
+      Aij = &A(m - m_tail, j);
-      for (i = 0; i < paddingM; ++i) {
+      for (i = 0; i < m_tail; ++i) {
        *buffer++ = *(Aij + i);
      }
-      for (i = paddingM; i < MR; ++i) {
+      for (i = m_tail; i < MR; ++i) {
        *buffer++ = 0;
      }
    }
@@ -45,11 +53,11 @@ void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
 }
 // 将A矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
+void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
                  float *buffer) {
  int i, j;
  const float *Ai, *Ai1, *Ai2, *Ai3;
-  for (i = 0; i < m - paddingM; i += MR) {
+  for (i = 0; i < m - m_tail; i += MR) {
    Ai = &A(i, 0);
    Ai1 = &A(i + 1, 0);
    Ai2 = &A(i + 2, 0);
@@ -61,12 +69,12 @@ void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
      *buffer++ = *Ai3++;
    }
  }
-  if (paddingM != 0) {
+  if (m_tail != 0) {
    for (j = 0; j < k; ++j) {
-      for (i = m - paddingM; i < m; ++i) {
+      for (i = m - m_tail; i < m; ++i) {
        *buffer++ = A(i, j);
      }
-      for (i = m; i < m + (MR - paddingM); ++i) {
+      for (i = m; i < m + (MR - m_tail); ++i) {
        *buffer++ = 0;
      }
    }
@@ -74,11 +82,11 @@ void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
 }
 // 将B矩阵分块复制到连续内存(ColMajor)
-void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
+void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                 float *buffer) {
  int i, j;
  const float *Bj, *Bj1, *Bj2, *Bj3;
-  for (j = 0; j < n - paddingN; j += NR) {
+  for (j = 0; j < n - n_tail; j += NR) {
    Bj = &B(0, j);
    Bj1 = &B(0, j + 1);
    Bj2 = &B(0, j + 2);
@@ -90,12 +98,12 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
      *buffer++ = *Bj3++;
    }
  }
-  if (paddingN != 0) {
+  if (n_tail != 0) {
    for (i = 0; i < k; ++i) {
-      for (int j = n - paddingN; j < n; ++j) {
+      for (int j = n - n_tail; j < n; ++j) {
        *buffer++ = B(i, j);
      }
-      for (int j = n; j < n + (NR - paddingN); ++j) {
+      for (int j = n; j < n + (NR - n_tail); ++j) {
        *buffer++ = 0;
      }
    }
@@ -103,26 +111,28 @@ void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
 }
 // 将B矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
+void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
                  float *buffer) {
  int i, j;
  const float *Bij;
-  for (j = 0; j < n - paddingN; j += NR) {
+  for (j = 0; j < n - n_tail; j += NR) {
    for (i = 0; i < k; ++i) {
      Bij = &B(i, j);
-      *buffer++ = *Bij;
+      asm volatile(
-      *buffer++ = *(Bij + 1);
+          "vld1.32    {q0}, [%[Bij]]        \n\t"
-      *buffer++ = *(Bij + 2);
+          "vst1.32    {q0}, [%[buffer]]!    \n\t"
-      *buffer++ = *(Bij + 3);
+          : [buffer] "+r"(buffer)
+          : [Bij] "r"(Bij)
+          : "memory", "q0");
    }
  }
-  if (paddingN != 0) {
+  if (n_tail != 0) {
    for (i = 0; i < k; ++i) {
-      Bij = &B(i, n - paddingN);
+      Bij = &B(i, n - n_tail);
-      for (int j = n - paddingN; j < n; ++j) {
+      for (int j = n - n_tail; j < n; ++j) {
        *buffer++ = *Bij++;
      }
-      for (int j = n; j < n + (NR - paddingN); ++j) {
+      for (int j = n; j < n + (NR - n_tail); ++j) {
        *buffer++ = 0;
      }
    }
@@ -133,53 +143,545 @@ void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
 void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
                 const float *B, int ldb, float beta, float *C, int ldc,
                 int first_time) {
-  int Buff_A_M = m;
+  int m_block = (m + MR - 1) / MR * MR;
-  int Buff_B_N = n;
+  int n_block = (n + NR - 1) / NR * NR;
-  int _mc = m % MR;
+  int m_tail = m % MR;
-  int _nc = n % NR;
+  int n_tail = n % NR;
-  if (_mc != 0) {
+  if (first_time) {
-    Buff_A_M = m + (MR - _mc);
+    PackMatrixB_(k, n, n_tail, B, ldb, packedB);
  }
+  PackMatrixA_(m, k, m_tail, A, lda, packedA);
+  int i, j, mc, nc;
-  if (_nc != 0) {
+  // B 取 4 列, 打包预热
-    Buff_B_N = n + (NR - _nc);
+  for (j = 0; j < n_block; j += NR) {
+    nc = (n - j) < NR ? n_tail : NR;
+    // A 取 4 行，打包预热
+    for (i = 0; i < m_block; i += MR) {
+      mc = (m - i) < MR ? m_tail : MR;
+      AddDot4x4(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
+                &C(i, j), ldc, mc, nc);
+    }
  }
+}
-  float packedA[MC * KC];
+// 分块矩阵乘法
-  static float packedB[KC * NC];
+void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda,
+                      const float *B, int ldb, float beta, float *C, int ldc,
+                      int first_time, bool relu = false) {
+  int m_block = (m + MR - 1) / MR * MR;
+  int n_block = (n + NR - 1) / NR * NR;
+  int m_tail = m % MR;
+  int n_tail = n % NR;
  if (first_time) {
-    PackMatrixB_(k, n, _nc, B, ldb, packedB);
+    PackMatrixB_(k, n, n_tail, B, ldb, packedB);
  }
-  PackMatrixA_(m, k, _mc, A, lda, packedA);
+  PackMatrixA_(m, k, m_tail, A, lda, packedA);
  int i, j, mc, nc;
  // B 取 4 列, 打包预热
-  for (j = 0; j < Buff_B_N; j += NR) {
+  for (j = 0; j < n_block; j += NR) {
-    nc = (n - j) < NR ? _nc : NR;
+    nc = (n - j) < NR ? n_tail : NR;
    // A 取 4 行，打包预热
-    for (i = 0; i < Buff_A_M; i += MR) {
+    for (i = 0; i < m_block; i += MR) {
-      mc = (m - i) < MR ? _mc : MR;
+      mc = (m - i) < MR ? m_tail : MR;
-      AddDot4x4(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
+      AddDot4x4_relu(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
-                &C(i, j), ldc, mc, nc);
+                     &C(i, j), ldc, mc, nc, relu);
    }
  }
 }
 // 计算一个更小的 4 * 4 的 C 矩阵分块
+#if defined(IOS)
+void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
+               int ldb, float beta, float *C, int ldc, int mc, int nc) {
+  // init C
+  float32x4_t cv0 = vdupq_n_f32(0.0);
+  float32x4_t cv1 = vdupq_n_f32(0.0);
+  float32x4_t cv2 = vdupq_n_f32(0.0);
+  float32x4_t cv3 = vdupq_n_f32(0.0);
+  float32x4_t av;
+  float32x4_t bv;
+  float32x2_t av01;
+  float32x2_t av23;
+  for (int p = 0; p < k; p += 1) {
+    av = vld1q_f32(a);
+    bv = vld1q_f32(b);
+    av01 = vget_low_f32(av);
+    cv0 = vmlaq_lane_f32(cv0, bv, av01, 0);
+    cv1 = vmlaq_lane_f32(cv1, bv, av01, 1);
+    av23 = vget_high_f32(av);
+    cv2 = vmlaq_lane_f32(cv2, bv, av23, 0);
+    cv3 = vmlaq_lane_f32(cv3, bv, av23, 1);
+    a += MR;
+    b += NR;
+  }
+  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
+  int i, j;
+  for (i = 0; i < mc; ++i) {
+    for (j = 0; j < nc; ++j) {
+      if (beta == 0.0) {
+        C(i, j) = 0.0;
+      } else if (beta != 1.0) {
+        C(i, j) *= beta;
+      }
+      if (j == 0) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
+      } else if (j == 1) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
+      } else if (j == 2) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
+      } else if (j == 3) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
+      }
+    }
+  }
+}
+void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
+                    int ldb, float beta, float *C, int ldc, int mc, int nc,
+                    bool relu = false) {
+  // init C
+  float32x4_t cv0 = vdupq_n_f32(0.0);
+  float32x4_t cv1 = vdupq_n_f32(0.0);
+  float32x4_t cv2 = vdupq_n_f32(0.0);
+  float32x4_t cv3 = vdupq_n_f32(0.0);
+  float32x4_t av;
+  float32x4_t bv;
+  float32x2_t av01;
+  float32x2_t av23;
+  for (int p = 0; p < k; p += 1) {
+    av = vld1q_f32(a);
+    bv = vld1q_f32(b);
+    av01 = vget_low_f32(av);
+    cv0 = vmlaq_lane_f32(cv0, bv, av01, 0);
+    cv1 = vmlaq_lane_f32(cv1, bv, av01, 1);
+    av23 = vget_high_f32(av);
+    cv2 = vmlaq_lane_f32(cv2, bv, av23, 0);
+    cv3 = vmlaq_lane_f32(cv3, bv, av23, 1);
+    a += MR;
+    b += NR;
+  }
+  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
+  int i, j;
+  for (i = 0; i < mc; ++i) {
+    for (j = 0; j < nc; ++j) {
+      if (beta == 0.0) {
+        C(i, j) = 0.0;
+      } else if (beta != 1.0) {
+        C(i, j) *= beta;
+      }
+      if (j == 0) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
+      } else if (j == 1) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
+      } else if (j == 2) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
+      } else if (j == 3) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
+      }
+      if (C(i, j) < 0) {
+        C(i, j) = 0;
+      }
+    }
+  }
+}
+#elif defined(ARMV7)
+void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
+               int ldb, float beta, float *C, int ldc, int mc, int nc) {
+  int kc1 = k / 4, kc2 = k % 4;
+  int bytes_ldc = 4 * ldc;
+  int flag_alpha = (alpha == 1.0) ? 1 : 2;
+  int flag_beta;
+  if (beta == 0.0) {
+    flag_beta = 0;
+  } else if (beta == 1.0) {
+    flag_beta = 1;
+  } else {
+    flag_beta = 2;
+  }
+  asm volatile(
+      "pld        [%[a]]              \n\t"
+      "pld        [%[b]]              \n\t"
+      "vmov.f32   q10,    #0.0        \n\t"
+      "vmov.f32   q11,    #0.0        \n\t"
+      "vmov.f32   q12,    #0.0        \n\t"
+      "vmov.f32   q13,    #0.0        \n\t"
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "blt        end_kc1_%=          \n\t"
+      "loop_kc1_%=:                   \n\t"
+      "pld        [%[a], #64]         \n\t"
+      "pld        [%[b], #64]         \n\t"
+      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
+      "vmla.f32   q10, q2, d0[0]      \n\t"
+      "vmla.f32   q11, q2, d0[1]      \n\t"
+      "vmla.f32   q12, q2, d1[0]      \n\t"
+      "vmla.f32   q13, q2, d1[1]      \n\t"
+      "vmla.f32   q10, q3, d2[0]      \n\t"
+      "vmla.f32   q11, q3, d2[1]      \n\t"
+      "vmla.f32   q12, q3, d3[0]      \n\t"
+      "vmla.f32   q13, q3, d3[1]      \n\t"
+      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
+      "vmla.f32   q10, q2, d0[0]      \n\t"
+      "vmla.f32   q11, q2, d0[1]      \n\t"
+      "vmla.f32   q12, q2, d1[0]      \n\t"
+      "vmla.f32   q13, q2, d1[1]      \n\t"
+      "vmla.f32   q10, q3, d2[0]      \n\t"
+      "vmla.f32   q11, q3, d2[1]      \n\t"
+      "vmla.f32   q12, q3, d3[0]      \n\t"
+      "vmla.f32   q13, q3, d3[1]      \n\t"
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "bge        loop_kc1_%=         \n\t"
+      "end_kc1_%=:                    \n\t"
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "blt        end_kc2_%=          \n\t"
+      "loop_kc2_%=:                   \n\t"
+      "vld1.32    {q0}, [%[a]]!       \n\t"
+      "vld1.32    {q1}, [%[b]]!       \n\t"
+      "vmla.f32   q10, q1, d0[0]      \n\t"
+      "vmla.f32   q11, q1, d0[1]      \n\t"
+      "vmla.f32   q12, q1, d1[0]      \n\t"
+      "vmla.f32   q13, q1, d1[1]      \n\t"
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "bge        loop_kc2_%=         \n\t"
+      "end_kc2_%=:                    \n\t"
+      "cmp        %[mc],      #4      \n\t"
+      "bne        temp_%=             \n\t"
+      "cmp        %[nc],      #4      \n\t"
+      "bne        temp_%=             \n\t"
+      "vmov.f32   d8[0],    %[alpha]  \n\t"
+      "vmov.f32   d8[1],    %[beta]   \n\t"
+      "cmp        %[flag_alpha],  #1  \n\t"
+      "bne        alpha_%=            \n\t"
+      "alpha_%=:                      \n\t"
+      "vmul.f32   q10, q10, d8[0]     \n\t"
+      "vmul.f32   q11, q11, d8[0]     \n\t"
+      "vmul.f32   q12, q12, d8[0]     \n\t"
+      "vmul.f32   q13, q13, d8[0]     \n\t"
+      "beta_%=:                       \n\t"
+      "cmp        %[flag_beta],   #0  \n\t"
+      "beq        memory_%=           \n\t"
+      "mov        r4,     %[C]        \n\t"
+      "mov        r6,     %[bytes_ldc]\n\t"
+      "vld1.32    {q0}, [r4], r6      \n\t"
+      "vld1.32    {q1}, [r4], r6      \n\t"
+      "vld1.32    {q2}, [r4], r6      \n\t"
+      "vld1.32    {q3}, [r4]          \n\t"
+      "cmp        %[flag_beta],   #1  \n\t"
+      "beq        beta_eq1_%=         \n\t"
+      "bne        beta_ne1_%=         \n\t"
+      "beta_eq1_%=:                   \n\t"
+      "vadd.f32   q10, q10, q0        \n\t"
+      "vadd.f32   q11, q11, q1        \n\t"
+      "vadd.f32   q12, q12, q2        \n\t"
+      "vadd.f32   q13, q13, q3        \n\t"
+      "b          memory_%=           \n\t"
+      "beta_ne1_%=:                   \n\t"
+      "vmla.f32   q10, q0, d8[1]      \n\t"
+      "vmla.f32   q11, q1, d8[1]      \n\t"
+      "vmla.f32   q12, q2, d8[1]      \n\t"
+      "vmla.f32   q13, q3, d8[1]      \n\t"
+      "memory_%=:                     \n\t"
+      "mov        r5,     %[C]        \n\t"
+      "mov        r6,     %[bytes_ldc]\n\t"
+      "vst1.32    {q10}, [r5], r6     \n\t"
+      "vst1.32    {q11}, [r5], r6     \n\t"
+      "vst1.32    {q12}, [r5], r6     \n\t"
+      "vst1.32    {q13}, [r5]         \n\t"
+      "b          end_%=              \n\t"
+      "temp_%=:                       \n\t"
+      "vst1.32    {q10, q11}, [%[ab]]!\n\t"
+      "vst1.32    {q12, q13}, [%[ab]] \n\t"
+      "end_%=:                        \n\t"
+      :
+      : [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
+        [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
+        [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13");
+  if (mc != MR || nc != NR) {
+    int i, j;
+    for (i = 0; i < mc; ++i) {
+      for (j = 0; j < nc; ++j) {
+        if (beta == 0.0) {
+          if (alpha != 1.0) {
+            C(i, j) = alpha * ab[i * MR + j];
+          } else {
+            C(i, j) = ab[i * MR + j];
+          }
+        } else {
+          if (beta != 1.0) {
+            C(i, j) *= beta;
+          }
+          if (alpha != 1.0) {
+            C(i, j) += alpha * ab[i * MR + j];
+          } else {
+            C(i, j) += ab[i * MR + j];
+          }
+        }
+      }
+    }
+  }
+}
+void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
+                    int ldb, float beta, float *C, int ldc, int mc, int nc,
+                    bool relu = false) {
+  int kc1 = k / 4, kc2 = k % 4;
+  int bytes_ldc = 4 * ldc;
+  int flag_alpha = (alpha == 1.0) ? 1 : 2;
+  int flag_beta;
+  if (beta == 0.0) {
+    flag_beta = 0;
+  } else if (beta == 1.0) {
+    flag_beta = 1;
+  } else {
+    flag_beta = 2;
+  }
+  asm volatile(
+      "pld        [%[a]]              \n\t"
+      "pld        [%[b]]              \n\t"
+      "vmov.f32   q10,    #0.0        \n\t"
+      "vmov.f32   q11,    #0.0        \n\t"
+      "vmov.f32   q12,    #0.0        \n\t"
+      "vmov.f32   q13,    #0.0        \n\t"
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "blt        end_kc1_%=          \n\t"
+      "loop_kc1_%=:                   \n\t"
+      "pld        [%[a], #64]         \n\t"
+      "pld        [%[b], #64]         \n\t"
+      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
+      "vmla.f32   q10, q2, d0[0]      \n\t"
+      "vmla.f32   q11, q2, d0[1]      \n\t"
+      "vmla.f32   q12, q2, d1[0]      \n\t"
+      "vmla.f32   q13, q2, d1[1]      \n\t"
+      "vmla.f32   q10, q3, d2[0]      \n\t"
+      "vmla.f32   q11, q3, d2[1]      \n\t"
+      "vmla.f32   q12, q3, d3[0]      \n\t"
+      "vmla.f32   q13, q3, d3[1]      \n\t"
+      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
+      "vmla.f32   q10, q2, d0[0]      \n\t"
+      "vmla.f32   q11, q2, d0[1]      \n\t"
+      "vmla.f32   q12, q2, d1[0]      \n\t"
+      "vmla.f32   q13, q2, d1[1]      \n\t"
+      "vmla.f32   q10, q3, d2[0]      \n\t"
+      "vmla.f32   q11, q3, d2[1]      \n\t"
+      "vmla.f32   q12, q3, d3[0]      \n\t"
+      "vmla.f32   q13, q3, d3[1]      \n\t"
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "bge        loop_kc1_%=         \n\t"
+      "end_kc1_%=:                    \n\t"
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "blt        end_kc2_%=          \n\t"
+      "loop_kc2_%=:                   \n\t"
+      "vld1.32    {q0}, [%[a]]!       \n\t"
+      "vld1.32    {q1}, [%[b]]!       \n\t"
+      "vmla.f32   q10, q1, d0[0]      \n\t"
+      "vmla.f32   q11, q1, d0[1]      \n\t"
+      "vmla.f32   q12, q1, d1[0]      \n\t"
+      "vmla.f32   q13, q1, d1[1]      \n\t"
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "bge        loop_kc2_%=         \n\t"
+      "end_kc2_%=:                    \n\t"
+      "cmp        %[mc],      #4      \n\t"
+      "bne        temp_%=             \n\t"
+      "cmp        %[nc],      #4      \n\t"
+      "bne        temp_%=             \n\t"
+      "vmov.f32   d8[0],    %[alpha]  \n\t"
+      "vmov.f32   d8[1],    %[beta]   \n\t"
+      "cmp        %[flag_alpha],  #1  \n\t"
+      "bne        alpha_%=            \n\t"
+      "alpha_%=:                      \n\t"
+      "vmul.f32   q10, q10, d8[0]     \n\t"
+      "vmul.f32   q11, q11, d8[0]     \n\t"
+      "vmul.f32   q12, q12, d8[0]     \n\t"
+      "vmul.f32   q13, q13, d8[0]     \n\t"
+      "beta_%=:                       \n\t"
+      "cmp        %[flag_beta],   #0  \n\t"
+      "beq        memory_%=           \n\t"
+      "mov        r4,     %[C]        \n\t"
+      "mov        r6,     %[bytes_ldc]\n\t"
+      "vld1.32    {q0}, [r4], r6      \n\t"
+      "vld1.32    {q1}, [r4], r6      \n\t"
+      "vld1.32    {q2}, [r4], r6      \n\t"
+      "vld1.32    {q3}, [r4]          \n\t"
+      "cmp        %[flag_beta],   #1  \n\t"
+      "beq        beta_eq1_%=         \n\t"
+      "bne        beta_ne1_%=         \n\t"
+      "beta_eq1_%=:                   \n\t"
+      "vadd.f32   q10, q10, q0        \n\t"
+      "vadd.f32   q11, q11, q1        \n\t"
+      "vadd.f32   q12, q12, q2        \n\t"
+      "vadd.f32   q13, q13, q3        \n\t"
+      "b          memory_%=           \n\t"
+      "beta_ne1_%=:                   \n\t"
+      "vmla.f32   q10, q0, d8[1]      \n\t"
+      "vmla.f32   q11, q1, d8[1]      \n\t"
+      "vmla.f32   q12, q2, d8[1]      \n\t"
+      "vmla.f32   q13, q3, d8[1]      \n\t"
+      "memory_%=:                     \n\t"
+      "vmax.f32 q10, q10, q14         \n\t"
+      "vmax.f32 q11, q11, q14         \n\t"
+      "vmax.f32 q12, q12, q14         \n\t"
+      "vmax.f32 q13, q13, q14         \n\t"
+      "mov        r5,     %[C]        \n\t"
+      "mov        r6,     %[bytes_ldc]\n\t"
+      "vst1.32    {q10}, [r5], r6     \n\t"
+      "vst1.32    {q11}, [r5], r6     \n\t"
+      "vst1.32    {q12}, [r5], r6     \n\t"
+      "vst1.32    {q13}, [r5]         \n\t"
+      "b          end_%=              \n\t"
+      "temp_%=:                       \n\t"
+      "vst1.32    {q10, q11}, [%[ab]]!\n\t"
+      "vst1.32    {q12, q13}, [%[ab]] \n\t"
+      "end_%=:                        \n\t"
+      :
+      : [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
+        [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
+        [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13",
+        "q14");
+  if (mc != MR || nc != NR) {
+    int i, j;
+    for (i = 0; i < mc; ++i) {
+      for (j = 0; j < nc; ++j) {
+        if (beta == 0.0) {
+          if (alpha != 1.0) {
+            C(i, j) = alpha * ab[i * MR + j];
+          } else {
+            C(i, j) = ab[i * MR + j];
+          }
+        } else {
+          if (beta != 1.0) {
+            C(i, j) *= beta;
+          }
+          if (alpha != 1.0) {
+            C(i, j) += alpha * ab[i * MR + j];
+          } else {
+            C(i, j) += ab[i * MR + j];
+          }
+        }
+        if (relu) {
+          if (C(i, j) < 0) {
+            C(i, j) = 0;
+          }
+        }
+      }
+    }
+  }
+}
+#else
 void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
               int ldb, float beta, float *C, int ldc, int mc, int nc) {
  float c[16] = {0};
  float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
-  // // init C
+  for (int p = 0; p < k; p += 1) {
-  // float32x4_t cv0 = vdup_n_f32(0.0);
+    reg_b0 = *b++;
-  // float32x4_t cv1 = vdup_n_f32(0.0);
+    reg_b1 = *b++;
-  // float32x4_t cv2 = vdup_n_f32(0.0);
+    reg_b2 = *b++;
-  // float32x4_t cv3 = vdup_n_f32(0.0);
+    reg_b3 = *b++;
+    reg_a0 = *a++;
+    reg_a1 = *a++;
+    reg_a2 = *a++;
+    reg_a3 = *a++;
+    // first row
+    c[0] += reg_a0 * reg_b0;
+    c[1] += reg_a0 * reg_b1;
+    c[2] += reg_a0 * reg_b2;
+    c[3] += reg_a0 * reg_b3;
+    // second row
+    c[4] += reg_a1 * reg_b0;
+    c[5] += reg_a1 * reg_b1;
+    c[6] += reg_a1 * reg_b2;
+    c[7] += reg_a1 * reg_b3;
+    // third row
+    c[8] += reg_a2 * reg_b0;
+    c[9] += reg_a2 * reg_b1;
+    c[10] += reg_a2 * reg_b2;
+    c[11] += reg_a2 * reg_b3;
+    // fourth row
+    c[12] += reg_a3 * reg_b0;
+    c[13] += reg_a3 * reg_b1;
+    c[14] += reg_a3 * reg_b2;
+    c[15] += reg_a3 * reg_b3;
+  }
+  int i, j;
+  for (i = 0; i < mc; ++i) {
+    for (j = 0; j < nc; ++j) {
+      if (beta == 0.0) {
+        C(i, j) = 0.0;
+      } else if (beta != 1.0) {
+        C(i, j) *= beta;
+      }
+      if (alpha != 1.0) {
+        C(i, j) += alpha * c[i * MR + j];
+      } else {
+        C(i, j) += c[i * MR + j];
+      }
+    }
+  }
+}
+void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
+                    int ldb, float beta, float *C, int ldc, int mc, int nc,
+                    bool relu) {
+  float c[16] = {0};
+  float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
  for (int p = 0; p < k; p += 1) {
    reg_b0 = *b++;
@@ -229,15 +731,26 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
      } else {
        C(i, j) += c[i * MR + j];
      }
+      if (relu) {
+        if (C(i, j) < 0) {
+          C(i, j) = 0;
+        }
+      }
    }
  }
 }
+#endif
 // 32位 float 矩阵乘法
 void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
           const float *B, int ldb, float beta, float *C, int ldc) {
  int i, j, p, mc, nc, kc;
  float beta_;
+  if (m == 1) {
+    VectorKernel(1, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+    return;
+  }
  for (j = 0; j < n; j += NC) {
    nc = s_min(n - j, NC);
    for (p = 0; p < k; p += KC) {
@@ -256,6 +769,248 @@ void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
  }
 }
+void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
+                const float *B, int ldb, float beta, float *C, int ldc) {
+  int i, j, p, mc, nc, kc;
+  float beta_;
+  for (j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+    for (p = 0; p < k; p += KC) {
+      kc = s_min(k - p, KC);
+      for (i = 0; i < m; i += MC) {
+        mc = s_min(m - i, MC);
+        if (p != 0) {
+          beta_ = 1.0;
+        } else {
+          beta_ = beta;
+        }
+        if (p + KC >= k) {
+          InnerKernel_relu(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb,
+                           beta_, &C(i, j), ldc, i == 0, true);
+        } else {
+          InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
+                      &C(i, j), ldc, i == 0);
+        }
+      }
+    }
+  }
+}
+void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
+                  const float *B, int ldb, float beta, float *C, int ldc) {
+  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
+  const float *a0, *b0, *b1, *b2, *b3;
+  float *c0, *C0;
+  int volatile kc1 = k / 4;
+  int volatile kc2 = k % 4;
+  int volatile nc1 = n / 16;
+  int _nc1 = n % 16;
+  int volatile nc2 = _nc1 / 4;
+  int volatile nc3 = _nc1 % 4;
+  for (int i = 0; i < kc1; i++) {
+    a0 = A + i * 4;
+    b0 = B + i * 4 * ldb;
+    b1 = b0 + ldb;
+    b2 = b1 + ldb;
+    b3 = b2 + ldb;
+    c0 = bufferC;
+    asm volatile(
+        "pld        [%[a0], #16]          \n\t"
+        "vld1.32    {q0}, [%[a0]]         \n\t"
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "blt        end_nc1_%=            \n\t"
+        "loop_nc1_%=:                     \n\t"
+        "cmp        %[i],       #0        \n\t"
+        "beq        i_eq0_%=              \n\t"
+        "bne        i_ne0_%=              \n\t"
+        "i_eq0_%=:                        \n\t"
+        "vmov.f32   q10,    #0.0          \n\t"
+        "vmov.f32   q11,    #0.0          \n\t"
+        "vmov.f32   q12,    #0.0          \n\t"
+        "vmov.f32   q13,    #0.0          \n\t"
+        "b          gemm_nc1_%=           \n\t"
+        "i_ne0_%=:                        \n\t"
+        "pld        [%[c0], #64]          \n\t"
+        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
+        "sub        %[c0], %[c0], #32     \n\t"
+        "gemm_nc1_%=:                     \n\t"
+        "pld        [%[b0], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "vmla.f32   q11, q3, d0[0]        \n\t"
+        "vmla.f32   q12, q4, d0[0]        \n\t"
+        "vmla.f32   q13, q5, d0[0]        \n\t"
+        "pld        [%[b1], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b1]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b1]]!    \n\t"
+        "vmla.f32   q10, q2, d0[1]        \n\t"
+        "vmla.f32   q11, q3, d0[1]        \n\t"
+        "vmla.f32   q12, q4, d0[1]        \n\t"
+        "vmla.f32   q13, q5, d0[1]        \n\t"
+        "pld        [%[b2], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b2]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b2]]!    \n\t"
+        "vmla.f32   q10, q2, d1[0]        \n\t"
+        "vmla.f32   q11, q3, d1[0]        \n\t"
+        "vmla.f32   q12, q4, d1[0]        \n\t"
+        "vmla.f32   q13, q5, d1[0]        \n\t"
+        "pld        [%[b3], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b3]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b3]]!    \n\t"
+        "vmla.f32   q10, q2, d1[1]        \n\t"
+        "vmla.f32   q11, q3, d1[1]        \n\t"
+        "vmla.f32   q12, q4, d1[1]        \n\t"
+        "vmla.f32   q13, q5, d1[1]        \n\t"
+        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "bge        loop_nc1_%=           \n\t"
+        "end_nc1_%=:                      \n\t"
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "blt        end_nc2_%=            \n\t"
+        "loop_nc2_%=:                     \n\t"
+        "cmp        %[i],       #0        \n\t"
+        "beq        ii_eq0_%=             \n\t"
+        "bne        ii_ne0_%=             \n\t"
+        "ii_eq0_%=:                       \n\t"
+        "vmov.f32   q10,    #0.0          \n\t"
+        "b          gemm_nc2_%=           \n\t"
+        "ii_ne0_%=:                       \n\t"
+        "pld        [%[c0], #16]          \n\t"
+        "vld1.32    {q10}, [%[c0]]        \n\t"
+        "gemm_nc2_%=:                     \n\t"
+        "pld        [%[b0], #16]          \n\t"
+        "vld1.32    {q2}, [%[b0]]!        \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "pld        [%[b1], #16]          \n\t"
+        "vld1.32    {q3}, [%[b1]]!        \n\t"
+        "vmla.f32   q10, q3, d0[1]        \n\t"
+        "pld        [%[b2], #16]          \n\t"
+        "vld1.32    {q4}, [%[b2]]!        \n\t"
+        "vmla.f32   q10, q4, d1[0]        \n\t"
+        "pld        [%[b3], #16]          \n\t"
+        "vld1.32    {q5}, [%[b3]]!        \n\t"
+        "vmla.f32   q10, q5, d1[1]        \n\t"
+        "vst1.32    {q10}, [%[c0]]!       \n\t"
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "bge        loop_nc2_%=           \n\t"
+        "end_nc2_%=:                      \n\t"
+        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
+          [c0] "+r"(c0)
+        : [a0] "r"(a0), [i] "r"(i), [nc1] "r"(nc1), [nc2] "r"(nc2)
+        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
+    for (int j = 0; j < nc3; j++) {
+      if (i == 0) {
+        *c0 = (*a0) * (*b0++);
+      } else {
+        *c0 += (*a0) * (*b0++);
+      }
+      *c0 += (*(a0 + 1)) * (*b1++);
+      *c0 += (*(a0 + 2)) * (*b2++);
+      *c0 += (*(a0 + 3)) * (*b3++);
+      c0++;
+    }
+  }
+  for (int i = 0; i < kc2; ++i) {
+    a0 = A + 4 * kc1 + i;
+    b0 = B + (4 * kc1 + i) * ldb;
+    c0 = bufferC;
+    asm volatile(
+        "pld        [%[a0], #16]          \n\t"
+        "vld1.32    {d0}, [%[a0]]         \n\t"
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "blt        end_nc1_%=            \n\t"
+        "loop_nc1_%=:                     \n\t"
+        "pld        [%[c0], #64]          \n\t"
+        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
+        "sub        %[c0], %[c0], #32     \n\t"
+        "gemm_nc1_%=:                     \n\t"
+        "pld        [%[b0], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "vmla.f32   q11, q3, d0[0]        \n\t"
+        "vmla.f32   q12, q4, d0[0]        \n\t"
+        "vmla.f32   q13, q5, d0[0]        \n\t"
+        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "bge        loop_nc1_%=           \n\t"
+        "end_nc1_%=:                      \n\t"
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "blt        end_nc2_%=            \n\t"
+        "loop_nc2_%=:                     \n\t"
+        "pld        [%[c0], #16]          \n\t"
+        "vld1.32    {q10}, [%[c0]]        \n\t"
+        "gemm_nc2_%=:                     \n\t"
+        "vld1.32    {q2}, [%[b0]]!        \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "vst1.32    {q10}, [%[c0]]!       \n\t"
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "bge        loop_nc2_%=           \n\t"
+        "end_nc2_%=:                      \n\t"
+        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
+          [c0] "+r"(c0)
+        : [a0] "r"(a0), [nc1] "r"(nc1), [nc2] "r"(nc2)
+        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
+    for (int j = 0; j < nc3; j++) {
+      *c0 += (*a0) * (*b0++);
+      c0++;
+    }
+  }
+  c0 = bufferC;
+  C0 = C;
+  for (int i = 0; i < n; i++) {
+    if (beta == 1.0) {
+      *C0++ += *c0++;
+    } else {
+      *C0++ = *c0++;
+    }
+  }
+}
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -20,9 +20,9 @@ limitations under the License. */
 #define C(i, j) C[(i)*ldc + (j)]
 // 分块计算的块大小，mc 与 kc 分别对应分块计算时的 m 与 k
-#define MC 384
+#define MC 128
-#define KC 384
+#define KC 128
-#define NC 4096
+#define NC 1024
 #define MR 4
 #define NR 4
@@ -33,19 +33,19 @@ namespace operators {
 namespace math {
 // 将 A 矩阵分块复制到连续内存(ColMajor)
-void PackMatrixA(int m, int k, int paddingM, const float *A, int lda,
+void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                 float *buffer);
 // 将 B 矩阵分块复制到连续内存(ColMajor)
-void PackMatrixB(int k, int n, int paddingN, const float *B, int ldb,
+void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                 float *buffer);
 // 将 A 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_(int m, int k, int paddingM, const float *A, int lda,
+void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
                  float *buffer);
 // 将 B 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_(int k, int n, int paddingN, const float *B, int ldb,
+void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
                  float *buffer);
 // 分块矩阵乘法
@@ -53,14 +53,25 @@ void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
                 const float *B, int ldb, float beta, float *C, int ldc,
                 int first_time);
+// 向量矩阵乘法 (M = 1)
+void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
+                  const float *B, int ldb, float beta, float *C, int ldc);
 // 计算一个更小的 4 * 4 的 C 矩阵分块
 void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B,
               int ldb, float beta, float *C, int ldc, int mc, int nc);
+void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
+                    int ldb, float beta, float *C, int ldc, int mc, int nc,
+                    bool relu);
 // 32位 float 矩阵乘法
 void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
           const float *B, int ldb, float beta, float *C, int ldc);
+void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
+                const float *B, int ldb, float beta, float *C, int ldc);
 // 64位 double 矩阵乘法
 void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
           const double *B, int ldb, float beta, double *C, int ldc);

--- a/src/operators/math/im2col.cpp
+++ b/src/operators/math/im2col.cpp
@@ -14,8 +14,10 @@ limitations under the License. */
 #include "operators/math/im2col.h"
 #include <vector>
+#ifdef __ARM_NEON
+#include "arm_neon.h"
+#endif
 #include "common/types.h"
 namespace paddle_mobile {
 namespace operators {
 namespace math {
@@ -65,9 +67,350 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
    //                      are " "inconsistent.");
    int channels_col = im_channels * filter_height * filter_width;
    const T *im_data = im.data<T>();
    T *col_data = col->data<T>();
+#ifdef __ARM_NEON
+    const int osize = col_height;
+    const int isize = im_height;
+    bool pad1 = padding[0] > 0;
+    bool pad2 =
+        (pad1 &&
+         (((isize - 2 * padding[0] + filter_height) % stride[0] == 0) ? 1 : 0));
+    int fill = isize % 2;
+    if (stride[0] == 1 && filter_height == 3 && pad1 && pad2 &&
+        dilation[0] == 1) {
+      for (int c = 0; c < im_channels; ++c) {
+        int oosize = osize * osize;
+        int nk4 = osize / 4;
+        int mk4 = osize % 4;
+        float *col0 = col_data + 0 * oosize + 2 * osize + 2;
+        float *col1 = col_data + 1 * oosize + 2 * osize + 1;
+        float *col2 = col_data + 2 * oosize + 2 * osize;
+        float *col3 = col_data + 3 * oosize + osize + 2;
+        float *col4 = col_data + 4 * oosize + osize + 1;
+        float *col5 = col_data + 5 * oosize + osize;
+        float *col6 = col_data + 6 * oosize + 2;
+        float *col7 = col_data + 7 * oosize + 1;
+        float *col8 = col_data + 8 * oosize;
+        float32x4_t im1;
+        const float *im_tmp_data = im_data + osize + 1;
+        int rrsize = oosize - osize - 1;
+        int nr4 = rrsize / 4;
+        int mr4 = rrsize % 4;
+        for (int i = 0; i < nr4; ++i) {
+          im1 = vld1q_f32(im_tmp_data);
+          vst1q_f32(col0, im1);
+          vst1q_f32(col1, im1);
+          vst1q_f32(col2, im1);
+          vst1q_f32(col3, im1);
+          vst1q_f32(col4, im1);
+          vst1q_f32(col5, im1);
+          vst1q_f32(col6, im1);
+          vst1q_f32(col7, im1);
+          vst1q_f32(col8, im1);
+          col0 += 4;
+          col1 += 4;
+          col2 += 4;
+          col3 += 4;
+          col4 += 4;
+          col5 += 4;
+          col6 += 4;
+          col7 += 4;
+          col8 += 4;
+          im_tmp_data += 4;
+        }
+        for (int i = 0; i < mr4; ++i) {
+          *col0 = *im_tmp_data;
+          *col1 = *im_tmp_data;
+          *col2 = *im_tmp_data;
+          *col3 = *im_tmp_data;
+          *col4 = *im_tmp_data;
+          *col5 = *im_tmp_data;
+          *col6 = *im_tmp_data;
+          *col7 = *im_tmp_data;
+          *col8 = *im_tmp_data;
+          col0++;
+          col1++;
+          col2++;
+          col3++;
+          col4++;
+          col5++;
+          col6++;
+          col7++;
+          col8++;
+          im_tmp_data++;
+        }
+        im_tmp_data = im_data + 1;
+        col0 = col_data + 0 * oosize + osize + 2;
+        col1 = col_data + 1 * oosize + osize + 1;
+        col2 = col_data + 2 * oosize + osize;
+        col3 = col_data + 3 * oosize + 2;
+        col4 = col_data + 4 * oosize + 1;
+        col5 = col_data + 5 * oosize;
+        for (int i = 0; i < nk4; i++) {
+          im1 = vld1q_f32(im_tmp_data);
+          vst1q_f32(col0, im1);
+          vst1q_f32(col1, im1);
+          vst1q_f32(col2, im1);
+          vst1q_f32(col3, im1);
+          vst1q_f32(col4, im1);
+          vst1q_f32(col5, im1);
+          col0 += 4;
+          col1 += 4;
+          col2 += 4;
+          col3 += 4;
+          col4 += 4;
+          col5 += 4;
+          im_tmp_data += 4;
+        }
+        for (int i = 0; i < mk4; i++) {
+          *col0 = *im_tmp_data;
+          *col1 = *im_tmp_data;
+          *col2 = *im_tmp_data;
+          *col3 = *im_tmp_data;
+          *col4 = *im_tmp_data;
+          *col5 = *im_tmp_data;
+          col0++;
+          col1++;
+          col2++;
+          col3++;
+          col4++;
+          col5++;
+          im_tmp_data++;
+        }
+        // fill 0 1 11;
+        for (int i = 0; i < osize; ++i) {
+          col_data[0 * oosize + i * osize] = 0.0;
+          col_data[3 * oosize + i * osize] = 0.0;
+          col_data[6 * oosize + i * osize] = 0.0;
+          col_data[2 * oosize + osize - 1 + i * osize] = 0.0;
+          col_data[5 * oosize + osize - 1 + i * osize] = 0.0;
+          col_data[8 * oosize + osize - 1 + i * osize] = 0.0;
+        }
+        col_data[0 * oosize + osize + 1] = im_data[0];
+        col_data[3 * oosize + 1] = im_data[0];
+        col_data[6 * oosize + 1] = im_data[osize];
+        col_data[1 * oosize + osize] = im_data[0];
+        col_data[4 * oosize] = im_data[0];
+        col_data[7 * oosize] = im_data[osize];
+        float32x4_t zero4;
+        zero4 = vdupq_n_f32(0.0);
+        auto col_z0 = col_data;
+        auto col_z1 = col_data + oosize;
+        auto col_z2 = col_data + 2 * oosize;
+        auto col_z6 = col_data + 6 * oosize + osize * (osize - 1);
+        auto col_z7 = col_data + 7 * oosize + osize * (osize - 1);
+        auto col_z8 = col_data + 8 * oosize + osize * (osize - 1);
+        for (int i = 0; i < nk4; ++i) {
+          vst1q_f32(col_z0, zero4);
+          vst1q_f32(col_z1, zero4);
+          vst1q_f32(col_z2, zero4);
+          vst1q_f32(col_z6, zero4);
+          vst1q_f32(col_z7, zero4);
+          vst1q_f32(col_z8, zero4);
+          col_z0 += 4;
+          col_z1 += 4;
+          col_z2 += 4;
+          col_z6 += 4;
+          col_z7 += 4;
+          col_z8 += 4;
+        }
+        for (int i = 0; i < mk4; ++i) {
+          col_z0[i] = 0.0;
+          col_z1[i] = 0.0;
+          col_z2[i] = 0.0;
+          col_z6[i] = 0.0;
+          col_z7[i] = 0.0;
+          col_z8[i] = 0.0;
+        }
+        col_data += 9 * oosize;
+        im_data += isize * isize;
+      }
+    } else if (stride[0] == 2 && filter_height == 3 && pad1 &&
+               dilation[0] == 1) {
+      for (int c = 0; c < im_channels; ++c) {
+        int oosize = osize * osize;
+        int nk4 = osize / 4;
+        int mk4 = osize % 4;
+        // 3 2 3 1 0 1 3 2 3
+        float *col0 = col_data + 0 * oosize + osize + 1;
+        float *col1 = col_data + 1 * oosize + osize;
+        float *col2 = col_data + 2 * oosize + osize;
+        float *col3 = col_data + 3 * oosize + 1;
+        float *col4 = col_data + 4 * oosize;
+        float *col5 = col_data + 5 * oosize;
+        float *col6 = col_data + 6 * oosize + 1;
+        float *col7 = col_data + 7 * oosize;
+        float *col8 = col_data + 8 * oosize;
+        float32x4x2_t im01;
+        float32x4x2_t im23;
+        const float *im_tmp_data0 = im_data;
+        const float *im_tmp_data2 = im_data + isize;
+        for (int j = 0; j < osize; ++j) {
+          for (int i = 0; i < nk4; ++i) {
+            im01 = vld2q_f32(im_tmp_data0);
+            im23 = vld2q_f32(im_tmp_data2);
+            vst1q_f32(col0, im23.val[1]);
+            vst1q_f32(col1, im23.val[0]);
+            vst1q_f32(col2, im23.val[1]);
+            vst1q_f32(col3, im01.val[1]);
+            vst1q_f32(col4, im01.val[0]);
+            vst1q_f32(col5, im01.val[1]);
+            vst1q_f32(col6, im23.val[1]);
+            vst1q_f32(col7, im23.val[0]);
+            vst1q_f32(col8, im23.val[1]);
+            col0 += 4;
+            col1 += 4;
+            col2 += 4;
+            col3 += 4;
+            col4 += 4;
+            col5 += 4;
+            col6 += 4;
+            col7 += 4;
+            col8 += 4;
+            im_tmp_data0 += 8;
+            im_tmp_data2 += 8;
+          }
+          const float *im_tmp_data1 = im_tmp_data0 + 1;
+          const float *im_tmp_data3 = im_tmp_data2 + 1;
+          for (int i = 0; i < mk4; ++i) {
+            *col0 = *im_tmp_data3;
+            *col1 = *im_tmp_data2;
+            *col2 = *im_tmp_data3;
+            *col3 = *im_tmp_data1;
+            *col4 = *im_tmp_data0;
+            *col5 = *im_tmp_data1;
+            *col6 = *im_tmp_data3;
+            *col7 = *im_tmp_data2;
+            *col8 = *im_tmp_data3;
+            col0++;
+            col1++;
+            col2++;
+            col3++;
+            col4++;
+            col5++;
+            col6++;
+            col7++;
+            col8++;
+            im_tmp_data0 += 2;
+            im_tmp_data1 += 2;
+            im_tmp_data2 += 2;
+            im_tmp_data3 += 2;
+          }
+          im_tmp_data0 += (isize - fill);
+          im_tmp_data2 += (isize - fill);
+        }
+        for (int i = 0; i < osize; ++i) {
+          col_data[0 * oosize + i * osize] = 0.0;
+          col_data[3 * oosize + i * osize] = 0.0;
+          col_data[6 * oosize + i * osize] = 0.0;
+          if (pad2) {
+            col_data[2 * oosize + osize - 1 + i * osize] = 0.0;
+            col_data[5 * oosize + osize - 1 + i * osize] = 0.0;
+            col_data[8 * oosize + osize - 1 + i * osize] = 0.0;
+          }
+        }
+        float32x4_t zero4;
+        zero4 = vdupq_n_f32(0.0);
+        auto col_z0 = col_data;
+        auto col_z1 = col_data + oosize;
+        auto col_z2 = col_data + 2 * oosize;
+        auto col_z6 = col_data + 6 * oosize + osize * (osize - 1);
+        auto col_z7 = col_data + 7 * oosize + osize * (osize - 1);
+        auto col_z8 = col_data + 8 * oosize + osize * (osize - 1);
+        for (int i = 0; i < nk4; ++i) {
+          vst1q_f32(col_z0, zero4);
+          vst1q_f32(col_z1, zero4);
+          vst1q_f32(col_z2, zero4);
+          if (pad2) {
+            vst1q_f32(col_z6, zero4);
+            vst1q_f32(col_z7, zero4);
+            vst1q_f32(col_z8, zero4);
+          }
+          col_z0 += 4;
+          col_z1 += 4;
+          col_z2 += 4;
+          col_z6 += 4;
+          col_z7 += 4;
+          col_z8 += 4;
+        }
+        for (int i = 0; i < mk4; ++i) {
+          col_z0[i] = 0.0;
+          col_z1[i] = 0.0;
+          col_z2[i] = 0.0;
+          if (pad2) {
+            col_z6[i] = 0.0;
+            col_z7[i] = 0.0;
+            col_z8[i] = 0.0;
+          }
+        }
+        col_data[1 * oosize + osize] = im_data[isize];
+        for (int i = 1; i < osize; ++i) {
+          col_data[3 * oosize + i] = im_data[(i - 1) * stride[0] + 1];
+        }
+        col_data[4 * oosize] = im_data[0];
+        col_data[7 * oosize] = im_data[isize];
+        col_data += 9 * oosize;
+        im_data += isize * isize;
+      }
+    } else {
+      for (int c = 0; c < channels_col; ++c) {
+        int w_offset = c % filter_width;
+        int h_offset = (c / filter_width) % filter_height;
+        int c_im = c / (filter_width * filter_height);
+        for (int h = 0; h < col_height; ++h) {
+          int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+          for (int w = 0; w < col_width; ++w) {
+            int im_col_idx =
+                w * stride[1] - padding[1] + w_offset * dilation[1];
+            int col_idx = (c * col_height + h) * col_width + w;
+            int im_idx =
+                (im_row_idx + c_im * im_height) * im_width + im_col_idx;
+            col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
+                                 im_col_idx < 0 || im_col_idx >= im_width)
+                                    ? static_cast<T>(0)
+                                    : im_data[im_idx];
+          }
+        }
+      }
+    }
+#else
    for (int c = 0; c < channels_col; ++c) {
      int w_offset = c % filter_width;
      int h_offset = (c / filter_width) % filter_height;
@@ -86,6 +429,7 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
        }
      }
    }
+#endif
  }
 };
@@ -158,7 +502,7 @@ class Col2ImFunctor<ColFormat::kCFO, CPU, T> {
 };
 template class Im2ColFunctor<ColFormat::kCFO, CPU, float>;
-template class Im2ColFunctor<ColFormat::kCFO, CPU, double>;
+// template class Im2ColFunctor<ColFormat::kCFO, CPU, double>;
 template class Col2ImFunctor<ColFormat::kCFO, CPU, float>;
 template class Col2ImFunctor<ColFormat::kCFO, CPU, double>;

--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -22,7 +22,7 @@ namespace math {
 template <>
 void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
                   const framework::Tensor &matrix_b, bool trans_b, float alpha,
-                   framework::Tensor *matrix_out, float beta) {
+                   framework::Tensor *matrix_out, float beta, bool relu) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -41,14 +41,20 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
  int N = dim_out[1];
  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
-  sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+  if (relu) {
-        beta, matrix_out->data<float>(), N);
+    sgemm_relu(M, N, K, alpha, matrix_a.data<float>(), K,
+               matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N);
+  } else {
+    sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+          beta, matrix_out->data<float>(), N);
+  }
 }
 template <>
 void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
                    const framework::Tensor &matrix_b, bool trans_b,
-                    double alpha, framework::Tensor *matrix_out, double beta) {
+                    double alpha, framework::Tensor *matrix_out, double beta,
+                    bool relu) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();

--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -25,7 +25,7 @@ namespace math {
 template <typename T>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
            const framework::Tensor &matrix_b, bool trans_b, T alpha,
-            framework::Tensor *matrix_out, T beta);
+            framework::Tensor *matrix_out, T beta, bool relu = false);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/pool_2x2.cpp
+++ b/src/operators/math/pool_2x2.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POOL_OP
+#include "pool_2x2.h"
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
+                Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+  const int input_height = input->dims()[2];
+  const int input_width = input->dims()[3];
+  const int output_channels = output->dims()[1];
+  int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int ksize_height = 2;
+  const int ksize_width = 2;
+  const int stride_height = strides[0];
+  const int stride_width = strides[1];
+  const int padding_height = paddings[0];
+  const int padding_width = paddings[1];
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+  const float *input_data = input->data<float>();
+  float *output_data = output->mutable_data<float>();
+  int out_w_num = output_width >> 2;
+  const int in_h_num = output_height >> 1;
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  int remain = output_width - out_w_num << 2;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      const float *input_data_chanel_row_next = input_data + input_width;
+      for (; output_height > 0; output_height--) {
+        if (out_w_num > 0) {
+          asm volatile(
+              "max_loop:                            \n\t"
+              "vld1.f32  {q0,q1},  [%[in_ptr1]]!         \n\t"
+              "vld1.f32  {q2,q3},  [%[in_ptr2]]!         \n\t"
+              "vmax.f32  q0,  q0,  q2                 \n\t"
+              "vmax.f32  q1,  q1,  q3                 \n\t"
+              "vpmax.f32  d4,  d0, d1                  \n\t"
+              "vpmax.f32  d5,  d2, d3                  \n\t"
+              "subs %[out_w_num],  #1                  \n\t"
+              "vst1.32  {q2},  [%[out_ptr]]!                 \n\t"
+              "bne  max_loop                            \n\t"
+              : [in_ptr1] "+r"(input_data),
+                [in_ptr2] "+r"(input_data_chanel_row_next),
+                [out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num)
+              :
+              : "memory", "q0", "q1", "q2", "q3");
+        }
+        for (; remain > 0; remain--) {
+          float max_row1 = std::max(input_data[0], input_data[1]);
+          float max_row2 = std::max(input_data_chanel_row_next[0],
+                                    input_data_chanel_row_next[1]);
+          *output_data = std::max(max_row1, max_row2);
+          input_data += 2;
+          input_data_chanel_row_next += 2;
+          output_data++;
+        }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+#endif
+}
+void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
+                Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+  const int input_height = input->dims()[2];
+  const int input_width = input->dims()[3];
+  const int output_channels = output->dims()[1];
+  int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int ksize_height = 2;
+  const int ksize_width = 2;
+  const int stride_height = strides[0];
+  const int stride_width = strides[1];
+  const int padding_height = paddings[0];
+  const int padding_width = paddings[1];
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+  const float *input_data = input->data<float>();
+  float *output_data = output->mutable_data<float>();
+  int out_w_num = output_width >> 2;
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  float vqua[] = {0.25f, 0.25f, 0.25f, 0.25f};
+  int remain = output_width - out_w_num << 2;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      const float *input_data_chanel_row_next = input_data + input_width;
+      for (; output_height > 0; output_height--) {
+        if (out_w_num > 0) {
+          asm volatile(
+              "avg_loop:                            \n\t"
+              "vld1.32  {q0,q1},  [%[in_ptr1]]!         \n\t"
+              "vld1.32  {q2,q3},  [%[in_ptr2]]!         \n\t"
+              "vadd.f32  q0,  q0,  q2                 \n\t"
+              "vadd.f32  q1,  q1,  q3                 \n\t"
+              "vpadd.f32  d4,  d0, d1                  \n\t"
+              "vpadd.f32  d5,  d2, d3                  \n\t"
+              "vld1.32  {q4}, [%[vqua]]!                  \n\t"
+              "vmul.f32  q2,  q2,  q4                          \n\t"
+              "subs %[out_w_num],  #1                  \n\t"
+              "vst1.32  {q2},  [%[out_ptr]]!                 \n\t"
+              "bne  avg_loop                            \n\t"
+              : [in_ptr1] "+r"(input_data),
+                [in_ptr2] "+r"(input_data_chanel_row_next),
+                [out_ptr] "+r"(output_data), [out_w_num] "+r"(out_w_num)
+              : [vqua] "r"(vqua)
+              : "memory", "q0", "q1", "q2", "q3", "q4");
+        }
+        for (; remain > 0; remain--) {
+          float max_row1 = std::max(input_data[0], input_data[1]);
+          float max_row2 = std::max(input_data_chanel_row_next[0],
+                                    input_data_chanel_row_next[1]);
+          *output_data = std::max(max_row1, max_row2);
+          input_data += 2;
+          input_data_chanel_row_next += 2;
+          output_data++;
+        }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+#endif
+}
+//}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/math/pool_2x2.h
+++ b/src/operators/math/pool_2x2.h
@@ -12,16 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef POOL_OP
 #pragma once
+#include "framework/tensor.h"
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+using framework::Tensor;
+using std::vector;
-static void Pool2x2Max() {
+void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
-  // todo impl with neon
+                Tensor *output);
-}
-static void Pool2x2Avg() {
+void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *in_x,
-  // todo impl with neon
+                Tensor *out);
-}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POOL_OP
+#define __ARM_NEON true
+#include "pool_3x3.h"
+#include "framework/tensor.h"
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif  // __ARM_NEON
+#include <climits>
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+using framework::Tensor;
+using std::max;
+using std::min;
+using std::vector;
+void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
+                Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+  const int input_height = input->dims()[2];
+  const int input_width = input->dims()[3];
+  const int output_channels = output->dims()[1];
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int _kernel_size = 3;
+  const int stride_height = strides[0];
+  const int stride_width = strides[1];
+  const int padding_height = paddings[0];
+  const int padding_width = paddings[1];
+  const float negative_max = -INT_MAX;
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+  const float *input_data = input->data<float>();
+  float *output_data = output->mutable_data<float>();
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  const float *pos1, *pos2, *pos3, *output_ptr;
+  int hstart, wstart, hend, wend;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      for (int ph = 0; ph < output_height; ph++) {
+        for (int pw = 0; pw < output_width; pw++) {
+          hstart = ph * stride_height - padding_height;
+          wstart = pw * stride_width - padding_width;
+          hend = min(hstart + _kernel_size, input_height + padding_height);
+          wend = min(wstart + _kernel_size, input_width + padding_width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          hend = min(hend, input_height);
+          wend = min(wend, input_width);
+          pos1 = input_data + hstart * input_width + wstart;
+          pos2 = input_data + (hstart + 1) * input_width + wstart;
+          pos3 = input_data + (hstart + 2) * input_width + wstart;
+          output_ptr = output_data + ph * output_width + pw;
+          if (hend - hstart != 3 || wend - wstart != 3) {
+            float max_value = -INT_MAX;
+            for (int h = hstart; h < hend; h++) {
+              for (int w = wstart; w < wend; w++) {
+                float value = input_data[h * input_width + w];
+                if (value > max_value) {
+                  max_value = value;
+                }
+              }
+            }
+            output_data[ph * output_width + pw] = max_value;
+          } else {
+#if defined(ARMV7)
+            asm volatile(
+                "vld1.32  {q1}, [%[pos1]]        \n\t"
+                "vld1.32  {q2}, [%[pos2]]        \n\t"
+                "vld1.32  {q3}, [%[pos3]]        \n\t"
+                "vmax.f32 q1, q1, q2            \n\t"
+                "vmax.f32 q2, q1, q3            \n\t"
+                "vmov.f32 d5[1],  %[negative_max]         \n\t"
+                "vpmax.f32  d6, d4, d5            \n\t"
+                "vpmax.f32  d7, d6, d6             \n\t"
+                "vst1.32 {d7[0]},[%[output_ptr]]    \n\t"
+                :
+                : [input_data] "r"(input_data), [pos1] "r"(pos1),
+                  [pos2] "r"(pos2), [pos3] "r"(pos3),
+                  [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
+                : "memory", "q1", "q2", "q3", "q4");
+#else
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos2);
+            const float32x4_t data3 = vld1q_f32(pos3);
+            const float32x4_t max_data =
+                vmaxq_f32(vmaxq_f32(data1, data3), data2);
+            float32x2_t res =
+                vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
+                          vget_low_f32(max_data));
+            res = vpmax_f32(res, res);
+            output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
+#endif
+          }
+        }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+#endif
+}
+void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
+                Tensor *output) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+  const int input_height = input->dims()[2];
+  const int input_width = input->dims()[3];
+  const int output_channels = output->dims()[1];
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int _kernel_size = 3;
+  const int stride_height = strides[0];
+  const int stride_width = strides[1];
+  const int padding_height = paddings[0];
+  const int padding_width = paddings[1];
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+  const float *input_data = input->data<float>();
+  float *output_data = output->mutable_data<float>();
+  const float zero = 0;
+  const float nine = 1.0 / 9.0;
+  const float nine_ptr[] = {nine, nine};
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      for (int ph = 0; ph < output_height; ph++) {
+        for (int pw = 0; pw < output_width; pw++) {
+          int hstart = ph * stride_height - padding_height;
+          int wstart = pw * stride_width - padding_width;
+          int hend = min(hstart + _kernel_size, input_height + padding_height);
+          int wend = min(wstart + _kernel_size, input_width + padding_width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          hend = min(hend, input_height);
+          wend = min(wend, input_width);
+          const float *pos1 = input_data + hstart * input_width + wstart;
+          const float *pos2 = input_data + (hstart + 1) * input_width + wstart;
+          const float *pos3 = input_data + (hstart + 2) * input_width + wstart;
+          const float *output_ptr = output_data + ph * output_width + pw;
+          if (hend - hstart != 3 || wend - wstart != 3) {
+            float sum = 0;
+            for (int h = hstart; h < hend; h++) {
+              for (int w = wstart; w < wend; w++) {
+                sum += input_data[h * input_width + w];
+              }
+            }
+            output_data[ph * output_width + pw] = sum / 9.0;
+          } else {
+#if defined(ARMV7)
+            asm volatile(
+                "vld1.32  {q1}, [%[pos1]]        \n\t"
+                "vld1.32  {q2}, [%[pos2]]        \n\t"
+                "vld1.32  {q3}, [%[pos3]]        \n\t"
+                "vadd.f32 q1, q1, q2            \n\t"
+                "vadd.f32 q2, q1, q3            \n\t"
+                "vmov.f32 d5[1],  %[zero]         \n\t"
+                "vpadd.f32  d6, d4, d5            \n\t"
+                "vpadd.f32  d6, d6, d6             \n\t"
+                "vld1.f32 d7, [%[nine_ptr]]!        \n\t"
+                "vmul.f32 d6,d7                     \n\t"
+                "vst1.32 {d6[0]},[%[output_ptr]]    \n\t"
+                :
+                : [input_data] "r"(input_data), [pos1] "r"(pos1),
+                  [pos2] "r"(pos2), [pos3] "r"(pos3),
+                  [output_ptr] "r"(output_ptr), [zero] "r"(zero),
+                  [nine_ptr] "r"(nine_ptr)
+                : "memory", "r6", "q1", "q2", "q3", "q4");
+#else
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos2);
+            const float32x4_t data3 = vld1q_f32(pos3);
+            const float32x4_t sum_data =
+                vaddq_f32(vaddq_f32(data1, data3), data2);
+            float32x2_t res =
+                vpadd_f32(vget_high_f32(vsetq_lane_f32(0, sum_data, 3)),
+                          vget_low_f32(sum_data));
+            res = vpadd_f32(res, res);
+            output_data[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0;
+#endif
+          }
+        }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+#endif
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/math/pool3x3.h
+++ b/src/operators/math/pool3x3.h
@@ -12,16 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef POOL_OP
 #pragma once
+#include "framework/tensor.h"
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON
-static void Pool3x3Max() {
+namespace paddle_mobile {
-  // todo impl with neon
+namespace operators {
-}
+namespace math {
+using framework::Tensor;
+using std::vector;
+void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
+                Tensor *output);
+void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *in_x,
+                Tensor *out);
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
-static void Pool3x3Avg() {
+#endif
-  // todo impl with neon
-}
--- a/src/operators/math/pooling.cpp
+++ b/src/operators/math/pooling.cpp
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef POOL_OP
 #include "pooling.h"
-#include <common/types.h>
+#include "common/types.h"
 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +38,7 @@ class PoolFunctor<CPU, PoolProcess, T> {
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
-    if (output == nullptr) {
-      DLOG << "output tensor is null";
-    }
    const int output_channels = output->dims()[1];
    const int output_height = output->dims()[2];
@@ -57,7 +57,7 @@ class PoolFunctor<CPU, PoolProcess, T> {
    T *output_data = output->mutable_data<T>();
    for (int i = 0; i < batch_size; i++) {
-#pragma omp parallel for
+      #pragma omp parallel for
      for (int c = 0; c < output_channels; ++c) {
        for (int ph = 0; ph < output_height; ++ph) {
          int hstart = ph * stride_height - padding_height;
@@ -91,3 +91,5 @@ template class PoolFunctor<CPU, math::MaxPool<float>, float>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/math/pooling.h
+++ b/src/operators/math/pooling.h
@@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef POOL_OP
 #pragma once
 #include "common/log.h"
 #include "framework/tensor.h"
+#include "pool_2x2.h"
+#include "pool_3x3.h"
 namespace paddle_mobile {
 namespace operators {
@@ -64,3 +68,5 @@ class PoolFunctor {
 }
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
@@ -11,6 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef SOFTMAX_OP
 #include "operators/math/softmax.h"
 #include "common/types.h"
 #if __ARM_NEON
@@ -153,3 +156,4 @@ template class SoftmaxFuntor<CPU, float>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/math/softmax.h
+++ b/src/operators/math/softmax.h
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef SOFTMAX_OP
 #pragma once
 #include "framework/tensor.h"
 namespace paddle_mobile {
@@ -26,3 +27,4 @@ class SoftmaxFuntor {
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef MUL_OP
 #include "mul_op.h"
 namespace paddle_mobile {
@@ -19,10 +21,10 @@ namespace operators {
 template <typename Dtype, typename T>
 void MulOp<Dtype, T>::InferShape() const {
-  auto x_dims = param_.InputX()->dims();
+  auto x_dims = this->param_.InputX()->dims();
-  auto y_dims = param_.InputY()->dims();
+  auto y_dims = this->param_.InputY()->dims();
-  int x_num_col_dims = param_.XNumColDims();
+  int x_num_col_dims = this->param_.XNumColDims();
-  int y_num_col_dims = param_.YNumColDims();
+  int y_num_col_dims = this->param_.YNumColDims();
  assert(x_dims.size() > x_num_col_dims);
  assert(y_dims.size() > y_num_col_dims);
@@ -46,12 +48,22 @@ void MulOp<Dtype, T>::InferShape() const {
  }
  framework::DDim ddim = framework::make_ddim(output_dims);
-  param_.Out()->Resize(ddim);
+  this->param_.Out()->Resize(ddim);
 }
 template class MulOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(mul);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(mul, ops::MulOp);
+USE_OP_CPU(mul);
+REGISTER_OPERATOR_CPU(mul, ops::MulOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(mul);
+REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
@@ -11,6 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef MUL_OP
 #pragma once
 #include <string>
@@ -22,26 +25,25 @@ namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType, typename T>
-class MulOp : public framework::OperatorWithKernel<DeviceType> {
+class MulOp : public framework::OperatorWithKernel<
+                  DeviceType, MulParam, operators::MulKernel<DeviceType, T>> {
 public:
  MulOp(const std::string &type, const VariableNameMap &inputs,
-        const VariableNameMap &outputs, const framework::AttributeMap attrs,
+        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
        std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+      : framework::OperatorWithKernel<DeviceType, MulParam,
-                                                  scope),
+                                      operators::MulKernel<DeviceType, T>>(
-        param_(inputs, outputs, attrs, *scope) {}
+            type, inputs, outputs, attrs, scope) {}
-  void RunImpl() const {
+  using framework::OperatorWithKernel<
-    operators::MulKernel<DeviceType, T> kernel;
+      DeviceType, MulParam,
-    kernel.Compute(param_);
+      operators::MulKernel<DeviceType, T>>::OperatorWithKernel;
-  }
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
  void InferShape() const override;
 protected:
-  MulParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/multiclass_nms_op.cpp
+++ b/src/operators/multiclass_nms_op.cpp
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef MULTICLASSNMS_OP
 #include "operators/multiclass_nms_op.h"
 namespace paddle_mobile {
 namespace operators {
 template <typename Dtype, typename T>
 void MultiClassNMSOp<Dtype, T>::InferShape() const {
-  auto input_bboxes_dims = param_.InputBBoxes()->dims();
+  auto input_bboxes_dims = this->param_.InputBBoxes()->dims();
-  auto input_scores_dims = param_.InputScores()->dims();
+  auto input_scores_dims = this->param_.InputScores()->dims();
  if (input_scores_dims.size() != 3) {
    LOG(kLOG_ERROR) << "Input Scores size must be 3";
  }
@@ -30,12 +32,20 @@ void MultiClassNMSOp<Dtype, T>::InferShape() const {
    LOG(kLOG_ERROR) << "Predict bboxes must be equal";
  }
  // pre size, will change in Compute.
-  param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6}));
+  this->param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6}));
 }
 template class MultiClassNMSOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(multiclass_nms);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp);
+USE_OP_CPU(multiclass_nms);
+REGISTER_OPERATOR_CPU(multiclass_nms, ops::MultiClassNMSOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/multiclass_nms_op.h
+++ b/src/operators/multiclass_nms_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef MULTICLASSNMS_OP
 #pragma once
 #include <string>
@@ -26,27 +28,28 @@ namespace operators {
 using paddle_mobile::framework::Tensor;
 template <typename DeviceType, typename T>
-class MultiClassNMSOp : public framework::OperatorWithKernel<DeviceType> {
+class MultiClassNMSOp : public framework::OperatorWithKernel<
+                            DeviceType, MultiClassNMSParam,
+                            operators::MultiClassNMSKernel<DeviceType, T>> {
 public:
  MultiClassNMSOp(const std::string &type, const VariableNameMap &inputs,
                  const VariableNameMap &outputs,
-                  const framework::AttributeMap attrs,
+                  const framework::AttributeMap &attrs,
                  std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+      : framework::OperatorWithKernel<
-                                                  scope),
+            DeviceType, MultiClassNMSParam,
-        param_(inputs, outputs, attrs, *scope) {}
+            operators::MultiClassNMSKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
-  void RunImpl() const {
-    operators::MultiClassNMSKernel<DeviceType, T> kernel;
+  using framework::OperatorWithKernel<
-    kernel.Compute(param_);
+      DeviceType, MultiClassNMSParam,
-  }
+      operators::MultiClassNMSKernel<DeviceType, T>>::OperatorWithKernel;
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
  void InferShape() const override;
 protected:
-  MultiClassNMSParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/op_param.cpp
+++ b/src/operators/op_param.cpp
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "op_param.h"
 namespace paddle_mobile {
 namespace operators {
+#ifdef CONV_OP
 Print &operator<<(Print &printer, const ConvParam &conv_param) {
  printer << "parameter of conv: "
          << "\n";
@@ -36,5 +37,33 @@ Print &operator<<(Print &printer, const ConvParam &conv_param) {
  printer << "  output dims: " << conv_param.Output()->dims();
  return printer;
 }
+#endif
+#ifdef FUSION_CONVADD_OP
+Print &operator<<(Print &printer, const FusionConvAddParam &conv_param) {
+  printer << "parameter of conv_add: "
+          << "\n";
+  printer << "  stride: "
+          << " (" << conv_param.Strides()[0] << conv_param.Strides()[1] << ") "
+          << "\n";
+  printer << "  paddings: "
+          << " (" << conv_param.Paddings()[0] << conv_param.Paddings()[1]
+          << ") "
+          << "\n";
+  printer << "  dilations: "
+          << " (" << conv_param.Dilations()[0] << conv_param.Dilations()[1]
+          << ") "
+          << "\n";
+  printer << "  groups: " << conv_param.Groups() << "\n";
+  printer << "  input  dims: " << conv_param.Input()->dims() << "\n";
+  printer << "  filter dims: " << conv_param.Filter()->dims() << "\n";
+  printer << "  bias dims: " << conv_param.Bias()->dims() << "\n";
+  printer << "  output dims: " << conv_param.Output()->dims();
+  return printer;
+}
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -34,7 +34,7 @@ using framework::Tensor;
 using std::string;
 using std::vector;
-class OpParam : PaddleMobileObject {
+class OpParam {
 protected:
  template <typename T>
  static T *InputFrom(const VariableNameMap &inputs, const Scope &scope) {
@@ -165,10 +165,10 @@ class OpParam : PaddleMobileObject {
  template <typename T>
  static T *GetVarValue(const string &key, const VariableNameMap &var_map,
                        const Scope &scope) {
+    PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0,
+                          "%s is not contained in var_map", key.c_str())
    auto var_vec = var_map.at(key);
    if (!var_vec.empty()) {
-      //      std::cout << " get var value -- " << var_vec[0] <<
-      //      std::endl;
      auto var = scope.FindVar(var_vec[0]);
      return var->GetMutable<T>();
    } else {
@@ -191,6 +191,7 @@ class OpParam : PaddleMobileObject {
  }
 };
+#ifdef CONV_OP
 class ConvParam : OpParam {
 public:
  ConvParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -230,7 +231,9 @@ class ConvParam : OpParam {
 };
 Print &operator<<(Print &printer, const ConvParam &conv_param);
+#endif
+#ifdef ELEMENTWISEADD_OP
 class ElementwiseAddParam : OpParam {
 public:
  ElementwiseAddParam(const VariableNameMap &inputs,
@@ -258,6 +261,9 @@ class ElementwiseAddParam : OpParam {
  int axis_;
 };
+#endif
+#ifdef MUL_OP
 class MulParam : OpParam {
 public:
  MulParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -287,7 +293,9 @@ class MulParam : OpParam {
  int x_num_col_dims_;
  int y_num_col_dims_;
 };
+#endif
+#ifdef CONCAT_OP
 class ConcatParam : public OpParam {
 public:
  ConcatParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -309,7 +317,9 @@ class ConcatParam : public OpParam {
  Tensor *out_;
  int axis_;
 };
+#endif
+#ifdef LRN_OP
 class LrnParam : public OpParam {
 public:
  LrnParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -351,6 +361,9 @@ class LrnParam : public OpParam {
  float k_;
  string data_format_;
 };
+#endif
+#ifdef BATCHNORM_OP
 class BatchNormParam : OpParam {
 public:
  BatchNormParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -399,6 +412,9 @@ class BatchNormParam : OpParam {
  bool is_test_;
  string data_format_;
 };
+#endif
+#ifdef POOL_OP
 class PoolParam : public OpParam {
 public:
  PoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -442,6 +458,9 @@ class PoolParam : public OpParam {
  bool gloabal_pooling_ = false;
 };
+#endif
+#ifdef PRIORBOX_OP
 class PriorBoxParam : public OpParam {
 public:
  PriorBoxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -503,7 +522,9 @@ class PriorBoxParam : public OpParam {
  float step_h_;
  float offset_;
 };
+#endif
+#ifdef BOXCODER_OP
 class BoxCoderParam : public OpParam {
 public:
  BoxCoderParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -533,7 +554,9 @@ class BoxCoderParam : public OpParam {
  Tensor *output_box_;
  std::string code_type_;
 };
+#endif
+#ifdef SOFTMAX_OP
 class SoftmaxParam : public OpParam {
 public:
  SoftmaxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -549,7 +572,9 @@ class SoftmaxParam : public OpParam {
  Tensor *input_x_;
  Tensor *out_;
 };
+#endif
+#ifdef SIGMOID_OP
 class SigmoidParam : public OpParam {
 public:
  SigmoidParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -565,6 +590,9 @@ class SigmoidParam : public OpParam {
  Tensor *input_x_;
  Tensor *out_;
 };
+#endif
+#ifdef MULTICLASSNMS_OP
 class MultiClassNMSParam : public OpParam {
 public:
  MultiClassNMSParam(const VariableNameMap &inputs,
@@ -610,6 +638,7 @@ class MultiClassNMSParam : public OpParam {
  float nms_eta_;
  float score_threshold_;
 };
+#endif
 class FeedParam : public OpParam {
 public:
@@ -646,6 +675,7 @@ class FetchParam : public OpParam {
  Tensor *out_;
 };
+#ifdef TRANSPOSE_OP
 class TransposeParam : public OpParam {
 public:
  TransposeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -666,7 +696,9 @@ class TransposeParam : public OpParam {
  Tensor *out_;
  vector<int> axis_;
 };
+#endif
+#ifdef RESHAPE_OP
 class ReshapeParam : public OpParam {
 public:
  ReshapeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -695,7 +727,9 @@ class ReshapeParam : public OpParam {
  vector<int> shape_;
  bool inplace_;
 };
+#endif
+#ifdef RELU_OP
 /*
 * @b op 层实例化好这个 param 传递给 kernel 层使用
 * */
@@ -715,11 +749,13 @@ class ReluParam : public OpParam {
  Tensor *input_x_;
  Tensor *out_;
 };
+#endif
-class FushionFcParam : public OpParam {
+#ifdef FUSION_FC_OP
+class FusionFcParam : public OpParam {
 public:
-  FushionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+  FusionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                 const AttributeMap &attrs, const Scope &scope) {
+                const AttributeMap &attrs, const Scope &scope) {
    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
    input_y_ = InputYFrom<LoDTensor>(inputs, scope);
    input_z_ = InputZFrom<LoDTensor>(inputs, scope);
@@ -751,6 +787,66 @@ class FushionFcParam : public OpParam {
  int y_num_col_dims_;
  int axis_;
 };
+#endif
+#ifdef FUSION_CONVADD_OP
+class FusionConvAddParam : public OpParam {
+ public:
+  FusionConvAddParam(const VariableNameMap &inputs,
+                     const VariableNameMap &outputs, const AttributeMap &attrs,
+                     const Scope &scope) {
+    bias_ = InputYFrom<LoDTensor>(inputs, scope);
+    axis_ = GetAttr<int>("axis", attrs);
+    filter_ = FilterFrom<LoDTensor>(inputs, scope);
+    input_ = InputFrom<LoDTensor>(inputs, scope);
+    output_ = OutFrom<LoDTensor>(outputs, scope);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+  }
+  Tensor *Bias() const { return bias_; }
+  const int &Axis() const { return axis_; }
+  const Tensor *Input() const { return input_; }
+  const Tensor *Filter() const { return filter_; }
+  Tensor *Output() const { return output_; }
+  const vector<int> &Strides() const { return strides_; }
+  const vector<int> &Paddings() const { return paddings_; }
+  const vector<int> &Dilations() const { return dilations_; }
+  const int &Groups() const { return groups; }
+ protected:
+  Tensor *bias_;
+  int axis_;
+  Tensor *input_;
+  Tensor *output_;
+  Tensor *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+};
+Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
+#endif
+#ifdef FUSION_CONVADD_RELU_OP
+class FusionConvAddReluParam : public FusionConvAddParam {
+ public:
+  FusionConvAddReluParam(const VariableNameMap &inputs,
+                         const VariableNameMap &outputs,
+                         const AttributeMap &attrs, const Scope &scope)
+      : FusionConvAddParam(inputs, outputs, attrs, scope) {}
+};
+#endif
 class Im2SequenceParam : public OpParam {
 public:

--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
@@ -12,7 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef POOL_OP
 #include "pool_op.h"
+#include "framework/op_proto_maker.h"
+#include "framework/op_registry.h"
 namespace paddle_mobile {
 namespace operators {
@@ -30,13 +34,13 @@ int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
 }
 template <typename DeviceType, typename T>
 void PoolOp<DeviceType, T>::InferShape() const {
-  auto in_x_dims = param_.Input()->dims();
+  auto in_x_dims = this->param_.Input()->dims();
-  std::vector<int> ksize = param_.Ksize();
+  std::vector<int> ksize = this->param_.Ksize();
-  std::vector<int> paddings = param_.Paddings();
+  std::vector<int> paddings = this->param_.Paddings();
-  std::vector<int> strides = param_.Strides();
+  std::vector<int> strides = this->param_.Strides();
-  bool ceil_mode = param_.isCeilMode();
+  bool ceil_mode = this->param_.isCeilMode();
-  if (param_.isGlobalPooling()) {
+  if (this->param_.isGlobalPooling()) {
    ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
    for (size_t i = 0; i < ksize.size(); ++i) {
      paddings[i] = 0;
@@ -48,12 +52,22 @@ void PoolOp<DeviceType, T>::InferShape() const {
    output_shape.push_back(PoolOutputSize(in_x_dims[i + 2], ksize[i],
                                          paddings[i], strides[i], ceil_mode));
  }
-  param_.Output()->Resize(framework::make_ddim(output_shape));
+  this->param_.Output()->Resize(framework::make_ddim(output_shape));
 }
 template class PoolOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(pool2d);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(pool2d, ops::PoolOp);
+USE_OP_CPU(pool2d);
+REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(pool2d);
+REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef POOL_OP
 #pragma once
-#include <framework/operator.h>
-#include <operators/kernel/pool_kernel.h>
-#include <operators/op_param.h>
 #include <string>
+#include "framework/operator.h"
+#include "operators/kernel/pool_kernel.h"
+#include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
 using framework::AttributeMap;
@@ -26,24 +29,23 @@ using framework::OperatorWithKernel;
 using framework::Scope;
 using std::string;
 template <typename DeviceType, typename T>
-class PoolOp : public OperatorWithKernel<DeviceType> {
+class PoolOp : public OperatorWithKernel<DeviceType, PoolParam,
+                                         operators::PoolKernel<DeviceType, T>> {
 public:
  PoolOp(const string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const AttributeMap &attrs,
         std::shared_ptr<Scope> scope)
-      : OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs, scope),
+      : OperatorWithKernel<DeviceType, PoolParam,
-        param_(inputs, outputs, attrs, *scope) {}
+                           operators::PoolKernel<DeviceType, T>>(
-  using OperatorWithKernel<DeviceType>::OperatorWithKernel;
+            type, inputs, outputs, attrs, scope) {}
+  using OperatorWithKernel<
+      DeviceType, PoolParam,
+      operators::PoolKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
-  void RunImpl() const {
-    operators::PoolKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-    this->ClearVariables({"X"});
-  }
 private:
-  PoolParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/prior_box_op.cpp
+++ b/src/operators/prior_box_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef PRIORBOX_OP
 #include "operators/prior_box_op.h"
 #include <vector>
 namespace paddle_mobile {
@@ -19,13 +21,13 @@ namespace operators {
 template <typename Dtype, typename T>
 void PriorBoxOp<Dtype, T>::InferShape() const {
-  auto input_dims = param_.Input()->dims();
+  auto input_dims = this->param_.Input()->dims();
-  auto input_image_dims = param_.InputImage()->dims();
+  auto input_image_dims = this->param_.InputImage()->dims();
-  auto min_sizes = param_.MinSizes();
+  auto min_sizes = this->param_.MinSizes();
-  auto max_sizes = param_.MaxSizes();
+  auto max_sizes = this->param_.MaxSizes();
-  auto variances = param_.Variances();
+  auto variances = this->param_.Variances();
-  auto aspect_ratios = param_.AspectRatios();
+  auto aspect_ratios = this->param_.AspectRatios();
-  bool flip = param_.Flip();
+  bool flip = this->param_.Flip();
  std::vector<float> aspect_ratios_vec;
  ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
@@ -39,13 +41,21 @@ void PriorBoxOp<Dtype, T>::InferShape() const {
  dim_vec[1] = input_dims[3];
  dim_vec[2] = num_priors;
  dim_vec[3] = 4;
-  param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec));
+  this->param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec));
-  param_.OutputVariances()->Resize(framework::make_ddim(dim_vec));
+  this->param_.OutputVariances()->Resize(framework::make_ddim(dim_vec));
 }
 template class PriorBoxOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(prior_box);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(prior_box, ops::PriorBoxOp);
+USE_OP_CPU(prior_box);
+REGISTER_OPERATOR_CPU(prior_box, ops::PriorBoxOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/prior_box_op.h
+++ b/src/operators/prior_box_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef PRIORBOX_OP
 #pragma once
 #include <string>
@@ -26,27 +28,27 @@ namespace operators {
 using paddle_mobile::framework::Tensor;
 template <typename DeviceType, typename T>
-class PriorBoxOp : public framework::OperatorWithKernel<DeviceType> {
+class PriorBoxOp
+    : public framework::OperatorWithKernel<
+          DeviceType, PriorBoxParam, operators::PriorBoxKernel<DeviceType, T>> {
 public:
  PriorBoxOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
-             const framework::AttributeMap attrs,
+             const framework::AttributeMap &attrs,
             std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+      : framework::OperatorWithKernel<DeviceType, PriorBoxParam,
-                                                  scope),
+                                      operators::PriorBoxKernel<DeviceType, T>>(
-        param_(inputs, outputs, attrs, *scope) {}
+            type, inputs, outputs, attrs, scope) {}
-  void RunImpl() const {
-    operators::PriorBoxKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, PriorBoxParam,
+      operators::PriorBoxKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:
-  PriorBoxParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef RELU_OP
 #include "operators/relu_op.h"
 namespace paddle_mobile {
 namespace operators {
 template <typename Dtype, typename T>
 void ReluOp<Dtype, T>::InferShape() const {
-  auto input_dims = param_.InputX()->dims();
+  auto input_dims = this->param_.InputX()->dims();
-  param_.Out()->Resize(input_dims);
+  this->param_.Out()->Resize(input_dims);
 }
 template class ReluOp<CPU, float>;
 }  // namespace operators
@@ -31,5 +33,15 @@ template class ReluOp<CPU, float>;
 * 都是需要和model中类型对应起来的
 * */
 namespace ops = paddle_mobile::operators;
-USE_OP(relu);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(relu, ops::ReluOp);
+USE_OP_CPU(relu);
+REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(relu);
+REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef RELU_OP
 #pragma once
 #include <string>
@@ -26,36 +28,29 @@ namespace operators {
 using paddle_mobile::framework::Tensor;
 template <typename DeviceType, typename T>
-class ReluOp : public framework::OperatorWithKernel<DeviceType> {
+class ReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, ReluParam, operators::ReluKernel<DeviceType, T>> {
 public:
  /*
   * @b op 的实例化方法, 需要调用父类的实例化方法, 以及实例化自己的参数结构体
   * */
  ReluOp(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const framework::AttributeMap attrs,
+         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
         std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+      : framework::OperatorWithKernel<DeviceType, ReluParam,
-                                                  scope),
+                                      operators::ReluKernel<DeviceType, T>>(
-        param_(inputs, outputs, attrs, *scope) {}
+            type, inputs, outputs, attrs, scope) {}
-  /*
-   * @b op 进行运算, 调用相应的 kernel 进行运算
-   * */
-  void RunImpl() const {
-    operators::ReluKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, ReluParam,
+      operators::ReluKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:
-  /*
-   * @b Relu kernel 进行运算时所需要用到参数的结构体,
-   *    结构体定义在: paddle-mobile/src/operators/op_param.h
-   * */
-  ReluParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef RESHAPE_OP
 #include "operators/reshape_op.h"
 #include <vector>
 namespace paddle_mobile {
@@ -20,15 +22,25 @@ namespace operators {
 template <typename Dtype, typename T>
 void ReshapeOp<Dtype, T>::InferShape() const {
  /// todo: add InputShape() detection.
-  auto &shape = param_.Shape();
+  auto &shape = this->param_.Shape();
-  auto input_x_dims = param_.InputX()->dims();
+  auto input_x_dims = this->param_.InputX()->dims();
  auto out_dims = ValidateShape(shape, input_x_dims);
-  param_.Out()->Resize(out_dims);
+  this->param_.Out()->Resize(out_dims);
 }
 template class ReshapeOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(reshape);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(reshape, ops::ReshapeOp);
+USE_OP_CPU(reshape);
+REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(reshape);
+REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/reshape_op.h
+++ b/src/operators/reshape_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef RESHAPE_OP
 #pragma once
 #include <string>
@@ -26,26 +28,27 @@ namespace operators {
 using paddle_mobile::framework::Tensor;
 template <typename DeviceType, typename T>
-class ReshapeOp : public framework::OperatorWithKernel<DeviceType> {
+class ReshapeOp
+    : public framework::OperatorWithKernel<
+          DeviceType, ReshapeParam, operators::ReshapeKernel<DeviceType, T>> {
 public:
  ReshapeOp(const std::string &type, const VariableNameMap &inputs,
-            const VariableNameMap &outputs, const framework::AttributeMap attrs,
+            const VariableNameMap &outputs,
+            const framework::AttributeMap &attrs,
            std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+      : framework::OperatorWithKernel<DeviceType, ReshapeParam,
-                                                  scope),
+                                      operators::ReshapeKernel<DeviceType, T>>(
-        param_(inputs, outputs, attrs, *scope) {}
+            type, inputs, outputs, attrs, scope) {}
-  void RunImpl() const {
-    operators::ReshapeKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-  }
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, ReshapeParam,
+      operators::ReshapeKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
 protected:
-  ReshapeParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/sigmoid_op.cpp
+++ b/src/operators/sigmoid_op.cpp
@@ -12,18 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef SIGMOID_OP
 #include "operators/sigmoid_op.h"
 namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType, typename T>
 void SigmoidOp<DeviceType, T>::InferShape() const {
-  param_.Out()->Resize(param_.InputX()->dims());
+  this->param_.Out()->Resize(this->param_.InputX()->dims());
 }
 template class SigmoidOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(sigmoid);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(sigmoid, ops::SigmoidOp);
+USE_OP_CPU(sigmoid);
+REGISTER_OPERATOR_CPU(sigmoid, ops::SigmoidOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/sigmoid_op.h
+++ b/src/operators/sigmoid_op.h
@@ -12,38 +12,38 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef SIGMOID_OP
 #pragma once
-#include <framework/operator.h>
-#include <operators/op_param.h>
 #include <string>
+#include "framework/operator.h"
 #include "operators/kernel/sigmoid_kernel.h"
+#include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType, typename T>
-class SigmoidOp : public framework::OperatorWithKernel<DeviceType> {
+class SigmoidOp
+    : public framework::OperatorWithKernel<
+          DeviceType, SigmoidParam, operators::SigmoidKernel<DeviceType, T>> {
 public:
  SigmoidOp(const std::string &type, const VariableNameMap &inputs,
            const VariableNameMap &outputs,
            const framework::AttributeMap &attrs,
            std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+      : framework::OperatorWithKernel<DeviceType, SigmoidParam,
-                                                  scope),
+                                      operators::SigmoidKernel<DeviceType, T>>(
-        param_(inputs, outputs, attrs, *scope) {}
+            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, SigmoidParam,
+      operators::SigmoidKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
-  void RunImpl() const {
-    operators::SigmoidKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-    this->ClearVariables({"X"});
-  }
- private:
-  SigmoidParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -12,18 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef SOFTMAX_OP
 #include "operators/softmax_op.h"
 namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType, typename T>
 void SoftmaxOp<DeviceType, T>::InferShape() const {
-  param_.Out()->Resize(param_.InputX()->dims());
+  this->param_.Out()->Resize(this->param_.InputX()->dims());
 }
 template class SoftmaxOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(softmax);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(softmax, ops::SoftmaxOp);
+USE_OP_CPU(softmax);
+REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(softmax);
+REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
@@ -12,38 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef SOFTMAX_OP
 #pragma once
-#include <framework/operator.h>
-#include <operators/op_param.h>
 #include <string>
+#include "framework/operator.h"
 #include "operators/kernel/softmax_kernel.h"
+#include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType, typename T>
-class SoftmaxOp : public framework::OperatorWithKernel<DeviceType> {
+class SoftmaxOp
+    : public framework::OperatorWithKernel<
+          DeviceType, SoftmaxParam, operators::SoftmaxKernel<DeviceType, T>> {
 public:
  SoftmaxOp(const std::string &type, const VariableNameMap &inputs,
            const VariableNameMap &outputs,
            const framework::AttributeMap &attrs,
            std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+      : framework::OperatorWithKernel<DeviceType, SoftmaxParam,
-                                                  scope),
+                                      operators::SoftmaxKernel<DeviceType, T>>(
-        param_(inputs, outputs, attrs, *scope) {}
+            type, inputs, outputs, attrs, scope) {}
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
+  using framework::OperatorWithKernel<
+      DeviceType, SoftmaxParam,
+      operators::SoftmaxKernel<DeviceType, T>>::OperatorWithKernel;
  void InferShape() const override;
-  void RunImpl() const {
-    operators::SoftmaxKernel<DeviceType, T> kernel;
-    kernel.Compute(param_);
-    this->ClearVariables({"X"});
-  }
 private:
-  SoftmaxParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/transpose_op.cpp
+++ b/src/operators/transpose_op.cpp
@@ -12,16 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "operators/transpose_op.h"
+#ifdef TRANSPOSE_OP
-#include <common/enforce.h>
 #include <vector>
+#include "common/enforce.h"
+#include "operators/transpose_op.h"
 namespace paddle_mobile {
 namespace operators {
 template <typename Dtype, typename T>
 void TransposeOp<Dtype, T>::InferShape() const {
-  auto input_x_dims = param_.InputX()->dims();
+  auto input_x_dims = this->param_.InputX()->dims();
-  auto axis = param_.Axis();
+  auto axis = this->param_.Axis();
  size_t x_dims_size = input_x_dims.size();
  size_t axis_size = axis.size();
@@ -42,12 +45,20 @@ void TransposeOp<Dtype, T>::InferShape() const {
  for (size_t i = 0; i < axis_size; i++) {
    out_dims[i] = input_x_dims[axis[i]];
  }
-  param_.Out()->Resize(out_dims);
+  this->param_.Out()->Resize(out_dims);
 }
 template class TransposeOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(transpose);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(transpose, ops::TransposeOp);
+USE_OP_CPU(transpose);
+REGISTER_OPERATOR_CPU(transpose, ops::TransposeOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/transpose_op.h
+++ b/src/operators/transpose_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef TRANSPOSE_OP
 #pragma once
 #include <string>
@@ -26,27 +28,26 @@ namespace operators {
 using paddle_mobile::framework::Tensor;
 template <typename DeviceType, typename T>
-class TransposeOp : public framework::OperatorWithKernel<DeviceType> {
+class TransposeOp : public framework::OperatorWithKernel<
+                        DeviceType, TransposeParam,
+                        operators::TransposeKernel<DeviceType, T>> {
 public:
  TransposeOp(const std::string &type, const VariableNameMap &inputs,
              const VariableNameMap &outputs,
-              const framework::AttributeMap attrs,
+              const framework::AttributeMap &attrs,
              std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
+      : framework::OperatorWithKernel<
-                                                  scope),
+            DeviceType, TransposeParam,
-        param_(inputs, outputs, attrs, *scope) {}
+            operators::TransposeKernel<DeviceType, T>>(type, inputs, outputs,
+                                                       attrs, scope) {}
-  void RunImpl() const {
-    operators::TransposeKernel<DeviceType, T> kernel;
+  using framework::OperatorWithKernel<
-    kernel.Compute(param_);
+      DeviceType, TransposeParam,
-  }
+      operators::TransposeKernel<DeviceType, T>>::OperatorWithKernel;
-  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
  void InferShape() const override;
- protected:
-  TransposeParam param_;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/platform/data_type.h
+++ b/src/platform/data_type.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <string>
-#include <typeindex>
-#include "framework/program/tensor_desc.h"
-namespace paddle_mobile {
-namespace framework {
-inline VarType_Type ToDataType(std::type_index type) {
-  /*if (typeid(platform::float16).hash_code() == type.hash_code()) {
-    return proto::VarType::FP16;
-  } else */
-  if (typeid(const float).hash_code() == type.hash_code()) {
-    // CPPLint complains Using C-style cast.  Use
-    // static_cast<float>() instead
-    // One fix to this is to replace float with const float because
-    // typeid(T) == typeid(const T)
-    // http://en.cppreference.com/w/cpp/language/typeid
-    return VARTYPE_TYPE_FP32;
-  } else if (typeid(const double).hash_code() == type.hash_code()) {
-    return VARTYPE_TYPE_FP64;
-  } else if (typeid(const int).hash_code() == type.hash_code()) {
-    return VARTYPE_TYPE_INT32;
-  } else if (typeid(const int64_t).hash_code() == type.hash_code()) {
-    return VARTYPE_TYPE_INT64;
-  } else if (typeid(const bool).hash_code() == type.hash_code()) {
-    return VARTYPE_TYPE_BOOL;
-  } else {
-    //    PADDLE_THROW("Not supported");
-    //    std::cout << "Not supported";
-  }
-}
-inline std::type_index ToTypeIndex(VarType_Type type) {
-  switch (type) {
-    //    case proto::VarType::FP16:
-    //      return typeid(platform::float16);
-    case VARTYPE_TYPE_FP32:
-      return typeid(float);
-    case VARTYPE_TYPE_FP64:
-      return typeid(double);
-    case VARTYPE_TYPE_INT32:
-      return typeid(int);
-    case VARTYPE_TYPE_INT64:
-      return typeid(int64_t);
-    case VARTYPE_TYPE_BOOL:
-      return typeid(bool);
-    default:
-      //      PADDLE_THROW("Not support type %d", type);
-      printf("Not support type %d", type);
-  }
-}
-template <typename Visitor>
-inline void VisitDataType(VarType_Type type, Visitor visitor) {
-  switch (type) {
-    //    case proto::VarType::FP16:
-    //      visitor.template operator()<platform::float16>();
-    //      break;
-    case VARTYPE_TYPE_FP32:
-      visitor.template operator()<float>();
-      break;
-    case VARTYPE_TYPE_FP64:
-      visitor.template operator()<double>();
-      break;
-    case VARTYPE_TYPE_INT32:
-      visitor.template operator()<int>();
-      break;
-    case VARTYPE_TYPE_INT64:
-      visitor.template operator()<int64_t>();
-      break;
-    case VARTYPE_TYPE_BOOL:
-      visitor.template operator()<bool>();
-      break;
-    default:
-      //      PADDLE_THROW("Not supported");
-      printf("Not supported");
-  }
-}
-inline std::string DataTypeToString(const VarType_Type type) {
-  switch (type) {
-    case VARTYPE_TYPE_FP16:
-      return "float16";
-    case VARTYPE_TYPE_FP32:
-      return "float32";
-    case VARTYPE_TYPE_FP64:
-      return "float64";
-    case VARTYPE_TYPE_INT16:
-      return "int16";
-    case VARTYPE_TYPE_INT32:
-      return "int32";
-    case VARTYPE_TYPE_INT64:
-      return "int64";
-    case VARTYPE_TYPE_BOOL:
-      return "bool";
-    default:
-      //      PADDLE_THROW("Not support type %d", type);
-      printf("Not support type %d", type);
-  }
-}
-inline std::ostream &operator<<(std::ostream &out, const VarType_Type &type) {
-  out << DataTypeToString(type);
-  return out;
-}
-}  // namespace framework
-}  // namespace paddle_mobile
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
 set(dir ${CMAKE_CURRENT_SOURCE_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build")
-# gen test
+if (googlenet)
-ADD_EXECUTABLE(test-conv-op  operators/test_cov_op.cpp test_helper.h  test_include.h executor_for_test.h)
+    # gen test
-target_link_libraries(test-conv-op paddle-mobile)
+    ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-googlenet paddle-mobile)
-# gen test
+elseif (mobilenet)
-ADD_EXECUTABLE(test-mul-op  operators/test_mul_op.cpp test_helper.h  test_include.h)
+    # gen test
-target_link_libraries(test-mul-op paddle-mobile)
+    ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-mobilenet paddle-mobile)
-# gen test
+elseif (yolo)
-ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h  test_include.h)
+    # gen test
-target_link_libraries(test-elementwiseadd-op paddle-mobile)
+    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-yolo paddle-mobile)
-# gen test
+elseif (squeezenet)
-ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h  test_include.h)
+    # gen test
-target_link_libraries(test-concat-op paddle-mobile)
+    ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-squeezenet paddle-mobile)
-# gen test
+elseif(resnet)
-ADD_EXECUTABLE(test-lrn-op  operators/test_lrn_op.cpp test_helper.h  test_include.h)
+    # gen test
-target_link_libraries(test-lrn-op paddle-mobile)
+    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet paddle-mobile)
-# gen test
+else ()
-ADD_EXECUTABLE(test-batchnorm-op  operators/test_batchnorm_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-batchnorm-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
-# gen test
+    target_link_libraries(test-resnet paddle-mobile)
-ADD_EXECUTABLE(test-priorbox-op  operators/test_prior_box_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-priorbox-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
-# gen test
+    target_link_libraries(test-squeezenet paddle-mobile)
-ADD_EXECUTABLE(test-boxcoder-op  operators/test_box_coder_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-boxcoder-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
-# gen test
+    target_link_libraries(test-yolo paddle-mobile)
-ADD_EXECUTABLE(test-transpose-op  operators/test_transpose_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-transpose-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
-# gen test
+    target_link_libraries(test-googlenet paddle-mobile)
-ADD_EXECUTABLE(test-multiclassnms-op  operators/test_multiclass_nms_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-multiclassnms-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-conv-op  operators/test_cov_op.cpp test_helper.h  test_include.h executor_for_test.h)
-# gen test
+    target_link_libraries(test-conv-op paddle-mobile)
-ADD_EXECUTABLE(test-reshape-op  operators/test_reshape_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-reshape-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-mul-op  operators/test_mul_op.cpp test_helper.h  test_include.h)
-# gen test
+    target_link_libraries(test-mul-op paddle-mobile)
-ADD_EXECUTABLE(test-relu-op  operators/test_relu_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-relu-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h  test_include.h)
-# gen test
+    target_link_libraries(test-elementwiseadd-op paddle-mobile)
-ADD_EXECUTABLE(test-fc-op  operators/test_fushion_fc_op.cpp test_helper.h  test_include.h)
-target_link_libraries(test-fc-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h  test_include.h)
-# gen test log
+    target_link_libraries(test-concat-op paddle-mobile)
-ADD_EXECUTABLE(test-log common/test_log.cpp)
-target_link_libraries(test-log paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-lrn-op  operators/test_lrn_op.cpp test_helper.h  test_include.h)
-# gen test log
+    target_link_libraries(test-lrn-op paddle-mobile)
-ADD_EXECUTABLE(test-load framework/test_load.cpp)
-target_link_libraries(test-load paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-batchnorm-op  operators/test_batchnorm_op.cpp test_helper.h  test_include.h)
-# gen test log
+    target_link_libraries(test-batchnorm-op paddle-mobile)
-# gen test
-ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp)
+    # gen test
-target_link_libraries(test-optimize paddle-mobile)
+    ADD_EXECUTABLE(test-priorbox-op  operators/test_prior_box_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-priorbox-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-boxcoder-op  operators/test_box_coder_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-boxcoder-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-transpose-op  operators/test_transpose_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-transpose-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-multiclassnms-op  operators/test_multiclass_nms_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-multiclassnms-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-reshape-op  operators/test_reshape_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-reshape-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-relu-op  operators/test_relu_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-relu-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-fc-op  operators/test_fusion_fc_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-fc-op paddle-mobile)
+    # gen test log
+    ADD_EXECUTABLE(test-log common/test_log.cpp)
+    target_link_libraries(test-log paddle-mobile)
+    # gen test log
+    ADD_EXECUTABLE(test-load framework/test_load.cpp)
+    target_link_libraries(test-load paddle-mobile)
+    # gen test log
+    # gen test
+    ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp)
+    target_link_libraries(test-optimize paddle-mobile)
+    #gen test
+    ADD_EXECUTABLE(test-pool operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-pool paddle-mobile)
+    #gen test
+    ADD_EXECUTABLE(test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-softmax paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-gemm common/test_gemm.cpp)
+    target_link_libraries(test-gemm paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
+    target_link_libraries(test-enforce paddle-mobile)
-#gen test
+    # gen test - test if openmp works
-ADD_EXECUTABLE(test-pool operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-openmp common/test_openmp.cpp test_helper.h test_include.h executor_for_test.h)
-target_link_libraries(test-pool paddle-mobile)
+    target_link_libraries(test-openmp paddle-mobile)
-#gen test
+    # gen test
-ADD_EXECUTABLE(test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h  test_include.h executor_for_test.h)
-target_link_libraries(test-softmax paddle-mobile)
+    target_link_libraries(test-mobilenetssd paddle-mobile)
-# gen test
+    # gen test
-ADD_EXECUTABLE(test-gemm common/test_gemm.cpp)
+    ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp  test_include.h)
-target_link_libraries(test-gemm paddle-mobile)
+    target_link_libraries(test-sigmoid paddle-mobile)
-# gen test
+    # gen test
-ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
+    ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
-target_link_libraries(test-enforce paddle-mobile)
+    target_link_libraries(test-depthwise-conv-op paddle-mobile)
-# gen test
+    # gen test
-ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
-target_link_libraries(test-yolo paddle-mobile)
+    target_link_libraries(test-mobilenet paddle-mobile)
-# gen test
+    # gen test
-ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h  test_include.h executor_for_test.h)
-target_link_libraries(test-googlenet paddle-mobile)
+    target_link_libraries(test-conv-add-relu-op paddle-mobile)
-# gen test
+    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
-ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
-target_link_libraries(test-mobilenet paddle-mobile)
-# gen test
+endif()
-ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
-target_link_libraries(test-resnet paddle-mobile)
-# gen test
-ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h  test_include.h executor_for_test.h)
-target_link_libraries(test-mobilenetssd paddle-mobile)
-# gen test
-ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
-target_link_libraries(test-squeezenet paddle-mobile)
-# gen test
-ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp  test_include.h)
-target_link_libraries(test-sigmoid paddle-mobile)
-# gen test
-ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
-target_link_libraries(test-depthwise-conv-op paddle-mobile)
--- a/test/common/test_gemm.cpp.cpp
+++ b/test/common/test_gemm.cpp.cpp
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <iostream>
+#include "../test_helper.h"
 #include "common/log.h"
+#include "memory/t_malloc.h"
 #include "operators/math/gemm.h"
 #define a(i, j) a[(i)*lda + (j)]
@@ -29,10 +31,15 @@ int main() {
  int ldb = n;
  int ldc = n;
-  float a[62 * 74];
+  float *a =
-  float b[74 * 63];
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * k));
-  float c[62 * 63] = {0};
+  float *b =
-  float c1[62 * 63] = {0};
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * k * n));
+  float *c =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
+  float *c1 =
+      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
  for (int i = 0; i < m * k; ++i) {
    a[i] = 2;
  }
@@ -44,8 +51,11 @@ int main() {
    c1[i] = 2;
  }
+  auto time1 = time();
  paddle_mobile::operators::math::sgemm(m, n, k, 0.9, a, lda, b, ldb, 0.3, c,
                                        ldc);
+  auto time2 = time();
+  DLOG << "gemm cost :" << time_diff(time1, time2) << "ms\n";
  for (int i = 0; i < m * n; ++i) {
    std::cout << c[i] << " | ";
    if (i % n == (n - 1)) {

--- a/src/framework/program/tensor_desc.cpp
+++ b/src/framework/program/tensor_desc.cpp
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 //
-// Created by liuRuiLong on 2018/5/26.
+// Created by liuRuiLong on 2018/6/6.
 //
-#include "tensor_desc.h"
+#include "test_lib_size.h"
+static test_lib_size t;
--- a/test/common/test_lib_size.h
+++ b/test/common/test_lib_size.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+//
+// Created by liuRuiLong on 2018/6/6.
+//
+#ifndef PADDLE_MOBILE_TEST_LIB_SIZE_H
+#define PADDLE_MOBILE_TEST_LIB_SIZE_H
+#include <pthread.h>
+#include <thread>
+#include <vector>
+//#include <list>
+//#include <tuple>
+//#include <typeinfo>
+//#include <mutex>
+//#include <initializer_list>
+//#include <map>
+//#include <string>
+//#include <unordered_map>
+//#include <unordered_set>
+//#include <algorithm>
+//#include <iostream>
+//#include <sstream>
+//#include <memory>
+//#include <stdio.h>
+//#include <cstring>
+void foo() {
+  //  char *str = "1234";
+  //  char dst[10];
+  //  strcpy(dst, str);
+  //  std::cout << "12345" << std::endl;
+  std::vector<int> vec = {1, 2, 3, 4, 5};
+  vec.push_back(2);
+  pthread_mutex_init(NULL, NULL);
+  pthread_attr_destroy(NULL);
+  //  std::find(vec.begin(), vec.end(), 1);
+  //  std::list<int> l;
+  //  std::mutex mutex_;
+  //  std::map<int, float> m;
+  //  std::unordered_map<int, float> u_m;
+  //  std::unordered_set<int> u_s;
+  //  std::string ss = "12345";
+  //  printf("%f", ss.c_str());
+  //  std::initializer_list<int> init_list = {1, 2};
+  //  std::tuple<int, int> t = {1, 2};
+  //  std::tuple_element<I, std::tuple<ARGS...>>::type
+  //  std::tuple<>
+  //  int i;
+  //  int j;
+  //  if (typeid(i) == typeid(j)){
+  //    int z = 10;
+  //  }
+  //  std::shared_ptr<int> s1 = std::make_shared<int>();
+  //  std::stringstream ss;
+  //  ss << "12345";
+}
+class test_lib_size {
+ public:
+  test_lib_size() {}
+  //  std::shared_ptr<int> Test(){
+  //    std::vector<int> vec = {1, 2, 3};
+  //    std::shared_ptr<int> si = std::make_shared<int>();
+  //    return si;
+  //  }
+  //  void test(){
+  //    int i = 9;
+  //  }
+};
+#endif  // PADDLE_MOBILE_TEST_LIB_SIZE_H
--- a/src/platform/hostdevice.h
+++ b/src/platform/hostdevice.h
@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
+//#include <omp.h>
+#include <iostream>
-#ifdef __CUDACC__
+int main(void) {
-#define HOSTDEVICE __host__ __device__
+#ifdef PADDLE_MOBILE_USE_OPENMP
-#define DEVICE __device__
+  #pragma omp parallel num_threads(2)
-#define HOST __host__
+  {
-#else
+    //        int thread_id = omp_get_thread_num();
-#define HOSTDEVICE
+    //        int nthreads = omp_get_num_threads();
-#define DEVICE
+    //        std::cout << "Hello, OMP " << thread_id << "/" << nthreads <<
-#define HOST
+    //        "\n";
+  }
 #endif
+  return 0;
+}
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "common/log.h"
 #include "framework/op_registry.h"
-#include "io.h"
+#include "io/io.h"
 #include "operators/conv_op.h"
 #include "operators/elementwise_add_op.h"
 #include "operators/pool_op.h"
@@ -42,8 +42,10 @@ using std::vector;
 template <typename DeviceType, typename OpType>
 class Executor4Test : public Executor<DeviceType> {
 public:
-  Executor4Test(Program<DeviceType> p, string op_type)
+  Executor4Test(Program<DeviceType> p, string op_type,
+                bool use_optimize = false)
      : Executor<DeviceType>() {
+    this->use_optimize_ = use_optimize;
    this->program_ = p;
    if (this->use_optimize_) {
      this->to_predict_program_ = this->program_.optimizeProgram;
@@ -61,13 +63,14 @@ class Executor4Test : public Executor<DeviceType> {
      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
      for (std::shared_ptr<OpDesc> op : ops) {
        if (op->Type() == op_type) {
+          DLOG << "匹配到: " << op->Type();
          /// test first meeting op in program
          std::shared_ptr<paddle_mobile::framework::OperatorBase<DeviceType>>
-              op_ptr = paddle_mobile::framework::OpRegistry<
+              op_ptr =
-                  paddle_mobile::CPU>::CreateOp(op->Type(), op->GetInputs(),
+                  paddle_mobile::framework::OpRegistry<DeviceType>::CreateOp(
-                                                op->GetOutputs(),
+                      op->Type(), op->GetInputs(), op->GetOutputs(),
-                                                op->GetAttrMap(),
+                      op->GetAttrMap(), this->program_.scope);
-                                                this->program_.scope);
          this->ops_of_block_[*block_desc.get()].push_back(op_ptr);
          break;
        }

--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -13,13 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "../test_helper.h"
-#include "io.h"
+#include "io/io.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
-  auto program = loader.Load(g_googlenet);
+  auto program = loader.Load(g_mobilenet_ssd, false, false);
-  program.optimizeProgram->Description("program desc: ");
+  //  auto program = loader.Load(g_googlenet_combine + "/model",
+  //  g_googlenet_combine +
+  //    "/params", true);
+  //  program.originProgram->Description("program desc: ");
  return 0;
 }
--- a/test/framework/test_optimize.cpp
+++ b/test/framework/test_optimize.cpp
@@ -15,17 +15,17 @@ limitations under the License. */
 #include "../test_helper.h"
 #include "framework/program/program-optimize/node.h"
 #include "framework/program/program-optimize/program_optimize.h"
-#include "io.h"
+#include "io/io.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  //    "../../../test/models/googlenet"
-  auto program = loader.Load(g_googlenet);
+  auto program = loader.Load(g_mobilenet_ssd, true);
  paddle_mobile::framework::ProgramOptimize optimize;
  //  program.originProgram->Description("origin");
-  auto optimize_program = optimize.FushionOptimize(program.originProgram);
+  auto optimize_program = optimize.FusionOptimize(program.originProgram);
  if (optimize_program != nullptr) {
-    optimize_program->Description("optimize");
+    //    optimize_program->Description("optimize");
  } else {
    LOG(paddle_mobile::kLOG_ERROR) << "optimize_program is null";
  }

--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -20,7 +20,9 @@ int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  bool optimize = true;
  auto time1 = time();
-  auto program = loader.Load(g_googlenet, optimize);
+  //  auto program = loader.Load(g_googlenet, optimize);
+  auto program = loader.Load(g_googlenet_combine + "/model",
+                             g_googlenet_combine + "/params", optimize);
  auto time2 = time();
  DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);
@@ -28,7 +30,11 @@ int main() {
  std::vector<int64_t> dims{1, 3, 224, 224};
  GetInput<float>(g_test_image_1x3x224x224, &input, dims);
  auto time3 = time();
-  executor.Predict(input, dims);
+  for (int i = 0; i < 10; ++i) {
+    executor.Predict(input, dims);
+  }
  auto time4 = time();
  DLOG << "predict cost :" << time_diff(time3, time4) << "ms\n";
  return 0;

--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
@@ -19,10 +19,10 @@ limitations under the License. */
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  auto time1 = time();
-  auto program = loader.Load(g_mobilenet_ssd, false);
+  auto program = loader.Load(g_mobilenet_ssd, true);
  auto time2 = time();
  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, false);
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, true);
  std::vector<int64_t> dims{1, 3, 300, 300};
  Tensor input_tensor;

--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -19,14 +19,14 @@ limitations under the License. */
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  auto time1 = time();
-  auto program = loader.Load(g_mobilenet, false);
+  auto program = loader.Load(g_mobilenet, true);
  auto time2 = time();
  DLOG << "load cost :" << time_diff(time1, time1) << "ms";
-  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 2, false);
+  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, true);
-  std::vector<int64_t> dims{2, 3, 224, 224};
+  std::vector<int64_t> dims{1, 3, 224, 224};
  Tensor input_tensor;
-  SetupTensor<float>(&input_tensor, {2, 3, 224, 224}, static_cast<float>(0),
+  SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
                     static_cast<float>(1));
  std::vector<float> input(input_tensor.data<float>(),

--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
@@ -41,7 +41,7 @@ class TestBatchNormOp {
      for (int j = 0; j < ops.size(); ++j) {
        std::shared_ptr<OpDesc> op = ops[j];
        if (op->Type() == "batch_norm" &&
-            op->Input("X")[0] == "conv2d_0.tmp_0") {
+            op->Input("X")[0] == "conv2d_5.tmp_0") {
          DLOG << " mul attr size: " << op->GetAttrMap().size();
          DLOG << " inputs size: " << op->GetInputs().size();
          DLOG << " outputs size: " << op->GetOutputs().size();
@@ -67,29 +67,29 @@ class TestBatchNormOp {
                                     const Tensor &t5) {
    // feed
    auto scope = program_.scope;
-    Variable *x1_feed_value = scope->Var("conv2d_0.tmp_0");
+    Variable *x1_feed_value = scope->Var("conv2d_5.tmp_0");
    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
    tensor_x1->ShareDataWith(t1);
-    Variable *mean_feed_value = scope->Var("batch_norm_0.w_1");
+    Variable *mean_feed_value = scope->Var("batch_norm_10.w_1");
    auto tensor_mean = mean_feed_value->GetMutable<LoDTensor>();
    tensor_mean->ShareDataWith(t2);
-    Variable *scale_feed_value = scope->Var("batch_norm_0.w_0");
+    Variable *scale_feed_value = scope->Var("batch_norm_10.w_0");
    auto tensor_scale = scale_feed_value->GetMutable<LoDTensor>();
    tensor_scale->ShareDataWith(t3);
-    Variable *variance_feed_value = scope->Var("batch_norm_0.w_2");
+    Variable *variance_feed_value = scope->Var("batch_norm_10.w_2");
    auto tensor_variance = variance_feed_value->GetMutable<LoDTensor>();
    tensor_variance->ShareDataWith(t4);
-    Variable *bias_feed_value = scope->Var("batch_norm_0.b_0");
+    Variable *bias_feed_value = scope->Var("batch_norm_10.b_0");
    auto tensor_bias = bias_feed_value->GetMutable<LoDTensor>();
    tensor_bias->ShareDataWith(t5);
-    Variable *output = scope->Var("batch_norm_0.tmp_2");
+    Variable *output = scope->Var("batch_norm_10.tmp_2");
    auto *output_tensor = output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>({4, 10, 2, 2});
+    output_tensor->mutable_data<float>({1, 256, 38, 38});
    //  DLOG << typeid(output_tensor).name();
    //  DLOG << "output_tensor dims: " << output_tensor->dims();
@@ -128,30 +128,32 @@ int main() {
  DLOG << "----------**********----------";
  DLOG << "begin to run BatchNormOp Test";
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_resnet));
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
  /// input x (4,10,2,2)
  paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {4, 10, 2, 2}, static_cast<float>(0),
+  SetupTensor<float>(&inputx1, {1, 256, 38, 38}, static_cast<float>(0),
                     static_cast<float>(1));
  auto *inputx1_ptr = inputx1.data<float>();
  paddle_mobile::framework::Tensor mean;
-  SetupTensor<float>(&mean, {10}, static_cast<float>(0), static_cast<float>(1));
+  SetupTensor<float>(&mean, {256}, static_cast<float>(0),
+                     static_cast<float>(1));
  auto *mean_ptr = mean.data<float>();
  paddle_mobile::framework::Tensor scale;
-  SetupTensor<float>(&scale, {10}, static_cast<float>(0),
+  SetupTensor<float>(&scale, {256}, static_cast<float>(0),
                     static_cast<float>(1));
  auto *scale_ptr = scale.data<float>();
  paddle_mobile::framework::Tensor variance;
-  SetupTensor<float>(&variance, {10}, static_cast<float>(0),
+  SetupTensor<float>(&variance, {256}, static_cast<float>(0),
                     static_cast<float>(1));
  auto *variance_ptr = variance.data<float>();
  paddle_mobile::framework::Tensor bias;
-  SetupTensor<float>(&bias, {10}, static_cast<float>(0), static_cast<float>(1));
+  SetupTensor<float>(&bias, {256}, static_cast<float>(0),
+                     static_cast<float>(1));
  auto *bias_ptr = bias.data<float>();
  paddle_mobile::framework::TestBatchNormOp<paddle_mobile::CPU> testBatchNormOp(
@@ -161,11 +163,13 @@ int main() {
      testBatchNormOp.predict_bn(inputx1, mean, scale, variance, bias);
  auto *output_bn_ptr = output_bn->data<float>();
-  /// [2, 5, 1, 0]
+  DLOG << " (" << inputx1_ptr[0] << " - " << mean_ptr[0] << ")/(("
-  DLOG << " (" << inputx1_ptr[102] << " - " << mean_ptr[5] << ")/(("
+       << variance_ptr[0] << " + 0.00001"
-       << variance_ptr[5] << " + 0.00001"
+       << ")^0.5)* " << scale_ptr[0] << " + " << bias_ptr[0] << " = ";
-       << ")^0.5)* " << scale_ptr[5] << " + " << bias_ptr[5] << " = ";
+  DLOG << output_bn_ptr[0];
-  DLOG << output_bn_ptr[102];
+  DLOG << "input_ptr 0 : " << inputx1_ptr[0];
+  DLOG << "output_ptr 0 : " << output_bn_ptr[0];
  return 0;
 }
--- a/test/operators/test_concat_op.cpp
+++ b/test/operators/test_concat_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/concat_op.h"

--- a/test/operators/test_conv_add_relu_op.cpp
+++ b/test/operators/test_conv_add_relu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "../test_include.h"
+#include "operators/fusion_conv_add_relu_op.h"
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../models/image_classification_resnet.inference.model
+  auto program = loader.Load(g_googlenet, true);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+  Executor4Test<
+      paddle_mobile::CPU,
+      paddle_mobile::operators::FusionConvAddReluOp<paddle_mobile::CPU, float>>
+      executor(program, "fusion_conv_add_relu", true);
+  paddle_mobile::framework::Tensor input;
+  GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
+  //  // use SetupTensor if not has local input image .
+  //  SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
+  //                     static_cast<float>(1));
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 112, 112});
+  auto output = executor.Predict(input, "data", "conv2d_0.tmp_2", out_ddim);
+  auto output_ptr = output->data<float>();
+  for (int j = 0; j < 25; ++j) {
+    DLOG << " value of output: " << output_ptr[j];
+  }
+  return 0;
+}
--- a/test/operators/test_cov_op.cpp
+++ b/test/operators/test_cov_op.cpp
@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "../executor_for_test.h"
 #include "../test_include.h"
+#include "operators/conv_op.h"
 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::Loader<paddle_mobile::GPU_MALI> loader;
  //  ../models/image_classification_resnet.inference.model
  auto program = loader.Load(g_googlenet);
  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                        "program file read fail");
-  Executor4Test<paddle_mobile::CPU,
+  Executor4Test<paddle_mobile::GPU_MALI, paddle_mobile::operators::ConvOp<
-                paddle_mobile::operators::ConvOp<paddle_mobile::CPU, float>>
+                                             paddle_mobile::GPU_MALI, float>>
      executor(program, "conv2d");
  paddle_mobile::framework::Tensor input;
@@ -37,7 +37,7 @@ int main() {
  auto output = executor.Predict(input, "data", "conv2d_0.tmp_0", out_ddim);
  auto output_ptr = output->data<float>();
-  for (int j = 0; j < output->numel(); ++j) {
+  for (int j = 0; j < 20; ++j) {
    DLOG << " value of output: " << output_ptr[j];
  }
  return 0;

--- a/test/operators/test_depthwise_conv_op.cpp
+++ b/test/operators/test_depthwise_conv_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/depthwise_conv_op.h"

--- a/test/operators/test_elementwise_add_op.cpp
+++ b/test/operators/test_elementwise_add_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "../executor_for_test.h"
 #include "../test_include.h"
 int main() {

--- a/test/operators/test_fushion_fc_op.cpp
+++ b/test/operators/test_fushion_fc_op.cpp
@@ -49,8 +49,8 @@ class TestFcOp {
          DLOG << " Input Y is : " << op->Input("Y")[0];
          DLOG << " Input Y is : " << op->Input("Z")[0];
          DLOG << " Output Out is : " << op->Output("Out")[0];
-          std::shared_ptr<operators::FushionFcOp<Dtype, float>> testOp =
+          std::shared_ptr<operators::FusionFcOp<Dtype, float>> testOp =
-              std::make_shared<operators::FushionFcOp<Dtype, float>>(
+              std::make_shared<operators::FusionFcOp<Dtype, float>>(
                  op->Type(), op->GetInputs(), op->GetOutputs(),
                  op->GetAttrMap(), program_.scope);
          ops_of_block_[*block_desc.get()].push_back(testOp);
@@ -119,7 +119,7 @@ int main() {
  auto program = loader.Load(g_googlenet);
  paddle_mobile::framework::ProgramOptimize optimize;
  //  program.originProgram->Description("origin");
-  auto optimize_program = optimize.FushionOptimize(program.originProgram);
+  auto optimize_program = optimize.FusionOptimize(program.originProgram);
  program.optimizeProgram = optimize_program;

--- a/test/operators/test_lrn_op.cpp
+++ b/test/operators/test_lrn_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/lrn_op.h"

--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/mul_op.h"

--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "../executor_for_test.h"
+#include "../test_include.h"
-#include "../test_helper.h"
+#include "operators/pool_op.h"
-#include "io.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;

--- a/test/operators/test_relu_op.cpp
+++ b/test/operators/test_relu_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "../executor_for_test.h"
 #include "../test_include.h"
 #include "operators/relu_op.h"

--- a/test/operators/test_reshape_op.cpp
+++ b/test/operators/test_reshape_op.cpp
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "../executor_for_test.h"
+#include "../test_include.h"
-#include "../test_helper.h"
+#include "operators/reshape_op.h"
-#include "io.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;

--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "../../src/operators/kernel/sigmoid_kernel.h"
 #include "../test_helper.h"
-#include "io.h"
+#include "io/io.h"
 int main() {
  paddle_mobile::framework::Tensor input;

--- a/test/operators/test_softmax_op.cpp
+++ b/test/operators/test_softmax_op.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "../executor_for_test.h"
+#include "../test_include.h"
-#include "../test_helper.h"
-#include "io.h"
+#include "operators/softmax_op.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;

--- a/test/operators/test_transpose_op.cpp
+++ b/test/operators/test_transpose_op.cpp
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "../executor_for_test.h"
 #include "../test_helper.h"
-#include "io.h"
+#include "../test_include.h"
+#include "operators/transpose_op.h"
 int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  auto program = loader.Load(std::string(g_mobilenet_ssd));

--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -22,12 +22,13 @@ limitations under the License. */
 #include "framework/ddim.h"
 #include "framework/tensor.h"
-static const std::string g_googlenet = "../models/googlenet";
-static const std::string g_mobilenet = "../models/mobilenet";
 static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd";
 static const std::string g_squeezenet = "../models/squeezenet";
-static const std::string g_resnet =
+static const std::string g_googlenet = "../models/googlenet";
-    "../models/image_classification_resnet.inference.model";
+static const std::string g_mobilenet = "../models/mobilenet";
+static const std::string g_resnet_50 = "../models/resnet_50";
+static const std::string g_resnet = "../models/resnet";
+static const std::string g_googlenet_combine = "../models/googlenet_combine";
 static const std::string g_yolo = "../models/yolo";
 static const std::string g_test_image_1x3x224x224 =
    "../images/test_image_1x3x224x224_float";

--- a/test/test_include.h
+++ b/test/test_include.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "./test_helper.h"
 #include "common/enforce.h"
 #include "common/log.h"
+#include "executor_for_test.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"
 #include "framework/program/block_desc.h"
@@ -29,4 +30,4 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "framework/variable.h"
-#include "io.h"
+#include "io/io.h"
--- a/tools/android-debug-script/push2android.sh
+++ b/tools/android-debug-script/push2android.sh
+#!/usr/bin/env sh
+push_fn () {
+MODELS_PATH="../../test/models/*"
+MODELS_SRC="../../test/models"
+IMAGE_PATH="../../test/images/*"
+EXE_FILE="../../test/build/*"
+EXE_DIR="data/local/tmp/bin"
+adb shell mkdir ${EXE_DIR}
+MODELS_DIR="data/local/tmp/models"
+adb shell mkdir ${MODELS_DIR}
+for file in `ls ${MODELS_SRC}`
+do 
+    adb shell mkdir ${MODELS_DIR}"/"${file}
+done
+if [[ -d "../../src/operators/kernel/mali/ACL_Android/build" ]]; then
+ACL_BUILD_PATH="../../src/operators/kernel/mali/ACL_Android/build/*"
+adb push ${ACL_BUILD_PATH} ${EXE_DIR}
+fi
+IMAGES_DIR="data/local/tmp/images"
+adb shell mkdir ${IMAGES_DIR}
+LIB_PATH="../../build/release/arm-v7a/build/*"
+adb push ${EXE_FILE} ${EXE_DIR}
+adb push ${LIB_PATH} ${EXE_DIR}
+if [[ $1 != "npm" ]]; then
+adb push ${IMAGE_PATH} ${IMAGES_DIR}
+adb push ${MODELS_PATH} ${MODELS_DIR}
+fi
+}
+if [[ $1 == "npm" ]]; then
+push_fn $1
+else
+push_fn
+fi
--- a/tools/android-debug-script/run_on_android.sh
+++ b/tools/android-debug-script/run_on_android.sh
+#!/usr/bin/env sh
+push_fn () {
+MODELS_PATH="../../test/models/*"
+MODELS_SRC="../../test/models"
+IMAGE_PATH="../../test/images/*"
+EXE_FILE="../../test/build/*"
+EXE_DIR="data/local/tmp/bin"
+adb shell mkdir ${EXE_DIR}
+MODELS_DIR="data/local/tmp/models"
+adb shell mkdir ${MODELS_DIR}
+for file in `ls ${MODELS_SRC}`
+do 
+    adb shell mkdir ${MODELS_DIR}"/"${file}
+done
+IMAGES_DIR="data/local/tmp/images"
+adb shell mkdir ${IMAGES_DIR}
+LIB_PATH="../../build/release/arm-v7a/build/*"
+adb push ${EXE_FILE} ${EXE_DIR}
+adb push ${LIB_PATH} ${EXE_DIR}
+if [[ $1 != "npm" ]]; then
+adb push ${IMAGE_PATH} ${IMAGES_DIR}
+adb push ${MODELS_PATH} ${MODELS_DIR}
+fi
+echo "test-op or test-net below : "
+adb shell ls /data/local/tmp/bin
+echo "**** choose OP or NET to test ****"
+read -p "which to test : " test_name
+adb shell "cd /data/local/tmp/bin; LD_LIBRARY_PATH=. ./${test_name}"
+}
+if [[ $1 == "npm" ]]; then
+push_fn $1
+else
+push_fn
+fi
\ No newline at end of file
--- a/tools/arm-platform.cmake
+++ b/tools/arm-platform.cmake
+set(ARCH "armv7-a")
+set(FLOAT_ABI "softfp" CACHE STRING "-mfloat-api chosen")
+set_property(CACHE FLOAT_ABI PROPERTY STRINGS "softfp" "soft" "hard")
+set(FPU "neon")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${ARCH} -mfloat-abi=${FLOAT_ABI} -mfpu=${FPU}")
--- a/build.sh
+++ b/build.sh
@@ -15,17 +15,15 @@ build_for_mac() {
    fi
    PLATFORM="x86"
    MODE="Release"
-    CXX_FLAGS="-std=c++11 -O3 -s"
+    BUILD_DIR=../build/release/"${PLATFORM}"
-    BUILD_DIR=build/release/"${PLATFORM}"
    mkdir -p ${BUILD_DIR}/build
    mkdir -p ${BUILD_DIR}/test
-    cp -r test/models ${BUILD_DIR}/test/models
+    cp -r ../test/models ${BUILD_DIR}/test/models
-    cmake . \
+    cmake .. \
        -B"${BUILD_DIR}" \
    	-DCMAKE_BUILD_TYPE="${MODE}" \
-    	-DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
    	-DIS_MAC=true
    cd ${BUILD_DIR}
@@ -33,34 +31,38 @@ build_for_mac() {
 }
 build_for_android() {
+    rm -rf "../build"
    if [ -z "${ANDROID_NDK}" ]; then
        echo "ANDROID_NDK not found!"
        exit -1
    fi
-    PLATFORM="arm-v7a"
+    if [ -z "$PLATFORM" ]; then
-#    PLATFORM="arm-v8a"
+        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" or other platforms from the command line.
+    fi
    if [ "${PLATFORM}" = "arm-v7a" ]; then
        ABI="armeabi-v7a with NEON"
        ARM_PLATFORM="V7"
-        CXX_FLAGS="-O3 -std=c++11 -s -march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security -llog"
+        CXX_FLAGS="-march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
    elif [ "${PLATFORM}" = "arm-v8a" ]; then
        ABI="arm64-v8a"
        ARM_PLATFORM="V8"
-        CXX_FLAGS="-O3 -std=c++11 -s -march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog"
+        CXX_FLAGS="-march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog"
    else
        echo "unknown platform!"
        exit -1
    fi
    MODE="Release"
-    ANDROID_PLATFORM_VERSION="android-15"
+    ANDROID_PLATFORM_VERSION="android-22"
    TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
    ANDROID_ARM_MODE="arm"
+    if [ $# -eq 1 ]; then
-    cmake . \
+    NET=$1
-        -B"build/release/${PLATFORM}" \
+    cmake .. \
+        -B"../build/release/${PLATFORM}" \
        -DANDROID_ABI="${ABI}" \
        -DCMAKE_BUILD_TYPE="${MODE}" \
        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
@@ -68,30 +70,55 @@ build_for_android() {
        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
        -DANDROID_STL=c++_static \
        -DANDROID=true \
+        -D"${NET}=true" \
        -D"${ARM_PLATFORM}"=true
+    else
-    cd "./build/release/${PLATFORM}"
+    cmake .. \
+        -B"../build/release/${PLATFORM}" \
+        -DANDROID_ABI="${ABI}" \
+        -DCMAKE_BUILD_TYPE="${MODE}" \
+        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
+        -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \
+        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
+        -DANDROID_STL=c++_static \
+        -DANDROID=true \
+        -D"${ARM_PLATFORM}"=true
+    fi
+    cd "../build/release/${PLATFORM}"
    make -j 8
 }
 build_for_ios() {
+    rm -rf "../build"
    PLATFORM="ios"
    MODE="Release"
-    BUILD_DIR=build/release/"${PLATFORM}"
+    BUILD_DIR=../build/release/"${PLATFORM}"
    TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
    C_FLAGS="-fobjc-abi-version=2 -fobjc-arc -isysroot ${CMAKE_OSX_SYSROOT}"
-    CXX_FLAGS="-fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT}"
+    CXX_FLAGS="-fobjc-abi-version=2 -fobjc-arc -std=gnu++14 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT}"
    mkdir -p "${BUILD_DIR}"
+    if [ $# -eq 1 ]; then
-    cmake . \
+        NET=$1
-        -B"${BUILD_DIR}" \
+        cmake .. \
-        -DCMAKE_BUILD_TYPE="${MODE}" \
+            -B"${BUILD_DIR}" \
-        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
+            -DCMAKE_BUILD_TYPE="${MODE}" \
-        -DIOS_PLATFORM=OS \
+            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-        -DCMAKE_C_FLAGS="${C_FLAGS}" \
+            -DIOS_PLATFORM=OS \
-        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
+            -DCMAKE_C_FLAGS="${C_FLAGS}" \
-        -DIS_IOS="true" \
+            -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
+            -D"${NET}"=true \
+            -DIS_IOS="true"
+    else
+        cmake .. \
+            -B"${BUILD_DIR}" \
+            -DCMAKE_BUILD_TYPE="${MODE}" \
+            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
+            -DIOS_PLATFORM=OS \
+            -DCMAKE_C_FLAGS="${C_FLAGS}" \
+            -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
+            -DIS_IOS="true"
+    fi
    cd "${BUILD_DIR}"
    make -j 8
 }
@@ -105,15 +132,43 @@ if [ $# -lt 1 ]; then
    echo "available targets: mac|linux|ios|android"
    echo "sample usage: ./build.sh mac"
 else
-	if [ $1 = "mac" ]; then
+    if [ $# -eq 2 ]; then
-		build_for_mac
+        if [ $2 != "googlenet" -a $2 != "mobilenet" -a $2 != "yolo" -a $2 != "squeezenet" -a $2 != "resnet" ]; then
-	elif [ $1 = "linux" ]; then
+            if [ $1 = "mac" ]; then
-		build_for_linux
+		        build_for_mac
-	elif [ $1 = "android" ]; then
+	        elif [ $1 = "linux" ]; then
-		build_for_android
+		        build_for_linux
-	elif [ $1 = "ios" ]; then
+	        elif [ $1 = "android" ]; then
-		build_for_ios
+		        build_for_android
-	else
+	        elif [ $1 = "ios" ]; then
-		build_error
+		        build_for_ios
+	        else
+		        build_error
+	        fi
+        else
+            if [ $1 = "mac" ]; then
+		        build_for_mac $2
+	        elif [ $1 = "linux" ]; then
+		        build_for_linux $2
+	        elif [ $1 = "android" ]; then
+		        build_for_android $2
+	        elif [ $1 = "ios" ]; then
+		        build_for_ios $2
+	        else
+		        build_error
+	        fi
+        fi
+    else
+        if [ $1 = "mac" ]; then
+		    build_for_mac
+	    elif [ $1 = "linux" ]; then
+		    build_for_linux
+	    elif [ $1 = "android" ]; then
+		    build_for_android
+	    elif [ $1 = "ios" ]; then
+		    build_for_ios
+	    else
+		    build_error
+	    fi
 	fi
 fi
--- a/tools/ios-cmake/ios.toolchain.cmake
+++ b/tools/ios-cmake/ios.toolchain.cmake
-# This file is part of the ios-cmake project. It was retrieved from
+# This file is based off of the Platform/Darwin.cmake and Platform/UnixPaths.cmake
-# https://github.com/cristeab/ios-cmake.git, which is a fork of
+# files which are included with CMake 2.8.4
-# https://code.google.com/p/ios-cmake/. Which in turn is based off of
+# It has been altered for iOS development
-# the Platform/Darwin.cmake and Platform/UnixPaths.cmake files which
-# are included with CMake 2.8.4
+# Options:
-#
-# The ios-cmake project is licensed under the new BSD license.
-#
-# Copyright (c) 2014, Bogdan Cristea and LTE Engineering Software,
-# Kitware, Inc., Insight Software Consortium.  All rights reserved.
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 1. Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-#
-# 2. Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-#
-# 3. Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#
-# This file is based off of the Platform/Darwin.cmake and
-# Platform/UnixPaths.cmake files which are included with CMake 2.8.4
-# It has been altered for iOS development.
-#
-# Updated by Alex Stewart (alexs.mac@gmail.com)
-#
-# *****************************************************************************
-#      Now maintained by Alexander Widerberg (widerbergaren [at] gmail.com)
-#                      under the BSD-Clause-3 licence
-# *****************************************************************************
-#
-#                           INFORMATION / HELP
-#
-# The following variables control the behaviour of this toolchain:
-#
-# IOS_PLATFORM: OS (default) or SIMULATOR or SIMULATOR64 or TVOS or SIMULATOR_TVOS
-#    OS = Build for iPhoneOS.
-#    SIMULATOR = Build for x86 i386 iPhone Simulator.
-#    SIMULATOR64 = Build for x86_64 iPhone Simulator.
-#    TVOS = Build for AppleTVOS.
-#    SIMULATOR_TVOS = Build for x86_64 AppleTV Simulator.
-# CMAKE_OSX_SYSROOT: Path to the iOS SDK to use.  By default this is
-#    automatically determined from IOS_PLATFORM and xcodebuild, but
-#    can also be manually specified (although this should not be required).
-# CMAKE_IOS_DEVELOPER_ROOT: Path to the Developer directory for the iOS platform
-#    being compiled for.  By default this is automatically determined from
-#    CMAKE_OSX_SYSROOT, but can also be manually specified (although this should
-#    not be required).
-# ENABLE_BITCODE: (1|0) Enables or disables bitcode support. Default 1 (true)
-# ENABLE_ARC: (1|0) Enables or disables ARC support. Default 1 (true, ARC enabled by default)
-# IOS_ARCH: (armv7 armv7s arm64 i386 x86_64) If specified, will override the default architectures for the given IOS_PLATFORM
-#    OS = armv7 armv7s arm64
-#    SIMULATOR = i386
-#    SIMULATOR64 = x86_64
-#    TVOS = arm64
-#    SIMULATOR_TVOS = x86_64
 #
-# This toolchain defines the following variables for use externally:
+# IOS_PLATFORM = OS (default) or SIMULATOR or SIMULATOR64
+#   This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
+#   OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
+#   SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
 #
-# XCODE_VERSION: Version number (not including Build version) of Xcode detected.
+# CMAKE_IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
-# IOS_SDK_VERSION: Version of iOS SDK being used.
+#   By default this location is automatcially chosen based on the IOS_PLATFORM value above.
-# CMAKE_OSX_ARCHITECTURES: Architectures being compiled for (generated from
+#   If set manually, it will override the default location and force the user of a particular Developer Platform
-#    IOS_PLATFORM).
 #
-# This toolchain defines the following macros for use externally:
+# CMAKE_IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
+#   By default this location is automatcially chosen based on the CMAKE_IOS_DEVELOPER_ROOT value.
+#   In this case it will always be the most up-to-date SDK found in the CMAKE_IOS_DEVELOPER_ROOT path.
+#   If set manually, this will force the use of a specific SDK version
+# Macros:
 #
-# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE XCODE_VARIANT)
+# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
-#   A convenience macro for setting xcode specific properties on targets.
+#  A convenience macro for setting xcode specific properties on targets
-#   Available variants are: All, Release, RelWithDebInfo, Debug, MinSizeRel
+#  example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
-#   example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1" "all").
 #
 # find_host_package (PROGRAM ARGS)
-#   A macro used to find executable programs on the host system, not within the
+#  A macro used to find executable programs on the host system, not within the iOS environment.
-#   iOS environment.  Thanks to the android-cmake project for providing the
+#  Thanks to the android-cmake project for providing the command
-#   command.
+# Standard settings
-# Fix for PThread library not in path
+set (CMAKE_SYSTEM_NAME Darwin)
-set(CMAKE_THREAD_LIBS_INIT "-lpthread")
+set (CMAKE_SYSTEM_VERSION 1)
-set(CMAKE_HAVE_THREADS_LIBRARY 1)
+set (UNIX True)
-set(CMAKE_USE_WIN32_THREADS_INIT 0)
+set (APPLE True)
-set(CMAKE_USE_PTHREADS_INIT 1)
+set (IOS True)
-# Get the Xcode version being used.
+# Required as of cmake 2.8.10
-execute_process(COMMAND xcodebuild -version
+set (CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
-  OUTPUT_VARIABLE XCODE_VERSION
-  ERROR_QUIET
+# Determine the cmake host system version so we know where to find the iOS SDKs
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
+find_program (CMAKE_UNAME uname /bin /usr/bin /usr/local/bin)
-string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}")
+if (CMAKE_UNAME)
-string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}")
+  exec_program(uname ARGS -r OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION)
-message(STATUS "Building with Xcode version: ${XCODE_VERSION}")
+  string (REGEX REPLACE "^([0-9]+)\\.([0-9]+).*$" "\\1" DARWIN_MAJOR_VERSION "${CMAKE_HOST_SYSTEM_VERSION}")
-# Default to building for iPhoneOS if not specified otherwise, and we cannot
+endif (CMAKE_UNAME)
-# determine the platform from the CMAKE_OSX_ARCHITECTURES variable.  The use
-# of CMAKE_OSX_ARCHITECTURES is such that try_compile() projects can correctly
+# Force the compilers to gcc for iOS
-# determine the value of IOS_PLATFORM from the root project, as
+#include (CMakeForceCompiler)
-# CMAKE_OSX_ARCHITECTURES is propagated to them by CMake.
+#CMAKE_C_COMPILER (/usr/bin/gcc)
-if (NOT DEFINED IOS_PLATFORM)
+#CMAKE_CXX_COMPILER (/usr/bin/g++)
-  if (CMAKE_OSX_ARCHITECTURES)
+set(CMAKE_C_COMPILER /usr/bin/gcc)
-    if (CMAKE_OSX_ARCHITECTURES MATCHES ".*arm.*")
+set(CMAKE_CXX_COMPILER /usr/bin/g++)
-      set(IOS_PLATFORM "OS")
-    elseif (CMAKE_OSX_ARCHITECTURES MATCHES "i386")
-      set(IOS_PLATFORM "SIMULATOR")
-    elseif (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
-      set(IOS_PLATFORM "SIMULATOR64")
-    endif()
-  endif()
-  if (NOT IOS_PLATFORM)
-    set(IOS_PLATFORM "OS")
-  endif()
-endif()
-set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING
-  "Type of iOS platform for which to build.")
-# Determine the platform name and architectures for use in xcodebuild commands
-# from the specified IOS_PLATFORM name.
-if (IOS_PLATFORM STREQUAL "OS")
-  set(XCODE_IOS_PLATFORM iphoneos)
-  if(NOT IOS_ARCH)
-    set(IOS_ARCH armv7 armv7s arm64)
-  endif()
-elseif (IOS_PLATFORM STREQUAL "SIMULATOR")
-  set(XCODE_IOS_PLATFORM iphonesimulator)
-  if(NOT IOS_ARCH)
-    set(IOS_ARCH i386)
-  endif()
-elseif(IOS_PLATFORM STREQUAL "SIMULATOR64")
-  set(XCODE_IOS_PLATFORM iphonesimulator)
-  if(NOT IOS_ARCH)
-    set(IOS_ARCH x86_64)
-  endif()
-elseif (IOS_PLATFORM STREQUAL "TVOS")
-  set(XCODE_IOS_PLATFORM appletvos)
-  if(NOT IOS_ARCH)
-    set(IOS_ARCH arm64)
-  endif()
-elseif (IOS_PLATFORM STREQUAL "SIMULATOR_TVOS")
-  set(XCODE_IOS_PLATFORM appletvsimulator)
-  if(NOT IOS_ARCH)
-    set(IOS_ARCH x86_64)
-  endif()
-else()
-  message(FATAL_ERROR "Invalid IOS_PLATFORM: ${IOS_PLATFORM}")
-endif()
-message(STATUS "Configuring iOS build for platform: ${IOS_PLATFORM}, "
-  "architecture(s): ${IOS_ARCH}")
-# If user did not specify the SDK root to use, then query xcodebuild for it.
-if (NOT CMAKE_OSX_SYSROOT)
-  execute_process(COMMAND xcodebuild -version -sdk ${XCODE_IOS_PLATFORM} Path
-    OUTPUT_VARIABLE CMAKE_OSX_SYSROOT
-    ERROR_QUIET
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-  message(STATUS "Using SDK: ${CMAKE_OSX_SYSROOT} for platform: ${IOS_PLATFORM}")
-endif()
-if (NOT EXISTS ${CMAKE_OSX_SYSROOT})
-  message(FATAL_ERROR "Invalid CMAKE_OSX_SYSROOT: ${CMAKE_OSX_SYSROOT} "
-    "does not exist.")
-endif()
-# Specify minimum version of deployment target.
-if (NOT DEFINED IOS_DEPLOYMENT_TARGET)
-  # Unless specified, SDK version 8.0 is used by default as minimum target version.
-  set(IOS_DEPLOYMENT_TARGET "8.0"
-      CACHE STRING "Minimum iOS version to build for." )
-  message(STATUS "Using the default min-version since IOS_DEPLOYMENT_TARGET not provided!")
-endif()
-# Use bitcode or not
-if (NOT DEFINED ENABLE_BITCODE)
-  # Unless specified, enable bitcode support by default
-  set(ENABLE_BITCODE TRUE CACHE BOOL "Wheter or not to enable bitcode")
-  message(STATUS "Enabling bitcode support by default. ENABLE_BITCODE not provided!")
-endif()
-# Use ARC or not
-if (NOT DEFINED ENABLE_ARC)
-  # Unless specified, enable ARC support by default
-  set(ENABLE_ARC TRUE CACHE BOOL "Wheter or not to enable ARC")
-  message(STATUS "Enabling ARC support by default. ENABLE_ARC not provided!")
-endif()
-# Get the SDK version information.
-execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion
-  OUTPUT_VARIABLE IOS_SDK_VERSION
-  ERROR_QUIET
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
-# Find the Developer root for the specific iOS platform being compiled for
-# from CMAKE_OSX_SYSROOT.  Should be ../../ from SDK specified in
-# CMAKE_OSX_SYSROOT.  There does not appear to be a direct way to obtain
-# this information from xcrun or xcodebuild.
-if (NOT CMAKE_IOS_DEVELOPER_ROOT)
-  get_filename_component(IOS_PLATFORM_SDK_DIR ${CMAKE_OSX_SYSROOT} PATH)
-  get_filename_component(CMAKE_IOS_DEVELOPER_ROOT ${IOS_PLATFORM_SDK_DIR} PATH)
-endif()
-if (NOT EXISTS ${CMAKE_IOS_DEVELOPER_ROOT})
-  message(FATAL_ERROR "Invalid CMAKE_IOS_DEVELOPER_ROOT: "
-    "${CMAKE_IOS_DEVELOPER_ROOT} does not exist.")
-endif()
-# Find the C & C++ compilers for the specified SDK.
-if (NOT CMAKE_C_COMPILER)
-  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang
-    OUTPUT_VARIABLE CMAKE_C_COMPILER
-    ERROR_QUIET
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-  message(STATUS "Using C compiler: ${CMAKE_C_COMPILER}")
-endif()
-if (NOT CMAKE_CXX_COMPILER)
-  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++
-    OUTPUT_VARIABLE CMAKE_CXX_COMPILER
-    ERROR_QUIET
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-  message(STATUS "Using CXX compiler: ${CMAKE_CXX_COMPILER}")
-endif()
-# Find (Apple's) libtool.
-execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find libtool
-  OUTPUT_VARIABLE IOS_LIBTOOL
-  ERROR_QUIET
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
-message(STATUS "Using libtool: ${IOS_LIBTOOL}")
-# Configure libtool to be used instead of ar + ranlib to build static libraries.
-# This is required on Xcode 7+, but should also work on previous versions of
-# Xcode.
-set(CMAKE_C_CREATE_STATIC_LIBRARY
-  "${IOS_LIBTOOL} -static -o <TARGET> <LINK_FLAGS> <OBJECTS> ")
-set(CMAKE_CXX_CREATE_STATIC_LIBRARY
-  "${IOS_LIBTOOL} -static -o <TARGET> <LINK_FLAGS> <OBJECTS> ")
-# Get the version of Darwin (OS X) of the host.
-execute_process(COMMAND uname -r
-  OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION
-  ERROR_QUIET
-  OUTPUT_STRIP_TRAILING_WHITESPACE)
-# Standard settings.
-set(CMAKE_SYSTEM_NAME Darwin CACHE INTERNAL "")
-set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION} CACHE INTERNAL "")
-set(UNIX TRUE CACHE BOOL "")
-set(APPLE TRUE CACHE BOOL "")
-set(IOS TRUE CACHE BOOL "")
 set(CMAKE_AR ar CACHE FILEPATH "" FORCE)
-set(CMAKE_RANLIB ranlib CACHE FILEPATH "" FORCE)
-# Force unset of OS X-specific deployment target (otherwise autopopulated),
+# Skip the platform compiler checks for cross compiling
-# required as of cmake 2.8.10.
+set (CMAKE_CXX_COMPILER_WORKS TRUE)
-set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING
+set (CMAKE_C_COMPILER_WORKS TRUE)
-  "Must be empty for iOS builds." FORCE)
-# Set the architectures for which to build.
+# All iOS/Darwin specific settings - some may be redundant
-set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE STRING "Build architecture for iOS")
+set (CMAKE_SHARED_LIBRARY_PREFIX "lib")
-# Skip the platform compiler checks for cross compiling.
+set (CMAKE_SHARED_LIBRARY_SUFFIX ".dylib")
-set(CMAKE_CXX_COMPILER_FORCED TRUE)
+set (CMAKE_SHARED_MODULE_PREFIX "lib")
-set(CMAKE_CXX_COMPILER_WORKS TRUE)
+set (CMAKE_SHARED_MODULE_SUFFIX ".so")
-set(CMAKE_C_COMPILER_FORCED TRUE)
+set (CMAKE_MODULE_EXISTS 1)
-set(CMAKE_C_COMPILER_WORKS TRUE)
+set (CMAKE_DL_LIBS "")
-# All iOS/Darwin specific settings - some may be redundant.
-set(CMAKE_SHARED_LIBRARY_PREFIX "lib")
+set (CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
-set(CMAKE_SHARED_LIBRARY_SUFFIX ".dylib")
+set (CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
-set(CMAKE_SHARED_MODULE_PREFIX "lib")
+set (CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
-set(CMAKE_SHARED_MODULE_SUFFIX ".so")
+set (CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
-set(CMAKE_MODULE_EXISTS 1)
-set(CMAKE_DL_LIBS "")
+# Hidden visibilty is required for cxx on iOS
-set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
+set (CMAKE_C_FLAGS_INIT "")
-set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
+set (CMAKE_CXX_FLAGS_INIT "-fvisibility=hidden -fvisibility-inlines-hidden")
-set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
-set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
+set (CMAKE_C_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}")
-message(STATUS "Building for minimum iOS version: ${IOS_DEPLOYMENT_TARGET}"
+set (CMAKE_CXX_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}")
-               " (SDK version: ${IOS_SDK_VERSION})")
-# Note that only Xcode 7+ supports the newer more specific:
+set (CMAKE_PLATFORM_HAS_INSTALLNAME 1)
-# -m${XCODE_IOS_PLATFORM}-version-min flags, older versions of Xcode use:
+set (CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
-# -m(ios/ios-simulator)-version-min instead.
+set (CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
-if (IOS_PLATFORM STREQUAL "OS")
+set (CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
-  if (XCODE_VERSION VERSION_LESS 7.0)
+set (CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
-    set(XCODE_IOS_PLATFORM_VERSION_FLAGS
+set (CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
-      "-mios-version-min=${IOS_DEPLOYMENT_TARGET}")
-  else()
+# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
-    # Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM.
+# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
-    set(XCODE_IOS_PLATFORM_VERSION_FLAGS
+# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
-      "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
+# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
-  endif()
-elseif (IOS_PLATFORM STREQUAL "TVOS")
-  set(XCODE_IOS_PLATFORM_VERSION_FLAGS
-    "-mtvos-version-min=${IOS_DEPLOYMENT_TARGET}")
-elseif (IOS_PLATFORM STREQUAL "SIMULATOR_TVOS")
-  set(XCODE_IOS_PLATFORM_VERSION_FLAGS
-    "-mtvos-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
-else()
-  # SIMULATOR or SIMULATOR64 both use -mios-simulator-version-min.
-  set(XCODE_IOS_PLATFORM_VERSION_FLAGS
-    "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
-endif()
-message(STATUS "Version flags set to: ${XCODE_IOS_PLATFORM_VERSION_FLAGS}")
-if (ENABLE_BITCODE)
-  set(BITCODE "-fembed-bitcode")
-  message(STATUS "Enabling bitcode support.")
-else()
-  set(BITCODE "")
-  message(STATUS "Disabling bitcode support.")
-endif()
-if (ENABLE_ARC)
-  set(FOBJC_ARC "-fobjc-arc")
-  message(STATUS "Enabling ARC support.")
-else()
-  set(FOBJC_ARC "-fno-objc-arc")
-  message(STATUS "Disabling ARC support.")
-endif()
-set(CMAKE_C_FLAGS
-"${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${BITCODE} -fobjc-abi-version=2 ${FOBJC_ARC} ${C_FLAGS}")
-# Hidden visibilty is required for C++ on iOS.
-set(CMAKE_CXX_FLAGS
-"${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${BITCODE} -fvisibility=hidden -fvisibility-inlines-hidden -fobjc-abi-version=2 ${FOBJC_ARC} ${CXX_FLAGS}")
-set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS} -DNDEBUG -Os -fomit-frame-pointer -ffast-math ${BITCODE} ${CXX_FLAGS_MINSIZEREL}")
-set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -DNDEBUG -O2 -g -fomit-frame-pointer -ffast-math ${BITCODE} ${CXX_FLAGS_RELWITHDEBINFO}")
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -DNDEBUG -O3 -fomit-frame-pointer -ffast-math ${BITCODE} ${CXX_FLAGS_RELEASE}")
-set(CMAKE_C_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first ${C_LINK_FLAGS}")
-set(CMAKE_CXX_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS}  -Wl,-search_paths_first ${CXX_LINK_FLAGS}")
-# In order to ensure that the updated compiler flags are used in try_compile()
-# tests, we have to forcibly set them in the CMake cache, not merely set them
-# in the local scope.
-list(APPEND VARS_TO_FORCE_IN_CACHE
-  CMAKE_C_FLAGS
-  CMAKE_CXX_FLAGS
-  CMAKE_CXX_FLAGS_RELWITHDEBINFO
-  CMAKE_CXX_FLAGS_MINSIZEREL
-  CMAKE_CXX_FLAGS_RELEASE
-  CMAKE_C_LINK_FLAGS
-  CMAKE_CXX_LINK_FLAGS)
-foreach(VAR_TO_FORCE ${VARS_TO_FORCE_IN_CACHE})
-  set(${VAR_TO_FORCE} "${${VAR_TO_FORCE}}" CACHE STRING "" FORCE)
-endforeach()
-set(CMAKE_PLATFORM_HAS_INSTALLNAME 1)
-set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
-set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
-set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
-set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
-set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
-# Hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old
-# build tree (where install_name_tool was hardcoded) and where
-# CMAKE_INSTALL_NAME_TOOL isn't in the cache and still cmake didn't fail in
-# CMakeFindBinUtils.cmake (because it isn't rerun) hardcode
-# CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did
-# before, Alex.
 if (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
  find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
 endif (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
-# Set the find root to the iOS developer roots and to user defined paths.
-set(CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_OSX_SYSROOT}
+# Setup iOS platform unless specified manually with IOS_PLATFORM
-  ${CMAKE_PREFIX_PATH} CACHE string  "iOS find search path root" FORCE)
+if (NOT DEFINED IOS_PLATFORM)
-# Default to searching for frameworks first.
+  set (IOS_PLATFORM "OS")
-set(CMAKE_FIND_FRAMEWORK FIRST)
+endif (NOT DEFINED IOS_PLATFORM)
-# Set up the default search directories for frameworks.
+set (IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
-set(CMAKE_SYSTEM_FRAMEWORK_PATH
-  ${CMAKE_OSX_SYSROOT}/System/Library/Frameworks
+# Setup building for arm64 or not
-  ${CMAKE_OSX_SYSROOT}/System/Library/PrivateFrameworks
+if (NOT DEFINED BUILD_ARM64)
-  ${CMAKE_OSX_SYSROOT}/Developer/Library/Frameworks)
+  set (BUILD_ARM64 true)
-# Only search the specified iOS SDK, not the remainder of the host filesystem.
+endif (NOT DEFINED BUILD_ARM64)
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+set (BUILD_ARM64 ${BUILD_ARM64} CACHE STRING "Build arm64 arch or not")
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+# Check the platform selection and setup for developer root
-# This little macro lets you set any XCode specific property.
+if (${IOS_PLATFORM} STREQUAL "OS")
-macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE XCODE_RELVERSION)
+  set (IOS_PLATFORM_LOCATION "iPhoneOS.platform")
-  set(XCODE_RELVERSION_I "${XCODE_RELVERSION}")
-  if (XCODE_RELVERSION_I STREQUAL "All")
+  # This causes the installers to properly locate the output libraries
-    set_property(TARGET ${TARGET} PROPERTY
+  set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
-    XCODE_ATTRIBUTE_${XCODE_PROPERTY} "${XCODE_VALUE}")
+elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
-  else()
+  set (SIMULATOR true)
-    set_property(TARGET ${TARGET} PROPERTY
+  set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
-    XCODE_ATTRIBUTE_${XCODE_PROPERTY}[variant=${XCODE_RELVERSION_I}] "${XCODE_VALUE}")
-  endif()
+  # This causes the installers to properly locate the output libraries
-endmacro(set_xcode_property)
+  set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
-# This macro lets you find executable programs on the host system.
+elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")
-macro(find_host_package)
+  set (SIMULATOR true)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+  set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
-  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
+  # This causes the installers to properly locate the output libraries
-  set(IOS FALSE)
+  set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
+else (${IOS_PLATFORM} STREQUAL "OS")
+  message (FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please choose OS or SIMULATOR")
+endif (${IOS_PLATFORM} STREQUAL "OS")
+# Setup iOS developer location unless specified manually with CMAKE_IOS_DEVELOPER_ROOT
+# Note Xcode 4.3 changed the installation location, choose the most recent one available
+exec_program(/usr/bin/xcode-select ARGS -print-path OUTPUT_VARIABLE CMAKE_XCODE_DEVELOPER_DIR)
+set (XCODE_POST_43_ROOT "${CMAKE_XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
+set (XCODE_PRE_43_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
+if (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
+  if (EXISTS ${XCODE_POST_43_ROOT})
+    set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_POST_43_ROOT})
+  elseif(EXISTS ${XCODE_PRE_43_ROOT})
+    set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_PRE_43_ROOT})
+  endif (EXISTS ${XCODE_POST_43_ROOT})
+endif (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
+set (CMAKE_IOS_DEVELOPER_ROOT ${CMAKE_IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
+# Find and use the most recent iOS sdk unless specified manually with CMAKE_IOS_SDK_ROOT
+if (NOT DEFINED CMAKE_IOS_SDK_ROOT)
+  file (GLOB _CMAKE_IOS_SDKS "${CMAKE_IOS_DEVELOPER_ROOT}/SDKs/*")
+  if (_CMAKE_IOS_SDKS)
+    list (SORT _CMAKE_IOS_SDKS)
+    list (REVERSE _CMAKE_IOS_SDKS)
+    list (GET _CMAKE_IOS_SDKS 0 CMAKE_IOS_SDK_ROOT)
+  else (_CMAKE_IOS_SDKS)
+    message (FATAL_ERROR "No iOS SDK's found in default search path ${CMAKE_IOS_DEVELOPER_ROOT}. Manually set CMAKE_IOS_SDK_ROOT or install the iOS SDK.")
+  endif (_CMAKE_IOS_SDKS)
+  message (STATUS "Toolchain using default iOS SDK: ${CMAKE_IOS_SDK_ROOT}")
+endif (NOT DEFINED CMAKE_IOS_SDK_ROOT)
+set (CMAKE_IOS_SDK_ROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
+# Set the sysroot default to the most recent SDK
+set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
+# set the architecture for iOS
+if (${IOS_PLATFORM} STREQUAL "OS")
+  set (IOS_ARCH armv7 armv7s arm64)
+elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
+  set (IOS_ARCH i386)
+elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")
+  set (IOS_ARCH x86_64)
+endif (${IOS_PLATFORM} STREQUAL "OS")
+set (CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
+# Set the find root to the iOS developer roots and to user defined paths
+set (CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string  "iOS find search path root")
+# default to searching for frameworks first
+set (CMAKE_FIND_FRAMEWORK FIRST)
+# set up the default search directories for frameworks
+set (CMAKE_SYSTEM_FRAMEWORK_PATH
+        ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
+        ${CMAKE_IOS_SDK_ROOT}/System/Library/PrivateFrameworks
+        ${CMAKE_IOS_SDK_ROOT}/Developer/Library/Frameworks
+        )
+# only search the iOS sdks, not the remainder of the host filesystem
+set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+# This little macro lets you set any XCode specific property
+macro (set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
+  set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
+endmacro (set_xcode_property)
+# This macro lets you find executable programs on the host system
+macro (find_host_package)
+  set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+  set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
+  set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
+  set (IOS FALSE)
  find_package(${ARGN})
-  set(IOS TRUE)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+  set (IOS TRUE)
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+  set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
-  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+  set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-endmacro(find_host_package)
+  set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+endmacro (find_host_package)
--- a/tools/op.cmake
+++ b/tools/op.cmake
+set(NET "googlenet" CACHE STRING "select net type")
+set_property(CACHE NET PROPERTY STRINGS "defult" "googlenet" "mobilenet" "yolo" "squeezenet")
+if (NET EQUAL "googlenet")
+  set(CONCAT_OP ON)
+  set(CONV_OP ON)
+  set(LRN_OP ON)
+  set(MUL_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(FUSION_FC_OP ON)
+  set(POOL_OP ON)
+  set(RELU_OP ON)
+  set(FUSION_CONVADD_OP ON)
+  set(FUSION_CONVADD_RELU_OP ON)
+elseif (NET EQUAL "mobilenet")
+  set(CONV_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(RELU_OP ON)
+  set(SOFTMAX_OP ON)
+  set(SOFTMAX_OP ON)
+  set(DEPTHWISECONV_OP ON)
+  set(BATCHNORM_OP ON)
+  set(POOL_OP ON)
+  set(RESHAPE_OP ON)
+elseif (NET EQUAL "yolo")
+  set(BATCHNORM_OP ON)
+  set(CONV_OP ON)
+  set(RELU_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+elseif (NET EQUAL "squeezenet")
+  set(CONCAT_OP ON)
+  set(CONV_OP ON)
+  set(RELU_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(POOL_OP ON)
+  set(RESHAPE_OP ON)
+  set(SOFTMAX_OP ON)
+elseif (NET EQUAL "resnet")
+  set(CONV_OP ON)
+  set(BATCHNORM_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(SOFTMAX_OP ON)
+  set(MUL_OP ON)
+  set(POOL_OP ON)
+  set(RELU_OP ON)
+else ()
+  set(BATCHNORM_OP ON)
+  set(BOXCODER_OP ON)
+  set(CONCAT_OP ON)
+  set(CONV_OP ON)
+  set(DEPTHWISECONV_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(FUSION_CONVADD_OP ON)
+  set(CONVADDRELU_OP ON)
+  set(FUSION_FC_OP ON)
+  set(LRN_OP ON)
+  set(MUL_OP ON)
+  set(MULTICLASSNMS_OP ON)
+  set(POOL_OP ON)
+  set(PRIORBOX_OP ON)
+  set(RELU_OP ON)
+  set(RESHAPE_OP ON)
+  set(SIGMOID_OP ON)
+  set(SOFTMAX_OP ON)
+  set(TRANSPOSE_OP ON)
+  set(FUSION_CONVADD_RELU_OP ON)
+  # option(BATCHNORM_OP "" ON)
+  # option(BOXCODER_OP "" ON)
+  # option(CONCAT_OP "" ON)
+  # option(CONV_OP "" ON)
+  # option(DEPTHWISECONV_OP "" ON)
+  # option(ELEMENTWISEADD_OP "" ON)
+  # option(FUSION_CONVADD_OP "" ON)
+  # option(CONVADDRELU_OP "" ON)
+  # option(FUSION_FC_OP "" ON)
+  # option(LRN_OP "" ON)
+  # option(MUL_OP "" ON)
+  # option(MULTICLASSNMS_OP "" ON)
+  # option(POOL_OP "" ON)
+  # option(PRIORBOX_OP "" ON)
+  # option(RELU_OP "" ON)
+  # option(RESHAPE_OP "" ON)
+  # option(SIGMOID_OP "" ON)
+  # option(SOFTMAX_OP "" ON)
+  # option(TRANSPOSE_OP "" ON)
+  # option(FUSION_CONVADD_RELU_OP "" ON)
+endif ()
+if (BATCHNORM_OP)
+  add_definitions(-DBATCHNORM_OP)
+endif()
+if (BOXCODER_OP)
+  add_definitions(-DBOXCODER_OP)
+endif()
+if (CONCAT_OP)
+  add_definitions(-DCONCAT_OP)
+endif()
+if (CONV_OP)
+  add_definitions(-DCONV_OP)
+endif()
+if (DEPTHWISECONV_OP)
+  add_definitions(-DDEPTHWISECONV_OP)
+endif()
+if (ELEMENTWISEADD_OP)
+  add_definitions(-DELEMENTWISEADD_OP)
+endif()
+if (FUSION_CONVADD_OP)
+  add_definitions(-DFUSION_CONVADD_OP)
+endif()
+if (CONVADDRELU_OP)
+  add_definitions(-DCONVADDRELU_OP)
+endif()
+if (FUSION_FC_OP)
+  add_definitions(-DFUSION_FC_OP)
+endif()
+if (LRN_OP)
+  add_definitions(-DLRN_OP)
+endif()
+if (MUL_OP)
+  add_definitions(-DMUL_OP)
+endif()
+if (MULTICLASSNMS_OP)
+  add_definitions(-DMULTICLASSNMS_OP)
+endif()
+if (POOL_OP)
+  add_definitions(-DPOOL_OP)
+endif()
+if (PRIORBOX_OP)
+  add_definitions(-DPRIORBOX_OP)
+endif()
+if (RELU_OP)
+  add_definitions(-DRELU_OP)
+endif()
+if (RESHAPE_OP)
+  add_definitions(-DRESHAPE_OP)
+endif()
+if (SIGMOID_OP)
+  add_definitions(-DSIGMOID_OP)
+endif()
+if (SOFTMAX_OP)
+  add_definitions(-DSOFTMAX_OP)
+endif()
+if (TRANSPOSE_OP)
+  add_definitions(-DTRANSPOSE_OP)
+endif()
+if (FUSION_CONVADD_RELU_OP)
+  add_definitions(-DFUSION_CONVADD_RELU_OP)
+endif()
\ No newline at end of file
--- a/tools/pre-commit.hooks/clang-format.hook
+++ b/tools/pre-commit.hooks/clang-format.hook
@@ -12,4 +12,8 @@ if ! [[ $version == *"$VERSION"* ]]; then
    exit -1
 fi
-clang-format $@
+# https://medicineyeh.wordpress.com/2017/07/13/clang-format-with-pragma/
+shift
+perl -i -pe 's|#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@"
+clang-format -i $@
+perl -i -pe 's|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> ||' "$@"
--- a/tools/profile_show.sh
+++ b/tools/profile_show.sh
+#!/usr/bin/env sh
+cat <<EOF
+<html>
+<head>
+<style>
+html, body {
+position: absolute;
+width: 100%;
+height: 100%;
+margin: 0;
+}
+div.timeview {
+width: 100%;
+position: relative;
+overflow: scroll;
+}
+ul {
+position: absolute;
+margin: 0;
+list-style:none;
+padding: 0;
+margin: 0;
+}
+li {
+height: 15px;
+position: absolute;
+background: blue;
+}
+li:nth-child(odd) {
+background: blue;
+}
+li:nth-child(even) {
+background: rebeccapurple;
+}
+ul.timeline {
+z-index: -1;
+}
+ul.timeline li {
+position: relative;
+height: 15px;
+width: 100%;
+}
+ul.timeline li:nth-child(odd) {
+background: beige;
+}
+ul.timeline li:nth-child(even) {
+background: antiquewhite;
+}
+</style>
+</head>
+<body>
+<div class="timeview">
+<ul>
+EOF
+min=$(awk 'NR==1{min=$4} NR>1{if($4 < min) min=$4} END{print min}' $1)
+max=$(awk 'NR==1{max=$5} NR>1{if($5 > max) max=$5} END{print max}' $1)
+sort $1 -k1,1n | awk -v max="$max" -v min="$min" '
+BEGIN {
+  total = max - min
+}
+{
+    opid = $1
+    optype = $2
+    tid = $3
+    cb = $4
+    ce = $5
+    cl = $6
+    sum += $4 - $3
+    print "<li class=\"timeline\"" \
+          " data-opid=\"" opid "\"" \
+          " data-optype=\"" optype "\"" \
+          " data-tid=\"" tid "\"" \
+          " data-begin=\"" cb "\"" \
+          " data-end=\"" ce "\"" \
+          "></li>"
+}
+'
+cat <<EOF
+</ul>
+</div>
+<pre>
+EOF
+echo "==================[ profile ]==================="
+cat $1 | awk '
+NR>1{
+    optype = $2
+    sum += $5 - $4
+    count[$2] += $6
+}
+END {
+for (t in count) {
+    msg = sprintf("%-16s\t%-10d\t%-.4f", t, count[t], count[t]*100 / sum);
+    print msg
+}
+}' | sort -k2,2nr
+cat $1 | awk '
+NR>1{
+    sum += $5 - $4
+}
+END {
+msg = sprintf("%-16s\t%-10d\t%-.4f", "total", sum, 100);
+print msg
+}'
+cat <<EOF
+</pre>
+<script>
+const min= $min;
+const max= $max;
+const px_per_nanosecond = 1/1000000;
+const scale = px_per_nanosecond;
+const li = document.querySelectorAll('li');
+const thread = new Set();
+for (let i = 0; i < li.length; i++) {
+    const prof = li[i].dataset;
+    li[i].style.width = (prof.end - prof.begin)*scale + 'px';
+    li[i].style.left = (prof.begin - min)*scale + 'px';
+    li[i].style.top = prof.tid * 15 + 'px';
+    thread.add(prof.tid);
+}
+const ul = document.createElement('ul');
+ul.classList.add('timeline');
+ul.style.width = (max - min)*scale + 'px';
+thread.forEach(i => {
+    const l = document.createElement('li');
+    ul.appendChild(l);
+});
+const timeview = document.querySelector('.timeview');
+timeview.appendChild(ul);
+timeview.style.height = thread.size * 15 + 'px';
+</script>
+</body>
+</html>
+EOF
--- a/tools/toolchains/arm-android-neon.cmake
+++ b/tools/toolchains/arm-android-neon.cmake
+set(ANDROID_ARM_NEON ON)
+include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")
\ No newline at end of file
--- a/tools/toolchains/arm-linux-gnueabi.cmake
+++ b/tools/toolchains/arm-linux-gnueabi.cmake
+# CMake toolchain file for building ARM software on Linux environment
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_VERSION 1)
+set(CMAKE_C_COMPILER   /usr/bin/arm-linux-gnueabi-gcc)
+set(CMAKE_CXX_COMPILER /usr/bin/arm-linux-gnueabi-g++)
+set(CMAKE_STRIP /usr/bin/arm-linux-gnueabi-strip)
+set(CMAKE_FIND_ROOT_PATH  /usr/arm-linux-gnueabi)
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+include("${CMAKE_CURRENT_LIST_DIR}/../arm-platform.cmake")