Merge remote-tracking branch 'upstream/develop' into develop

bbdd27ca · eclipsess · 4bef099b · 41a18aab · bbdd27ca · bbdd27ca
78 changed file
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "src/operators/kernel/mali/ACL_Android"]
+	path = src/operators/kernel/mali/ACL_Android
+	url = https://github.com/halsay/ACL_Android.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,42 +6,68 @@ option(USE_OPENMP "openmp support" OFF)
 option(USE_EXCEPTION "use std exception" ON)
 option(LOG_PROFILE "log profile" ON)
 # select the platform to build
-option(CPU "cpu" ON)
+option(CPU "armv7 with neon" ON)
-option(MALI_GPU "mali gpu" ON)
+option(MALI_GPU "mali gpu" OFF)
 option(FPGA "fpga" OFF)
+set(DEBUGING ON)
+file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c)
+file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
 if (CPU)
-    add_definitions(-DPADDLE_MOBILE_CPU)
+  add_definitions(-DPADDLE_MOBILE_CPU)
+else()
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/arm/*.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/arm/*.cc)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/arm/*.cpp)
 endif()
 if (MALI_GPU)
    add_definitions(-DPADDLE_MOBILE_MALI_GPU)
+    add_definitions(-DUSE_ACL=1)
+    add_definitions(-DUSE_OPENCL)
+    set(ACL_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/ACL_Android)
+    include_directories(${ACL_ROOT} ${ACL_ROOT}/include)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_core")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_graph")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build/opencl-1.2-stubs")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ACL=1")
+else()
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/*.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/*.cc)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/*.cpp)
 endif()
 if(FPGA)
    add_definitions(-DPADDLE_MOBILE_FPGA)
+else()
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/fpga/*.h)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/fpga/*.cc)
+    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/fpga/*.cpp)
 endif()
 set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
 if (DEBUGING)
-    set(CMAKE_BUILD_TYPE Debug)
+  message(STATUS "debug")
-    set(CMAKE_CXX_FLAGS_DEBUG "-O3 -DNDEBUG")
+  set(CMAKE_BUILD_TYPE Debug)
-else()
+  set(CMAKE_CXX_FLAGS_DEBUG "-g -DNDEBUG")
-    set(CMAKE_BUILD_TYPE Release)
+  add_definitions(-DPADDLE_MOBILE_DEBUG)
-    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
+  if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
-endif ()
+    add_definitions(-DARMV7)
-if(DEBUGING)
-    message(STATUS "debuging")
-    add_definitions(-DPADDLE_MOBILE_DEBUG)
-    if(ANDROID)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
-    endif()
+  endif ()
+else ()
-else()
+  set(CMAKE_BUILD_TYPE Release)
-    message(STATUS "releasing")
+  set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
-    add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
+  add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
-endif()
+endif ()
 if (USE_EXCEPTION)
    message(STATUS "use exception")
@@ -55,110 +81,36 @@ if (LOG_PROFILE)
    add_definitions(-DPADDLE_MOBILE_PROFILE)
 endif()
-if(IS_MAC)
+if(USE_OPENMP)
-    add_definitions(-DX86)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-elseif(IS_IOS)
+    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
-    add_definitions(-DIOS)
-elseif(V7)
-    add_definitions(-DARMV7)
-elseif(V8)
-    add_definitions(-DARMV8)
-else ()
-    add_definitions(-DX86)
 endif()
-set(CMAKE_VERBOSE_MAKEFILE ON)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
-file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c)
-file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
-if (NOT ANDROID)
-list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.cpp)
+if (NOT ANDROID_NDK_TOOLCHAIN_INCLUDED)
-list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.h)
+  list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.cpp)
-list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
+  list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/*.h)
+  list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
 endif ()
 include_directories(src/)
-if(USE_OPENMP)
+set(CMAKE_VERBOSE_MAKEFILE ON)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
-endif()
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
-if (googlenet)
+include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
-    add_definitions(-DCONCAT_OP)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DLRN_OP)
-    add_definitions(-DMUL_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DFUSION_FC_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DFUSION_CONVADD_OP)
-    add_definitions(-DFUSION_CONVADD_RELU_OP)
-elseif (mobilenet)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DSOFTMAX_OP)
-    add_definitions(-DSOFTMAX_OP)
-    add_definitions(-DDEPTHWISECONV_OP)
-    add_definitions(-DBATCHNORM_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DRESHAPE_OP)
-elseif (yolo)
-    add_definitions(-DBATCHNORM_OP)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-elseif (squeezenet)
-    add_definitions(-DCONCAT_OP)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DRESHAPE_OP)
-    add_definitions(-DSOFTMAX_OP)
-elseif(resnet)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DBATCHNORM_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DSOFTMAX_OP)
-    add_definitions(-DMUL_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DRELU_OP)
-else ()
-    add_definitions(-DBATCHNORM_OP)
-    add_definitions(-DBOXCODER_OP)
-    add_definitions(-DCONCAT_OP)
-    add_definitions(-DCONV_OP)
-    add_definitions(-DDEPTHWISECONV_OP)
-    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DFUSION_CONVADD_OP)
-    add_definitions(-DCONVADDRELU_OP)
-    add_definitions(-DFUSION_FC_OP)
-    add_definitions(-DLRN_OP)
-    add_definitions(-DMUL_OP)
-    add_definitions(-DMULTICLASSNMS_OP)
-    add_definitions(-DPOOL_OP)
-    add_definitions(-DPRIORBOX_OP)
-    add_definitions(-DRELU_OP)
-    add_definitions(-DRESHAPE_OP)
-    add_definitions(-DSIGMOID_OP)
-    add_definitions(-DSOFTMAX_OP)
-    add_definitions(-DTRANSPOSE_OP)
-    add_definitions(-DFUSION_CONVADD_RELU_OP)
-endif()
-if (IS_IOS)
+# if (IS_IOS)
-    add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+#     add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
-elseif(ANDROID)
+if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
+    list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS)
    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
-else()
+else ()
    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
 endif ()

--- a/Dockerfile
+++ b/Dockerfile
-FROM ubuntu:18.04
+FROM ubuntu:16.04
 RUN echo '\
-deb <mirror> bionic main restricted universe multiverse\n\
+deb <mirror> <version> main restricted universe multiverse\n\
-deb <mirror> bionic-updates main restricted universe multiverse\n\
+deb <mirror> <version>-updates main restricted universe multiverse\n\
-deb <mirror> bionic-backports main restricted universe multiverse\n\
+deb <mirror> <version>-backports main restricted universe multiverse\n\
-deb <mirror> bionic-security main restricted universe multiverse\n'\
+deb <mirror> <version>-security main restricted universe multiverse\n'\
 > /etc/apt/sources.list
 RUN sed -ie 's|<mirror>|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|' /etc/apt/sources.list
+RUN sed -ie 's|<version>|xenial|' /etc/apt/sources.list
 RUN apt-get update && apt-get upgrade -y
 RUN apt-get install -y --no-install-recommends \
        curl \
        unzip \
        git \
+        make \
        cmake \
+        cmake-curses-gui \
        python \
        python-pip \
        python-setuptools \
        clang-format-5.0 \
-        graphviz
+        graphviz \
+        g++-arm-linux-gnueabi \
+        gcc-arm-linux-gnueabi
 RUN apt-get autoremove -y && apt-get clean
-RUN pip install wheel pre-commit
+RUN pip install --upgrade pip
-RUN pre-commit autoupdate
+RUN pip install wheel && pip install pre-commit
 RUN ln -s clang-format-5.0 /usr/bin/clang-format
 RUN cd /tmp && curl -O http://mirrors.neusoft.edu.cn/android/repository/android-ndk-r17b-linux-x86_64.zip
 RUN cd /opt && unzip /tmp/android-ndk-r17b-linux-x86_64.zip
\ No newline at end of file
+ENV NDK_ROOT /opt/android-ndk-r17b
--- a/doc/build.md
+++ b/doc/build.md
+# 环境搭建
+## 使用 docker
+### 1. 安装 docker
+安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
+### 2. 使用 docker 搭建构建环境
+首先进入 paddle-mobile 的目录下，执行 `docker build`
+以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
+```
+$ docker build -t paddle-mobile:dev - < Dockerfile
+```
+使用 `docker images` 可以看到我们新建的 image
+```
+$ docker images
+REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
+paddle-mobile   dev     33b146787711   45 hours ago    372MB
+```
+### 3. 使用 docker 构建
+进入 paddle-mobile 目录，执行 docker run
+```
+$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
+root@5affd29d4fc5:/ # cd /paddle-mobile
+# 生成构建 android 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
+# 生成构建 linux 产出的 Makefile
+root@5affd29d4fc5:/ # rm CMakeCache.txt
+root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
+```
+### 4. 设置编译选项
+可以通过 ccmake 设置编译选项
+```
+root@5affd29d4fc5:/ # ccmake .
+                                                     Page 1 of 1
+ CMAKE_ASM_FLAGS
+ CMAKE_ASM_FLAGS_DEBUG
+ CMAKE_ASM_FLAGS_RELEASE
+ CMAKE_BUILD_TYPE
+ CMAKE_INSTALL_PREFIX             /usr/local
+ CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
+ CPU                              ON
+ DEBUGING                         ON
+ FPGA                             OFF
+ LOG_PROFILE                      ON
+ MALI_GPU                         OFF
+ NET                              googlenet
+ USE_EXCEPTION                    ON
+ USE_OPENMP                       OFF
+```
+修改选项后，按 `c`, `g` 更新 Makefile
+### 5. 构建
+使用 make 命令进行构建
+```
+root@5affd29d4fc5:/ # make
+```
+### 6. 查看构建产出
+构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
+## 不使用 docker
+不使用 docker 的方法，可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具，可能需要设置 CC，CXX 环境变量，或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake，或者增加自己需要的 toolchain file。
\ No newline at end of file
--- a/src/common/common.h
+++ b/src/common/common.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <chrono>
+using Time = decltype(std::chrono::high_resolution_clock::now());
+inline Time time() { return std::chrono::high_resolution_clock::now(); }
+inline double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
--- a/src/common/depCore.h
+++ b/src/common/depCore.h
--- a/src/common/log.h
+++ b/src/common/log.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <vector>
 #ifdef PADDLE_MOBILE_DEBUG
+#include <cstring>
 #include <iostream>
 #include <sstream>
 #include <string>
@@ -115,26 +116,29 @@ struct ToLog {
  Print printer_;
 };
-#define LOG(level)                                                             \
+#define LOG(level)                                                           \
-  if (level > paddle_mobile::log_level) {                                      \
+  if (level > paddle_mobile::log_level) {                                    \
-  } else                                                                       \
+  } else                                                                     \
-    paddle_mobile::ToLog(                                                      \
+    paddle_mobile::ToLog(                                                    \
-        level,                                                                 \
+        level, static_cast<const std::stringstream &>(                       \
-        (std::stringstream()                                                   \
+                   std::stringstream()                                       \
-         << "[file: "                                                          \
+                   << "[file: "                                              \
-         << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \
+                   << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
-         << "] [line: " << __LINE__ << "] ")                                   \
+                                              : __FILE__)                    \
-            .str())
+                   << "] [line: " << __LINE__ << "] ")                       \
+                   .str())
-#define DLOG                                                                   \
-  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {                  \
+#define DLOG                                                          \
-  } else                                                                       \
+  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {         \
-    paddle_mobile::ToLog(                                                      \
+  } else                                                              \
-        paddle_mobile::kLOG_DEBUG,                                             \
+    paddle_mobile::ToLog(                                             \
-        (std::stringstream()                                                   \
+        paddle_mobile::kLOG_DEBUG,                                    \
-         << "[file: "                                                          \
+        static_cast<const std::stringstream &>(                       \
-         << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) : __FILE__) \
+            std::stringstream()                                       \
-         << "] [line: " << __LINE__ << "] ")                                   \
+            << "[file: "                                              \
+            << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
+                                       : __FILE__)                    \
+            << "] [line: " << __LINE__ << "] ")                       \
            .str())
 #define LOGF(level, format, ...)          \

--- a/src/common/type_define.h
+++ b/src/common/type_define.h
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once;
+#pragma once
+#include <functional>
 #include <map>
 #include <string>
 #include <vector>

--- a/src/common/types.h
+++ b/src/common/types.h
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once;
+#pragma once
 #include <string>
 #include <unordered_map>
+#include <vector>
 namespace paddle_mobile {
 enum class Precision : int { FP32 = 0 };

--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string>
+#include <typeinfo>
 #include <unordered_map>
 #include <vector>

--- a/src/framework/ddim.h
+++ b/src/framework/ddim.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <initializer_list>
+#include <typeinfo>
 #include <vector>
 #include "common/enforce.h"
 #include "common/variant.h"

--- a/src/framework/lod_tensor.cpp
+++ b/src/framework/lod_tensor.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "lod_tensor.h"
+#include <algorithm>
 namespace paddle_mobile {
 namespace framework {

--- a/src/framework/program/block_desc.cpp
+++ b/src/framework/program/block_desc.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "block_desc.h"
+#include <algorithm>
 namespace paddle_mobile {
 namespace framework {

--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "framework/program/program-optimize/program_optimize.h"
+#include <algorithm>
 #include "framework/program/program-optimize/fusion_op_register.h"
 namespace paddle_mobile {

--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -29,7 +29,7 @@ class Program {
  std::shared_ptr<Scope> scope;
  std::string model_path;
  std::string para_path;
-  bool is_commbine = false;
+  bool combined = false;
 private:
 };

--- a/src/framework/scope.cpp
+++ b/src/framework/scope.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "framework/scope.h"
+#include <algorithm>
 #include <set>
 #include <string>
 #include <vector>

--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <vector>
 #include "common/enforce.h"
+#include <fstream>
 #include "common/enforce.h"
 #include "framework/data_layout.h"
 #include "framework/ddim.h"
@@ -131,6 +132,22 @@ class Tensor {
    return reinterpret_cast<T *>(mutable_data(typeid(T)));
  }
+#ifdef PADDLE_MOBILE_DEBUG
+  template <typename T>
+  inline void dump(std::string filename) const {
+    const T *dataptr = data<T>();
+    std::ofstream out(filename.c_str());
+    for (int i = 0; i < numel(); ++i) {
+      out << dataptr[i] << " ";
+    }
+    out << "形状：";
+    for (int j = 0; j < dims_.size(); ++j) {
+      out << dims_[j] << " ";
+    }
+    out.close();
+  }
+#endif
  inline void *mutable_data(std::type_index type) {
    if (holder_ != nullptr) {
      holder_->set_type(type);

--- a/src/io/io.cpp
+++ b/src/io/io.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "io/io.h"
+#include <algorithm>
 #include <vector>
 #include "common/enforce.h"
 #include "common/log.h"
@@ -25,7 +26,6 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
-#include <algorithm>
 #include <queue>
 #include <utility>
 #include "common/threadpool.h"
@@ -88,7 +88,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
    bool optimize) {
  auto program = this->LoadProgram(model_path, optimize);
  program.para_path = para_path;
-  program.is_commbine = true;
+  program.combined = true;
  return program;
 }
@@ -193,7 +193,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
 #endif
    }
  }
-  if (program_.is_commbine) {
+  if (program_.combined) {
    InitCombineMemory();
  } else {
    InitMemory();

--- a/src/io/io.h
+++ b/src/io/io.h
@@ -27,7 +27,7 @@ limitations under the License. */
 #include <condition_variable>
 #include <mutex>
 #include <thread>
-#include "common/depCore.h"
+#include "common/dep_core.h"
 #endif
 namespace paddle_mobile {

--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 #include "memory/t_malloc.h"
 #include <cstdlib>
 #include <cstring>

--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -36,6 +36,8 @@ USE_OP_CPU(batch_norm);
 REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(batch_norm);
+REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -67,6 +67,8 @@ USE_OP_CPU(concat);
 REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(concat);
+REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -34,6 +34,8 @@ USE_OP_CPU(elementwise_add);
 REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(elementwise_add);
+REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/fusion_conv_add.h
+++ b/src/operators/fusion_conv_add.h
@@ -11,9 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define FUSION_CONVADD_OP
-#ifdef FUSION_CONVADD_OP
+#ifdef FUSION_CONVADD_OP
 #pragma once
 #include <string>

--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -59,6 +59,8 @@ USE_OP_CPU(fc);
 REGISTER_OPERATOR_CPU(fc, ops::FusionFcOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fc);
+REGISTER_OPERATOR_MALI_GPU(fc, ops::FusionFcOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/kernel/arm/batchnorm_kernel.cpp
+++ b/src/operators/kernel/arm/batchnorm_kernel.cpp
@@ -14,10 +14,8 @@ limitations under the License. */
 #ifdef BATCHNORM_OP
-#pragma once
 #include "operators/kernel/batchnorm_kernel.h"
-#include "operators/kernel/central-arm-func/batchnorm_func.h"
+#include "operators/kernel/central-arm-func/batchnorm_arm_func.h"
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/arm/box_coder_kernel.cpp
+++ b/src/operators/kernel/arm/box_coder_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef BOXCODER_OP
 #include "operators/kernel/box_coder_kernel.h"
+#include <cmath>
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/arm/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_kernel.cpp
@@ -23,8 +23,7 @@ bool ConvAddKernel<CPU, float>::Init(const FusionConvAddParam &para) const {
  return true;
 }
-template <>
+void ConvAddBasic(const FusionConvAddParam &param) {
-void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam &param) const {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
  Tensor bias = *param.Bias();
@@ -102,7 +101,6 @@ void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam &param) const {
        // vol2col
        vol2col(in_slice, dilations, strides, paddings, &col);
      }
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
@@ -112,6 +110,26 @@ void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam &param) const {
    }
  }
 }
+template <>
+void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam &param) const {
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
+                               param.Bias(), true);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3) {
+    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
+                           param.Filter(), param.Bias(), param.Output(), true);
+  } else {
+    ConvAddBasic(param);
+  }
+}
 template class ConvAddKernel<CPU, float>;
 }  // namespace operators

--- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADD_RELU_OP
 #include "operators/kernel/conv_add_relu_kernel.h"
-#include "operators/kernel/central-arm-func/conv_add_relu_func.h"
+#include "operators/kernel/central-arm-func/conv_add_relu_arm_func.h"
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef CONV_OP
 #include "operators/kernel/conv_kernel.h"
-#include "operators/kernel/central-arm-func/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/arm/depthwise_conv_kernel.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef DEPTHWISECONV_OP
 #include "operators/kernel/depthwise_conv_kernel.h"
-#include "operators/kernel/conv_kernel.h"
+#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
@@ -27,91 +27,7 @@ bool DepthwiseConvKernel<CPU, float>::Init(const ConvParam &para) const {
 template <>
 void DepthwiseConvKernel<CPU, float>::Compute(const ConvParam &param) const {
-  LOG(kLOG_DEBUG) << param;
+  DepthwiseConvCompute<float>(param);
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-  //  DLOG << " compute end get Attrs " << strides[0];
-  const int batch_size = static_cast<int>(input->dims()[0]);
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(0));
-    }
-  }
 }
 template class DepthwiseConvKernel<CPU, float>;

--- a/src/operators/kernel/arm/multiclass_nms_kernel.cpp
+++ b/src/operators/kernel/arm/multiclass_nms_kernel.cpp
@@ -14,10 +14,8 @@ limitations under the License. */
 #ifdef MULTICLASSNMS_OP
-#pragma once
 #include "operators/kernel/multiclass_nms_kernel.h"
+#include <algorithm>
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/arm/prior_box_kernel.cpp
+++ b/src/operators/kernel/arm/prior_box_kernel.cpp
@@ -14,8 +14,6 @@ limitations under the License. */
 #ifdef PRIORBOX_OP
-#pragma once
 #include "operators/kernel/prior_box_kernel.h"
 namespace paddle_mobile {

--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -14,8 +14,6 @@ limitations under the License. */
 #ifdef RELU_OP
-#pragma once
 #include "operators/kernel/relu_kernel.h"
 #include <operators/math/transform.h>

--- a/src/operators/kernel/arm/reshape_kernel.cpp
+++ b/src/operators/kernel/arm/reshape_kernel.cpp
@@ -14,8 +14,6 @@ limitations under the License. */
 #ifdef RESHAPE_OP
-#pragma once
 #include "operators/kernel/reshape_kernel.h"
 namespace paddle_mobile {

--- a/src/operators/kernel/arm/sigmoid_kernel.cpp
+++ b/src/operators/kernel/arm/sigmoid_kernel.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #if __ARM_NEON
 #include "../../math/math_func_neon.h"
 #endif
+#include <cmath>
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/central-arm-func/batchnorm_func.h
+++ b/src/operators/kernel/central-arm-func/batchnorm_func.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #pragma once
+#include <cmath>
 #include "operators/op_param.h"
 namespace paddle_mobile {

--- a/src/operators/kernel/central-arm-func/conv_add_relu_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_relu_func.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADD_RELU_OP
 #pragma once
+#include <vector>
 #include "operators/op_param.h"
 namespace paddle_mobile {

--- a/src/operators/kernel/central-arm-func/conv_func.h
+++ b/src/operators/kernel/central-arm-func/conv_func.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifdef CONV_OP
 #pragma once
+#include <vector>
+#include "operators/math/conv_func.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
@@ -48,7 +50,8 @@ void ConvCompute(const ConvParam &param) {
  framework::DDim col_matrix_shape =
      framework::flatten_to_2d(col_shape, data_dim + 1);
-  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
  Tensor col;
  Tensor col_matrix;
  if (is_expand) {

--- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef DEPTHWISECONV_OP
+#pragma once
+#include <vector>
+#include "operators/math/conv_func.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename P>
+void DepthwiseConvCompute(const ConvParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor *output = param.Output();
+  output->mutable_data<float>();
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+  //  DLOG << " compute end get Attrs " << strides[0];
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(0));
+    }
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
@@ -20,9 +20,11 @@ limitations under the License. */
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif
+#include "common/common.h"
 #include "framework/ddim.h"
 #include "framework/operator.h"
 #include "operators/math/conv_func.h"
+#include "operators/math/depthwise_conv_3x3.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"

--- a/src/operators/kernel/conv_kernel.h
+++ b/src/operators/kernel/conv_kernel.h
@@ -35,21 +35,6 @@ class ConvKernel : public OpKernelBase<DeviceType, ConvParam> {
  bool Init(const ConvParam &para) const;
 };
-inline bool IsExpand(const std::vector<int64_t> &filter_dim,
-                     const std::vector<int> &strides,
-                     const std::vector<int> &paddings,
-                     const std::vector<int> &dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
@@ -14,10 +14,11 @@ limitations under the License. */
 #ifdef LRN_OP
-#pragma once
 #include "framework/operator.h"
 #include "operators/op_param.h"
+#include <cmath>
 #ifdef __ARM_NEON
 #include "arm_neon.h"
 #include "operators/math/math_func_neon.h"

--- a/ACL_Android @ 591027fc
+++ b/ACL_Android @ 591027fc
+Subproject commit 591027fcffea084100c756e48356e0f8a48e35e5
--- a/src/operators/kernel/mali/acl_operator.cc
+++ b/src/operators/kernel/mali/acl_operator.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#if USE_ACL == 1
+#include "acl_operator.h"
+unsigned int bypass_acl_class_layer =
+    (0 | FLAGS_ENABLE_ACL_CONCAT |
+     /*0xffffffff |*/ /*FLAGS_ENABLE_ACL_FC |*/ /*FLAGS_ENABLE_ACL_LRN
+                                                   |*/
+     0);
+int enable_schedule = 0;
+#ifdef USE_PROFILING
+#include "arm_neon.h"
+unsigned int acl_log_flags =
+    (0 | MASK_LOG_APP_TIME | /*MASK_LOG_ALLOCATE | */  /*MASK_LOG_ALLOCATE | */
+     /*MASK_LOG_RUN      | */ /*MASK_LOG_CONFIG   | */ /*MASK_LOG_COPY     | */
+     MASK_LOG_ABSVAL | MASK_LOG_BNLL | MASK_LOG_CONV | MASK_LOG_FC |
+     MASK_LOG_LRN | MASK_LOG_POOLING | MASK_LOG_RELU | MASK_LOG_SIGMOID |
+     MASK_LOG_SOFTMAX | MASK_LOG_TANH | MASK_LOG_LC | MASK_LOG_BN |
+     MASK_LOG_CONCAT | 0);
+#include <stdio.h>  /* printf */
+#include <stdlib.h> /* getenv */
+#endif              // USE_PROFILING
+static bool force_enable_gpu = false;
+bool AclEnableSchedule(int enable) {
+  enable_schedule = enable;
+  if (enable) {
+    force_enable_gpu = true;
+  }
+  return true;
+}
+int isScheduleEnable() { return enable_schedule; }
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+bool ACLOperator::init_gpu_env = true;
+#ifdef USE_OPENCL
+bool ACLOperator::support_opencl_ = false;
+bool opencl_is_available() { return arm_compute::opencl_is_available(); }
+#elif defined(USE_OPENGLES)
+bool ACLOperator::support_opengles_ = false;
+#endif
+ACLOperator::ACLOperator(bool is_gpu)
+    : operator_state_(operator_not_init),
+      force_bypass_acl_path_(false),
+      target_hint_(TargetHint::DONT_CARE),
+      convolution_method_hint_(ConvolutionMethodHint::GEMM),
+      _group(1),
+      name_(""),
+      input_idx_(0),
+      output_idx_(0),
+      is_gpu_(is_gpu) {
+  const char* pBypassACL;
+  if (init_gpu_env) {
+#ifdef USE_OPENCL
+    try {
+      if (opencl_is_available()) {
+        arm_compute::CLScheduler::get().default_init();
+        support_opencl_ = true;
+      }
+    } catch (std::exception& e) {
+      support_opencl_ = false;
+    }
+#elif defined(USE_OPENGLES)
+    try {
+      arm_compute::GCScheduler::get().default_init();
+      support_opengles_ = true;
+    } catch (std::exception& e) {
+      support_opengles_ = false;
+    }
+#endif
+    init_gpu_env = false;
+  }
+  if (force_enable_gpu) is_gpu_ = true;
+  pBypassACL = getenv("BYPASSACL");
+  if (pBypassACL) {
+    unsigned int bacl;
+    sscanf(pBypassACL, "%i", &bacl);
+    if (bacl != bypass_acl_class_layer) {
+      bypass_acl_class_layer = bacl;
+      printf("BYPASSACL<%s>\n", pBypassACL);
+      printf("BYPASSACL: %x\n", bypass_acl_class_layer);
+    }
+  }
+#ifdef USE_PROFILING
+  const char* pLogACL;
+  pLogACL = getenv("LOGACL");
+  if (pLogACL) {
+    unsigned int alf;
+    sscanf(pLogACL, "%i", &alf);
+    if (alf != acl_log_flags) {
+      acl_log_flags = alf;
+      printf("LOGACL<%s>\n", pLogACL);
+      printf("LOGACL: %x\n", acl_log_flags);
+    }
+  }
+#endif  // USE_PROFILING
+  const char* pEnableSchedule;
+  pEnableSchedule = getenv("ENABLESCHEDULE");
+  if (pEnableSchedule) {
+    int bshedule;
+    sscanf(pEnableSchedule, "%i", &bshedule);
+    if (bshedule != enable_schedule) {
+      enable_schedule = bshedule;
+      printf("ENABLESCHEDULE<%s>\n", pEnableSchedule);
+      printf("ENABLESCHEDULE: %x\n", enable_schedule);
+    }
+    if (enable_schedule) {
+      AclEnableSchedule(1);
+    }
+  }
+}
+ACLOperator::~ACLOperator() {}
+bool ACLOperator::new_tensor(std::unique_ptr<ACLTensor>& tensor,
+                             arm_compute::TensorShape& shape, void* mem,
+                             bool commit) {
+  auto acl_tensor =
+      new ACLTensor(arm_compute::TensorInfo(shape, arm_compute::Format::F32));
+  acl_tensor->set_target(getTargetHint());
+  acl_tensor->bindmem(mem);
+  if (commit) acl_tensor->commit();
+  tensor = (std::unique_ptr<ACLTensor>)std::move(acl_tensor);
+  return true;
+}
+bool ACLOperator::new_tensor(std::unique_ptr<ACLSubTensor>& tensor,
+                             std::unique_ptr<ACLTensor>& parent,
+                             arm_compute::TensorShape& shape,
+                             arm_compute::Coordinates& coord) {
+  auto acl_tensor = new ACLSubTensor(parent, shape, coord);
+  acl_tensor->set_target(getTargetHint());
+  tensor = (std::unique_ptr<ACLSubTensor>)std::move(acl_tensor);
+  return true;
+}
+void ACLTensor::commit(TensorType type) {
+  settensortype(type);
+  if (mem_) {
+    if (!allocate_) {
+#ifdef USE_PROFILING
+      logtime_util log_time(ACL_ALLOCATE_INFO);
+#endif  // USE_PROFILING
+      allocate();
+      allocate_ = true;
+    }
+    if (type_ != tensor_output) {
+      tensor_copy(mem_);
+    }
+    mem_ = nullptr;
+  }
+}
+int BaseACLTensor::tensor_copy(arm_compute::ITensor* tensor, void* mem,
+                               bool toTensor) {
+#ifdef USE_PROFILING
+  logtime_util log_time(ACL_COPY_INFO);
+#endif  // USE_PROFILING
+  arm_compute::Window window;
+  // Iterate through the rows (not each element)
+  window.use_tensor_dimensions(tensor->info()->tensor_shape(),
+                               /* first_dimension =*/arm_compute::Window::DimY);
+  int width = tensor->info()->tensor_shape()[0];
+  int height = tensor->info()->tensor_shape()[1];
+  int deepth = tensor->info()->tensor_shape()[2];
+  map();
+  // Create an iterator:
+  arm_compute::Iterator it(tensor, window);
+  // Except it works for an arbitrary number of dimensions
+  if (toTensor) {  // mem->tensor
+    arm_compute::execute_window_loop(
+        window,
+        [&](const arm_compute::Coordinates& id) {
+          memcpy(it.ptr(),
+                 ((char*)mem) +
+                     ((id[3] * (width * height * deepth) +
+                       id.z() * (width * height) + id.y() * width + id.x()) *
+                      tensor->info()->element_size()),
+                 width * tensor->info()->element_size());
+        },
+        it);
+  } else {  // tensor-->mem
+    arm_compute::execute_window_loop(
+        window,
+        [&](const arm_compute::Coordinates& id) {
+          memcpy(((char*)mem) + ((id[3] * (width * height * deepth) +
+                                  id.z() * (width * height) + id.y() * width) *
+                                 tensor->info()->element_size()),
+                 it.ptr(), width * tensor->info()->element_size());
+        },
+        it);
+  }
+  unmap();
+  return 0;
+}
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/acl_operator.h
+++ b/src/operators/kernel/mali/acl_operator.h
--- a/src/operators/kernel/mali/acl_tensor.cc
+++ b/src/operators/kernel/mali/acl_tensor.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "acl_tensor.h"
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+#ifdef USE_ACL
+template <typename TensorType>
+std::unique_ptr<arm_compute::ITensor> initialise_tensor(
+    arm_compute::TensorInfo &info) {
+  auto tensor = cpp14::make_unique<TensorType>();
+  tensor->allocator()->init(info);
+  return std::move(tensor);
+}
+template <typename TensorType>
+void tensor_allocate(arm_compute::ITensor &tensor) {
+  auto itensor = dynamic_cast<TensorType *>(&tensor);
+  itensor->allocator()->allocate();
+}
+Tensor::Tensor(arm_compute::TensorInfo &info) noexcept
+    : _target(TargetHint::DONT_CARE), _info(info), _tensor(nullptr) {}
+Tensor::Tensor(Tensor &&src) noexcept
+    : _target(src._target),
+      _info(std::move(src._info)),
+      _tensor(std::move(src._tensor)) {}
+arm_compute::ITensor *Tensor::set_target(TargetHint target) {
+  switch (target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      _tensor = initialise_tensor<arm_compute::CLTensor>(_info);
+      break;
+#elif defined(USE_OPENGLES)
+    case TargetHint::OPENGLES:
+      _tensor = initialise_tensor<arm_compute::GCTensor>(_info);
+      break;
+#endif
+    case TargetHint::NEON:
+      _tensor = initialise_tensor<arm_compute::Tensor>(_info);
+      break;
+    default:
+      break;
+  }
+  _target = target;
+  return _tensor.get();
+}
+void Tensor::allocate() {
+  switch (_target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      tensor_allocate<arm_compute::CLTensor>(*_tensor);
+      break;
+#elif defined(USE_OPENGLES)
+    case TargetHint::OPENGLES:
+      tensor_allocate<arm_compute::GCTensor>(*_tensor);
+      break;
+#endif
+    case TargetHint::NEON:
+      tensor_allocate<arm_compute::Tensor>(*_tensor);
+      break;
+    default:
+      break;
+  }
+}
+void Tensor::map(bool blocking) {
+#ifdef USE_OPENCL
+  if (_target == TargetHint::OPENCL)
+    dynamic_cast<arm_compute::CLTensor *>(tensor())->map(blocking);
+#elif defined(USE_OPENGLES)
+  if (_target == TargetHint::OPENGLES)
+    dynamic_cast<arm_compute::GCTensor *>(tensor())->map(blocking);
+#endif
+}
+void Tensor::unmap() {
+#ifdef USE_OPENCL
+  if (_target == TargetHint::OPENCL)
+    dynamic_cast<arm_compute::CLTensor *>(tensor())->unmap();
+#elif defined(USE_OPENGLES)
+  if (_target == TargetHint::OPENGLES)
+    dynamic_cast<arm_compute::GCTensor *>(tensor())->unmap();
+#endif
+}
+template <typename SubTensorType, typename ParentTensorType>
+std::unique_ptr<arm_compute::ITensor> initialise_subtensor(
+    arm_compute::ITensor *parent, arm_compute::TensorShape shape,
+    arm_compute::Coordinates coords) {
+  auto ptensor = dynamic_cast<ParentTensorType *>(parent);
+  auto subtensor = cpp14::make_unique<SubTensorType>(ptensor, shape, coords);
+  return std::move(subtensor);
+}
+SubTensor::SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
+                     arm_compute::Coordinates &coords) noexcept
+    : _target(TargetHint::DONT_CARE),
+      _tensor_shape(tensor_shape),
+      _coords(coords),
+      _parent(nullptr),
+      _subtensor(nullptr) {
+  _parent = parent->tensor();
+  _target = parent->target();
+  instantiate_subtensor();
+}
+arm_compute::ITensor *SubTensor::set_target(TargetHint target) {
+  return (target == _target) ? _subtensor.get() : nullptr;
+}
+arm_compute::ITensor *SubTensor::tensor() { return _subtensor.get(); }
+const arm_compute::ITensor *SubTensor::tensor() const {
+  return _subtensor.get();
+}
+TargetHint SubTensor::target() const { return _target; }
+void SubTensor::allocate() {
+  // NOP for sub-tensors
+}
+void SubTensor::instantiate_subtensor() {
+  switch (_target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      _subtensor = initialise_subtensor<arm_compute::CLSubTensor,
+                                        arm_compute::ICLTensor>(
+          _parent, _tensor_shape, _coords);
+      break;
+#endif
+    default:
+    case TargetHint::NEON:
+      _subtensor =
+          initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(
+              _parent, _tensor_shape, _coords);
+      break;
+  }
+}
+#endif
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/mali/acl_tensor.h
+++ b/src/operators/kernel/mali/acl_tensor.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef ACL_TENSOR_H_
+#define ACL_TENSOR_H_
+#ifdef USE_ACL
+#ifdef USE_OPENCL
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#elif defined(USE_OPENGLES)
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#endif
+#include "arm_compute/runtime/SubTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+#include <memory>
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+enum class TargetHint {
+  DONT_CARE,
+  OPENCL,
+  OPENGLES,
+  NEON,
+};
+enum class ConvolutionMethodHint {
+  GEMM,
+  DIRECT,
+};
+namespace cpp14 {
+template <class T>
+struct _Unique_if {
+  typedef std::unique_ptr<T> _Single_object;
+};
+template <class T>
+struct _Unique_if<T[]> {
+  typedef std::unique_ptr<T[]> _Unknown_bound;
+};
+template <class T, size_t N>
+struct _Unique_if<T[N]> {
+  typedef void _Known_bound;
+};
+template <class T, class... Args>
+typename _Unique_if<T>::_Single_object make_unique(Args &&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+template <class T>
+typename _Unique_if<T>::_Unknown_bound make_unique(size_t n) {
+  typedef typename std::remove_extent<T>::type U;
+  return std::unique_ptr<T>(new U[n]());
+}
+template <class T, class... Args>
+typename _Unique_if<T>::_Known_bound make_unique(Args &&...);
+}  // namespace cpp14
+class Tensor {
+ public:
+  explicit Tensor(arm_compute::TensorInfo &info) noexcept;
+  virtual ~Tensor() {}
+  Tensor(Tensor &&src) noexcept;
+  void set_info(arm_compute::TensorInfo &&info) { _info = info; }
+  arm_compute::ITensor *set_target(TargetHint target);
+  const arm_compute::TensorInfo &info() const { return _info; }
+  arm_compute::ITensor *tensor() { return _tensor.get(); }
+  void allocate();
+  void init() {}
+  TargetHint target() const { return _target; }
+  virtual void map(bool blocking = true);
+  virtual void unmap();
+ private:
+  TargetHint _target;
+  arm_compute::TensorInfo _info;
+  std::unique_ptr<arm_compute::ITensor> _tensor;
+};
+class SubTensor {
+ public:
+  SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
+            arm_compute::Coordinates &coords) noexcept;
+  ~SubTensor() {}
+  arm_compute::ITensor *tensor();
+  const arm_compute::ITensor *tensor() const;
+  TargetHint target() const;
+  void allocate();
+  arm_compute::ITensor *set_target(TargetHint target);
+ private:
+  /** Instantiates a sub-tensor */
+  void instantiate_subtensor();
+ private:
+  /**< Target that this tensor is pinned on */
+  TargetHint _target;
+  /**< SubTensor shape */
+  arm_compute::TensorShape _tensor_shape;
+  /**< SubTensor Coordinates */
+  arm_compute::Coordinates _coords;
+  /**< Parent tensor */
+  arm_compute::ITensor *_parent;
+  /**< SubTensor */
+  std::unique_ptr<arm_compute::ITensor> _subtensor;
+};
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif  // ACL_TENSOR_H_
--- a/src/operators/kernel/mali/batchnorm_kernel.cpp
+++ b/src/operators/kernel/mali/batchnorm_kernel.cpp
@@ -14,23 +14,153 @@ limitations under the License. */
 #ifdef BATCHNORM_OP
-#pragma once
 #include "operators/kernel/batchnorm_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
+template <typename DeviceType, typename T>
+class AclBatchNormOp : public acl::ACLOperator {
+ public:
+  AclBatchNormOp() {
+    this->force_bypass_acl_path_ = bypass_acl_class_layer & FLAGS_ENABLE_ACL_BN;
+  }
+  ~AclBatchNormOp() = default;
+  AclBatchNormOp(const AclBatchNormOp&) = delete;
+  AclBatchNormOp& operator=(const AclBatchNormOp&) = delete;
+  AclBatchNormOp(AclBatchNormOp&&) = delete;
+  AclBatchNormOp& operator=(AclBatchNormOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const BatchNormParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    arm_compute::TensorShape mean_shape(args.in_depth);
+    arm_compute::TensorShape var_shape = mean_shape;
+    arm_compute::TensorShape beta_shape = mean_shape;
+    arm_compute::TensorShape gamma_shape = mean_shape;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    new_tensor(mean(), mean_shape, args.mean_data);
+    new_tensor(var(), var_shape, args.var_data);
+    new_tensor(beta(), beta_shape, args.biases_data);
+    new_tensor(gamma(), gamma_shape, args.weight_data);
+    acl_configure(bn, this, args.epsilon);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const BatchNormParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const BatchNormParam& param) {
+    const Tensor* in_x = param.InputX();
+    Tensor* out = param.OutputY();
+    const Tensor* scale = param.InputScale();
+    const Tensor* bias = param.InputBias();
+    const Tensor* saved_mean = param.InputMean();
+    const Tensor* saved_variance = param.InputVariance();
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    const T* weight_data = scale->data<T>();
+    const T* bias_data = bias->data<T>();
+    const T* mean_data = saved_mean->data<T>();
+    const T* var_data = saved_variance->data<T>();
+    float epsilon = param.Epsilon();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    // args.weight_data = (void*)weight_data;
+    // args.biases_data = (void*)bias_data;
+    args.mean_data = (void*)mean_data;
+    args.var_data = (void*)var_data;
+    args.epsilon = epsilon;
+    args.dim = in_x->dims().size();
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+    args.out_num = out->dims()[0];
+    args.out_depth = out->dims()[1];
+    args.out_rows = out->dims()[2];
+    args.out_cols = out->dims()[3];
+    args.weight_data = (void*)weight_data;
+    args.biases_data = (void*)bias_data;
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
 template <>
-bool BatchNormKernel<GPU_MALI, float>::Init(const BatchNormParam &para) const {
+bool BatchNormKernel<GPU_MALI, float>::Init(const BatchNormParam& param) const {
+  AclBatchNormOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclBatchNormOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
  return true;
 }
 template <>
 void BatchNormKernel<GPU_MALI, float>::Compute(
-    const BatchNormParam &param) const {}
+    const BatchNormParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclBatchNormOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
+}
+template class BatchNormKernel<GPU_MALI, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 #endif
+#endif
--- a/src/operators/kernel/mali/concat_kernel.cpp
+++ b/src/operators/kernel/mali/concat_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef CONCAT_OP
+#include "operators/kernel/concat_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclConcatOp : public acl::ACLOperator {
+ public:
+  AclConcatOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONCAT;
+  }
+  ~AclConcatOp() = default;
+  AclConcatOp(const AclConcatOp&) = delete;
+  AclConcatOp& operator=(const AclConcatOp&) = delete;
+  AclConcatOp(AclConcatOp&&) = delete;
+  AclConcatOp& operator=(AclConcatOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const ConcatParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    const std::vector<framework::LoDTensor*>* input_data = &args.in_tensor;
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.batch);
+    if (is_operator_init_done(output_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    T type;
+    for (int i = 0; i < input_data->size(); i++) {
+      const T* idata = (*input_data)[i]->data<T>();
+      const T* pdata = (*input_data)[i]->data<T>();
+      int in_batch = (*input_data)[i]->dims()[0];
+      int in_channels = (*input_data)[i]->dims()[1];
+      int in_width = (*input_data)[i]->dims()[2];
+      int in_height = (*input_data)[i]->dims()[3];
+      arm_compute::TensorShape in_shape(in_width, in_height, in_channels);
+      new_tensor(cinput(i), in_shape,
+                 acl::InputdataPtr(this, args.in_tensor, type, i));
+    }
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(concat, this, input_data->size());
+  }
+  void RunAcl(const std::vector<framework::LoDTensor*>& input, void* output) {
+    T type;
+    acl::acl_run(this, input, output, type);
+  }
+  bool Bypass_acl(const ConcatParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || !args.is_channel_concat) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const ConcatParam& param) {
+    auto inputs = param.Inputs();
+    auto* output = param.Out();
+    int64_t axis = param.Axis();
+    T* output_data = output->mutable_data<T>();
+    args.is_channel_concat = (axis == 1);
+    args.in_tensor = inputs;
+    args.output_data = (void*)output_data;
+    args.batch = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+  }
+  acl::AclParameters args;
+};
+template <>
+bool ConcatKernel<GPU_MALI, float>::Init(const ConcatParam& param) const {
+  AclConcatOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConcatOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void ConcatKernel<GPU_MALI, float>::Compute(const ConcatParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclConcatOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  std::vector<framework::LoDTensor*> temp_data = args.in_tensor;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl(temp_data, (void*)output_data);
+}
+template class ConcatKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/conv_add_kernel.cpp
+++ b/src/operators/kernel/mali/conv_add_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADD_OP
+#include "operators/kernel/conv_add_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclConvAddOp : public acl::ACLOperator {
+ public:
+  AclConvAddOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
+  }
+  ~AclConvAddOp() = default;
+  AclConvAddOp(const AclConvAddOp&) = delete;
+  AclConvAddOp& operator=(const AclConvAddOp&) = delete;
+  AclConvAddOp(AclConvAddOp&&) = delete;
+  AclConvAddOp& operator=(AclConvAddOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const FusionConvAddParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+    arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
+                                           args.in_depth / args.num_group,
+                                           args.out_depth);
+    arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PadStrideInfo conv_info(
+        args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
+        arm_compute::DimensionRoundingType::FLOOR);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    check_direct_conv();
+    //[kernel_x, kernel_y, IFM, OFM]
+    new_tensor(weights(), weights_shape, args.weight_data);
+    //[OFM]
+    if (args.biases_data) {
+      new_tensor(biases(), biases_shape, args.biases_data);
+    }
+    group() = args.num_group;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(conv, this, conv_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const FusionConvAddParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
+      bypass_acl = true;
+    }
+    if (args.dim > 2) {
+      bypass_acl = true;
+    }
+    if (args.dilated) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void check_direct_conv() {
+    bool use_direct_conv = false;
+    const char* pDirectConv;
+    pDirectConv = getenv("DIRECTCONV");
+    if (pDirectConv) {
+      unsigned int bdirectconv;
+      sscanf(pDirectConv, "%i", &bdirectconv);
+      if (bdirectconv != use_direct_conv) {
+        use_direct_conv = bdirectconv;
+        printf("DIRECTCONV<%s>\n", pDirectConv);
+        printf("DIRECTCONV: %x\n", use_direct_conv);
+      }
+    }
+    int pad_data[2], kernel[2];
+    pad_data[1] = args.pad_rows;
+    pad_data[0] = args.pad_cols;
+    kernel[1] = args.filter_rows;
+    kernel[0] = args.filter_cols;
+    if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
+                             pad_data[0] == 0 && pad_data[1] == 0) ||
+                            (kernel[0] == 3 && kernel[1] == 3 &&
+                             pad_data[0] <= 1 && pad_data[1] <= 1))) {
+      setConvMethod();  // NEDirectConvolutionLayer only for 1x1 and 3x3
+    }
+  }
+  void AclParametersByContext(const FusionConvAddParam& param) {
+    const Tensor* input = param.Input();
+    Tensor filter = *param.Filter();
+    Tensor* output = param.Output();
+    Tensor* bias;
+    int groups = param.Groups();
+    std::vector<int> strides = param.Strides();
+    std::vector<int> paddings = param.Paddings();
+    std::vector<int> dilations = param.Dilations();
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>();
+    const T* weight_data = filter.data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.weight_data = (void*)weight_data;
+    args.biases_data = nullptr;
+    try {
+      bias = param.Bias();
+    } catch (const std::exception& e) {
+    }
+    if (bias) {
+      const T* biases_data = bias->data<T>();
+      args.biases_data = (void*)biases_data;
+    }
+    args.num_group = groups;
+    args.dilation_rows = dilations[0];
+    args.dilation_cols = dilations[1];
+    if (dilations[0] != 1 || dilations[1] != 1) {
+      args.dilated = true;
+    }
+    // NCHW
+    // std::cout << "In dims: " << (input->dims()).size() << std::endl;
+    args.batch = input->dims()[0];
+    args.in_depth = input->dims()[1];
+    args.in_rows = input->dims()[2];
+    args.in_cols = input->dims()[3];
+    // std::cout <<"In N: " << args.batch << " C: " <<  args.in_depth
+    //  << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout << "Out dims: " << (output->dims()).size() << std::endl;
+    args.out_num = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    args.filter_rows = filter.dims()[2];
+    args.filter_cols = filter.dims()[3];
+    // std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
+    //  << " I: " <<  static_cast<int>(filter.dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+  }
+  acl::AclParameters args;
+};
+template <>
+bool ConvAddKernel<GPU_MALI, float>::Init(
+    const FusionConvAddParam& param) const {
+  AclConvAddOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConvAddOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void ConvAddKernel<GPU_MALI, float>::Compute(
+    const FusionConvAddParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclConvAddOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
+}
+template class ConvAddKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/conv_kernel.cpp
+++ b/src/operators/kernel/mali/conv_kernel.cpp
@@ -15,20 +15,213 @@ limitations under the License. */
 #ifdef CONV_OP
 #include "operators/kernel/conv_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
+template <typename DeviceType, typename T>
+class AclConvOp : public acl::ACLOperator {
+ public:
+  AclConvOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
+  }
+  ~AclConvOp() = default;
+  AclConvOp(const AclConvOp&) = delete;
+  AclConvOp& operator=(const AclConvOp&) = delete;
+  AclConvOp(AclConvOp&&) = delete;
+  AclConvOp& operator=(AclConvOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const ConvParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+    arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
+                                           args.in_depth / args.num_group,
+                                           args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PadStrideInfo conv_info(
+        args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
+        arm_compute::DimensionRoundingType::FLOOR);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    check_direct_conv();
+    //[kernel_x, kernel_y, IFM, OFM]
+    new_tensor(weights(), weights_shape, args.weight_data);
+    //[OFM]
+    // if (args.biases_data) {
+    //    new_tensor(biases(),biases_shape,args.biases_data);
+    //}
+    group() = args.num_group;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(conv, this, conv_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const ConvParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
+      bypass_acl = true;
+    }
+    if (args.dim > 2) {
+      bypass_acl = true;
+    }
+    if (args.dilated) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void check_direct_conv() {
+    bool use_direct_conv = false;
+    const char* pDirectConv;
+    pDirectConv = getenv("DIRECTCONV");
+    if (pDirectConv) {
+      unsigned int bdirectconv;
+      sscanf(pDirectConv, "%i", &bdirectconv);
+      if (bdirectconv != use_direct_conv) {
+        use_direct_conv = bdirectconv;
+        printf("DIRECTCONV<%s>\n", pDirectConv);
+        printf("DIRECTCONV: %x\n", use_direct_conv);
+      }
+    }
+    int pad_data[2], kernel[2];
+    pad_data[1] = args.pad_rows;
+    pad_data[0] = args.pad_cols;
+    kernel[1] = args.filter_rows;
+    kernel[0] = args.filter_cols;
+    if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
+                             pad_data[0] == 0 && pad_data[1] == 0) ||
+                            (kernel[0] == 3 && kernel[1] == 3 &&
+                             pad_data[0] <= 1 && pad_data[1] <= 1))) {
+      setConvMethod();  // NEDirectConvolutionLayer only for 1x1 and 3x3
+    }
+  }
+  void AclParametersByContext(const ConvParam& param) {
+    const Tensor* input = param.Input();
+    Tensor filter = *param.Filter();
+    Tensor* output = param.Output();
+    int groups = param.Groups();
+    std::vector<int> strides = param.Strides();
+    std::vector<int> paddings = param.Paddings();
+    std::vector<int> dilations = param.Dilations();
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>();
+    const T* weight_data = filter.data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.weight_data = (void*)weight_data;
+    args.biases_data = nullptr;
+    // try {
+    //     bias = context.Input<framework::Tensor>("Bias");
+    // } catch (const std::exception& e) {
+    // }
+    // if (bias) {
+    //     const T* biases_data = bias->data<T>();
+    //     args.biases_data = (void*)biases_data;
+    // }
+    args.num_group = groups;
+    args.dilation_rows = dilations[0];
+    args.dilation_cols = dilations[1];
+    if (dilations[0] != 1 || dilations[1] != 1) {
+      args.dilated = true;
+    }
+    // NCHW
+    // std::cout << "In dims: " << (input->dims()).size() << std::endl;
+    args.batch = input->dims()[0];
+    args.in_depth = input->dims()[1];
+    args.in_rows = input->dims()[2];
+    args.in_cols = input->dims()[3];
+    std::cout << "In N: " << args.batch << " C: " << args.in_depth
+              << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout << "Out dims: " << (output->dims()).size() << std::endl;
+    args.out_num = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    args.filter_rows = filter.dims()[2];
+    args.filter_cols = filter.dims()[3];
+    // std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
+    //  << " I: " <<  static_cast<int>(filter.dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+  }
+  acl::AclParameters args;
+};
 template <>
-bool ConvKernel<GPU_MALI, float>::Init(const ConvParam &para) const {
+bool ConvKernel<GPU_MALI, float>::Init(const ConvParam& param) const {
+  AclConvOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConvOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
  return true;
 }
 template <>
-void ConvKernel<GPU_MALI, float>::Compute(const ConvParam &param) const {
+void ConvKernel<GPU_MALI, float>::Compute(const ConvParam& param) const {
-  //  ArmConvImplement imp;
+  std::cout << "init acl" << std::endl;
-  //  imp.Compute(param);
+  AclConvOp<GPU_MALI, float>* acl_op =
-  param.Output()->mutable_data<float>()[0] = 100.0;
+      reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
 }
 template class ConvKernel<GPU_MALI, float>;
@@ -36,3 +229,4 @@ template class ConvKernel<GPU_MALI, float>;
 }  // namespace paddle_mobile
 #endif
+#endif
--- a/src/operators/kernel/mali/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ELEMENTWISEADD_OP
+#pragma once
+#include "operators/kernel/elementwise_add_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename T>
+struct AddFunctor {
+  inline T operator()(T a, T b) const { return a + b; }
+};
+template <>
+bool ElementwiseAddKernel<GPU_MALI, float>::Init(
+    const ElementwiseAddParam &para) const {
+  return true;
+}
+template <>
+void ElementwiseAddKernel<GPU_MALI, float>::Compute(
+    const ElementwiseAddParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *Out = param.Out();
+  Out->mutable_data<float>();
+  int axis = param.Axis();
+  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
+                                                 AddFunctor<float>(), Out);
+}
+template class ElementwiseAddKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FC_OP
+#pragma once
+#include "operators/kernel/fusion_fc_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FusionFcKernel<GPU_MALI, float>::Init(const FusionFcParam &para) const {
+  return true;
+}
+template <>
+void FusionFcKernel<GPU_MALI, float>::Compute(
+    const FusionFcParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  const Tensor *input_z = param.InputZ();
+  auto *input_z_data = input_z->data<float>();
+  int axis = param.Axis();
+  Tensor *out = param.Out();
+  auto *out_data = out->mutable_data<float>();
+  const Tensor x_matrix =
+      input_x->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+          : *input_x;
+  const Tensor y_matrix =
+      input_y->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+          : *input_y;
+  auto out_dim = out->dims();
+  if (out_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
+  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
+                        " out_dim.size must be 2.");
+  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
+  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ")
+  int64_t classes = input_z->numel();
+  for (int i = 0; i < out_dim[0]; i++) {
+    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
+  }
+  for (int i = 0; i < out->numel(); i++) {
+    DLOG << out_data[i];
+  }
+  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                      out, static_cast<float>(1));
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  //            if (out_dim.size() != 2) {
+  //                out->Resize(out_dim);
+  //            }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/lrn_kernel.cpp
+++ b/src/operators/kernel/mali/lrn_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef LRN_OP
+#pragma once
+#include "operators/kernel/lrn_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclLrnOp : public acl::ACLOperator {
+ public:
+  AclLrnOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_LRN;
+  }
+  ~AclLrnOp() = default;
+  AclLrnOp(const AclLrnOp&) = delete;
+  AclLrnOp& operator=(const AclLrnOp&) = delete;
+  AclLrnOp(AclLrnOp&&) = delete;
+  AclLrnOp& operator=(AclLrnOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const LrnParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape shape(args.in_cols, args.in_rows, args.in_depth);
+    if (is_operator_init_done(shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    arm_compute::NormalizationLayerInfo norm_info(
+        arm_compute::NormType::CROSS_MAP, args.nsize, args.alpha, args.beta,
+        args.knorm);
+    //[width, height, IFM]
+    new_tensor(input(), shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), shape, args.output_data);
+    acl_configure(lrn, this, norm_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const LrnParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const LrnParam& param) {
+    const Tensor* in_x = param.InputX();
+    Tensor* out = param.Out();
+    int n = param.N();
+    T alpha = param.Alpha();
+    T beta = param.Beta();
+    T k = param.K();
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.nsize = n;
+    args.alpha = alpha;
+    args.beta = beta;
+    args.knorm = k;
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
+template <>
+bool LrnKernel<GPU_MALI, float>::Init(const LrnParam& param) const {
+  AclLrnOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclLrnOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void LrnKernel<GPU_MALI, float>::Compute(const LrnParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclLrnOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  for (int n = 0; n < args.batch; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth * args.in_cols * args.in_rows;
+    output_data += args.in_depth * args.in_cols * args.in_rows;
+  }
+}
+template class LrnKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/mul_kernel.cpp
+++ b/src/operators/kernel/mali/mul_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef MUL_OP
+#pragma once
+#include "operators/kernel/mul_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool MulKernel<GPU_MALI, float>::Init(const MulParam &para) const {
+  return true;
+}
+template <>
+void MulKernel<GPU_MALI, float>::Compute(const MulParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *out = param.Out();
+  out->mutable_data<float>();
+  const Tensor x_matrix =
+      input_x->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+          : *input_x;
+  const Tensor y_matrix =
+      input_y->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+          : *input_y;
+  auto out_dim = out->dims();
+  if (out_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                      out, static_cast<float>(0));
+  if (out_dim.size() != 2) {
+    out->Resize(out_dim);
+  }
+}
+template class MulKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/pool_kernel.cpp
+++ b/src/operators/kernel/mali/pool_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POOL_OP
+#pragma once
+#include "operators/kernel/pool_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclPoolOp : public acl::ACLOperator {
+ public:
+  AclPoolOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_POOLING;
+  }
+  ~AclPoolOp() = default;
+  AclPoolOp(const AclPoolOp&) = delete;
+  AclPoolOp& operator=(const AclPoolOp&) = delete;
+  AclPoolOp(AclPoolOp&&) = delete;
+  AclPoolOp& operator=(AclPoolOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const PoolParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth);
+    // arm_compute::TensorShape weights_shape(
+    // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PoolingLayerInfo pool_info;
+    if (args.pool_type == "max") {
+      pool_info = arm_compute::PoolingLayerInfo(
+          arm_compute::PoolingType::MAX, args.filter_rows,
+          arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
+                                     args.pad_cols, args.pad_rows,
+                                     arm_compute::DimensionRoundingType::CEIL));
+    } else {
+      pool_info = arm_compute::PoolingLayerInfo(
+          arm_compute::PoolingType::AVG, args.filter_rows,
+          arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
+                                     args.pad_cols, args.pad_rows,
+                                     arm_compute::DimensionRoundingType::CEIL));
+    }
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(pooling, this, pool_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const PoolParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    if (args.pool_type != "max" && args.pool_type != "avg") {
+      bypass_acl = true;
+    }
+    if (args.filter_rows != args.filter_cols) {
+      bypass_acl = true;
+    }
+    // if (args.filter_rows!=2 && args.filter_rows!=3) {
+    //     bypass_acl = true;
+    // }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const PoolParam& param) {
+    const Tensor* in_x = param.Input();
+    Tensor* out = param.Output();
+    std::string pooling_type = param.PoolingType();
+    std::vector<int> ksize = param.Ksize();
+    std::vector<int> strides = param.Strides();
+    std::vector<int> paddings = param.Paddings();
+    bool is_global_pooling = param.isGlobalPooling();
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.is_global_pool = is_global_pooling;
+    args.pool_type = pooling_type;
+    args.filter_rows = ksize[0];
+    args.filter_cols = ksize[1];
+    args.dim = ksize.size();
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+    // std::cout <<"In N: " << args.batch << " C: " <<  args.in_depth
+    //  << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    // std::cout <<"Filter O: " << static_cast<int>(filter->dims()[0])
+    //  << " I: " <<  static_cast<int>(filter->dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"PoolingType: " << args.pool_type << "\n";
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+    args.out_depth = args.in_depth;
+    // args.out_rows = out->dims()[2];
+    // args.out_cols = out->dims()[3];
+    args.out_rows = static_cast<int>(ceil(static_cast<float>(args.in_rows +
+                                                             2 * args.pad_rows -
+                                                             args.filter_rows) /
+                                          args.stride_rows)) +
+                    1;
+    args.out_cols = static_cast<int>(ceil(static_cast<float>(args.in_cols +
+                                                             2 * args.pad_cols -
+                                                             args.filter_cols) /
+                                          args.stride_cols)) +
+                    1;
+    if (is_global_pooling) {
+      args.filter_rows = args.in_rows;
+      args.filter_cols = args.in_cols;
+      args.pad_rows = 0;
+      args.pad_cols = 0;
+    }
+  }
+  acl::AclParameters args;
+};
+template <>
+bool PoolKernel<GPU_MALI, float>::Init(const PoolParam& param) const {
+  AclPoolOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclPoolOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void PoolKernel<GPU_MALI, float>::Compute(const PoolParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclPoolOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  for (int n = 0; n < args.batch; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth * args.in_cols * args.in_rows;
+    output_data += args.in_depth * args.out_cols * args.out_rows;
+  }
+}
+template class PoolKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/relu_kernel.cpp
+++ b/src/operators/kernel/mali/relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef RELU_OP
+#pragma once
+#include "operators/kernel/relu_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclReluOp : public acl::ACLOperator {
+ public:
+  AclReluOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_RELU;
+  }
+  ~AclReluOp() = default;
+  AclReluOp(const AclReluOp&) = delete;
+  AclReluOp& operator=(const AclReluOp&) = delete;
+  AclReluOp(AclReluOp&&) = delete;
+  AclReluOp& operator=(AclReluOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const ReluParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols * args.in_rows *
+                                         args.in_depth * args.batch);
+    arm_compute::TensorShape output_shape(args.in_cols * args.in_rows *
+                                          args.in_depth * args.out_num);
+    // arm_compute::TensorShape weights_shape(
+    // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::ActivationLayerInfo::ActivationFunction type;
+    type = arm_compute::ActivationLayerInfo::ActivationFunction::RELU;
+    arm_compute::ActivationLayerInfo act_info(type);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(activation, this, act_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const ReluParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const ReluParam& param) {
+    const auto* input_x = param.InputX();
+    auto* out = param.Out();
+    const T* input_data = input_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.batch = input_x->dims()[0];
+    args.in_depth = input_x->dims()[1];
+    args.in_rows = input_x->dims()[2];
+    args.in_cols = input_x->dims()[3];
+    args.out_num = out->dims()[0];
+  }
+  acl::AclParameters args;
+};
+template <>
+bool ReluKernel<GPU_MALI, float>::Init(const ReluParam& param) const {
+  AclReluOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclReluOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void ReluKernel<GPU_MALI, float>::Compute(const ReluParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclReluOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
+}
+template class ReluKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/reshape_kernel.cpp
+++ b/src/operators/kernel/mali/reshape_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef RESHAPE_OP
+#pragma once
+#include "operators/kernel/reshape_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ReshapeKernel<GPU_MALI, float>::Init(const ReshapeParam &para) const {
+  return true;
+}
+template <>
+void ReshapeKernel<GPU_MALI, float>::Compute(const ReshapeParam &param) const {
+  const auto *input_x = param.InputX();
+  const auto &input_x_dims = input_x->dims();
+  auto *out = param.Out();
+  framework::DDim out_dims = out->dims();
+  const auto *input_shape = param.InputShape();
+  if (input_shape) {
+    auto *shape_data = input_shape->data<int>();
+    framework::Tensor cpu_shape_tensor;
+    auto shape =
+        std::vector<int>(shape_data, shape_data + input_shape->numel());
+    out_dims = ValidateShape(shape, input_x->dims());
+  }
+  bool inplace = param.Inplace();
+  out->Resize(out_dims);
+  if (!inplace) {
+    out->mutable_data<float>();
+    framework::TensorCopy(*input_x, out);
+    out->Resize(out_dims);
+  } else {
+    out->ShareDataWith(*input_x);
+    out->Resize(out_dims);
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/softmax_kernel.cpp
+++ b/src/operators/kernel/mali/softmax_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SOFTMAX_OP
+#pragma once
+#include "operators/kernel/softmax_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclSoftmaxOp : public acl::ACLOperator {
+ public:
+  AclSoftmaxOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_SOFTMAX;
+  }
+  ~AclSoftmaxOp() = default;
+  AclSoftmaxOp(const AclSoftmaxOp&) = delete;
+  AclSoftmaxOp& operator=(const AclSoftmaxOp&) = delete;
+  AclSoftmaxOp(AclSoftmaxOp&&) = delete;
+  AclSoftmaxOp& operator=(AclSoftmaxOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const SoftmaxParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape shape(args.in_depth, args.batch);
+    if (is_operator_init_done(shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    //[width, height, IFM]
+    new_tensor(input(), shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), shape, args.output_data);
+    acl_configure(softmax, this, NULL);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const SoftmaxParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const SoftmaxParam& param) {
+    const framework::Tensor* in_x = param.InputX();
+    framework::Tensor* out = param.Out();
+    auto x_dims = in_x->dims();
+    out->Resize(x_dims);
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.out_num = out->dims()[0];
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
+template <>
+bool SoftmaxKernel<GPU_MALI, float>::Init(const SoftmaxParam& param) const {
+  AclSoftmaxOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclSoftmaxOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void SoftmaxKernel<GPU_MALI, float>::Compute(const SoftmaxParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclSoftmaxOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  for (int n = 0; n < args.out_num; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth;
+    output_data += args.in_depth;
+  }
+}
+template class SoftmaxKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/prior_box_kernel.h
+++ b/src/operators/kernel/prior_box_kernel.h
@@ -16,8 +16,9 @@ limitations under the License. */
 #pragma once
+#include <algorithm>
+#include <cmath>
 #include <vector>
 #include "framework/operator.h"
 #include "operators/math/transform.h"
 #include "operators/op_param.h"

--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -34,6 +34,8 @@ USE_OP_CPU(lrn);
 REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(lrn);
+REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/math/depthwise_conv_3x3.h"
+#include <vector>
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
+                      vector<int> paddings, const Tensor *filter, Tensor *bias,
+                      Tensor *output, bool if_bias) {
+#if __ARM_NEON
+  const int batch_size = input->dims()[0];
+  const int input_height = input->dims()[2];
+  const int input_width = input->dims()[3];
+  const int output_channels = output->dims()[1];
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  const int _kernel_size = 3;
+  const int stride_height = strides[0];
+  const int stride_width = strides[1];
+  const int padding_height = paddings[0];
+  const int padding_width = paddings[1];
+  const float zero = 0;
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+  const int filter_channel_stride = 9;
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+  if (if_bias) {
+    math::expand_bias(*bias, 1, output->dims());
+    output->ShareDataWith(*bias);
+  }
+  float *output_data = output->mutable_data<float>();
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  const int filter_batch_stride = output_channels * output_channel_stride;
+  const float *pos1, *pos2, *pos3, *filter1, *filter2, *filter3, *output_ptr;
+  int hstart, wstart, hend, wend;
+  float result;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int c = 0; c < output_channels; ++c) {
+      filter1 = filter_data;
+      filter2 = filter1 + 3;
+      filter3 = filter2 + 3;
+      for (int ph = 0; ph < output_height; ph++) {
+        for (int pw = 0; pw < output_width; pw++) {
+          hstart = ph * stride_height - padding_height;
+          wstart = pw * stride_width - padding_width;
+          hend = min(hstart + _kernel_size, input_height + padding_height);
+          wend = min(wstart + _kernel_size, input_width + padding_width);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          hend = min(hend, input_height);
+          wend = min(wend, input_width);
+          pos1 = input_data + hstart * input_width + wstart;
+          pos2 = input_data + (hstart + 1) * input_width + wstart;
+          pos3 = input_data + (hstart + 2) * input_width + wstart;
+          output_ptr = output_data + ph * output_width + pw;
+          if (hend - hstart != 3 || wend - wstart != 3) {
+            result = 0;
+            float fake_input[9] = {0};
+            if (hstart == 0 && wstart == 0) {
+              // 左上角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend && k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k -
+                                   (3 - wend)];
+                  }
+                }
+              }
+            } else if (hstart == 0 && wend == input_width) {
+              // 右上角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend && k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k + wstart];
+                  }
+                }
+              }
+            } else if (hend == input_height && wstart == 0) {
+              // 左下角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - 1 - hstart && k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k - (3 - wend)];
+                  }
+                }
+              }
+            } else if (hend == input_height && wend == input_width) {
+              // 右下角
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - hstart - 1 &&
+                      k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+            } else if (hstart == 0) {
+              // 顶部
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j >= 3 - hend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j - (3 - hend)) * input_width + k + wstart];
+                  }
+                }
+              }
+            } else if (hend == input_height) {
+              // 底部
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (j <= input_height - hstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+            } else if (wstart == 0) {
+              // 左侧
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (k >= 3 - wend) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width +
+                                   (k - (3 - wend))];
+                  }
+                }
+              }
+            } else if (wend == input_width) {
+              // 右侧
+              for (int j = 0; j < 3; ++j) {
+                for (int k = 0; k < 3; ++k) {
+                  if (k <= input_width - wstart - 1) {
+                    fake_input[3 * j + k] =
+                        input_data[(j + hstart) * input_width + k + wstart];
+                  }
+                }
+              }
+            }
+            for (int l = 0; l < 9; ++l) {
+              result += fake_input[l] * filter1[l];
+            }
+            if (if_bias) {
+              output_data[ph * output_width + pw] += result;
+            } else {
+              output_data[ph * output_width + pw] = result;
+            }
+          } else {
+#if defined(ARMV17)
+            asm volatile(
+                "vld1.32  {q1}, [%[pos1]]        \n\t"
+                "vld1.32  {q4}, [%[filter1]]        \n\t"
+                "vmov.f32 q0,    #0.0              \n\t"
+                "vld1.32  {q2}, [%[pos2]]        \n\t"
+                "vld1.32  {q5}, [%[filter2]]        \n\t"
+                "vmla.f32 q0, q1, q4           \n\t"
+                "vld1.32  {q3}, [%[pos3]]        \n\t"
+                "vld1.32  {q6}, [%[filter3]]        \n\t"
+                "vmla.f32 q0, q2, q5           \n\t"
+                "vmla.f32 q0, q3, q6          \n\t"
+                "vmov.f32 d1[1],  %[zero]         \n\t"
+                "vadd.f32  d4, d0, d1           \n\t"
+                "vadd.f32  s10, s8, s9            \n\t"
+                "vst1.32 {d5[0]},[%[output_ptr]]    \n\t"
+                :
+                : [input_data] "r"(input_data), [pos1] "r"(pos1),
+                  [pos2] "r"(pos2), [pos3] "r"(pos3), [filter1] "r"(filter1),
+                  [filter2] "r"(filter2), [filter3] "r"(filter3),
+                  [output_ptr] "r"(output_ptr), [zero] "r"(zero)
+                : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
+#else
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos2);
+            const float32x4_t data3 = vld1q_f32(pos3);
+            const float32x4_t v_filter1 = vld1q_f32(filter1);
+            const float32x4_t v_filter2 = vld1q_f32(filter2);
+            const float32x4_t v_filter3 = vld1q_f32(filter3);
+            float32x4_t mula = vmulq_f32(data1, v_filter1);
+            mula = vmlaq_f32(mula, data2, v_filter2);
+            mula = vmlaq_f32(mula, data3, v_filter3);
+            float32x2_t res = vpadd_f32(
+                vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
+            res = vpadd_f32(res, res);
+            if (if_bias) {
+              output_data[ph * output_width + pw] += vget_lane_f32(res, 0);
+            } else {
+              output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
+            }
+#endif
+          }
+        }
+      }
+      input_data += input_channel_stride;
+      output_data += output_channel_stride;
+      filter_data += filter_channel_stride;
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+#endif
+}
+void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
+                          Tensor *output, Tensor *bias, bool if_bias) {
+  const float *input_data = input->data<float>();
+  const float *filter_data = filter->data<float>();
+  float *output_data = output->data<float>();
+  const float *bias_data = bias->data<float>();
+  const int h = static_cast<int>(input->dims()[2]);
+  const int w = static_cast<int>(input->dims()[3]);
+  const int l = h;
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  const int c = static_cast<int>(input->dims()[1]);
+  const int hxw = h * w;
+  float32x4_t vbias = vdupq_n_f32(0.0);
+  for (int b = 0; b < batch_size; ++b) {
+    const float *filter_data_tmp = filter_data;
+    for (int j = 0; j < c; ++j) {
+      if (if_bias) {
+        vbias = vdupq_n_f32(bias_data[j]);
+      }
+      int l_mid = l - 2;  // l=1->l_mid=-1,l=2->l_mid=0
+      float w00 = filter_data_tmp[0];
+      float w01 = filter_data_tmp[1];
+      float w02 = filter_data_tmp[2];
+      float w10 = filter_data_tmp[3];
+      float w11 = filter_data_tmp[4];
+      float w12 = filter_data_tmp[5];
+      float w20 = filter_data_tmp[6];
+      float w21 = filter_data_tmp[7];
+      float w22 = filter_data_tmp[8];
+      output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
+                       w21 * input_data[l] + w22 * input_data[l + 1] +
+                       bias_data[j];
+      output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
+                           w20 * input_data[2 * l - 2] +
+                           w21 * input_data[2 * l - 1] + bias_data[j];
+      output_data[(l - 1) * l] =
+          w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
+          w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1] +
+          bias_data[j];
+      output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
+                               w01 * input_data[(l - 2) * (l + 1) + 1] +
+                               w10 * input_data[l * l - 2] +
+                               w11 * input_data[l * l - 1] + bias_data[j];
+      for (int i = 1; i < l - 1; ++i) {
+        output_data[i * l] =
+            w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
+            w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
+            w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1] +
+            bias_data[j];
+        output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
+                                     w01 * input_data[i * l + l - 1 - l] +
+                                     w10 * input_data[i * l + l - 1 - 1] +
+                                     w11 * input_data[i * l + l - 1] +
+                                     w20 * input_data[i * l + l - 1 + l - 1] +
+                                     w21 * input_data[i * l + l - 1 + l] +
+                                     bias_data[j];
+      }
+      // top 1 row and bottom 1 row
+      const float *input_tmp = input_data;
+      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
+          tmp3, tmp4, tmp5, out0;
+      in0 = vld1q_f32(input_tmp);
+      in2 = vld1q_f32(input_tmp + l);
+      const float *input_tmp_end = input_tmp + (l - 2) * l;
+      in4 = vld1q_f32(input_tmp_end);
+      in6 = vld1q_f32(input_tmp_end + l);
+      int c_mid = l_mid;
+      auto output_ptr = output_data + 1;
+      for (; c_mid > 3; c_mid -= 4) {
+        in1 = vld1q_f32(input_tmp + 4);
+        in3 = vld1q_f32(input_tmp + l + 4);
+        tmp0 = vextq_f32(in0, in1, 1);
+        tmp1 = vextq_f32(in0, in1, 2);
+        tmp2 = vextq_f32(in2, in3, 1);
+        tmp3 = vextq_f32(in2, in3, 2);
+        out0 = vmulq_n_f32(in0, w10);
+        out0 = vmlaq_n_f32(out0, tmp0, w11);
+        out0 = vmlaq_n_f32(out0, tmp1, w12);
+        out0 = vmlaq_n_f32(out0, in2, w20);
+        out0 = vmlaq_n_f32(out0, tmp2, w21);
+        out0 = vmlaq_n_f32(out0, tmp3, w22);
+        out0 = vaddq_f32(out0, vbias);
+        vst1q_f32(output_ptr, out0);
+        in5 = vld1q_f32(input_tmp_end + 4);
+        in7 = vld1q_f32(input_tmp_end + l + 4);
+        tmp0 = vextq_f32(in4, in5, 1);
+        tmp1 = vextq_f32(in4, in5, 2);
+        tmp2 = vextq_f32(in6, in7, 1);
+        tmp3 = vextq_f32(in6, in7, 2);
+        out0 = vmulq_n_f32(in4, w00);
+        out0 = vmlaq_n_f32(out0, tmp0, w01);
+        out0 = vmlaq_n_f32(out0, tmp1, w02);
+        out0 = vmlaq_n_f32(out0, in6, w10);
+        out0 = vmlaq_n_f32(out0, tmp2, w11);
+        out0 = vmlaq_n_f32(out0, tmp3, w12);
+        out0 = vaddq_f32(out0, vbias);
+        vst1q_f32(output_ptr + (l - 1) * l, out0);
+        // can optimize to each 8 stride.
+        input_tmp += 4;
+        input_tmp_end += 4;
+        output_ptr += 4;
+        in0 = in1;
+        in2 = in3;
+        in4 = in5;
+        in6 = in7;
+      }
+      // top right pad
+      float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]);
+      float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]);
+      tmp0 = vextq_f32(in0, pad0, 1);
+      tmp1 = vextq_f32(in0, pad0, 2);
+      tmp2 = vextq_f32(in2, pad1, 1);
+      tmp3 = vextq_f32(in2, pad1, 2);
+      out0 = vmulq_n_f32(in0, w10);
+      out0 = vmlaq_n_f32(out0, tmp0, w11);
+      out0 = vmlaq_n_f32(out0, tmp1, w12);
+      out0 = vmlaq_n_f32(out0, in2, w20);
+      out0 = vmlaq_n_f32(out0, tmp2, w21);
+      out0 = vmlaq_n_f32(out0, tmp3, w22);
+      out0 = vaddq_f32(out0, vbias);
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + i, out0, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + i, out0, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + i, out0, 2);
+        }
+      }
+      // bottom right pad
+      float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]);
+      float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]);
+      tmp0 = vextq_f32(in4, pad2, 1);
+      tmp1 = vextq_f32(in4, pad2, 2);
+      tmp2 = vextq_f32(in6, pad3, 1);
+      tmp3 = vextq_f32(in6, pad3, 2);
+      out0 = vmulq_n_f32(in4, w00);
+      out0 = vmlaq_n_f32(out0, tmp0, w01);
+      out0 = vmlaq_n_f32(out0, tmp1, w02);
+      out0 = vmlaq_n_f32(out0, in6, w10);
+      out0 = vmlaq_n_f32(out0, tmp2, w11);
+      out0 = vmlaq_n_f32(out0, tmp3, w12);
+      out0 = vaddq_f32(out0, vbias);
+      for (int i = 0; i < c_mid; ++i) {
+        if (i == 0) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0);
+        }
+        if (i == 1) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1);
+        }
+        if (i == 2) {
+          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2);
+        }
+      }
+      // mid
+      for (int i = 0; i < l - 2; ++i) {
+        auto output_ptr = output_data + (i + 1) * l + 1;
+        input_tmp = input_data + i * l;
+        auto in0_tmp = vld1q_f32(input_tmp);
+        auto in2_tmp = vld1q_f32(input_tmp + l);
+        auto in4_tmp = vld1q_f32(input_tmp + l + l);
+        c_mid = l_mid;
+        for (; c_mid > 3; c_mid -= 4) {
+          auto in1_tmp = vld1q_f32(input_tmp + 4);
+          auto in3_tmp = vld1q_f32(input_tmp + l + 4);
+          auto in5_tmp = vld1q_f32(input_tmp + l + l + 4);
+          tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
+          tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
+          tmp2 = vextq_f32(in2_tmp, in3_tmp, 1);
+          tmp3 = vextq_f32(in2_tmp, in3_tmp, 2);
+          tmp4 = vextq_f32(in4_tmp, in5_tmp, 1);
+          tmp5 = vextq_f32(in4_tmp, in5_tmp, 2);
+          out0 = vmulq_n_f32(in0_tmp, w00);
+          out0 = vmlaq_n_f32(out0, tmp0, w01);
+          out0 = vmlaq_n_f32(out0, tmp1, w02);
+          out0 = vmlaq_n_f32(out0, in2_tmp, w10);
+          out0 = vmlaq_n_f32(out0, tmp2, w11);
+          out0 = vmlaq_n_f32(out0, tmp3, w12);
+          out0 = vmlaq_n_f32(out0, in4_tmp, w20);
+          out0 = vmlaq_n_f32(out0, tmp4, w21);
+          out0 = vmlaq_n_f32(out0, tmp5, w22);
+          out0 = vaddq_f32(out0, vbias);
+          vst1q_f32(output_ptr, out0);
+          output_ptr += 4;
+          input_tmp += 4;
+          in0_tmp = in1_tmp;
+          in2_tmp = in3_tmp;
+          in4_tmp = in5_tmp;
+        }
+        float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]);
+        float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]);
+        float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]);
+        tmp0 = vextq_f32(in0_tmp, pad0, 1);
+        tmp1 = vextq_f32(in0_tmp, pad0, 2);
+        tmp2 = vextq_f32(in2_tmp, pad1, 1);
+        tmp3 = vextq_f32(in2_tmp, pad1, 2);
+        tmp4 = vextq_f32(in4_tmp, pad2, 1);
+        tmp5 = vextq_f32(in4_tmp, pad2, 2);
+        out0 = vmulq_n_f32(in0_tmp, w00);
+        out0 = vmlaq_n_f32(out0, tmp0, w01);
+        out0 = vmlaq_n_f32(out0, tmp1, w02);
+        out0 = vmlaq_n_f32(out0, in2_tmp, w10);
+        out0 = vmlaq_n_f32(out0, tmp2, w11);
+        out0 = vmlaq_n_f32(out0, tmp3, w12);
+        out0 = vmlaq_n_f32(out0, in4_tmp, w20);
+        out0 = vmlaq_n_f32(out0, tmp4, w21);
+        out0 = vmlaq_n_f32(out0, tmp5, w22);
+        out0 = vaddq_f32(out0, vbias);
+        for (int i = 0; i < c_mid; ++i) {
+          if (i == 0) {
+            vst1q_lane_f32(output_ptr + i, out0, 0);
+          }
+          if (i == 1) {
+            vst1q_lane_f32(output_ptr + i, out0, 1);
+          }
+          if (i == 2) {
+            vst1q_lane_f32(output_ptr + i, out0, 2);
+          }
+        }
+      }
+      output_data += hxw;
+      input_data += hxw;
+      filter_data_tmp += 9;
+    }
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/math/depthwise_conv_3x3.h
+++ b/src/operators/math/depthwise_conv_3x3.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "framework/tensor.h"
+#include "operators/math/conv_func.h"
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+using framework::Tensor;
+using std::max;
+using std::min;
+using std::vector;
+void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
+                      vector<int> paddings, const Tensor *filter, Tensor *bias,
+                      Tensor *output, bool if_bias);
+void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
+                          Tensor *output, Tensor *bias, bool if_bias);
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON
+#include <climits>
 namespace paddle_mobile {
 namespace operators {
 namespace math {

--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -60,6 +60,8 @@ USE_OP_CPU(mul);
 REGISTER_OPERATOR_CPU(mul, ops::MulOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(mul);
+REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
@@ -64,6 +64,8 @@ USE_OP_CPU(pool2d);
 REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(pool2d);
+REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -38,6 +38,8 @@ USE_OP_CPU(relu);
 REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(relu);
+REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -37,6 +37,8 @@ USE_OP_CPU(reshape);
 REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(reshape);
+REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -32,6 +32,8 @@ USE_OP_CPU(softmax);
 REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(softmax);
+REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -33,12 +33,8 @@ int main() {
                           input_tensor.data<float>() + input_tensor.numel());
  auto time3 = time();
  auto vec_result = executor.Predict(input, dims);
-  float sum = 0;
-  for (const auto item : vec_result) {
-    sum += item;
-  }
-  DLOG << "mobilenet output sum =" << sum;
  auto time4 = time();
  DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
  return 0;
 }
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -14,10 +14,10 @@ limitations under the License. */
 #pragma once
-#include <chrono>
 #include <fstream>
 #include <random>
+#include "common/common.h"
 #include "common/log.h"
 #include "framework/ddim.h"
 #include "framework/tensor.h"
@@ -35,17 +35,6 @@ static const std::string g_test_image_1x3x224x224 =
 using paddle_mobile::framework::DDim;
 using paddle_mobile::framework::Tensor;
-using Time = decltype(std::chrono::high_resolution_clock::now());
-Time time() { return std::chrono::high_resolution_clock::now(); }
-double time_diff(Time t1, Time t2) {
-  typedef std::chrono::microseconds ms;
-  auto diff = t2 - t1;
-  ms counter = std::chrono::duration_cast<ms>(diff);
-  return counter.count() / 1000.0;
-}
 template <typename T>
 void SetupTensor(paddle_mobile::framework::Tensor *input,
                 paddle_mobile::framework::DDim dims, T lower, T upper) {

--- a/tools/android-cmake/android.toolchain.cmake
+++ b/tools/android-cmake/android.toolchain.cmake
@@ -37,7 +37,7 @@
 # ANDROID_DISABLE_FORMAT_STRING_CHECKS
 # ANDROID_CCACHE
-cmake_minimum_required(VERSION 3.6.0)
+# cmake_minimum_required(VERSION 3.6.0)
 # Inhibit all of CMake's own NDK handling code.
 set(CMAKE_SYSTEM_VERSION 1)

--- a/tools/arm-platform.cmake
+++ b/tools/arm-platform.cmake
+set(ARCH "armv7-a")
+set(FLOAT_ABI "softfp" CACHE STRING "-mfloat-api chosen")
+set_property(CACHE FLOAT_ABI PROPERTY STRINGS "softfp" "soft" "hard")
+set(FPU "neon")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${ARCH} -mfloat-abi=${FLOAT_ABI} -mfpu=${FPU}")
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -56,7 +56,7 @@ build_for_android() {
    MODE="Release"
-    ANDROID_PLATFORM_VERSION="android-15"
+    ANDROID_PLATFORM_VERSION="android-22"
    TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
    ANDROID_ARM_MODE="arm"
    if [ $# -eq 1 ]; then

--- a/tools/op.cmake
+++ b/tools/op.cmake
--- a/tools/toolchains/arm-android-neon.cmake
+++ b/tools/toolchains/arm-android-neon.cmake
+set(ANDROID_ARM_NEON ON)
+include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")
\ No newline at end of file
--- a/tools/toolchains/arm-linux-gnueabi.cmake
+++ b/tools/toolchains/arm-linux-gnueabi.cmake
+# CMake toolchain file for building ARM software on Linux environment
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_VERSION 1)
+set(CMAKE_C_COMPILER   /usr/bin/arm-linux-gnueabi-gcc)
+set(CMAKE_CXX_COMPILER /usr/bin/arm-linux-gnueabi-g++)
+set(CMAKE_STRIP /usr/bin/arm-linux-gnueabi-strip)
+set(CMAKE_FIND_ROOT_PATH  /usr/arm-linux-gnueabi)
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+include("${CMAKE_CURRENT_LIST_DIR}/../arm-platform.cmake")