ready for PR

bec8987d · sharper · 0342c83c · bec8987d · bec8987d · bec8987d
34 changed file
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "src/operators/kernel/mali/ACL_Android"]
+	path = src/operators/kernel/mali/ACL_Android
+	url = https://github.com/halsay/ACL_Android.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ option(USE_OPENMP "openmp support" OFF)
 option(USE_EXCEPTION "use std exception" ON)
 option(LOG_PROFILE "log profile" ON)
 # select the platform to build
-option(CPU "cpu" ON)
+option(CPU "cpu" OFF)
 option(MALI_GPU "mali gpu" ON)
 option(FPGA "fpga" OFF)
@@ -16,6 +16,17 @@ endif()
 if (MALI_GPU)
    add_definitions(-DPADDLE_MOBILE_MALI_GPU)
+    add_definitions(-DUSE_ACL=1)
+    add_definitions(-DUSE_OPENCL)
+    set(ACL_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/ACL_Android)
+    include_directories(${ACL_ROOT} ${ACL_ROOT}/include)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_core")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_graph")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build/opencl-1.2-stubs")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ACL=1")
 endif()
 if(FPGA)

--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -138,9 +138,21 @@ class OpKernelBase {
   * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
   *    所有结构体存在与: paddle-mobile/src/operators/op_param.h
   * */
+#ifdef PADDLE_MOBILE_MALI_GPU
+  OpKernelBase() { acl_op_ = nullptr; }
+  void *GetAclOp() const { return acl_op_; }
+  void SetAclOp(void *op, void *ob) const {
+    reinterpret_cast<OpKernelBase<Dtype, P> *>(ob)->acl_op_ = op;
+  }
+#endif
  virtual void Compute(const P &para) const = 0;
  virtual bool Init(const P &para) const { return true; };
  virtual ~OpKernelBase() = default;
+ private:
+#ifdef PADDLE_MOBILE_MALI_GPU
+  void *acl_op_;
+#endif
 };
 #define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)                                 \

--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -36,6 +36,8 @@ USE_OP_CPU(batch_norm);
 REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(batch_norm);
+REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -67,6 +67,8 @@ USE_OP_CPU(concat);
 REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(concat);
+REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -34,6 +34,8 @@ USE_OP_CPU(elementwise_add);
 REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(elementwise_add);
+REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -50,6 +50,8 @@ USE_OP_CPU(feed);
 REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(feed);
+REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -50,6 +50,8 @@ USE_OP_CPU(fetch);
 REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fetch);
+REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/fusion_conv_add.cpp
+++ b/src/operators/fusion_conv_add.cpp
@@ -54,6 +54,8 @@ USE_OP_CPU(conv_add);
 REGISTER_OPERATOR_CPU(conv_add, ops::FusionConvAddOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(conv_add);
+REGISTER_OPERATOR_MALI_GPU(conv_add, ops::FusionConvAddOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -59,6 +59,8 @@ USE_OP_CPU(fc);
 REGISTER_OPERATOR_CPU(fc, ops::FusionFcOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fc);
+REGISTER_OPERATOR_MALI_GPU(fc, ops::FusionFcOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/ACL_Android @ 591027fc
+++ b/ACL_Android @ 591027fc
+Subproject commit 591027fcffea084100c756e48356e0f8a48e35e5
--- a/src/operators/kernel/mali/acl_operator.cc
+++ b/src/operators/kernel/mali/acl_operator.cc
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file acl_operator.cc
+ * \brief
+ * \author Joey
+ */
+#if USE_ACL == 1
+#include "acl_operator.h"
+unsigned int bypass_acl_class_layer =
+    (0 | FLAGS_ENABLE_ACL_CONCAT |
+     /*0xffffffff |*/ /*FLAGS_ENABLE_ACL_FC |*/ /*FLAGS_ENABLE_ACL_LRN
+                                                   |*/
+     0);
+int enable_schedule = 0;
+#ifdef USE_PROFILING
+#include "arm_neon.h"
+unsigned int acl_log_flags =
+    (0 | MASK_LOG_APP_TIME | /*MASK_LOG_ALLOCATE | */  /*MASK_LOG_ALLOCATE | */
+     /*MASK_LOG_RUN      | */ /*MASK_LOG_CONFIG   | */ /*MASK_LOG_COPY     | */
+     MASK_LOG_ABSVAL | MASK_LOG_BNLL | MASK_LOG_CONV | MASK_LOG_FC |
+     MASK_LOG_LRN | MASK_LOG_POOLING | MASK_LOG_RELU | MASK_LOG_SIGMOID |
+     MASK_LOG_SOFTMAX | MASK_LOG_TANH | MASK_LOG_LC | MASK_LOG_BN |
+     MASK_LOG_CONCAT | 0);
+#include <stdio.h>  /* printf */
+#include <stdlib.h> /* getenv */
+#endif              // USE_PROFILING
+static bool force_enable_gpu = false;
+bool AclEnableSchedule(int enable) {
+  enable_schedule = enable;
+  if (enable) {
+    force_enable_gpu = true;
+  }
+  return true;
+}
+int isScheduleEnable() { return enable_schedule; }
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+bool ACLOperator::init_gpu_env = true;
+#ifdef USE_OPENCL
+bool ACLOperator::support_opencl_ = false;
+bool opencl_is_available() { return arm_compute::opencl_is_available(); }
+#elif defined(USE_OPENGLES)
+bool ACLOperator::support_opengles_ = false;
+#endif
+ACLOperator::ACLOperator(bool is_gpu)
+    : operator_state_(operator_not_init),
+      force_bypass_acl_path_(false),
+      target_hint_(TargetHint::DONT_CARE),
+      convolution_method_hint_(ConvolutionMethodHint::GEMM),
+      _group(1),
+      name_(""),
+      input_idx_(0),
+      output_idx_(0),
+      is_gpu_(is_gpu) {
+  const char* pBypassACL;
+  if (init_gpu_env) {
+#ifdef USE_OPENCL
+    try {
+      if (opencl_is_available()) {
+        arm_compute::CLScheduler::get().default_init();
+        support_opencl_ = true;
+      }
+    } catch (std::exception& e) {
+      support_opencl_ = false;
+    }
+#elif defined(USE_OPENGLES)
+    try {
+      arm_compute::GCScheduler::get().default_init();
+      support_opengles_ = true;
+    } catch (std::exception& e) {
+      support_opengles_ = false;
+    }
+#endif
+    init_gpu_env = false;
+  }
+  if (force_enable_gpu) is_gpu_ = true;
+  pBypassACL = getenv("BYPASSACL");
+  if (pBypassACL) {
+    unsigned int bacl;
+    sscanf(pBypassACL, "%i", &bacl);
+    if (bacl != bypass_acl_class_layer) {
+      bypass_acl_class_layer = bacl;
+      printf("BYPASSACL<%s>\n", pBypassACL);
+      printf("BYPASSACL: %x\n", bypass_acl_class_layer);
+    }
+  }
+#ifdef USE_PROFILING
+  const char* pLogACL;
+  pLogACL = getenv("LOGACL");
+  if (pLogACL) {
+    unsigned int alf;
+    sscanf(pLogACL, "%i", &alf);
+    if (alf != acl_log_flags) {
+      acl_log_flags = alf;
+      printf("LOGACL<%s>\n", pLogACL);
+      printf("LOGACL: %x\n", acl_log_flags);
+    }
+  }
+#endif  // USE_PROFILING
+  const char* pEnableSchedule;
+  pEnableSchedule = getenv("ENABLESCHEDULE");
+  if (pEnableSchedule) {
+    int bshedule;
+    sscanf(pEnableSchedule, "%i", &bshedule);
+    if (bshedule != enable_schedule) {
+      enable_schedule = bshedule;
+      printf("ENABLESCHEDULE<%s>\n", pEnableSchedule);
+      printf("ENABLESCHEDULE: %x\n", enable_schedule);
+    }
+    if (enable_schedule) {
+      AclEnableSchedule(1);
+    }
+  }
+}
+ACLOperator::~ACLOperator() {}
+bool ACLOperator::new_tensor(std::unique_ptr<ACLTensor>& tensor,
+                             arm_compute::TensorShape& shape, void* mem,
+                             bool commit) {
+  auto acl_tensor =
+      new ACLTensor(arm_compute::TensorInfo(shape, arm_compute::Format::F32));
+  acl_tensor->set_target(getTargetHint());
+  acl_tensor->bindmem(mem);
+  if (commit) acl_tensor->commit();
+  tensor = (std::unique_ptr<ACLTensor>)std::move(acl_tensor);
+  return true;
+}
+bool ACLOperator::new_tensor(std::unique_ptr<ACLSubTensor>& tensor,
+                             std::unique_ptr<ACLTensor>& parent,
+                             arm_compute::TensorShape& shape,
+                             arm_compute::Coordinates& coord) {
+  auto acl_tensor = new ACLSubTensor(parent, shape, coord);
+  acl_tensor->set_target(getTargetHint());
+  tensor = (std::unique_ptr<ACLSubTensor>)std::move(acl_tensor);
+  return true;
+}
+void ACLTensor::commit(TensorType type) {
+  settensortype(type);
+  if (mem_) {
+    if (!allocate_) {
+#ifdef USE_PROFILING
+      logtime_util log_time(ACL_ALLOCATE_INFO);
+#endif  // USE_PROFILING
+      allocate();
+      allocate_ = true;
+    }
+    if (type_ != tensor_output) {
+      tensor_copy(mem_);
+    }
+    mem_ = nullptr;
+  }
+}
+int BaseACLTensor::tensor_copy(arm_compute::ITensor* tensor, void* mem,
+                               bool toTensor) {
+#ifdef USE_PROFILING
+  logtime_util log_time(ACL_COPY_INFO);
+#endif  // USE_PROFILING
+  arm_compute::Window window;
+  // Iterate through the rows (not each element)
+  window.use_tensor_dimensions(tensor->info()->tensor_shape(),
+                               /* first_dimension =*/arm_compute::Window::DimY);
+  int width = tensor->info()->tensor_shape()[0];
+  int height = tensor->info()->tensor_shape()[1];
+  int deepth = tensor->info()->tensor_shape()[2];
+  map();
+  // Create an iterator:
+  arm_compute::Iterator it(tensor, window);
+  // Except it works for an arbitrary number of dimensions
+  if (toTensor) {  // mem->tensor
+    arm_compute::execute_window_loop(
+        window,
+        [&](const arm_compute::Coordinates& id) {
+          memcpy(it.ptr(),
+                 ((char*)mem) +
+                     ((id[3] * (width * height * deepth) +
+                       id.z() * (width * height) + id.y() * width + id.x()) *
+                      tensor->info()->element_size()),
+                 width * tensor->info()->element_size());
+        },
+        it);
+  } else {  // tensor-->mem
+    arm_compute::execute_window_loop(
+        window,
+        [&](const arm_compute::Coordinates& id) {
+          memcpy(((char*)mem) + ((id[3] * (width * height * deepth) +
+                                  id.z() * (width * height) + id.y() * width) *
+                                 tensor->info()->element_size()),
+                 it.ptr(), width * tensor->info()->element_size());
+        },
+        it);
+  }
+  unmap();
+  return 0;
+}
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/acl_operator.h
+++ b/src/operators/kernel/mali/acl_operator.h
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file acl_operator.h
+ * \brief
+ * \author Joey
+ */
+#ifndef ACL_OPERATOR_H_
+#define ACL_OPERATOR_H_
+#include <framework/tensor.h>
+#include <operators/op_param.h>
+#if USE_ACL == 1
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+#endif
+#ifdef USE_OPENGLES
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h"
+#endif
+#include "acl_tensor.h"
+#define FLAGS_ENABLE_ACL_ABSVAL 0x00000001
+#define FLAGS_ENABLE_ACL_BNLL 0x00000002
+#define FLAGS_ENABLE_ACL_CONV 0x00000004
+#define FLAGS_ENABLE_ACL_FC 0x00000008
+#define FLAGS_ENABLE_ACL_LRN 0x00000010
+#define FLAGS_ENABLE_ACL_POOLING 0x00000020
+#define FLAGS_ENABLE_ACL_RELU 0x00000040
+#define FLAGS_ENABLE_ACL_SIGMOID 0x00000080
+#define FLAGS_ENABLE_ACL_SOFTMAX 0x00000100
+#define FLAGS_ENABLE_ACL_TANH 0x00000200
+#define FLAGS_ENABLE_ACL_LC 0x00000400
+#define FLAGS_ENABLE_ACL_BN 0x00000800
+#define FLAGS_ENABLE_ACL_CONCAT 0x00001000
+extern unsigned int bypass_acl_class_layer;
+#ifdef USE_PROFILING
+#include <sys/time.h>
+#define NANO_SEC_CONV 1000000
+#define MASK_LOG_APP_TIME 0x00000001
+#define MASK_LOG_ALLOCATE 0x00000002
+#define MASK_LOG_RUN 0x00000004
+#define MASK_LOG_CONFIG 0x00000008
+#define MASK_LOG_COPY 0x00000010
+#define MASK_LOG_ABSVAL 0x00000020
+#define MASK_LOG_BNLL 0x00000040
+#define MASK_LOG_CONV 0x00000080
+#define MASK_LOG_FC 0x00000100
+#define MASK_LOG_LRN 0x00000200
+#define MASK_LOG_POOLING 0x00000400
+#define MASK_LOG_RELU 0x00000800
+#define MASK_LOG_SIGMOID 0x00001000
+#define MASK_LOG_SOFTMAX 0x00002000
+#define MASK_LOG_TANH 0x00004000
+#define MASK_LOG_LC 0x00008000
+#define MASK_LOG_BN 0x00010000
+#define MASK_LOG_CONCAT 0x00020000
+#define APP_TIME_INFO MASK_LOG_APP_TIME, "time:       \t"
+#define ACL_ALLOCATE_INFO MASK_LOG_ALLOCATE, "allocate:   \t\t"
+#define ACL_RUN_INFO MASK_LOG_RUN, "run:        \t\t\t"
+#define ACL_CONFIG_INFO MASK_LOG_CONFIG, "configure:  \t\t\t\t"
+#define ACL_COPY_INFO MASK_LOG_COPY, "tensor_copy:\t\t\t\t\t"
+#define ACL_ABSVAL_INFO MASK_LOG_ABSVAL, "ACL_ABSVAL :\t\t\t\t\t\t"
+#define ACL_BNLL_INFO MASK_LOG_BNLL, "ACL_BNLL   :\t\t\t\t\t\t\t"
+#define ACL_CONV_INFO MASK_LOG_CONV, "ACL_CONV   :\t\t\t\t\t\t\t\t"
+#define ACL_FC_INFO MASK_LOG_FC, "ACL_FC     :\t\t\t\t\t\t\t\t\t"
+#define ACL_LRN_INFO MASK_LOG_LRN, "ACL_LRN    :\t\t\t\t\t\t\t\t\t\t"
+#define ACL_POOLING_INFO MASK_LOG_POOLING, "ACL_POOLING:\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_RELU_INFO MASK_LOG_RELU, "ACL_RELU   :\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_SIGMOID_INFO \
+  MASK_LOG_SIGMOID, "ACL_SIGMOID:\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_SOFTMAX_INFO \
+  MASK_LOG_SOFTMAX, "ACL_SOFTMAX:\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_TANH_INFO \
+  MASK_LOG_TANH, "ACL_TANH   :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_LC_INFO MASK_LOG_LC, "ACL_LC     :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_BN_INFO \
+  MASK_LOG_BN, "ACL_BN     :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_CONCAT_INFO \
+  MASK_LOG_CONCAT, "ACL_CONCAT :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+extern unsigned int acl_log_flags;
+class logtime_util {
+ public:
+  logtime_util() { mask = 0; }
+  logtime_util(int mask_, const char *information_) {
+    setlogtime_info(mask_, information_);
+  }
+  void setlogtime_info(int mask_, const char *information_) {
+    mask = mask_;
+    if (acl_log_flags & mask) {
+      strncpy(information, information_, 255);
+      gettimeofday(&tv[0], NULL);
+    }
+  }
+  ~logtime_util() {
+    if (acl_log_flags & mask) {
+      int time[2];
+      gettimeofday(&tv[1], NULL);
+      time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
+      time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
+      printf("%s %.6lf\n", information,
+             (((double)time[1] - time[0]) / NANO_SEC_CONV));
+    }
+  }
+  void log_time(bool start) {
+    if (acl_log_flags & mask) {
+      if (start) {
+        gettimeofday(&tv[0], NULL);
+      } else {
+        int time[2];
+        gettimeofday(&tv[1], NULL);
+        time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
+        time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
+        printf("%s %.6lf\n", information,
+               (((double)time[1] - time[0]) / NANO_SEC_CONV));
+      }
+    }
+  }
+ private:
+  struct timeval tv[2];
+  int mask;
+  char information[256];
+};
+#endif  // USE_PROFILING
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+class AclParameters {
+ public:
+  AclParameters() {
+    dilated = false;
+    dim = 2;
+    num_group = 1;
+  }
+  int batch;
+  int in_depth;
+  int in_rows;
+  int in_cols;
+  int out_depth;
+  int out_rows;
+  int out_cols;
+  int out_num;
+  int filter_rows;
+  int filter_cols;
+  int stride_rows;
+  int stride_cols;
+  int pad_rows;
+  int pad_cols;
+  int dilation_rows;
+  int dilation_cols;
+  int num_group;
+  bool dilated;
+  int dim;
+  int epsilon;
+  int nsize;
+  float alpha;
+  float beta;
+  float knorm;
+  void *input_data;
+  void *output_data;
+  void *weight_data;
+  void *biases_data;
+  void *mean_data;
+  void *var_data;
+  std::string pool_type;
+  std::string act_type;
+  std::string data_layout;
+  bool is_global_pool;
+  bool is_channel_concat;
+  std::vector<framework::LoDTensor *> in_tensor;
+};
+enum TensorType {
+  tensor_input,
+  tensor_output,
+  tensor_weights,
+  tensor_biases,
+  tensor_mean,
+  tensor_var,
+  tensor_beta,
+  tensor_gamma,
+  tensor_concat,
+  tensor_data,
+};
+enum OperatorState {
+  operator_not_init,
+  operator_init_done,
+  operator_reinit,
+};
+enum OperateType {
+  operate_type_pooling,
+  operate_type_activation,
+  operate_type_lrn,
+  operate_type_conv,
+  operate_type_lc,
+  operate_type_fc,
+  operate_type_bn,
+  operate_type_softmax,
+  operate_type_concat,
+};
+class BaseACLTensor {
+ public:
+  BaseACLTensor() : type_(tensor_input), allocate_(false) {}
+  virtual ~BaseACLTensor() {}
+  virtual void bindmem(void *mem) { mem_ = mem; }
+  virtual void settensortype(TensorType type) { type_ = type; }
+  virtual void map(bool blocking = true) {}
+  virtual void unmap() {}
+  virtual void commit(TensorType type = tensor_data) {}
+  int tensor_copy(arm_compute::ITensor *tensor, void *mem,
+                  bool toTensor = true);
+ protected:
+  void *mem_;
+  TensorType type_;
+  bool allocate_;
+};
+class ACLTensor : public BaseACLTensor, public Tensor {
+ public:
+  explicit ACLTensor(arm_compute::TensorInfo &&info) : Tensor(info) {}
+  virtual void map(bool blocking = true) {
+    if (!allocate_) {
+      Tensor::allocate();
+      allocate_ = true;
+    }
+    Tensor::map(blocking);
+  }
+  virtual int tensor_copy(void *mem, bool toTensor = true) {
+    auto acl_tensor = this;
+    arm_compute::ITensor *tensor = acl_tensor->tensor();
+    BaseACLTensor::tensor_copy(tensor, mem, toTensor);
+    return 0;
+  }
+  virtual void unmap() { Tensor::unmap(); }
+  virtual void commit(TensorType type = tensor_data);
+};
+class ACLSubTensor : public BaseACLTensor, public SubTensor {
+ public:
+  ACLSubTensor(std::unique_ptr<ACLTensor> &parent,
+               arm_compute::TensorShape &shape, arm_compute::Coordinates &coord)
+      : SubTensor(parent.get(), shape, coord) {}
+  virtual int tensor_copy(void *mem, bool toTensor = true) { return 0; }
+};
+template <typename T>
+class TensorPair {
+ public:
+  TensorPair() {}
+  ~TensorPair() {}
+  TensorType type;
+  std::unique_ptr<T> tensor;
+};
+template <typename T>
+std::unique_ptr<T> &tensor_item(
+    std::vector<std::unique_ptr<TensorPair<T>>> &pool, TensorType type,
+    int idx) {
+  int count = 0;
+  for (auto &item : pool) {
+    if (item.get()->type == type) {
+      ++count;
+    }
+    if (item.get()->type == type && idx == count - 1) {
+      return item.get()->tensor;
+    }
+  }
+  pool.push_back((std::unique_ptr<TensorPair<T>>)std::move(new TensorPair<T>));
+  auto item = pool[pool.size() - 1].get();
+  item->type = type;
+  item->tensor = NULL;
+  return item->tensor;
+}
+class ACLOperator {
+ public:
+  virtual void commit() {
+    for (auto &item : tensor_pool_) {
+      if (item.get()->tensor) item.get()->tensor->commit(item.get()->type);
+    }
+  }
+  inline void run() {
+    commit();
+#ifdef USE_PROFILING
+    logtime_util log_time(ACL_RUN_INFO);
+#endif  // USE_PROFILING
+    for (auto &c : funcs_) {
+      c->run();
+    }
+  }
+  inline std::vector<std::unique_ptr<arm_compute::IFunction>> &funcs() {
+    return funcs_;
+  }
+  inline std::unique_ptr<ACLSubTensor> &sinput(int idx = 0) {
+    return tensor_item(subtensor_pool_, tensor_input, idx);
+  }
+  inline std::unique_ptr<ACLSubTensor> &soutput(int idx = 0) {
+    return tensor_item(subtensor_pool_, tensor_output, idx);
+  }
+  inline std::unique_ptr<ACLSubTensor> &sweights(int idx = 0) {
+    return tensor_item(subtensor_pool_, tensor_weights, idx);
+  }
+  inline std::unique_ptr<ACLSubTensor> &sbiases(int idx = 0) {
+    return tensor_item(subtensor_pool_, tensor_biases, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &cinput(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_concat, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &input(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_input, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &output(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_output, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &weights(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_weights, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &biases(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_biases, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &mean(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_mean, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &var(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_var, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &beta(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_beta, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &gamma(int idx = 0) {
+    return tensor_item(tensor_pool_, tensor_gamma, idx);
+  }
+  inline std::unique_ptr<ACLTensor> &tensor(TensorType type) {
+    switch (type) {
+      case tensor_biases:
+        return biases();
+        break;
+      case tensor_weights:
+        return weights();
+        break;
+      case tensor_output:
+        return output();
+        break;
+      default:
+      case tensor_input:
+        return input();
+        break;
+    }
+    return input();
+  }
+  explicit ACLOperator(bool is_gpu = false);
+  virtual ~ACLOperator();
+  inline TargetHint getTargetHint() {
+#ifdef USE_OPENCL
+    if (target_hint_ == TargetHint::DONT_CARE) {
+      if (is_gpu_) {
+        return TargetHint::OPENCL;
+      }
+      return TargetHint::NEON;
+    }
+    return target_hint_;
+#elif defined(USE_OPENGLES)
+    if (target_hint_ == TargetHint::DONT_CARE) {
+      if (is_gpu_) {
+        return TargetHint::OPENGLES;
+      }
+      return TargetHint::NEON;
+    }
+    return target_hint_;
+#else
+    return TargetHint::NEON;
+#endif
+  }
+  inline void setTargetHint(TargetHint hint) { target_hint_ = hint; }
+  inline ConvolutionMethodHint &getConvMethod() {
+    return convolution_method_hint_;
+  }
+  inline void setConvMethod() {
+    convolution_method_hint_ = ConvolutionMethodHint::DIRECT;
+  }
+  inline bool tensor_mem(std::unique_ptr<ACLTensor> &tensor, void *mem) {
+    tensor->bindmem(mem);
+    return true;
+  }
+  inline bool tensor_mem(void *mem, std::unique_ptr<ACLTensor> &tensor) {
+    tensor->tensor_copy(mem, false);
+    return true;
+  }
+  bool new_tensor(std::unique_ptr<ACLTensor> &tensor,
+                  arm_compute::TensorShape &shape, void *mem = nullptr,
+                  bool commit = false);
+  bool new_tensor(std::unique_ptr<ACLSubTensor> &tensor,
+                  std::unique_ptr<ACLTensor> &parent,
+                  arm_compute::TensorShape &shape,
+                  arm_compute::Coordinates &coord);
+  inline int &group() { return _group; }
+  inline void set_operator_property(OperateType type, const char *name) {
+    name_ = name;
+    type_ = type;
+  }
+  inline void acl_run(void *input_data, void *output_data) {
+    if (input_data) tensor_mem(input(), input_data);
+    run();
+    tensor_mem(output_data, output());
+  }
+  inline int &input_idx() { return input_idx_; }
+  inline int &output_idx() { return output_idx_; }
+ protected:
+  inline bool isGPUMode() {
+#ifdef USE_OPENCL
+    if (!support_opencl_) return false;
+    return getTargetHint() == TargetHint::OPENCL;
+#elif defined(USE_OPENGLES)
+    if (!support_opengles_) return false;
+    return getTargetHint() == TargetHint::OPENGLES;
+#endif
+    return false;
+  }
+  inline OperatorState &opstate() { return operator_state_; }
+  inline bool is_operator_init_done(arm_compute::TensorShape shape,
+                                    TensorType type = tensor_input) {
+    checkreshape(shape, type);
+    return operator_state_ == operator_init_done;
+  }
+  inline void set_operator_init_done() {
+    opstate() = operator_init_done;
+    set_bypass_state(false);
+  }
+  inline void set_bypass_state(bool state = false) {
+    force_bypass_acl_path_ = state;
+  }
+  inline OperatorState checkreshape(arm_compute::TensorShape shape,
+                                    TensorType type = tensor_input) {
+    opstate() = reshape(shape, type);
+    if (opstate() == operator_reinit) {
+      freeres();
+    }
+    return opstate();
+  }
+  inline OperatorState reshape(arm_compute::TensorShape &shape,
+                               TensorType type) {
+    arm_compute::TensorShape _shape;
+    std::unique_ptr<ACLTensor> &acl_tensor = tensor(type);
+    if (!acl_tensor.get()) return operator_not_init;
+    _shape = acl_tensor->info().tensor_shape();
+    if (_shape.total_size() == shape.total_size() && _shape[0] == shape[0] &&
+        _shape[1] == shape[1]) {
+      return operator_init_done;
+    }
+    return operator_reinit;
+  }
+  inline void freeres() {
+    tensor_pool_.clear();
+    subtensor_pool_.clear();
+    funcs_.clear();
+  }
+  inline const char *&name() { return name_; }
+  inline void set_in_out_index(int indata_idx, int outdata_idx) {
+    input_idx() = indata_idx;
+    output_idx() = outdata_idx;
+  }
+ protected:
+  std::vector<std::unique_ptr<TensorPair<ACLTensor>>> tensor_pool_;
+  std::vector<std::unique_ptr<TensorPair<ACLSubTensor>>> subtensor_pool_;
+  std::vector<std::unique_ptr<arm_compute::IFunction>> funcs_;
+  OperatorState operator_state_;
+  bool force_bypass_acl_path_;
+  TargetHint target_hint_;
+  ConvolutionMethodHint convolution_method_hint_;
+  static bool support_opengles_;
+  static bool support_opencl_;
+  static bool init_gpu_env;
+  int _group;
+  const char *name_;
+  OperateType type_;
+  int input_idx_, output_idx_;
+  bool is_gpu_;
+};
+int isScheduleEnable();
+template <typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    arm_compute::ITensor *input, arm_compute::ITensor *output) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(dynamic_cast<TensorType *>(input),
+                dynamic_cast<TensorType *>(output));
+  return std::move(op);
+}
+template <typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    arm_compute::ITensor *input, arm_compute::ITensor *output) {
+  return instantiate_function<OperatorType, TensorType>(input, output);
+}
+template <typename OpType, typename OpTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint &hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  func = instantiate<OpType, OpTensor>(input->tensor(), output->tensor());
+  return func;
+}
+template <typename OperatorType, typename TensorType, typename VectorTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    VectorTensor inputs, arm_compute::ITensor *output) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(inputs, dynamic_cast<TensorType *>(output));
+  return std::move(op);
+}
+template <typename OperatorType, typename TensorType, typename VectorTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    VectorTensor inputs, arm_compute::ITensor *output) {
+  return instantiate_function<OperatorType, TensorType, VectorTensor>(inputs,
+                                                                      output);
+}
+template <typename OpType, typename OpTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func_lists(
+    ACLOperator *&acl_op, std::unique_ptr<ACLTensor> &output, int num,
+    TargetHint &hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  static std::vector<OpTensor *> tensors;
+  tensors.clear();
+  for (int i = 0; i < num; ++i) {
+    tensors.push_back(
+        dynamic_cast<OpTensor *>(acl_op->cinput(i).get()->tensor()));
+  }
+  func = instantiate<OpType, OpTensor, std::vector<OpTensor *>>(
+      tensors, output->tensor());
+  return func;
+}
+template <typename OperatorType, typename TensorType, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    arm_compute::ITensor *input, arm_compute::ITensor *output,
+    const OperatorInfo &info) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(dynamic_cast<TensorType *>(input),
+                dynamic_cast<TensorType *>(output), info);
+  return std::move(op);
+}
+template <typename OperatorType, typename TensorType, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    arm_compute::ITensor *input, arm_compute::ITensor *output,
+    const OperatorInfo &info) {
+  return instantiate_function<OperatorType, TensorType, OperatorInfo>(
+      input, output, info);
+}
+template <typename OpType, typename OpTensor, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    const OperatorInfo &info, TargetHint &hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  func = instantiate<OpType, OpTensor, OperatorInfo>(input->tensor(),
+                                                     output->tensor(), info);
+  return func;
+}
+template <typename OperatorType, typename TensorType, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    arm_compute::ITensor *input, arm_compute::ITensor *weights,
+    arm_compute::ITensor *biases, arm_compute::ITensor *output,
+    const OperatorInfo &info) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(dynamic_cast<TensorType *>(input),
+                dynamic_cast<TensorType *>(weights),
+                dynamic_cast<TensorType *>(biases),
+                dynamic_cast<TensorType *>(output), info);
+  return std::move(op);
+}
+template <typename OperatorType, typename TensorType, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    arm_compute::ITensor *input, arm_compute::ITensor *weights,
+    arm_compute::ITensor *biases, arm_compute::ITensor *output,
+    const OperatorInfo &info) {
+  return instantiate_function<OperatorType, TensorType, OperatorInfo>(
+      input, weights, biases, output, info);
+}
+template <typename OpType, typename OpTensor, typename OperatorInfo,
+          typename ACLTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &weights,
+    std::unique_ptr<ACLTensor> &biases, std::unique_ptr<ACLTensor> &output,
+    const OperatorInfo &info, TargetHint &hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  arm_compute::ITensor *biases_tensor = NULL;
+  if (biases.get()) {
+    biases_tensor = biases->tensor();
+  }
+  func = instantiate<OpType, OpTensor, OperatorInfo>(
+      input->tensor(), weights->tensor(), biases_tensor, output->tensor(),
+      info);
+  return func;
+}
+template <typename Dtype, typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(
+    arm_compute::ITensor *input, arm_compute::ITensor *output,
+    arm_compute::ITensor *mean, arm_compute::ITensor *var,
+    arm_compute::ITensor *beta, arm_compute::ITensor *gamma, Dtype &eps) {
+  auto op = cpp14::make_unique<OperatorType>();
+  op->configure(
+      dynamic_cast<TensorType *>(input), dynamic_cast<TensorType *>(output),
+      dynamic_cast<TensorType *>(mean), dynamic_cast<TensorType *>(var),
+      dynamic_cast<TensorType *>(beta), dynamic_cast<TensorType *>(gamma), eps);
+  return std::move(op);
+}
+template <typename Dtype, typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate(
+    arm_compute::ITensor *input, arm_compute::ITensor *output,
+    arm_compute::ITensor *mean, arm_compute::ITensor *var,
+    arm_compute::ITensor *beta, arm_compute::ITensor *gamma, Dtype eps) {
+  return instantiate_function<Dtype, OperatorType, TensorType>(
+      input, output, mean, var, beta, gamma, eps);
+}
+template <typename Dtype, typename OpType, typename OpTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    std::unique_ptr<ACLTensor> &mean, std::unique_ptr<ACLTensor> &var,
+    std::unique_ptr<ACLTensor> &beta, std::unique_ptr<ACLTensor> &gamma,
+    Dtype eps, TargetHint hint) {
+  std::unique_ptr<arm_compute::IFunction> func;
+  func = instantiate<Dtype, OpType, OpTensor>(
+      input->tensor(), output->tensor(), mean->tensor(), var->tensor(),
+      beta->tensor(), gamma->tensor(), eps);
+  return func;
+}
+template <typename OperatorInfo>
+bool instantiate_op_pooling(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func<arm_compute::CLPoolingLayer, arm_compute::ICLTensor,
+                            arm_compute::PoolingLayerInfo>(input, output, info,
+                                                           hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(
+        instantiate_op_func<arm_compute::GCPoolingLayer, arm_compute::IGCTensor,
+                            arm_compute::PoolingLayerInfo>(input, output, info,
+                                                           hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func<arm_compute::NEPoolingLayer, arm_compute::ITensor,
+                            arm_compute::PoolingLayerInfo>(input, output, info,
+                                                           hint));
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_activation(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(instantiate_op_func<arm_compute::CLActivationLayer,
+                                       arm_compute::ICLTensor,
+                                       arm_compute::ActivationLayerInfo>(
+        input, output, info, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(instantiate_op_func<arm_compute::GCActivationLayer,
+                                       arm_compute::IGCTensor,
+                                       arm_compute::ActivationLayerInfo>(
+        input, output, info, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(instantiate_op_func<arm_compute::NEActivationLayer,
+                                       arm_compute::ITensor,
+                                       arm_compute::ActivationLayerInfo>(
+        input, output, info, hint));
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_lrn(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(instantiate_op_func<arm_compute::CLNormalizationLayer,
+                                       arm_compute::ICLTensor,
+                                       arm_compute::NormalizationLayerInfo>(
+        input, output, info, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(instantiate_op_func<arm_compute::GCNormalizationLayer,
+                                       arm_compute::IGCTensor,
+                                       arm_compute::NormalizationLayerInfo>(
+        input, output, info, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(instantiate_op_func<arm_compute::NENormalizationLayer,
+                                       arm_compute::ITensor,
+                                       arm_compute::NormalizationLayerInfo>(
+        input, output, info, hint));
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_conv(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+  std::unique_ptr<ACLTensor> &weights = acl_op->weights();
+  std::unique_ptr<ACLTensor> &biases = acl_op->biases();
+  ConvolutionMethodHint &conv_method = acl_op->getConvMethod();
+  bool has_biases = biases.get() ? true : false;
+  int &groups = acl_op->group();
+  arm_compute::TensorShape input_shape = input->info().tensor_shape();
+  arm_compute::TensorShape weights_shape = weights->info().tensor_shape();
+  arm_compute::TensorShape biases_shape;
+  if (has_biases) {
+    biases_shape = biases->info().tensor_shape();
+  }
+  arm_compute::TensorShape output_shape = output->info().tensor_shape();
+  if (groups == 1) {
+    if (conv_method == ConvolutionMethodHint::GEMM) {
+#ifdef USE_OPENCL
+      if (hint == TargetHint::OPENCL) {
+        func.push_back(instantiate_op_func<arm_compute::CLConvolutionLayer,
+                                           arm_compute::ICLTensor,
+                                           arm_compute::PadStrideInfo>(
+            acl_op->input(), acl_op->weights(), acl_op->biases(),
+            acl_op->output(), info, hint));
+        return true;
+      }
+#elif defined(USE_OPENGLES)
+      if (hint == TargetHint::OPENGLES) {
+        func.push_back(instantiate_op_func<arm_compute::GCConvolutionLayer,
+                                           arm_compute::IGCTensor,
+                                           arm_compute::PadStrideInfo>(
+            acl_op->input(), acl_op->weights(), acl_op->biases(),
+            acl_op->output(), info, hint));
+        return true;
+      }
+#endif
+      {
+        func.push_back(instantiate_op_func<arm_compute::NEConvolutionLayer,
+                                           arm_compute::ITensor,
+                                           arm_compute::PadStrideInfo>(
+            acl_op->input(), acl_op->weights(), acl_op->biases(),
+            acl_op->output(), info, hint));
+      }
+    } else {
+#ifdef USE_OPENCL
+      if (hint == TargetHint::OPENCL) {
+        func.push_back(
+            instantiate_op_func<arm_compute::CLDirectConvolutionLayer,
+                                arm_compute::ICLTensor,
+                                arm_compute::PadStrideInfo>(
+                acl_op->input(), acl_op->weights(), acl_op->biases(),
+                acl_op->output(), info, hint));
+        return true;
+      }
+#elif defined(USE_OPENGLES)
+      if (hint == TargetHint::OPENGLES) {
+        func.push_back(
+            instantiate_op_func<arm_compute::GCDirectConvolutionLayer,
+                                arm_compute::IGCTensor,
+                                arm_compute::PadStrideInfo>(
+                acl_op->input(), acl_op->weights(), acl_op->biases(),
+                acl_op->output(), info, hint));
+        return true;
+      }
+#endif
+      {
+        func.push_back(
+            instantiate_op_func<arm_compute::NEDirectConvolutionLayer,
+                                arm_compute::ITensor,
+                                arm_compute::PadStrideInfo>(
+                acl_op->input(), acl_op->weights(), acl_op->biases(),
+                acl_op->output(), info, hint));
+      }
+    }
+    return true;
+  }
+  // Calculate sub-tensor splits
+  const int input_split = input_shape.z() / groups;
+  const int output_split = output_shape.z() / groups;
+  const int weights_split = weights_shape[3] / groups;
+  const int biases_split = biases_shape.x() / groups;
+  // Calculate sub-tensor shapes
+  input_shape.set(2, input_split);
+  output_shape.set(2, output_split);
+  weights_shape.set(3, weights_split);
+  biases_shape.set(0, biases_split);
+  for (auto i = 0; i < groups; ++i) {
+    // Calculate sub-tensors starting coordinates
+    arm_compute::Coordinates input_coord(0, 0, input_split * i);
+    arm_compute::Coordinates output_coord(0, 0, output_split * i);
+    arm_compute::Coordinates weights_coord(0, 0, 0, weights_split * i);
+    arm_compute::Coordinates biases_coord(biases_split * i);
+    // Create sub-tensors for input, output, weights and bias
+    acl_op->new_tensor(acl_op->sinput(i), acl_op->input(), input_shape,
+                       input_coord);
+    acl_op->new_tensor(acl_op->soutput(i), acl_op->output(), output_shape,
+                       output_coord);
+    acl_op->new_tensor(acl_op->sweights(i), acl_op->weights(), weights_shape,
+                       weights_coord);
+    if (has_biases) {
+      acl_op->new_tensor(acl_op->sbiases(i), acl_op->biases(), biases_shape,
+                         biases_coord);
+    }
+    bool use_opencl = false;
+    if (conv_method == ConvolutionMethodHint::GEMM) {
+#ifdef USE_OPENCL
+      if (hint == TargetHint::OPENCL) {
+        use_opencl = true;
+        func.push_back(
+            instantiate_op_func<arm_compute::CLConvolutionLayer,
+                                arm_compute::ICLTensor,
+                                arm_compute::PadStrideInfo, ACLSubTensor>(
+                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
+                acl_op->soutput(i), info, hint));
+      }
+#endif
+      if (!use_opencl) {
+        func.push_back(
+            instantiate_op_func<arm_compute::NEConvolutionLayer,
+                                arm_compute::ITensor,
+                                arm_compute::PadStrideInfo, ACLSubTensor>(
+                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
+                acl_op->soutput(i), info, hint));
+      }
+    } else {
+#ifdef USE_OPENCL
+      if (hint == TargetHint::OPENCL) {
+        use_opencl = true;
+        func.push_back(
+            instantiate_op_func<arm_compute::CLDirectConvolutionLayer,
+                                arm_compute::ICLTensor,
+                                arm_compute::PadStrideInfo, ACLSubTensor>(
+                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
+                acl_op->soutput(i), info, hint));
+      }
+#endif
+      if (!use_opencl) {
+        func.push_back(
+            instantiate_op_func<arm_compute::NEDirectConvolutionLayer,
+                                arm_compute::ITensor,
+                                arm_compute::PadStrideInfo, ACLSubTensor>(
+                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
+                acl_op->soutput(i), info, hint));
+      }
+    }
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_lc(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+  std::unique_ptr<ACLTensor> &weights = acl_op->weights();
+  std::unique_ptr<ACLTensor> &biases = acl_op->biases();
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func<arm_compute::CLLocallyConnectedLayer,
+                            arm_compute::ICLTensor, arm_compute::PadStrideInfo>(
+            input, weights, biases, output, info, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func<arm_compute::NELocallyConnectedLayer,
+                            arm_compute::ITensor, arm_compute::PadStrideInfo>(
+            input, weights, biases, output, info, hint));
+  }
+  return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_fc(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, const OperatorInfo &info) {
+  std::unique_ptr<ACLTensor> &weights = acl_op->weights();
+  std::unique_ptr<ACLTensor> &biases = acl_op->biases();
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(instantiate_op_func<arm_compute::CLFullyConnectedLayer,
+                                       arm_compute::ICLTensor, bool>(
+        input, weights, biases, output, info, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(instantiate_op_func<arm_compute::GCFullyConnectedLayer,
+                                       arm_compute::IGCTensor, bool>(
+        input, weights, biases, output, info, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(instantiate_op_func<arm_compute::NEFullyConnectedLayer,
+                                       arm_compute::ITensor, bool>(
+        input, weights, biases, output, info, hint));
+  }
+  return true;
+}
+template <typename Dtype>
+bool instantiate_op_bn(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, Dtype eps) {
+  std::unique_ptr<ACLTensor> &mean = acl_op->mean();
+  std::unique_ptr<ACLTensor> &var = acl_op->var();
+  std::unique_ptr<ACLTensor> &beta = acl_op->beta();
+  std::unique_ptr<ACLTensor> &gamma = acl_op->gamma();
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func<Dtype, arm_compute::CLBatchNormalizationLayer,
+                            arm_compute::ICLTensor>(input, output, mean, var,
+                                                    beta, gamma, eps, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(
+        instantiate_op_func<Dtype, arm_compute::GCBatchNormalizationLayer,
+                            arm_compute::IGCTensor>(input, output, mean, var,
+                                                    beta, gamma, eps, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func<Dtype, arm_compute::NEBatchNormalizationLayer,
+                            arm_compute::ITensor>(input, output, mean, var,
+                                                  beta, gamma, eps, hint));
+  }
+  return true;
+}
+inline bool instantiate_op_softmax(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, void *data) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func<arm_compute::CLSoftmaxLayer,
+                            arm_compute::ICLTensor>(input, output, hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(
+        instantiate_op_func<arm_compute::GCSoftmaxLayer,
+                            arm_compute::IGCTensor>(input, output, hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func<arm_compute::NESoftmaxLayer, arm_compute::ITensor>(
+            input, output, hint));
+  }
+  return true;
+}
+inline bool instantiate_op_concat(
+    ACLOperator *acl_op,
+    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
+    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
+    TargetHint hint, int num) {
+#ifdef USE_OPENCL
+  if (hint == TargetHint::OPENCL) {
+    func.push_back(
+        instantiate_op_func_lists<arm_compute::CLDepthConcatenateLayer,
+                                  arm_compute::ICLTensor>(acl_op, output, num,
+                                                          hint));
+    return true;
+  }
+#elif defined(USE_OPENGLES)
+  if (hint == TargetHint::OPENGLES) {
+    func.push_back(
+        instantiate_op_func_lists<arm_compute::GCDepthConcatenateLayer,
+                                  arm_compute::IGCTensor>(acl_op, output, num,
+                                                          hint));
+    return true;
+  }
+#endif
+  {
+    func.push_back(
+        instantiate_op_func_lists<arm_compute::NEDepthConcatenateLayer,
+                                  arm_compute::ITensor>(acl_op, output, num,
+                                                        hint));
+  }
+  return true;
+}
+template <typename Dtype>
+void *InputdataPtr(ACLOperator *op,
+                   const std::vector<framework::LoDTensor *> &input_data,
+                   Dtype type, int index = -1) {
+  if (index == -1) index = 0;
+  return (void *)(input_data[index]->mutable_data<Dtype>());
+}
+template <typename Dtype>
+void acl_run(ACLOperator *op,
+             const std::vector<framework::LoDTensor *> &in_data, void *out_data,
+             Dtype type, bool multi_input_run = true) {
+  for (int i = 0; i < in_data.size(); ++i) {
+    op->tensor_mem(op->cinput(i), InputdataPtr(op, in_data, type, i));
+  }
+  op->acl_run(NULL, out_data);
+}
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
+#ifdef USE_PROFILING
+#define acl_configure(opname, acl_op, args...)                                \
+  {                                                                           \
+    set_operator_property(acl::operate_type_##opname, #opname);               \
+    logtime_util log_time(ACL_CONFIG_INFO);                                   \
+    instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(),         \
+                            acl_op->output(), acl_op->getTargetHint(), args); \
+  }
+#else
+#define acl_configure(opname, acl_op, args...)                                \
+  {                                                                           \
+    set_operator_property(acl::operate_type_##opname, #opname);               \
+    instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(),         \
+                            acl_op->output(), acl_op->getTargetHint(), args); \
+  }
+#endif
+#define ACLOp_Ptr(a) dynamic_cast<ACLOperator *>(a)
+#endif  // USE_ACL
+#endif  // ACL_OPERATOR_H_
--- a/src/operators/kernel/mali/acl_tensor.cc
+++ b/src/operators/kernel/mali/acl_tensor.cc
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file acl_tensor.cc
+ * \brief
+ * \author Joey
+ */
+#include "acl_tensor.h"
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+#ifdef USE_ACL
+template <typename TensorType>
+std::unique_ptr<arm_compute::ITensor> initialise_tensor(
+    arm_compute::TensorInfo &info) {
+  auto tensor = cpp14::make_unique<TensorType>();
+  tensor->allocator()->init(info);
+  return std::move(tensor);
+}
+template <typename TensorType>
+void tensor_allocate(arm_compute::ITensor &tensor) {
+  auto itensor = dynamic_cast<TensorType *>(&tensor);
+  itensor->allocator()->allocate();
+}
+Tensor::Tensor(arm_compute::TensorInfo &info) noexcept
+    : _target(TargetHint::DONT_CARE), _info(info), _tensor(nullptr) {}
+Tensor::Tensor(Tensor &&src) noexcept
+    : _target(src._target),
+      _info(std::move(src._info)),
+      _tensor(std::move(src._tensor)) {}
+arm_compute::ITensor *Tensor::set_target(TargetHint target) {
+  switch (target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      _tensor = initialise_tensor<arm_compute::CLTensor>(_info);
+      break;
+#elif defined(USE_OPENGLES)
+    case TargetHint::OPENGLES:
+      _tensor = initialise_tensor<arm_compute::GCTensor>(_info);
+      break;
+#endif
+    case TargetHint::NEON:
+      _tensor = initialise_tensor<arm_compute::Tensor>(_info);
+      break;
+    default:
+      break;
+  }
+  _target = target;
+  return _tensor.get();
+}
+void Tensor::allocate() {
+  switch (_target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      tensor_allocate<arm_compute::CLTensor>(*_tensor);
+      break;
+#elif defined(USE_OPENGLES)
+    case TargetHint::OPENGLES:
+      tensor_allocate<arm_compute::GCTensor>(*_tensor);
+      break;
+#endif
+    case TargetHint::NEON:
+      tensor_allocate<arm_compute::Tensor>(*_tensor);
+      break;
+    default:
+      break;
+  }
+}
+void Tensor::map(bool blocking) {
+#ifdef USE_OPENCL
+  if (_target == TargetHint::OPENCL)
+    dynamic_cast<arm_compute::CLTensor *>(tensor())->map(blocking);
+#elif defined(USE_OPENGLES)
+  if (_target == TargetHint::OPENGLES)
+    dynamic_cast<arm_compute::GCTensor *>(tensor())->map(blocking);
+#endif
+}
+void Tensor::unmap() {
+#ifdef USE_OPENCL
+  if (_target == TargetHint::OPENCL)
+    dynamic_cast<arm_compute::CLTensor *>(tensor())->unmap();
+#elif defined(USE_OPENGLES)
+  if (_target == TargetHint::OPENGLES)
+    dynamic_cast<arm_compute::GCTensor *>(tensor())->unmap();
+#endif
+}
+template <typename SubTensorType, typename ParentTensorType>
+std::unique_ptr<arm_compute::ITensor> initialise_subtensor(
+    arm_compute::ITensor *parent, arm_compute::TensorShape shape,
+    arm_compute::Coordinates coords) {
+  auto ptensor = dynamic_cast<ParentTensorType *>(parent);
+  auto subtensor = cpp14::make_unique<SubTensorType>(ptensor, shape, coords);
+  return std::move(subtensor);
+}
+SubTensor::SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
+                     arm_compute::Coordinates &coords) noexcept
+    : _target(TargetHint::DONT_CARE),
+      _tensor_shape(tensor_shape),
+      _coords(coords),
+      _parent(nullptr),
+      _subtensor(nullptr) {
+  _parent = parent->tensor();
+  _target = parent->target();
+  instantiate_subtensor();
+}
+arm_compute::ITensor *SubTensor::set_target(TargetHint target) {
+  return (target == _target) ? _subtensor.get() : nullptr;
+}
+arm_compute::ITensor *SubTensor::tensor() { return _subtensor.get(); }
+const arm_compute::ITensor *SubTensor::tensor() const {
+  return _subtensor.get();
+}
+TargetHint SubTensor::target() const { return _target; }
+void SubTensor::allocate() {
+  // NOP for sub-tensors
+}
+void SubTensor::instantiate_subtensor() {
+  switch (_target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      _subtensor = initialise_subtensor<arm_compute::CLSubTensor,
+                                        arm_compute::ICLTensor>(
+          _parent, _tensor_shape, _coords);
+      break;
+#endif
+    default:
+    case TargetHint::NEON:
+      _subtensor =
+          initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(
+              _parent, _tensor_shape, _coords);
+      break;
+  }
+}
+#endif
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/mali/acl_tensor.h
+++ b/src/operators/kernel/mali/acl_tensor.h
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file acl_tensor.h
+ * \brief
+ * \author Joey
+ */
+#ifndef ACL_TENSOR_H_
+#define ACL_TENSOR_H_
+#ifdef USE_ACL
+#ifdef USE_OPENCL
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#elif defined(USE_OPENGLES)
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#endif
+#include "arm_compute/runtime/SubTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+#include <memory>
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+enum class TargetHint {
+  DONT_CARE,
+  OPENCL,
+  OPENGLES,
+  NEON,
+};
+enum class ConvolutionMethodHint {
+  GEMM,
+  DIRECT,
+};
+namespace cpp14 {
+template <class T>
+struct _Unique_if {
+  typedef std::unique_ptr<T> _Single_object;
+};
+template <class T>
+struct _Unique_if<T[]> {
+  typedef std::unique_ptr<T[]> _Unknown_bound;
+};
+template <class T, size_t N>
+struct _Unique_if<T[N]> {
+  typedef void _Known_bound;
+};
+template <class T, class... Args>
+typename _Unique_if<T>::_Single_object make_unique(Args &&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+template <class T>
+typename _Unique_if<T>::_Unknown_bound make_unique(size_t n) {
+  typedef typename std::remove_extent<T>::type U;
+  return std::unique_ptr<T>(new U[n]());
+}
+template <class T, class... Args>
+typename _Unique_if<T>::_Known_bound make_unique(Args &&...);
+}  // namespace cpp14
+class Tensor {
+ public:
+  explicit Tensor(arm_compute::TensorInfo &info) noexcept;
+  virtual ~Tensor() {}
+  Tensor(Tensor &&src) noexcept;
+  void set_info(arm_compute::TensorInfo &&info) { _info = info; }
+  arm_compute::ITensor *set_target(TargetHint target);
+  const arm_compute::TensorInfo &info() const { return _info; }
+  arm_compute::ITensor *tensor() { return _tensor.get(); }
+  void allocate();
+  void init() {}
+  TargetHint target() const { return _target; }
+  virtual void map(bool blocking = true);
+  virtual void unmap();
+ private:
+  TargetHint _target;
+  arm_compute::TensorInfo _info;
+  std::unique_ptr<arm_compute::ITensor> _tensor;
+};
+class SubTensor {
+ public:
+  SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
+            arm_compute::Coordinates &coords) noexcept;
+  ~SubTensor() {}
+  arm_compute::ITensor *tensor();
+  const arm_compute::ITensor *tensor() const;
+  TargetHint target() const;
+  void allocate();
+  arm_compute::ITensor *set_target(TargetHint target);
+ private:
+  /** Instantiates a sub-tensor */
+  void instantiate_subtensor();
+ private:
+  /**< Target that this tensor is pinned on */
+  TargetHint _target;
+  /**< SubTensor shape */
+  arm_compute::TensorShape _tensor_shape;
+  /**< SubTensor Coordinates */
+  arm_compute::Coordinates _coords;
+  /**< Parent tensor */
+  arm_compute::ITensor *_parent;
+  /**< SubTensor */
+  std::unique_ptr<arm_compute::ITensor> _subtensor;
+};
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif  // ACL_TENSOR_H_
--- a/src/operators/kernel/mali/batchnorm_kernel.cpp
+++ b/src/operators/kernel/mali/batchnorm_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,20 +17,152 @@ limitations under the License. */
 #pragma once
 #include "operators/kernel/batchnorm_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
+template <typename DeviceType, typename T>
+class AclBatchNormOp : public acl::ACLOperator {
+ public:
+  AclBatchNormOp() {
+    this->force_bypass_acl_path_ = bypass_acl_class_layer & FLAGS_ENABLE_ACL_BN;
+  }
+  ~AclBatchNormOp() = default;
+  AclBatchNormOp(const AclBatchNormOp&) = delete;
+  AclBatchNormOp& operator=(const AclBatchNormOp&) = delete;
+  AclBatchNormOp(AclBatchNormOp&&) = delete;
+  AclBatchNormOp& operator=(AclBatchNormOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const BatchNormParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    arm_compute::TensorShape mean_shape(args.in_depth);
+    arm_compute::TensorShape var_shape = mean_shape;
+    arm_compute::TensorShape beta_shape = mean_shape;
+    arm_compute::TensorShape gamma_shape = mean_shape;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    new_tensor(mean(), mean_shape, args.mean_data);
+    new_tensor(var(), var_shape, args.var_data);
+    new_tensor(beta(), beta_shape, args.biases_data);
+    new_tensor(gamma(), gamma_shape, args.weight_data);
+    acl_configure(bn, this, args.epsilon);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const BatchNormParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const BatchNormParam& param) {
+    const Tensor* in_x = param.InputX();
+    Tensor* out = param.OutputY();
+    const Tensor* scale = param.InputScale();
+    const Tensor* bias = param.InputBias();
+    const Tensor* saved_mean = param.InputMean();
+    const Tensor* saved_variance = param.InputVariance();
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    const T* weight_data = scale->data<T>();
+    const T* bias_data = bias->data<T>();
+    const T* mean_data = saved_mean->data<T>();
+    const T* var_data = saved_variance->data<T>();
+    float epsilon = param.Epsilon();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    // args.weight_data = (void*)weight_data;
+    // args.biases_data = (void*)bias_data;
+    args.mean_data = (void*)mean_data;
+    args.var_data = (void*)var_data;
+    args.epsilon = epsilon;
+    args.dim = in_x->dims().size();
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+    args.out_num = out->dims()[0];
+    args.out_depth = out->dims()[1];
+    args.out_rows = out->dims()[2];
+    args.out_cols = out->dims()[3];
+    args.weight_data = (void*)weight_data;
+    args.biases_data = (void*)bias_data;
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
 template <>
-bool BatchNormKernel<GPU_MALI, float>::Init(const BatchNormParam &para) const {
+bool BatchNormKernel<GPU_MALI, float>::Init(const BatchNormParam& param) const {
+  AclBatchNormOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclBatchNormOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
  return true;
 }
 template <>
 void BatchNormKernel<GPU_MALI, float>::Compute(
-    const BatchNormParam &param) const {}
+    const BatchNormParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclBatchNormOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
+}
+template class BatchNormKernel<GPU_MALI, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 #endif
+#endif
--- a/src/operators/kernel/mali/concat_kernel.cpp
+++ b/src/operators/kernel/mali/concat_kernel.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef CONCAT_OP
+#include "operators/kernel/concat_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclConcatOp : public acl::ACLOperator {
+ public:
+  AclConcatOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONCAT;
+  }
+  ~AclConcatOp() = default;
+  AclConcatOp(const AclConcatOp&) = delete;
+  AclConcatOp& operator=(const AclConcatOp&) = delete;
+  AclConcatOp(AclConcatOp&&) = delete;
+  AclConcatOp& operator=(AclConcatOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const ConcatParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    const std::vector<framework::LoDTensor*>* input_data = &args.in_tensor;
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.batch);
+    if (is_operator_init_done(output_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    T type;
+    for (int i = 0; i < input_data->size(); i++) {
+      const T* idata = (*input_data)[i]->data<T>();
+      const T* pdata = (*input_data)[i]->data<T>();
+      int in_batch = (*input_data)[i]->dims()[0];
+      int in_channels = (*input_data)[i]->dims()[1];
+      int in_width = (*input_data)[i]->dims()[2];
+      int in_height = (*input_data)[i]->dims()[3];
+      arm_compute::TensorShape in_shape(in_width, in_height, in_channels);
+      new_tensor(cinput(i), in_shape,
+                 acl::InputdataPtr(this, args.in_tensor, type, i));
+    }
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(concat, this, input_data->size());
+  }
+  void RunAcl(const std::vector<framework::LoDTensor*>& input, void* output) {
+    T type;
+    acl::acl_run(this, input, output, type);
+  }
+  bool Bypass_acl(const ConcatParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || !args.is_channel_concat) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const ConcatParam& param) {
+    auto inputs = param.Inputs();
+    auto* output = param.Out();
+    int64_t axis = param.Axis();
+    T* output_data = output->mutable_data<T>();
+    args.is_channel_concat = (axis == 1);
+    args.in_tensor = inputs;
+    args.output_data = (void*)output_data;
+    args.batch = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+  }
+  acl::AclParameters args;
+};
+template <>
+bool ConcatKernel<GPU_MALI, float>::Init(const ConcatParam& param) const {
+  AclConcatOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConcatOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void ConcatKernel<GPU_MALI, float>::Compute(const ConcatParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclConcatOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  std::vector<framework::LoDTensor*> temp_data = args.in_tensor;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl(temp_data, (void*)output_data);
+}
+template class ConcatKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/conv_add_kernel.cpp
+++ b/src/operators/kernel/mali/conv_add_kernel.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADD_OP
+#include "operators/kernel/conv_add_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclConvAddOp : public acl::ACLOperator {
+ public:
+  AclConvAddOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
+  }
+  ~AclConvAddOp() = default;
+  AclConvAddOp(const AclConvAddOp&) = delete;
+  AclConvAddOp& operator=(const AclConvAddOp&) = delete;
+  AclConvAddOp(AclConvAddOp&&) = delete;
+  AclConvAddOp& operator=(AclConvAddOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const FusionConvAddParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+    arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
+                                           args.in_depth / args.num_group,
+                                           args.out_depth);
+    arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PadStrideInfo conv_info(
+        args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
+        arm_compute::DimensionRoundingType::FLOOR);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    check_direct_conv();
+    //[kernel_x, kernel_y, IFM, OFM]
+    new_tensor(weights(), weights_shape, args.weight_data);
+    //[OFM]
+    if (args.biases_data) {
+      new_tensor(biases(), biases_shape, args.biases_data);
+    }
+    group() = args.num_group;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(conv, this, conv_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const FusionConvAddParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
+      bypass_acl = true;
+    }
+    if (args.dim > 2) {
+      bypass_acl = true;
+    }
+    if (args.dilated) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void check_direct_conv() {
+    bool use_direct_conv = false;
+    const char* pDirectConv;
+    pDirectConv = getenv("DIRECTCONV");
+    if (pDirectConv) {
+      unsigned int bdirectconv;
+      sscanf(pDirectConv, "%i", &bdirectconv);
+      if (bdirectconv != use_direct_conv) {
+        use_direct_conv = bdirectconv;
+        printf("DIRECTCONV<%s>\n", pDirectConv);
+        printf("DIRECTCONV: %x\n", use_direct_conv);
+      }
+    }
+    int pad_data[2], kernel[2];
+    pad_data[1] = args.pad_rows;
+    pad_data[0] = args.pad_cols;
+    kernel[1] = args.filter_rows;
+    kernel[0] = args.filter_cols;
+    if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
+                             pad_data[0] == 0 && pad_data[1] == 0) ||
+                            (kernel[0] == 3 && kernel[1] == 3 &&
+                             pad_data[0] <= 1 && pad_data[1] <= 1))) {
+      setConvMethod();  // NEDirectConvolutionLayer only for 1x1 and 3x3
+    }
+  }
+  void AclParametersByContext(const FusionConvAddParam& param) {
+    const Tensor* input = param.Input();
+    Tensor filter = *param.Filter();
+    Tensor* output = param.Output();
+    Tensor* bias;
+    int groups = param.Groups();
+    std::vector<int> strides = param.Strides();
+    std::vector<int> paddings = param.Paddings();
+    std::vector<int> dilations = param.Dilations();
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>();
+    const T* weight_data = filter.data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.weight_data = (void*)weight_data;
+    args.biases_data = nullptr;
+    try {
+      bias = param.Bias();
+    } catch (const std::exception& e) {
+    }
+    if (bias) {
+      const T* biases_data = bias->data<T>();
+      args.biases_data = (void*)biases_data;
+    }
+    args.num_group = groups;
+    args.dilation_rows = dilations[0];
+    args.dilation_cols = dilations[1];
+    if (dilations[0] != 1 || dilations[1] != 1) {
+      args.dilated = true;
+    }
+    // NCHW
+    // std::cout << "In dims: " << (input->dims()).size() << std::endl;
+    args.batch = input->dims()[0];
+    args.in_depth = input->dims()[1];
+    args.in_rows = input->dims()[2];
+    args.in_cols = input->dims()[3];
+    // std::cout <<"In N: " << args.batch << " C: " <<  args.in_depth
+    //  << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout << "Out dims: " << (output->dims()).size() << std::endl;
+    args.out_num = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    args.filter_rows = filter.dims()[2];
+    args.filter_cols = filter.dims()[3];
+    // std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
+    //  << " I: " <<  static_cast<int>(filter.dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+  }
+  acl::AclParameters args;
+};
+template <>
+bool ConvAddKernel<GPU_MALI, float>::Init(
+    const FusionConvAddParam& param) const {
+  AclConvAddOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConvAddOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void ConvAddKernel<GPU_MALI, float>::Compute(
+    const FusionConvAddParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclConvAddOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
+}
+template class ConvAddKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/conv_kernel.cpp
+++ b/src/operators/kernel/mali/conv_kernel.cpp
@@ -15,20 +15,213 @@ limitations under the License. */
 #ifdef CONV_OP
 #include "operators/kernel/conv_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
+template <typename DeviceType, typename T>
+class AclConvOp : public acl::ACLOperator {
+ public:
+  AclConvOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
+  }
+  ~AclConvOp() = default;
+  AclConvOp(const AclConvOp&) = delete;
+  AclConvOp& operator=(const AclConvOp&) = delete;
+  AclConvOp(AclConvOp&&) = delete;
+  AclConvOp& operator=(AclConvOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const ConvParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+    arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
+                                           args.in_depth / args.num_group,
+                                           args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PadStrideInfo conv_info(
+        args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
+        arm_compute::DimensionRoundingType::FLOOR);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    check_direct_conv();
+    //[kernel_x, kernel_y, IFM, OFM]
+    new_tensor(weights(), weights_shape, args.weight_data);
+    //[OFM]
+    // if (args.biases_data) {
+    //    new_tensor(biases(),biases_shape,args.biases_data);
+    //}
+    group() = args.num_group;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(conv, this, conv_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const ConvParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
+      bypass_acl = true;
+    }
+    if (args.dim > 2) {
+      bypass_acl = true;
+    }
+    if (args.dilated) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void check_direct_conv() {
+    bool use_direct_conv = false;
+    const char* pDirectConv;
+    pDirectConv = getenv("DIRECTCONV");
+    if (pDirectConv) {
+      unsigned int bdirectconv;
+      sscanf(pDirectConv, "%i", &bdirectconv);
+      if (bdirectconv != use_direct_conv) {
+        use_direct_conv = bdirectconv;
+        printf("DIRECTCONV<%s>\n", pDirectConv);
+        printf("DIRECTCONV: %x\n", use_direct_conv);
+      }
+    }
+    int pad_data[2], kernel[2];
+    pad_data[1] = args.pad_rows;
+    pad_data[0] = args.pad_cols;
+    kernel[1] = args.filter_rows;
+    kernel[0] = args.filter_cols;
+    if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
+                             pad_data[0] == 0 && pad_data[1] == 0) ||
+                            (kernel[0] == 3 && kernel[1] == 3 &&
+                             pad_data[0] <= 1 && pad_data[1] <= 1))) {
+      setConvMethod();  // NEDirectConvolutionLayer only for 1x1 and 3x3
+    }
+  }
+  void AclParametersByContext(const ConvParam& param) {
+    const Tensor* input = param.Input();
+    Tensor filter = *param.Filter();
+    Tensor* output = param.Output();
+    int groups = param.Groups();
+    std::vector<int> strides = param.Strides();
+    std::vector<int> paddings = param.Paddings();
+    std::vector<int> dilations = param.Dilations();
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>();
+    const T* weight_data = filter.data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.weight_data = (void*)weight_data;
+    args.biases_data = nullptr;
+    // try {
+    //     bias = context.Input<framework::Tensor>("Bias");
+    // } catch (const std::exception& e) {
+    // }
+    // if (bias) {
+    //     const T* biases_data = bias->data<T>();
+    //     args.biases_data = (void*)biases_data;
+    // }
+    args.num_group = groups;
+    args.dilation_rows = dilations[0];
+    args.dilation_cols = dilations[1];
+    if (dilations[0] != 1 || dilations[1] != 1) {
+      args.dilated = true;
+    }
+    // NCHW
+    // std::cout << "In dims: " << (input->dims()).size() << std::endl;
+    args.batch = input->dims()[0];
+    args.in_depth = input->dims()[1];
+    args.in_rows = input->dims()[2];
+    args.in_cols = input->dims()[3];
+    std::cout << "In N: " << args.batch << " C: " << args.in_depth
+              << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout << "Out dims: " << (output->dims()).size() << std::endl;
+    args.out_num = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    args.filter_rows = filter.dims()[2];
+    args.filter_cols = filter.dims()[3];
+    // std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
+    //  << " I: " <<  static_cast<int>(filter.dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+  }
+  acl::AclParameters args;
+};
 template <>
-bool ConvKernel<GPU_MALI, float>::Init(const ConvParam &para) const {
+bool ConvKernel<GPU_MALI, float>::Init(const ConvParam& param) const {
+  AclConvOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConvOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
  return true;
 }
 template <>
-void ConvKernel<GPU_MALI, float>::Compute(const ConvParam &param) const {
+void ConvKernel<GPU_MALI, float>::Compute(const ConvParam& param) const {
-  //  ArmConvImplement imp;
+  std::cout << "init acl" << std::endl;
-  //  imp.Compute(param);
+  AclConvOp<GPU_MALI, float>* acl_op =
-  param.Output()->mutable_data<float>()[0] = 100.0;
+      reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
 }
 template class ConvKernel<GPU_MALI, float>;
@@ -36,3 +229,4 @@ template class ConvKernel<GPU_MALI, float>;
 }  // namespace paddle_mobile
 #endif
+#endif
--- a/src/operators/kernel/mali/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ELEMENTWISEADD_OP
+#pragma once
+#include "operators/kernel/elementwise_add_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename T>
+struct AddFunctor {
+  inline T operator()(T a, T b) const { return a + b; }
+};
+template <>
+bool ElementwiseAddKernel<GPU_MALI, float>::Init(
+    const ElementwiseAddParam &para) const {
+  return true;
+}
+template <>
+void ElementwiseAddKernel<GPU_MALI, float>::Compute(
+    const ElementwiseAddParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *Out = param.Out();
+  Out->mutable_data<float>();
+  int axis = param.Axis();
+  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
+                                                 AddFunctor<float>(), Out);
+}
+template class ElementwiseAddKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FC_OP
+#pragma once
+#include "operators/kernel/fusion_fc_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FusionFcKernel<GPU_MALI, float>::Init(const FusionFcParam &para) const {
+  return true;
+}
+template <>
+void FusionFcKernel<GPU_MALI, float>::Compute(
+    const FusionFcParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  const Tensor *input_z = param.InputZ();
+  auto *input_z_data = input_z->data<float>();
+  int axis = param.Axis();
+  Tensor *out = param.Out();
+  auto *out_data = out->mutable_data<float>();
+  const Tensor x_matrix =
+      input_x->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+          : *input_x;
+  const Tensor y_matrix =
+      input_y->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+          : *input_y;
+  auto out_dim = out->dims();
+  if (out_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
+  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
+                        " out_dim.size must be 2.");
+  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
+  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ")
+  int64_t classes = input_z->numel();
+  for (int i = 0; i < out_dim[0]; i++) {
+    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
+  }
+  for (int i = 0; i < out->numel(); i++) {
+    DLOG << out_data[i];
+  }
+  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                      out, static_cast<float>(1));
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  //            if (out_dim.size() != 2) {
+  //                out->Resize(out_dim);
+  //            }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/lrn_kernel.cpp
+++ b/src/operators/kernel/mali/lrn_kernel.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef LRN_OP
+#pragma once
+#include "operators/kernel/lrn_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclLrnOp : public acl::ACLOperator {
+ public:
+  AclLrnOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_LRN;
+  }
+  ~AclLrnOp() = default;
+  AclLrnOp(const AclLrnOp&) = delete;
+  AclLrnOp& operator=(const AclLrnOp&) = delete;
+  AclLrnOp(AclLrnOp&&) = delete;
+  AclLrnOp& operator=(AclLrnOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const LrnParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape shape(args.in_cols, args.in_rows, args.in_depth);
+    if (is_operator_init_done(shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    arm_compute::NormalizationLayerInfo norm_info(
+        arm_compute::NormType::CROSS_MAP, args.nsize, args.alpha, args.beta,
+        args.knorm);
+    //[width, height, IFM]
+    new_tensor(input(), shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), shape, args.output_data);
+    acl_configure(lrn, this, norm_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const LrnParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const LrnParam& param) {
+    const Tensor* in_x = param.InputX();
+    Tensor* out = param.Out();
+    int n = param.N();
+    T alpha = param.Alpha();
+    T beta = param.Beta();
+    T k = param.K();
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.nsize = n;
+    args.alpha = alpha;
+    args.beta = beta;
+    args.knorm = k;
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
+template <>
+bool LrnKernel<GPU_MALI, float>::Init(const LrnParam& param) const {
+  AclLrnOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclLrnOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void LrnKernel<GPU_MALI, float>::Compute(const LrnParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclLrnOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  for (int n = 0; n < args.batch; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth * args.in_cols * args.in_rows;
+    output_data += args.in_depth * args.in_cols * args.in_rows;
+  }
+}
+template class LrnKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/mul_kernel.cpp
+++ b/src/operators/kernel/mali/mul_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef MUL_OP
+#pragma once
+#include "operators/kernel/mul_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool MulKernel<GPU_MALI, float>::Init(const MulParam &para) const {
+  return true;
+}
+template <>
+void MulKernel<GPU_MALI, float>::Compute(const MulParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *out = param.Out();
+  out->mutable_data<float>();
+  const Tensor x_matrix =
+      input_x->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+          : *input_x;
+  const Tensor y_matrix =
+      input_y->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+          : *input_y;
+  auto out_dim = out->dims();
+  if (out_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                      out, static_cast<float>(0));
+  if (out_dim.size() != 2) {
+    out->Resize(out_dim);
+  }
+}
+template class MulKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/pool_kernel.cpp
+++ b/src/operators/kernel/mali/pool_kernel.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POOL_OP
+#pragma once
+#include "operators/kernel/pool_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclPoolOp : public acl::ACLOperator {
+ public:
+  AclPoolOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_POOLING;
+  }
+  ~AclPoolOp() = default;
+  AclPoolOp(const AclPoolOp&) = delete;
+  AclPoolOp& operator=(const AclPoolOp&) = delete;
+  AclPoolOp(AclPoolOp&&) = delete;
+  AclPoolOp& operator=(AclPoolOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const PoolParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth);
+    // arm_compute::TensorShape weights_shape(
+    // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PoolingLayerInfo pool_info;
+    if (args.pool_type == "max") {
+      pool_info = arm_compute::PoolingLayerInfo(
+          arm_compute::PoolingType::MAX, args.filter_rows,
+          arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
+                                     args.pad_cols, args.pad_rows,
+                                     arm_compute::DimensionRoundingType::CEIL));
+    } else {
+      pool_info = arm_compute::PoolingLayerInfo(
+          arm_compute::PoolingType::AVG, args.filter_rows,
+          arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
+                                     args.pad_cols, args.pad_rows,
+                                     arm_compute::DimensionRoundingType::CEIL));
+    }
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(pooling, this, pool_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const PoolParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    if (args.pool_type != "max" && args.pool_type != "avg") {
+      bypass_acl = true;
+    }
+    if (args.filter_rows != args.filter_cols) {
+      bypass_acl = true;
+    }
+    // if (args.filter_rows!=2 && args.filter_rows!=3) {
+    //     bypass_acl = true;
+    // }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const PoolParam& param) {
+    const Tensor* in_x = param.Input();
+    Tensor* out = param.Output();
+    std::string pooling_type = param.PoolingType();
+    std::vector<int> ksize = param.Ksize();
+    std::vector<int> strides = param.Strides();
+    std::vector<int> paddings = param.Paddings();
+    bool is_global_pooling = param.isGlobalPooling();
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.is_global_pool = is_global_pooling;
+    args.pool_type = pooling_type;
+    args.filter_rows = ksize[0];
+    args.filter_cols = ksize[1];
+    args.dim = ksize.size();
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+    // std::cout <<"In N: " << args.batch << " C: " <<  args.in_depth
+    //  << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    // std::cout <<"Filter O: " << static_cast<int>(filter->dims()[0])
+    //  << " I: " <<  static_cast<int>(filter->dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"PoolingType: " << args.pool_type << "\n";
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+    args.out_depth = args.in_depth;
+    // args.out_rows = out->dims()[2];
+    // args.out_cols = out->dims()[3];
+    args.out_rows = static_cast<int>(ceil(static_cast<float>(args.in_rows +
+                                                             2 * args.pad_rows -
+                                                             args.filter_rows) /
+                                          args.stride_rows)) +
+                    1;
+    args.out_cols = static_cast<int>(ceil(static_cast<float>(args.in_cols +
+                                                             2 * args.pad_cols -
+                                                             args.filter_cols) /
+                                          args.stride_cols)) +
+                    1;
+    if (is_global_pooling) {
+      args.filter_rows = args.in_rows;
+      args.filter_cols = args.in_cols;
+      args.pad_rows = 0;
+      args.pad_cols = 0;
+    }
+  }
+  acl::AclParameters args;
+};
+template <>
+bool PoolKernel<GPU_MALI, float>::Init(const PoolParam& param) const {
+  AclPoolOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclPoolOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void PoolKernel<GPU_MALI, float>::Compute(const PoolParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclPoolOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  for (int n = 0; n < args.batch; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth * args.in_cols * args.in_rows;
+    output_data += args.in_depth * args.out_cols * args.out_rows;
+  }
+}
+template class PoolKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/relu_kernel.cpp
+++ b/src/operators/kernel/mali/relu_kernel.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef RELU_OP
+#pragma once
+#include "operators/kernel/relu_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclReluOp : public acl::ACLOperator {
+ public:
+  AclReluOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_RELU;
+  }
+  ~AclReluOp() = default;
+  AclReluOp(const AclReluOp&) = delete;
+  AclReluOp& operator=(const AclReluOp&) = delete;
+  AclReluOp(AclReluOp&&) = delete;
+  AclReluOp& operator=(AclReluOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const ReluParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols * args.in_rows *
+                                         args.in_depth * args.batch);
+    arm_compute::TensorShape output_shape(args.in_cols * args.in_rows *
+                                          args.in_depth * args.out_num);
+    // arm_compute::TensorShape weights_shape(
+    // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::ActivationLayerInfo::ActivationFunction type;
+    type = arm_compute::ActivationLayerInfo::ActivationFunction::RELU;
+    arm_compute::ActivationLayerInfo act_info(type);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(activation, this, act_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const ReluParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const ReluParam& param) {
+    const auto* input_x = param.InputX();
+    auto* out = param.Out();
+    const T* input_data = input_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.batch = input_x->dims()[0];
+    args.in_depth = input_x->dims()[1];
+    args.in_rows = input_x->dims()[2];
+    args.in_cols = input_x->dims()[3];
+    args.out_num = out->dims()[0];
+  }
+  acl::AclParameters args;
+};
+template <>
+bool ReluKernel<GPU_MALI, float>::Init(const ReluParam& param) const {
+  AclReluOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclReluOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void ReluKernel<GPU_MALI, float>::Compute(const ReluParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclReluOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
+}
+template class ReluKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/reshape_kernel.cpp
+++ b/src/operators/kernel/mali/reshape_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef RESHAPE_OP
+#pragma once
+#include "operators/kernel/reshape_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ReshapeKernel<GPU_MALI, float>::Init(const ReshapeParam &para) const {
+  return true;
+}
+template <>
+void ReshapeKernel<GPU_MALI, float>::Compute(const ReshapeParam &param) const {
+  const auto *input_x = param.InputX();
+  const auto &input_x_dims = input_x->dims();
+  auto *out = param.Out();
+  framework::DDim out_dims = out->dims();
+  const auto *input_shape = param.InputShape();
+  if (input_shape) {
+    auto *shape_data = input_shape->data<int>();
+    framework::Tensor cpu_shape_tensor;
+    auto shape =
+        std::vector<int>(shape_data, shape_data + input_shape->numel());
+    out_dims = ValidateShape(shape, input_x->dims());
+  }
+  bool inplace = param.Inplace();
+  out->Resize(out_dims);
+  if (!inplace) {
+    out->mutable_data<float>();
+    framework::TensorCopy(*input_x, out);
+    out->Resize(out_dims);
+  } else {
+    out->ShareDataWith(*input_x);
+    out->Resize(out_dims);
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/softmax_kernel.cpp
+++ b/src/operators/kernel/mali/softmax_kernel.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SOFTMAX_OP
+#pragma once
+#include "operators/kernel/softmax_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclSoftmaxOp : public acl::ACLOperator {
+ public:
+  AclSoftmaxOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_SOFTMAX;
+  }
+  ~AclSoftmaxOp() = default;
+  AclSoftmaxOp(const AclSoftmaxOp&) = delete;
+  AclSoftmaxOp& operator=(const AclSoftmaxOp&) = delete;
+  AclSoftmaxOp(AclSoftmaxOp&&) = delete;
+  AclSoftmaxOp& operator=(AclSoftmaxOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const SoftmaxParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape shape(args.in_depth, args.batch);
+    if (is_operator_init_done(shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    //[width, height, IFM]
+    new_tensor(input(), shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), shape, args.output_data);
+    acl_configure(softmax, this, NULL);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const SoftmaxParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const SoftmaxParam& param) {
+    const framework::Tensor* in_x = param.InputX();
+    framework::Tensor* out = param.Out();
+    auto x_dims = in_x->dims();
+    out->Resize(x_dims);
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.out_num = out->dims()[0];
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
+template <>
+bool SoftmaxKernel<GPU_MALI, float>::Init(const SoftmaxParam& param) const {
+  AclSoftmaxOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclSoftmaxOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void SoftmaxKernel<GPU_MALI, float>::Compute(const SoftmaxParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclSoftmaxOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  for (int n = 0; n < args.out_num; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth;
+    output_data += args.in_depth;
+  }
+}
+template class SoftmaxKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -34,6 +34,8 @@ USE_OP_CPU(lrn);
 REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(lrn);
+REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -60,6 +60,8 @@ USE_OP_CPU(mul);
 REGISTER_OPERATOR_CPU(mul, ops::MulOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(mul);
+REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
@@ -64,6 +64,8 @@ USE_OP_CPU(pool2d);
 REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(pool2d);
+REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -38,6 +38,8 @@ USE_OP_CPU(relu);
 REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(relu);
+REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -37,6 +37,8 @@ USE_OP_CPU(reshape);
 REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(reshape);
+REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -32,6 +32,8 @@ USE_OP_CPU(softmax);
 REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(softmax);
+REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/tools/build.sh
+++ b/tools/build.sh
@@ -15,6 +15,7 @@ build_for_mac() {
    fi
    PLATFORM="x86"
    MODE="Release"
+    CXX_FLAGS="-std=c++11 -O3 -s"
    BUILD_DIR=../build/release/"${PLATFORM}"
    mkdir -p ${BUILD_DIR}/build
@@ -24,6 +25,7 @@ build_for_mac() {
    cmake .. \
        -B"${BUILD_DIR}" \
    	-DCMAKE_BUILD_TYPE="${MODE}" \
+    	-DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
    	-DIS_MAC=true
    cd ${BUILD_DIR}
@@ -31,6 +33,8 @@ build_for_mac() {
 }
 build_for_android() {
+    export ANDROID_NDK=/home/halsay/android-ndk-r16b
+    export NDK_ROOT=/home/halsay/android-ndk-r16b
    rm -rf "../build"
    if [ -z "${ANDROID_NDK}" ]; then
        echo "ANDROID_NDK not found!"
@@ -44,11 +48,11 @@ build_for_android() {
    if [ "${PLATFORM}" = "arm-v7a" ]; then
        ABI="armeabi-v7a with NEON"
        ARM_PLATFORM="V7"
-        CXX_FLAGS="-march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
+        CXX_FLAGS="-O3 -std=c++11 -s -march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
    elif [ "${PLATFORM}" = "arm-v8a" ]; then
        ABI="arm64-v8a"
        ARM_PLATFORM="V8"
-        CXX_FLAGS="-march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog"
+        CXX_FLAGS="-std=c++11 -march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog -O0 -ggdb3  -fno-inline -g"
    else
        echo "unknown platform!"
        exit -1
@@ -56,7 +60,7 @@ build_for_android() {
    MODE="Release"
-    ANDROID_PLATFORM_VERSION="android-15"
+    ANDROID_PLATFORM_VERSION="android-22"
    TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
    ANDROID_ARM_MODE="arm"
    if [ $# -eq 1 ]; then
@@ -171,4 +175,4 @@ else
 		    build_error
 	    fi
 	fi
 fi
\ No newline at end of file