ready for PR

bec8987d · sharper · 0342c83c · bec8987d · bec8987d · bec8987d
34 changed file
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "src/operators/kernel/mali/ACL_Android"]
+	path = src/operators/kernel/mali/ACL_Android
+	url = https://github.com/halsay/ACL_Android.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ option(USE_OPENMP "openmp support" OFF)
 option(USE_EXCEPTION "use std exception" ON)
 option(LOG_PROFILE "log profile" ON)
 # select the platform to build
-option(CPU "cpu" ON)
+option(CPU "cpu" OFF)
 option(MALI_GPU "mali gpu" ON)
 option(FPGA "fpga" OFF)
@@ -16,6 +16,17 @@ endif()
 if (MALI_GPU)
    add_definitions(-DPADDLE_MOBILE_MALI_GPU)
+    add_definitions(-DUSE_ACL=1)
+    add_definitions(-DUSE_OPENCL)
+    set(ACL_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/ACL_Android)
+    include_directories(${ACL_ROOT} ${ACL_ROOT}/include)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_core")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_graph")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build/opencl-1.2-stubs")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ACL=1")
 endif()
 if(FPGA)

--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -138,9 +138,21 @@ class OpKernelBase {
   * @p para 这个参数为 kernel 运算时所需要用到参数组成的一个结构体,
   *    所有结构体存在与: paddle-mobile/src/operators/op_param.h
   * */
+#ifdef PADDLE_MOBILE_MALI_GPU
+  OpKernelBase() { acl_op_ = nullptr; }
+  void *GetAclOp() const { return acl_op_; }
+  void SetAclOp(void *op, void *ob) const {
+    reinterpret_cast<OpKernelBase<Dtype, P> *>(ob)->acl_op_ = op;
+  }
+#endif
  virtual void Compute(const P &para) const = 0;
  virtual bool Init(const P &para) const { return true; };
  virtual ~OpKernelBase() = default;
+ private:
+#ifdef PADDLE_MOBILE_MALI_GPU
+  void *acl_op_;
+#endif
 };
 #define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)                                 \

--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -36,6 +36,8 @@ USE_OP_CPU(batch_norm);
 REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(batch_norm);
+REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -67,6 +67,8 @@ USE_OP_CPU(concat);
 REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(concat);
+REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -34,6 +34,8 @@ USE_OP_CPU(elementwise_add);
 REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(elementwise_add);
+REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -50,6 +50,8 @@ USE_OP_CPU(feed);
 REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(feed);
+REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -50,6 +50,8 @@ USE_OP_CPU(fetch);
 REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fetch);
+REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/fusion_conv_add.cpp
+++ b/src/operators/fusion_conv_add.cpp
@@ -54,6 +54,8 @@ USE_OP_CPU(conv_add);
 REGISTER_OPERATOR_CPU(conv_add, ops::FusionConvAddOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(conv_add);
+REGISTER_OPERATOR_MALI_GPU(conv_add, ops::FusionConvAddOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -59,6 +59,8 @@ USE_OP_CPU(fc);
 REGISTER_OPERATOR_CPU(fc, ops::FusionFcOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(fc);
+REGISTER_OPERATOR_MALI_GPU(fc, ops::FusionFcOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/ACL_Android @ 591027fc
+++ b/ACL_Android @ 591027fc
+Subproject commit 591027fcffea084100c756e48356e0f8a48e35e5
--- a/src/operators/kernel/mali/acl_operator.cc
+++ b/src/operators/kernel/mali/acl_operator.cc
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file acl_operator.cc
+ * \brief
+ * \author Joey
+ */
+#if USE_ACL == 1
+#include "acl_operator.h"
+unsigned int bypass_acl_class_layer =
+    (0 | FLAGS_ENABLE_ACL_CONCAT |
+     /*0xffffffff |*/ /*FLAGS_ENABLE_ACL_FC |*/ /*FLAGS_ENABLE_ACL_LRN
+                                                   |*/
+     0);
+int enable_schedule = 0;
+#ifdef USE_PROFILING
+#include "arm_neon.h"
+unsigned int acl_log_flags =
+    (0 | MASK_LOG_APP_TIME | /*MASK_LOG_ALLOCATE | */  /*MASK_LOG_ALLOCATE | */
+     /*MASK_LOG_RUN      | */ /*MASK_LOG_CONFIG   | */ /*MASK_LOG_COPY     | */
+     MASK_LOG_ABSVAL | MASK_LOG_BNLL | MASK_LOG_CONV | MASK_LOG_FC |
+     MASK_LOG_LRN | MASK_LOG_POOLING | MASK_LOG_RELU | MASK_LOG_SIGMOID |
+     MASK_LOG_SOFTMAX | MASK_LOG_TANH | MASK_LOG_LC | MASK_LOG_BN |
+     MASK_LOG_CONCAT | 0);
+#include <stdio.h>  /* printf */
+#include <stdlib.h> /* getenv */
+#endif              // USE_PROFILING
+static bool force_enable_gpu = false;
+bool AclEnableSchedule(int enable) {
+  enable_schedule = enable;
+  if (enable) {
+    force_enable_gpu = true;
+  }
+  return true;
+}
+int isScheduleEnable() { return enable_schedule; }
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+bool ACLOperator::init_gpu_env = true;
+#ifdef USE_OPENCL
+bool ACLOperator::support_opencl_ = false;
+bool opencl_is_available() { return arm_compute::opencl_is_available(); }
+#elif defined(USE_OPENGLES)
+bool ACLOperator::support_opengles_ = false;
+#endif
+ACLOperator::ACLOperator(bool is_gpu)
+    : operator_state_(operator_not_init),
+      force_bypass_acl_path_(false),
+      target_hint_(TargetHint::DONT_CARE),
+      convolution_method_hint_(ConvolutionMethodHint::GEMM),
+      _group(1),
+      name_(""),
+      input_idx_(0),
+      output_idx_(0),
+      is_gpu_(is_gpu) {
+  const char* pBypassACL;
+  if (init_gpu_env) {
+#ifdef USE_OPENCL
+    try {
+      if (opencl_is_available()) {
+        arm_compute::CLScheduler::get().default_init();
+        support_opencl_ = true;
+      }
+    } catch (std::exception& e) {
+      support_opencl_ = false;
+    }
+#elif defined(USE_OPENGLES)
+    try {
+      arm_compute::GCScheduler::get().default_init();
+      support_opengles_ = true;
+    } catch (std::exception& e) {
+      support_opengles_ = false;
+    }
+#endif
+    init_gpu_env = false;
+  }
+  if (force_enable_gpu) is_gpu_ = true;
+  pBypassACL = getenv("BYPASSACL");
+  if (pBypassACL) {
+    unsigned int bacl;
+    sscanf(pBypassACL, "%i", &bacl);
+    if (bacl != bypass_acl_class_layer) {
+      bypass_acl_class_layer = bacl;
+      printf("BYPASSACL<%s>\n", pBypassACL);
+      printf("BYPASSACL: %x\n", bypass_acl_class_layer);
+    }
+  }
+#ifdef USE_PROFILING
+  const char* pLogACL;
+  pLogACL = getenv("LOGACL");
+  if (pLogACL) {
+    unsigned int alf;
+    sscanf(pLogACL, "%i", &alf);
+    if (alf != acl_log_flags) {
+      acl_log_flags = alf;
+      printf("LOGACL<%s>\n", pLogACL);
+      printf("LOGACL: %x\n", acl_log_flags);
+    }
+  }
+#endif  // USE_PROFILING
+  const char* pEnableSchedule;
+  pEnableSchedule = getenv("ENABLESCHEDULE");
+  if (pEnableSchedule) {
+    int bshedule;
+    sscanf(pEnableSchedule, "%i", &bshedule);
+    if (bshedule != enable_schedule) {
+      enable_schedule = bshedule;
+      printf("ENABLESCHEDULE<%s>\n", pEnableSchedule);
+      printf("ENABLESCHEDULE: %x\n", enable_schedule);
+    }
+    if (enable_schedule) {
+      AclEnableSchedule(1);
+    }
+  }
+}
+ACLOperator::~ACLOperator() {}
+bool ACLOperator::new_tensor(std::unique_ptr<ACLTensor>& tensor,
+                             arm_compute::TensorShape& shape, void* mem,
+                             bool commit) {
+  auto acl_tensor =
+      new ACLTensor(arm_compute::TensorInfo(shape, arm_compute::Format::F32));
+  acl_tensor->set_target(getTargetHint());
+  acl_tensor->bindmem(mem);
+  if (commit) acl_tensor->commit();
+  tensor = (std::unique_ptr<ACLTensor>)std::move(acl_tensor);
+  return true;
+}
+bool ACLOperator::new_tensor(std::unique_ptr<ACLSubTensor>& tensor,
+                             std::unique_ptr<ACLTensor>& parent,
+                             arm_compute::TensorShape& shape,
+                             arm_compute::Coordinates& coord) {
+  auto acl_tensor = new ACLSubTensor(parent, shape, coord);
+  acl_tensor->set_target(getTargetHint());
+  tensor = (std::unique_ptr<ACLSubTensor>)std::move(acl_tensor);
+  return true;
+}
+void ACLTensor::commit(TensorType type) {
+  settensortype(type);
+  if (mem_) {
+    if (!allocate_) {
+#ifdef USE_PROFILING
+      logtime_util log_time(ACL_ALLOCATE_INFO);
+#endif  // USE_PROFILING
+      allocate();
+      allocate_ = true;
+    }
+    if (type_ != tensor_output) {
+      tensor_copy(mem_);
+    }
+    mem_ = nullptr;
+  }
+}
+int BaseACLTensor::tensor_copy(arm_compute::ITensor* tensor, void* mem,
+                               bool toTensor) {
+#ifdef USE_PROFILING
+  logtime_util log_time(ACL_COPY_INFO);
+#endif  // USE_PROFILING
+  arm_compute::Window window;
+  // Iterate through the rows (not each element)
+  window.use_tensor_dimensions(tensor->info()->tensor_shape(),
+                               /* first_dimension =*/arm_compute::Window::DimY);
+  int width = tensor->info()->tensor_shape()[0];
+  int height = tensor->info()->tensor_shape()[1];
+  int deepth = tensor->info()->tensor_shape()[2];
+  map();
+  // Create an iterator:
+  arm_compute::Iterator it(tensor, window);
+  // Except it works for an arbitrary number of dimensions
+  if (toTensor) {  // mem->tensor
+    arm_compute::execute_window_loop(
+        window,
+        [&](const arm_compute::Coordinates& id) {
+          memcpy(it.ptr(),
+                 ((char*)mem) +
+                     ((id[3] * (width * height * deepth) +
+                       id.z() * (width * height) + id.y() * width + id.x()) *
+                      tensor->info()->element_size()),
+                 width * tensor->info()->element_size());
+        },
+        it);
+  } else {  // tensor-->mem
+    arm_compute::execute_window_loop(
+        window,
+        [&](const arm_compute::Coordinates& id) {
+          memcpy(((char*)mem) + ((id[3] * (width * height * deepth) +
+                                  id.z() * (width * height) + id.y() * width) *
+                                 tensor->info()->element_size()),
+                 it.ptr(), width * tensor->info()->element_size());
+        },
+        it);
+  }
+  unmap();
+  return 0;
+}
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/acl_operator.h
+++ b/src/operators/kernel/mali/acl_operator.h
--- a/src/operators/kernel/mali/acl_tensor.cc
+++ b/src/operators/kernel/mali/acl_tensor.cc
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file acl_tensor.cc
+ * \brief
+ * \author Joey
+ */
+#include "acl_tensor.h"
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+#ifdef USE_ACL
+template <typename TensorType>
+std::unique_ptr<arm_compute::ITensor> initialise_tensor(
+    arm_compute::TensorInfo &info) {
+  auto tensor = cpp14::make_unique<TensorType>();
+  tensor->allocator()->init(info);
+  return std::move(tensor);
+}
+template <typename TensorType>
+void tensor_allocate(arm_compute::ITensor &tensor) {
+  auto itensor = dynamic_cast<TensorType *>(&tensor);
+  itensor->allocator()->allocate();
+}
+Tensor::Tensor(arm_compute::TensorInfo &info) noexcept
+    : _target(TargetHint::DONT_CARE), _info(info), _tensor(nullptr) {}
+Tensor::Tensor(Tensor &&src) noexcept
+    : _target(src._target),
+      _info(std::move(src._info)),
+      _tensor(std::move(src._tensor)) {}
+arm_compute::ITensor *Tensor::set_target(TargetHint target) {
+  switch (target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      _tensor = initialise_tensor<arm_compute::CLTensor>(_info);
+      break;
+#elif defined(USE_OPENGLES)
+    case TargetHint::OPENGLES:
+      _tensor = initialise_tensor<arm_compute::GCTensor>(_info);
+      break;
+#endif
+    case TargetHint::NEON:
+      _tensor = initialise_tensor<arm_compute::Tensor>(_info);
+      break;
+    default:
+      break;
+  }
+  _target = target;
+  return _tensor.get();
+}
+void Tensor::allocate() {
+  switch (_target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      tensor_allocate<arm_compute::CLTensor>(*_tensor);
+      break;
+#elif defined(USE_OPENGLES)
+    case TargetHint::OPENGLES:
+      tensor_allocate<arm_compute::GCTensor>(*_tensor);
+      break;
+#endif
+    case TargetHint::NEON:
+      tensor_allocate<arm_compute::Tensor>(*_tensor);
+      break;
+    default:
+      break;
+  }
+}
+void Tensor::map(bool blocking) {
+#ifdef USE_OPENCL
+  if (_target == TargetHint::OPENCL)
+    dynamic_cast<arm_compute::CLTensor *>(tensor())->map(blocking);
+#elif defined(USE_OPENGLES)
+  if (_target == TargetHint::OPENGLES)
+    dynamic_cast<arm_compute::GCTensor *>(tensor())->map(blocking);
+#endif
+}
+void Tensor::unmap() {
+#ifdef USE_OPENCL
+  if (_target == TargetHint::OPENCL)
+    dynamic_cast<arm_compute::CLTensor *>(tensor())->unmap();
+#elif defined(USE_OPENGLES)
+  if (_target == TargetHint::OPENGLES)
+    dynamic_cast<arm_compute::GCTensor *>(tensor())->unmap();
+#endif
+}
+template <typename SubTensorType, typename ParentTensorType>
+std::unique_ptr<arm_compute::ITensor> initialise_subtensor(
+    arm_compute::ITensor *parent, arm_compute::TensorShape shape,
+    arm_compute::Coordinates coords) {
+  auto ptensor = dynamic_cast<ParentTensorType *>(parent);
+  auto subtensor = cpp14::make_unique<SubTensorType>(ptensor, shape, coords);
+  return std::move(subtensor);
+}
+SubTensor::SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
+                     arm_compute::Coordinates &coords) noexcept
+    : _target(TargetHint::DONT_CARE),
+      _tensor_shape(tensor_shape),
+      _coords(coords),
+      _parent(nullptr),
+      _subtensor(nullptr) {
+  _parent = parent->tensor();
+  _target = parent->target();
+  instantiate_subtensor();
+}
+arm_compute::ITensor *SubTensor::set_target(TargetHint target) {
+  return (target == _target) ? _subtensor.get() : nullptr;
+}
+arm_compute::ITensor *SubTensor::tensor() { return _subtensor.get(); }
+const arm_compute::ITensor *SubTensor::tensor() const {
+  return _subtensor.get();
+}
+TargetHint SubTensor::target() const { return _target; }
+void SubTensor::allocate() {
+  // NOP for sub-tensors
+}
+void SubTensor::instantiate_subtensor() {
+  switch (_target) {
+#ifdef USE_OPENCL
+    case TargetHint::OPENCL:
+      _subtensor = initialise_subtensor<arm_compute::CLSubTensor,
+                                        arm_compute::ICLTensor>(
+          _parent, _tensor_shape, _coords);
+      break;
+#endif
+    default:
+    case TargetHint::NEON:
+      _subtensor =
+          initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(
+              _parent, _tensor_shape, _coords);
+      break;
+  }
+}
+#endif
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/mali/acl_tensor.h
+++ b/src/operators/kernel/mali/acl_tensor.h
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file acl_tensor.h
+ * \brief
+ * \author Joey
+ */
+#ifndef ACL_TENSOR_H_
+#define ACL_TENSOR_H_
+#ifdef USE_ACL
+#ifdef USE_OPENCL
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#elif defined(USE_OPENGLES)
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#endif
+#include "arm_compute/runtime/SubTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+#include <memory>
+namespace paddle_mobile {
+namespace operators {
+namespace acl {
+enum class TargetHint {
+  DONT_CARE,
+  OPENCL,
+  OPENGLES,
+  NEON,
+};
+enum class ConvolutionMethodHint {
+  GEMM,
+  DIRECT,
+};
+namespace cpp14 {
+template <class T>
+struct _Unique_if {
+  typedef std::unique_ptr<T> _Single_object;
+};
+template <class T>
+struct _Unique_if<T[]> {
+  typedef std::unique_ptr<T[]> _Unknown_bound;
+};
+template <class T, size_t N>
+struct _Unique_if<T[N]> {
+  typedef void _Known_bound;
+};
+template <class T, class... Args>
+typename _Unique_if<T>::_Single_object make_unique(Args &&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+template <class T>
+typename _Unique_if<T>::_Unknown_bound make_unique(size_t n) {
+  typedef typename std::remove_extent<T>::type U;
+  return std::unique_ptr<T>(new U[n]());
+}
+template <class T, class... Args>
+typename _Unique_if<T>::_Known_bound make_unique(Args &&...);
+}  // namespace cpp14
+class Tensor {
+ public:
+  explicit Tensor(arm_compute::TensorInfo &info) noexcept;
+  virtual ~Tensor() {}
+  Tensor(Tensor &&src) noexcept;
+  void set_info(arm_compute::TensorInfo &&info) { _info = info; }
+  arm_compute::ITensor *set_target(TargetHint target);
+  const arm_compute::TensorInfo &info() const { return _info; }
+  arm_compute::ITensor *tensor() { return _tensor.get(); }
+  void allocate();
+  void init() {}
+  TargetHint target() const { return _target; }
+  virtual void map(bool blocking = true);
+  virtual void unmap();
+ private:
+  TargetHint _target;
+  arm_compute::TensorInfo _info;
+  std::unique_ptr<arm_compute::ITensor> _tensor;
+};
+class SubTensor {
+ public:
+  SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
+            arm_compute::Coordinates &coords) noexcept;
+  ~SubTensor() {}
+  arm_compute::ITensor *tensor();
+  const arm_compute::ITensor *tensor() const;
+  TargetHint target() const;
+  void allocate();
+  arm_compute::ITensor *set_target(TargetHint target);
+ private:
+  /** Instantiates a sub-tensor */
+  void instantiate_subtensor();
+ private:
+  /**< Target that this tensor is pinned on */
+  TargetHint _target;
+  /**< SubTensor shape */
+  arm_compute::TensorShape _tensor_shape;
+  /**< SubTensor Coordinates */
+  arm_compute::Coordinates _coords;
+  /**< Parent tensor */
+  arm_compute::ITensor *_parent;
+  /**< SubTensor */
+  std::unique_ptr<arm_compute::ITensor> _subtensor;
+};
+}  // namespace acl
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif  // ACL_TENSOR_H_
--- a/src/operators/kernel/mali/batchnorm_kernel.cpp
+++ b/src/operators/kernel/mali/batchnorm_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,20 +17,152 @@ limitations under the License. */
 #pragma once
 #include "operators/kernel/batchnorm_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
+template <typename DeviceType, typename T>
+class AclBatchNormOp : public acl::ACLOperator {
+ public:
+  AclBatchNormOp() {
+    this->force_bypass_acl_path_ = bypass_acl_class_layer & FLAGS_ENABLE_ACL_BN;
+  }
+  ~AclBatchNormOp() = default;
+  AclBatchNormOp(const AclBatchNormOp&) = delete;
+  AclBatchNormOp& operator=(const AclBatchNormOp&) = delete;
+  AclBatchNormOp(AclBatchNormOp&&) = delete;
+  AclBatchNormOp& operator=(AclBatchNormOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const BatchNormParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    arm_compute::TensorShape mean_shape(args.in_depth);
+    arm_compute::TensorShape var_shape = mean_shape;
+    arm_compute::TensorShape beta_shape = mean_shape;
+    arm_compute::TensorShape gamma_shape = mean_shape;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    new_tensor(mean(), mean_shape, args.mean_data);
+    new_tensor(var(), var_shape, args.var_data);
+    new_tensor(beta(), beta_shape, args.biases_data);
+    new_tensor(gamma(), gamma_shape, args.weight_data);
+    acl_configure(bn, this, args.epsilon);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const BatchNormParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const BatchNormParam& param) {
+    const Tensor* in_x = param.InputX();
+    Tensor* out = param.OutputY();
+    const Tensor* scale = param.InputScale();
+    const Tensor* bias = param.InputBias();
+    const Tensor* saved_mean = param.InputMean();
+    const Tensor* saved_variance = param.InputVariance();
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    const T* weight_data = scale->data<T>();
+    const T* bias_data = bias->data<T>();
+    const T* mean_data = saved_mean->data<T>();
+    const T* var_data = saved_variance->data<T>();
+    float epsilon = param.Epsilon();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    // args.weight_data = (void*)weight_data;
+    // args.biases_data = (void*)bias_data;
+    args.mean_data = (void*)mean_data;
+    args.var_data = (void*)var_data;
+    args.epsilon = epsilon;
+    args.dim = in_x->dims().size();
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+    args.out_num = out->dims()[0];
+    args.out_depth = out->dims()[1];
+    args.out_rows = out->dims()[2];
+    args.out_cols = out->dims()[3];
+    args.weight_data = (void*)weight_data;
+    args.biases_data = (void*)bias_data;
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
 template <>
-bool BatchNormKernel<GPU_MALI, float>::Init(const BatchNormParam &para) const {
+bool BatchNormKernel<GPU_MALI, float>::Init(const BatchNormParam& param) const {
+  AclBatchNormOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclBatchNormOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
  return true;
 }
 template <>
 void BatchNormKernel<GPU_MALI, float>::Compute(
-    const BatchNormParam &param) const {}
+    const BatchNormParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclBatchNormOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
+}
+template class BatchNormKernel<GPU_MALI, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 #endif
+#endif
--- a/src/operators/kernel/mali/concat_kernel.cpp
+++ b/src/operators/kernel/mali/concat_kernel.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef CONCAT_OP
+#include "operators/kernel/concat_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclConcatOp : public acl::ACLOperator {
+ public:
+  AclConcatOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONCAT;
+  }
+  ~AclConcatOp() = default;
+  AclConcatOp(const AclConcatOp&) = delete;
+  AclConcatOp& operator=(const AclConcatOp&) = delete;
+  AclConcatOp(AclConcatOp&&) = delete;
+  AclConcatOp& operator=(AclConcatOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const ConcatParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    const std::vector<framework::LoDTensor*>* input_data = &args.in_tensor;
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.batch);
+    if (is_operator_init_done(output_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    T type;
+    for (int i = 0; i < input_data->size(); i++) {
+      const T* idata = (*input_data)[i]->data<T>();
+      const T* pdata = (*input_data)[i]->data<T>();
+      int in_batch = (*input_data)[i]->dims()[0];
+      int in_channels = (*input_data)[i]->dims()[1];
+      int in_width = (*input_data)[i]->dims()[2];
+      int in_height = (*input_data)[i]->dims()[3];
+      arm_compute::TensorShape in_shape(in_width, in_height, in_channels);
+      new_tensor(cinput(i), in_shape,
+                 acl::InputdataPtr(this, args.in_tensor, type, i));
+    }
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(concat, this, input_data->size());
+  }
+  void RunAcl(const std::vector<framework::LoDTensor*>& input, void* output) {
+    T type;
+    acl::acl_run(this, input, output, type);
+  }
+  bool Bypass_acl(const ConcatParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || !args.is_channel_concat) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const ConcatParam& param) {
+    auto inputs = param.Inputs();
+    auto* output = param.Out();
+    int64_t axis = param.Axis();
+    T* output_data = output->mutable_data<T>();
+    args.is_channel_concat = (axis == 1);
+    args.in_tensor = inputs;
+    args.output_data = (void*)output_data;
+    args.batch = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+  }
+  acl::AclParameters args;
+};
+template <>
+bool ConcatKernel<GPU_MALI, float>::Init(const ConcatParam& param) const {
+  AclConcatOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConcatOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void ConcatKernel<GPU_MALI, float>::Compute(const ConcatParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclConcatOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  std::vector<framework::LoDTensor*> temp_data = args.in_tensor;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl(temp_data, (void*)output_data);
+}
+template class ConcatKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/conv_add_kernel.cpp
+++ b/src/operators/kernel/mali/conv_add_kernel.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADD_OP
+#include "operators/kernel/conv_add_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclConvAddOp : public acl::ACLOperator {
+ public:
+  AclConvAddOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
+  }
+  ~AclConvAddOp() = default;
+  AclConvAddOp(const AclConvAddOp&) = delete;
+  AclConvAddOp& operator=(const AclConvAddOp&) = delete;
+  AclConvAddOp(AclConvAddOp&&) = delete;
+  AclConvAddOp& operator=(AclConvAddOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const FusionConvAddParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+    arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
+                                           args.in_depth / args.num_group,
+                                           args.out_depth);
+    arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PadStrideInfo conv_info(
+        args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
+        arm_compute::DimensionRoundingType::FLOOR);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    check_direct_conv();
+    //[kernel_x, kernel_y, IFM, OFM]
+    new_tensor(weights(), weights_shape, args.weight_data);
+    //[OFM]
+    if (args.biases_data) {
+      new_tensor(biases(), biases_shape, args.biases_data);
+    }
+    group() = args.num_group;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(conv, this, conv_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const FusionConvAddParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
+      bypass_acl = true;
+    }
+    if (args.dim > 2) {
+      bypass_acl = true;
+    }
+    if (args.dilated) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void check_direct_conv() {
+    bool use_direct_conv = false;
+    const char* pDirectConv;
+    pDirectConv = getenv("DIRECTCONV");
+    if (pDirectConv) {
+      unsigned int bdirectconv;
+      sscanf(pDirectConv, "%i", &bdirectconv);
+      if (bdirectconv != use_direct_conv) {
+        use_direct_conv = bdirectconv;
+        printf("DIRECTCONV<%s>\n", pDirectConv);
+        printf("DIRECTCONV: %x\n", use_direct_conv);
+      }
+    }
+    int pad_data[2], kernel[2];
+    pad_data[1] = args.pad_rows;
+    pad_data[0] = args.pad_cols;
+    kernel[1] = args.filter_rows;
+    kernel[0] = args.filter_cols;
+    if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
+                             pad_data[0] == 0 && pad_data[1] == 0) ||
+                            (kernel[0] == 3 && kernel[1] == 3 &&
+                             pad_data[0] <= 1 && pad_data[1] <= 1))) {
+      setConvMethod();  // NEDirectConvolutionLayer only for 1x1 and 3x3
+    }
+  }
+  void AclParametersByContext(const FusionConvAddParam& param) {
+    const Tensor* input = param.Input();
+    Tensor filter = *param.Filter();
+    Tensor* output = param.Output();
+    Tensor* bias;
+    int groups = param.Groups();
+    std::vector<int> strides = param.Strides();
+    std::vector<int> paddings = param.Paddings();
+    std::vector<int> dilations = param.Dilations();
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>();
+    const T* weight_data = filter.data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.weight_data = (void*)weight_data;
+    args.biases_data = nullptr;
+    try {
+      bias = param.Bias();
+    } catch (const std::exception& e) {
+    }
+    if (bias) {
+      const T* biases_data = bias->data<T>();
+      args.biases_data = (void*)biases_data;
+    }
+    args.num_group = groups;
+    args.dilation_rows = dilations[0];
+    args.dilation_cols = dilations[1];
+    if (dilations[0] != 1 || dilations[1] != 1) {
+      args.dilated = true;
+    }
+    // NCHW
+    // std::cout << "In dims: " << (input->dims()).size() << std::endl;
+    args.batch = input->dims()[0];
+    args.in_depth = input->dims()[1];
+    args.in_rows = input->dims()[2];
+    args.in_cols = input->dims()[3];
+    // std::cout <<"In N: " << args.batch << " C: " <<  args.in_depth
+    //  << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout << "Out dims: " << (output->dims()).size() << std::endl;
+    args.out_num = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    args.filter_rows = filter.dims()[2];
+    args.filter_cols = filter.dims()[3];
+    // std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
+    //  << " I: " <<  static_cast<int>(filter.dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+  }
+  acl::AclParameters args;
+};
+template <>
+bool ConvAddKernel<GPU_MALI, float>::Init(
+    const FusionConvAddParam& param) const {
+  AclConvAddOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConvAddOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void ConvAddKernel<GPU_MALI, float>::Compute(
+    const FusionConvAddParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclConvAddOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
+}
+template class ConvAddKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/conv_kernel.cpp
+++ b/src/operators/kernel/mali/conv_kernel.cpp
@@ -15,20 +15,213 @@ limitations under the License. */
 #ifdef CONV_OP
 #include "operators/kernel/conv_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
+template <typename DeviceType, typename T>
+class AclConvOp : public acl::ACLOperator {
+ public:
+  AclConvOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
+  }
+  ~AclConvOp() = default;
+  AclConvOp(const AclConvOp&) = delete;
+  AclConvOp& operator=(const AclConvOp&) = delete;
+  AclConvOp(AclConvOp&&) = delete;
+  AclConvOp& operator=(AclConvOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const ConvParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth, args.batch);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth, args.out_num);
+    arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
+                                           args.in_depth / args.num_group,
+                                           args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PadStrideInfo conv_info(
+        args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
+        arm_compute::DimensionRoundingType::FLOOR);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    check_direct_conv();
+    //[kernel_x, kernel_y, IFM, OFM]
+    new_tensor(weights(), weights_shape, args.weight_data);
+    //[OFM]
+    // if (args.biases_data) {
+    //    new_tensor(biases(),biases_shape,args.biases_data);
+    //}
+    group() = args.num_group;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(conv, this, conv_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const ConvParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
+      bypass_acl = true;
+    }
+    if (args.dim > 2) {
+      bypass_acl = true;
+    }
+    if (args.dilated) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void check_direct_conv() {
+    bool use_direct_conv = false;
+    const char* pDirectConv;
+    pDirectConv = getenv("DIRECTCONV");
+    if (pDirectConv) {
+      unsigned int bdirectconv;
+      sscanf(pDirectConv, "%i", &bdirectconv);
+      if (bdirectconv != use_direct_conv) {
+        use_direct_conv = bdirectconv;
+        printf("DIRECTCONV<%s>\n", pDirectConv);
+        printf("DIRECTCONV: %x\n", use_direct_conv);
+      }
+    }
+    int pad_data[2], kernel[2];
+    pad_data[1] = args.pad_rows;
+    pad_data[0] = args.pad_cols;
+    kernel[1] = args.filter_rows;
+    kernel[0] = args.filter_cols;
+    if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
+                             pad_data[0] == 0 && pad_data[1] == 0) ||
+                            (kernel[0] == 3 && kernel[1] == 3 &&
+                             pad_data[0] <= 1 && pad_data[1] <= 1))) {
+      setConvMethod();  // NEDirectConvolutionLayer only for 1x1 and 3x3
+    }
+  }
+  void AclParametersByContext(const ConvParam& param) {
+    const Tensor* input = param.Input();
+    Tensor filter = *param.Filter();
+    Tensor* output = param.Output();
+    int groups = param.Groups();
+    std::vector<int> strides = param.Strides();
+    std::vector<int> paddings = param.Paddings();
+    std::vector<int> dilations = param.Dilations();
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>();
+    const T* weight_data = filter.data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.weight_data = (void*)weight_data;
+    args.biases_data = nullptr;
+    // try {
+    //     bias = context.Input<framework::Tensor>("Bias");
+    // } catch (const std::exception& e) {
+    // }
+    // if (bias) {
+    //     const T* biases_data = bias->data<T>();
+    //     args.biases_data = (void*)biases_data;
+    // }
+    args.num_group = groups;
+    args.dilation_rows = dilations[0];
+    args.dilation_cols = dilations[1];
+    if (dilations[0] != 1 || dilations[1] != 1) {
+      args.dilated = true;
+    }
+    // NCHW
+    // std::cout << "In dims: " << (input->dims()).size() << std::endl;
+    args.batch = input->dims()[0];
+    args.in_depth = input->dims()[1];
+    args.in_rows = input->dims()[2];
+    args.in_cols = input->dims()[3];
+    std::cout << "In N: " << args.batch << " C: " << args.in_depth
+              << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout << "Out dims: " << (output->dims()).size() << std::endl;
+    args.out_num = output->dims()[0];
+    args.out_depth = output->dims()[1];
+    args.out_rows = output->dims()[2];
+    args.out_cols = output->dims()[3];
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    args.filter_rows = filter.dims()[2];
+    args.filter_cols = filter.dims()[3];
+    // std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
+    //  << " I: " <<  static_cast<int>(filter.dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+  }
+  acl::AclParameters args;
+};
 template <>
-bool ConvKernel<GPU_MALI, float>::Init(const ConvParam &para) const {
+bool ConvKernel<GPU_MALI, float>::Init(const ConvParam& param) const {
+  AclConvOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclConvOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
  return true;
 }
 template <>
-void ConvKernel<GPU_MALI, float>::Compute(const ConvParam &param) const {
+void ConvKernel<GPU_MALI, float>::Compute(const ConvParam& param) const {
-  //  ArmConvImplement imp;
+  std::cout << "init acl" << std::endl;
-  //  imp.Compute(param);
+  AclConvOp<GPU_MALI, float>* acl_op =
-  param.Output()->mutable_data<float>()[0] = 100.0;
+      reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
 }
 template class ConvKernel<GPU_MALI, float>;
@@ -36,3 +229,4 @@ template class ConvKernel<GPU_MALI, float>;
 }  // namespace paddle_mobile
 #endif
+#endif
--- a/src/operators/kernel/mali/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ELEMENTWISEADD_OP
+#pragma once
+#include "operators/kernel/elementwise_add_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename T>
+struct AddFunctor {
+  inline T operator()(T a, T b) const { return a + b; }
+};
+template <>
+bool ElementwiseAddKernel<GPU_MALI, float>::Init(
+    const ElementwiseAddParam &para) const {
+  return true;
+}
+template <>
+void ElementwiseAddKernel<GPU_MALI, float>::Compute(
+    const ElementwiseAddParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *Out = param.Out();
+  Out->mutable_data<float>();
+  int axis = param.Axis();
+  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
+                                                 AddFunctor<float>(), Out);
+}
+template class ElementwiseAddKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FC_OP
+#pragma once
+#include "operators/kernel/fusion_fc_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FusionFcKernel<GPU_MALI, float>::Init(const FusionFcParam &para) const {
+  return true;
+}
+template <>
+void FusionFcKernel<GPU_MALI, float>::Compute(
+    const FusionFcParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  const Tensor *input_z = param.InputZ();
+  auto *input_z_data = input_z->data<float>();
+  int axis = param.Axis();
+  Tensor *out = param.Out();
+  auto *out_data = out->mutable_data<float>();
+  const Tensor x_matrix =
+      input_x->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+          : *input_x;
+  const Tensor y_matrix =
+      input_y->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+          : *input_y;
+  auto out_dim = out->dims();
+  if (out_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
+  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
+                        " out_dim.size must be 2.");
+  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
+  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ")
+  int64_t classes = input_z->numel();
+  for (int i = 0; i < out_dim[0]; i++) {
+    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
+  }
+  for (int i = 0; i < out->numel(); i++) {
+    DLOG << out_data[i];
+  }
+  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                      out, static_cast<float>(1));
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  //            if (out_dim.size() != 2) {
+  //                out->Resize(out_dim);
+  //            }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/lrn_kernel.cpp
+++ b/src/operators/kernel/mali/lrn_kernel.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef LRN_OP
+#pragma once
+#include "operators/kernel/lrn_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclLrnOp : public acl::ACLOperator {
+ public:
+  AclLrnOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_LRN;
+  }
+  ~AclLrnOp() = default;
+  AclLrnOp(const AclLrnOp&) = delete;
+  AclLrnOp& operator=(const AclLrnOp&) = delete;
+  AclLrnOp(AclLrnOp&&) = delete;
+  AclLrnOp& operator=(AclLrnOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const LrnParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape shape(args.in_cols, args.in_rows, args.in_depth);
+    if (is_operator_init_done(shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    arm_compute::NormalizationLayerInfo norm_info(
+        arm_compute::NormType::CROSS_MAP, args.nsize, args.alpha, args.beta,
+        args.knorm);
+    //[width, height, IFM]
+    new_tensor(input(), shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), shape, args.output_data);
+    acl_configure(lrn, this, norm_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const LrnParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const LrnParam& param) {
+    const Tensor* in_x = param.InputX();
+    Tensor* out = param.Out();
+    int n = param.N();
+    T alpha = param.Alpha();
+    T beta = param.Beta();
+    T k = param.K();
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.nsize = n;
+    args.alpha = alpha;
+    args.beta = beta;
+    args.knorm = k;
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
+template <>
+bool LrnKernel<GPU_MALI, float>::Init(const LrnParam& param) const {
+  AclLrnOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclLrnOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void LrnKernel<GPU_MALI, float>::Compute(const LrnParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclLrnOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  for (int n = 0; n < args.batch; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth * args.in_cols * args.in_rows;
+    output_data += args.in_depth * args.in_cols * args.in_rows;
+  }
+}
+template class LrnKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/mul_kernel.cpp
+++ b/src/operators/kernel/mali/mul_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef MUL_OP
+#pragma once
+#include "operators/kernel/mul_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool MulKernel<GPU_MALI, float>::Init(const MulParam &para) const {
+  return true;
+}
+template <>
+void MulKernel<GPU_MALI, float>::Compute(const MulParam &param) const {
+  const Tensor *input_x = param.InputX();
+  const Tensor *input_y = param.InputY();
+  Tensor *out = param.Out();
+  out->mutable_data<float>();
+  const Tensor x_matrix =
+      input_x->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+          : *input_x;
+  const Tensor y_matrix =
+      input_y->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+          : *input_y;
+  auto out_dim = out->dims();
+  if (out_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                      out, static_cast<float>(0));
+  if (out_dim.size() != 2) {
+    out->Resize(out_dim);
+  }
+}
+template class MulKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/pool_kernel.cpp
+++ b/src/operators/kernel/mali/pool_kernel.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POOL_OP
+#pragma once
+#include "operators/kernel/pool_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclPoolOp : public acl::ACLOperator {
+ public:
+  AclPoolOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_POOLING;
+  }
+  ~AclPoolOp() = default;
+  AclPoolOp(const AclPoolOp&) = delete;
+  AclPoolOp& operator=(const AclPoolOp&) = delete;
+  AclPoolOp(AclPoolOp&&) = delete;
+  AclPoolOp& operator=(AclPoolOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const PoolParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
+                                         args.in_depth);
+    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
+                                          args.out_depth);
+    // arm_compute::TensorShape weights_shape(
+    // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::PoolingLayerInfo pool_info;
+    if (args.pool_type == "max") {
+      pool_info = arm_compute::PoolingLayerInfo(
+          arm_compute::PoolingType::MAX, args.filter_rows,
+          arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
+                                     args.pad_cols, args.pad_rows,
+                                     arm_compute::DimensionRoundingType::CEIL));
+    } else {
+      pool_info = arm_compute::PoolingLayerInfo(
+          arm_compute::PoolingType::AVG, args.filter_rows,
+          arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
+                                     args.pad_cols, args.pad_rows,
+                                     arm_compute::DimensionRoundingType::CEIL));
+    }
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(pooling, this, pool_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const PoolParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    if (args.pool_type != "max" && args.pool_type != "avg") {
+      bypass_acl = true;
+    }
+    if (args.filter_rows != args.filter_cols) {
+      bypass_acl = true;
+    }
+    // if (args.filter_rows!=2 && args.filter_rows!=3) {
+    //     bypass_acl = true;
+    // }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const PoolParam& param) {
+    const Tensor* in_x = param.Input();
+    Tensor* out = param.Output();
+    std::string pooling_type = param.PoolingType();
+    std::vector<int> ksize = param.Ksize();
+    std::vector<int> strides = param.Strides();
+    std::vector<int> paddings = param.Paddings();
+    bool is_global_pooling = param.isGlobalPooling();
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.is_global_pool = is_global_pooling;
+    args.pool_type = pooling_type;
+    args.filter_rows = ksize[0];
+    args.filter_cols = ksize[1];
+    args.dim = ksize.size();
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.in_rows = in_x->dims()[2];
+    args.in_cols = in_x->dims()[3];
+    // std::cout <<"In N: " << args.batch << " C: " <<  args.in_depth
+    //  << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
+    // NCHW
+    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
+    //  << " C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+    // MCHW = OIHW
+    // std::cout <<"Filter O: " << static_cast<int>(filter->dims()[0])
+    //  << " I: " <<  static_cast<int>(filter->dims()[1])
+    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
+    // strides(h_stride, w_stride)
+    args.stride_rows = strides[0];
+    args.stride_cols = strides[1];
+    // std::cout <<"PoolingType: " << args.pool_type << "\n";
+    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
+    // args.stride_cols << "\n";
+    // paddings(h_pad, w_pad)
+    args.pad_rows = paddings[0];
+    args.pad_cols = paddings[1];
+    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
+    // "\n";
+    args.out_depth = args.in_depth;
+    // args.out_rows = out->dims()[2];
+    // args.out_cols = out->dims()[3];
+    args.out_rows = static_cast<int>(ceil(static_cast<float>(args.in_rows +
+                                                             2 * args.pad_rows -
+                                                             args.filter_rows) /
+                                          args.stride_rows)) +
+                    1;
+    args.out_cols = static_cast<int>(ceil(static_cast<float>(args.in_cols +
+                                                             2 * args.pad_cols -
+                                                             args.filter_cols) /
+                                          args.stride_cols)) +
+                    1;
+    if (is_global_pooling) {
+      args.filter_rows = args.in_rows;
+      args.filter_cols = args.in_cols;
+      args.pad_rows = 0;
+      args.pad_cols = 0;
+    }
+  }
+  acl::AclParameters args;
+};
+template <>
+bool PoolKernel<GPU_MALI, float>::Init(const PoolParam& param) const {
+  AclPoolOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclPoolOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void PoolKernel<GPU_MALI, float>::Compute(const PoolParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclPoolOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  for (int n = 0; n < args.batch; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth * args.in_cols * args.in_rows;
+    output_data += args.in_depth * args.out_cols * args.out_rows;
+  }
+}
+template class PoolKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/relu_kernel.cpp
+++ b/src/operators/kernel/mali/relu_kernel.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef RELU_OP
+#pragma once
+#include "operators/kernel/relu_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclReluOp : public acl::ACLOperator {
+ public:
+  AclReluOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_RELU;
+  }
+  ~AclReluOp() = default;
+  AclReluOp(const AclReluOp&) = delete;
+  AclReluOp& operator=(const AclReluOp&) = delete;
+  AclReluOp(AclReluOp&&) = delete;
+  AclReluOp& operator=(AclReluOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const ReluParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape input_shape(args.in_cols * args.in_rows *
+                                         args.in_depth * args.batch);
+    arm_compute::TensorShape output_shape(args.in_cols * args.in_rows *
+                                          args.in_depth * args.out_num);
+    // arm_compute::TensorShape weights_shape(
+    // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
+    // arm_compute::TensorShape biases_shape(args.out_depth);
+    arm_compute::ActivationLayerInfo::ActivationFunction type;
+    type = arm_compute::ActivationLayerInfo::ActivationFunction::RELU;
+    arm_compute::ActivationLayerInfo act_info(type);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    //[width, height, IFM]
+    new_tensor(input(), input_shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), output_shape, args.output_data);
+    acl_configure(activation, this, act_info);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const ReluParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const ReluParam& param) {
+    const auto* input_x = param.InputX();
+    auto* out = param.Out();
+    const T* input_data = input_x->data<T>();
+    T* output_data = out->mutable_data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    args.batch = input_x->dims()[0];
+    args.in_depth = input_x->dims()[1];
+    args.in_rows = input_x->dims()[2];
+    args.in_cols = input_x->dims()[3];
+    args.out_num = out->dims()[0];
+  }
+  acl::AclParameters args;
+};
+template <>
+bool ReluKernel<GPU_MALI, float>::Init(const ReluParam& param) const {
+  AclReluOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclReluOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void ReluKernel<GPU_MALI, float>::Compute(const ReluParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclReluOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  acl_op->RunAcl((void*)input_data, (void*)output_data);
+}
+template class ReluKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/kernel/mali/reshape_kernel.cpp
+++ b/src/operators/kernel/mali/reshape_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef RESHAPE_OP
+#pragma once
+#include "operators/kernel/reshape_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ReshapeKernel<GPU_MALI, float>::Init(const ReshapeParam &para) const {
+  return true;
+}
+template <>
+void ReshapeKernel<GPU_MALI, float>::Compute(const ReshapeParam &param) const {
+  const auto *input_x = param.InputX();
+  const auto &input_x_dims = input_x->dims();
+  auto *out = param.Out();
+  framework::DDim out_dims = out->dims();
+  const auto *input_shape = param.InputShape();
+  if (input_shape) {
+    auto *shape_data = input_shape->data<int>();
+    framework::Tensor cpu_shape_tensor;
+    auto shape =
+        std::vector<int>(shape_data, shape_data + input_shape->numel());
+    out_dims = ValidateShape(shape, input_x->dims());
+  }
+  bool inplace = param.Inplace();
+  out->Resize(out_dims);
+  if (!inplace) {
+    out->mutable_data<float>();
+    framework::TensorCopy(*input_x, out);
+    out->Resize(out_dims);
+  } else {
+    out->ShareDataWith(*input_x);
+    out->Resize(out_dims);
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/mali/softmax_kernel.cpp
+++ b/src/operators/kernel/mali/softmax_kernel.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SOFTMAX_OP
+#pragma once
+#include "operators/kernel/softmax_kernel.h"
+#ifdef PADDLE_MOBILE_MALI_GPU
+#include "acl_operator.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class AclSoftmaxOp : public acl::ACLOperator {
+ public:
+  AclSoftmaxOp() {
+    this->force_bypass_acl_path_ =
+        bypass_acl_class_layer & FLAGS_ENABLE_ACL_SOFTMAX;
+  }
+  ~AclSoftmaxOp() = default;
+  AclSoftmaxOp(const AclSoftmaxOp&) = delete;
+  AclSoftmaxOp& operator=(const AclSoftmaxOp&) = delete;
+  AclSoftmaxOp(AclSoftmaxOp&&) = delete;
+  AclSoftmaxOp& operator=(AclSoftmaxOp&&) = delete;
+  acl::AclParameters& getargs() { return args; }
+  void InitAclLayer(const SoftmaxParam& param) {
+    setTargetHint(acl::TargetHint::OPENCL);
+    arm_compute::TensorShape shape(args.in_depth, args.batch);
+    if (is_operator_init_done(shape)) return;
+    set_operator_init_done();
+    this->force_bypass_acl_path_ = false;
+    //[width, height, IFM]
+    new_tensor(input(), shape, args.input_data);
+    //[width, height, OFM]
+    new_tensor(output(), shape, args.output_data);
+    acl_configure(softmax, this, NULL);
+  }
+  void RunAcl(void* input, void* output) {
+    acl::ACLOperator::acl_run(input, output);
+  }
+  bool Bypass_acl(const SoftmaxParam& param) {
+    bool bypass_acl = false;
+    AclParametersByContext(param);
+    // for performance, more groups impact GPU performance
+    if (this->force_bypass_acl_path_) {
+      bypass_acl = true;
+    }
+    return bypass_acl;
+  }
+ private:
+  void AclParametersByContext(const SoftmaxParam& param) {
+    const framework::Tensor* in_x = param.InputX();
+    framework::Tensor* out = param.Out();
+    auto x_dims = in_x->dims();
+    out->Resize(x_dims);
+    const T* input_data = in_x->data<T>();
+    T* output_data = out->data<T>();
+    args.input_data = (void*)input_data;
+    args.output_data = (void*)output_data;
+    // NCHW
+    args.batch = in_x->dims()[0];
+    args.in_depth = in_x->dims()[1];
+    args.out_num = out->dims()[0];
+    // std::cout
+    //  << "Out C: " <<  args.out_depth
+    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
+  }
+  acl::AclParameters args;
+};
+template <>
+bool SoftmaxKernel<GPU_MALI, float>::Init(const SoftmaxParam& param) const {
+  AclSoftmaxOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    acl_op = new AclSoftmaxOp<GPU_MALI, float>();
+    this->SetAclOp((void*)acl_op, (void*)this);
+  }
+  return true;
+}
+template <>
+void SoftmaxKernel<GPU_MALI, float>::Compute(const SoftmaxParam& param) const {
+  std::cout << "init acl" << std::endl;
+  AclSoftmaxOp<GPU_MALI, float>* acl_op =
+      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
+  if (acl_op == nullptr) {
+    return;
+  }
+  if (acl_op->Bypass_acl(param)) {
+    std::cout << "init acl failed" << std::endl;
+    return;
+  }
+  acl::AclParameters& args = acl_op->getargs();
+  const float* input_data = (const float*)args.input_data;
+  const float* output_data = (const float*)args.output_data;
+  acl_op->InitAclLayer(param);
+  for (int n = 0; n < args.out_num; ++n) {
+    acl_op->RunAcl((void*)input_data, (void*)output_data);
+    input_data += args.in_depth;
+    output_data += args.in_depth;
+  }
+}
+template class SoftmaxKernel<GPU_MALI, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
+#endif
--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -34,6 +34,8 @@ USE_OP_CPU(lrn);
 REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(lrn);
+REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -60,6 +60,8 @@ USE_OP_CPU(mul);
 REGISTER_OPERATOR_CPU(mul, ops::MulOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(mul);
+REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
@@ -64,6 +64,8 @@ USE_OP_CPU(pool2d);
 REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(pool2d);
+REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -38,6 +38,8 @@ USE_OP_CPU(relu);
 REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(relu);
+REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -37,6 +37,8 @@ USE_OP_CPU(reshape);
 REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(reshape);
+REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -32,6 +32,8 @@ USE_OP_CPU(softmax);
 REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(softmax);
+REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/tools/build.sh
+++ b/tools/build.sh
@@ -15,6 +15,7 @@ build_for_mac() {
    fi
    PLATFORM="x86"
    MODE="Release"
+    CXX_FLAGS="-std=c++11 -O3 -s"
    BUILD_DIR=../build/release/"${PLATFORM}"
    mkdir -p ${BUILD_DIR}/build
@@ -24,6 +25,7 @@ build_for_mac() {
    cmake .. \
        -B"${BUILD_DIR}" \
    	-DCMAKE_BUILD_TYPE="${MODE}" \
+    	-DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
    	-DIS_MAC=true
    cd ${BUILD_DIR}
@@ -31,6 +33,8 @@ build_for_mac() {
 }
 build_for_android() {
+    export ANDROID_NDK=/home/halsay/android-ndk-r16b
+    export NDK_ROOT=/home/halsay/android-ndk-r16b
    rm -rf "../build"
    if [ -z "${ANDROID_NDK}" ]; then
        echo "ANDROID_NDK not found!"
@@ -44,11 +48,11 @@ build_for_android() {
    if [ "${PLATFORM}" = "arm-v7a" ]; then
        ABI="armeabi-v7a with NEON"
        ARM_PLATFORM="V7"
-        CXX_FLAGS="-march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
+        CXX_FLAGS="-O3 -std=c++11 -s -march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
    elif [ "${PLATFORM}" = "arm-v8a" ]; then
        ABI="arm64-v8a"
        ARM_PLATFORM="V8"
-        CXX_FLAGS="-march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog"
+        CXX_FLAGS="-std=c++11 -march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog -O0 -ggdb3  -fno-inline -g"
    else
        echo "unknown platform!"
        exit -1
@@ -56,7 +60,7 @@ build_for_android() {
    MODE="Release"
-    ANDROID_PLATFORM_VERSION="android-15"
+    ANDROID_PLATFORM_VERSION="android-22"
    TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
    ANDROID_ARM_MODE="arm"
    if [ $# -eq 1 ]; then
@@ -171,4 +175,4 @@ else
 		    build_error
 	    fi
 	fi
 fi
\ No newline at end of file