remove mali

a29db744 · xiebaiyuan · ad3844d6 · a29db744 · a29db744 · a29db744
66 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,6 @@ option(WITH_TEST     "build with unit tests" ON)
 # select the platform to build
 option(CPU        "build with arm CPU support" ON)
-option(GPU_MALI   "build with arm mali GPU support" OFF)
 option(GPU_CL     "build with OpenCL support" OFF)
 option(FPGA       "build with FPGA support" OFF)
 if(FPGA)
@@ -97,31 +96,6 @@ else()
    endforeach()
 endif()
-if (GPU_MALI)
-    add_definitions(-DPADDLE_MOBILE_MALI_GPU)
-    add_definitions(-DUSE_ACL=1)
-    add_definitions(-DUSE_OPENCL)
-    set(ACL_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/ACL_Android)
-    include_directories(${ACL_ROOT} ${ACL_ROOT}/include)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_core")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_graph")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build/opencl-1.2-stubs")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ACL=1")
-else()
-    file(GLOB_RECURSE _tmp_list src/operators/kernel/mali/*.cpp src/operators/kernel/mali/*.cc)
-    foreach(f ${_tmp_list})
-        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-    endforeach()
-    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/mali/*.h)
-    foreach(f ${_tmp_list_h})
-        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-    endforeach()
-endif()
 if(FPGA)
    add_definitions(-DPADDLE_MOBILE_FPGA)
    file(GLOB_RECURSE _tmp_list src/operators/math/*.cpp src/operators/kernel/fpga/*.cc)

--- a/doc/build.md
+++ b/doc/build.md
@@ -46,7 +46,6 @@ root@5affd29d4fc5:/ # ccmake .
 DEBUGING                         ON
 FPGA                             OFF
 LOG_PROFILE                      ON
- MALI_GPU                         OFF
 NET                              googlenet
 USE_EXCEPTION                    ON
 USE_OPENMP                       OFF

--- a/doc/design_doc.md
+++ b/doc/design_doc.md
@@ -109,11 +109,6 @@ USE_OP_CPU(conv2d);
 REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(conv2d);
-REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 USE_OP_FPGA(conv2d);
 REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);

--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -23,15 +23,6 @@ limitations under the License. */
 #define LOAD_CPU_OP(op_type)
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#define LOAD_MALI_GPU_OP(op_type)                                           \
-  extern int TouchOpRegistrar_##op_type##_##mali_gpu();                     \
-  static int use_op_itself_##op_type##_##mali_gpu __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type##_##mali_gpu()
-#else
-#define LOAD_MALI_GPU_OP(op_type)
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #define LOAD_FPGA_OP(op_type)                                           \
  extern int TouchOpRegistrar_##op_type##_##fpga();                     \
@@ -46,9 +37,8 @@ limitations under the License. */
  static int use_fusion_matcher_itself_##op_type __attribute__((unused)) = \
      TouchFusionMatcherRegistrar_##op_type();
-#define LOAD_OP(op_type)     \
+#define LOAD_OP(op_type) \
-  LOAD_CPU_OP(op_type);      \
+  LOAD_CPU_OP(op_type);  \
-  LOAD_MALI_GPU_OP(op_type); \
  LOAD_FPGA_OP(op_type);
 #define LOAD_OP1(op_type, device_type) LOAD_##device_type##_OP(op_type);
@@ -68,7 +58,7 @@ LOAD_OP(fetch)
 LOAD_OP(fill_constant)
 #endif
 #ifdef BATCHNORM_OP
-LOAD_OP2(batch_norm, CPU, MALI_GPU);
+LOAD_OP1(batch_norm, CPU);
 #endif
 #ifdef BILINEAR_INTERP_OP
 LOAD_OP1(bilinear_interp, CPU);
@@ -77,40 +67,40 @@ LOAD_OP1(bilinear_interp, CPU);
 LOAD_OP1(box_coder, CPU);
 #endif
 #ifdef CONCAT_OP
-LOAD_OP3(concat, CPU, MALI_GPU, FPGA);
+LOAD_OP2(concat, CPU, FPGA);
 #endif
 #ifdef CONV_OP
-LOAD_OP3(conv2d, CPU, MALI_GPU, FPGA);
+LOAD_OP2(conv2d, CPU, FPGA);
 #endif
 #ifdef LRN_OP
-LOAD_OP2(lrn, CPU, MALI_GPU);
+LOAD_OP1(lrn, CPU);
 #endif
 #ifdef SIGMOID_OP
 LOAD_OP1(sigmoid, CPU);
 #endif
 #ifdef FUSION_FC_RELU_OP
-LOAD_OP3(fusion_fc_relu, CPU, MALI_GPU, FPGA);
+LOAD_OP3(fusion_fc_relu, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_fc_relu);
 #endif
 #ifdef FUSION_ELEMENTWISEADDRELU_OP
-LOAD_OP3(fusion_elementwise_add_relu, CPU, MALI_GPU, FPGA);
+LOAD_OP3(fusion_elementwise_add_relu, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_elementwise_add_relu);
 #endif
 #ifdef SPLIT_OP
 LOAD_OP1(split, CPU);
 #endif
 #ifdef RESIZE_OP
-LOAD_OP2(resize, CPU, MALI_GPU);
+LOAD_OP1(resize, CPU);
 #endif
 #ifdef FUSION_CONVADDBNRELU_OP
 LOAD_OP2(fusion_conv_add_bn_relu, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_add_bn_relu);
 #endif
 #ifdef RESHAPE_OP
-LOAD_OP2(reshape, CPU, MALI_GPU);
+LOAD_OP1(reshape, CPU);
 #endif
 #ifdef RESHAPE2_OP
-LOAD_OP2(reshape2, CPU, MALI_GPU);
+LOAD_OP1(reshape2, CPU);
 #endif
 #ifdef TRANSPOSE_OP
 LOAD_OP1(transpose, CPU);
@@ -126,11 +116,11 @@ LOAD_OP2(fusion_conv_add_relu, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_add_relu);
 #endif
 #ifdef FUSION_CONVADD_OP
-LOAD_OP2(fusion_conv_add, CPU, MALI_GPU);
+LOAD_OP1(fusion_conv_add, CPU);
 LOAD_FUSION_MATCHER(fusion_conv_add);
 #endif
 #ifdef SOFTMAX_OP
-LOAD_OP2(softmax, CPU, MALI_GPU);
+LOAD_OP1(softmax, CPU);
 #endif
 #ifdef SHAPE_OP
 LOAD_OP1(shape, CPU);
@@ -142,13 +132,13 @@ LOAD_OP1(depthwise_conv2d, CPU);
 LOAD_OP1(conv2d_transpose, CPU);
 #endif
 #ifdef SCALE_OP
-LOAD_OP2(scale, CPU, MALI_GPU);
+LOAD_OP1(scale, CPU);
 #endif
 #ifdef ELEMENTWISEADD_OP
-LOAD_OP2(elementwise_add, CPU, MALI_GPU);
+LOAD_OP1(elementwise_add, CPU);
 #endif
 #ifdef PRELU_OP
-LOAD_OP2(prelu, CPU, MALI_GPU);
+LOAD_OP1(prelu, CPU);
 #endif
 #ifdef FLATTEN_OP
 LOAD_OP1(flatten, CPU);
@@ -182,13 +172,13 @@ LOAD_FUSION_MATCHER(fusion_dwconv_bn_relu);
 LOAD_OP1(crf_decoding, CPU);
 #endif
 #ifdef MUL_OP
-LOAD_OP2(mul, CPU, MALI_GPU);
+LOAD_OP1(mul, CPU);
 #endif
 #ifdef NORM_OP
 LOAD_OP1(norm, CPU);
 #endif
 #ifdef RELU_OP
-LOAD_OP2(relu, CPU, MALI_GPU);
+LOAD_OP1(relu, CPU);
 LOAD_OP1(relu6, CPU);
 #endif
 #ifdef IM2SEQUENCE_OP
@@ -198,11 +188,11 @@ LOAD_OP1(im2sequence, CPU);
 LOAD_OP1(lookup_table, CPU);
 #endif
 #ifdef FUSION_FC_OP
-LOAD_OP3(fusion_fc, CPU, MALI_GPU, FPGA);
+LOAD_OP2(fusion_fc, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_fc);
 #endif
 #ifdef POOL_OP
-LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA);
+LOAD_OP2(pool2d, CPU, FPGA);
 #endif
 #ifdef MULTICLASSNMS_OP
 LOAD_OP1(multiclass_nms, CPU);
@@ -217,7 +207,7 @@ LOAD_OP1(sum, CPU);
 LOAD_OP1(elementwise_mul, CPU);
 #endif
 #ifdef SLICE_OP
-LOAD_OP2(slice, CPU, MALI_GPU);
+LOAD_OP1(slice, CPU);
 #endif
 #ifdef FUSION_CONVBN_OP
 LOAD_OP2(fusion_conv_bn, CPU, FPGA);

--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -115,9 +115,6 @@ class OpRegistry {
 #define REGISTER_OPERATOR_CPU(op_type, op_class) \
  REGISTER_OPERATOR(op_type, op_class, cpu, paddle_mobile::CPU);
-#define REGISTER_OPERATOR_MALI_GPU(op_type, op_class) \
-  REGISTER_OPERATOR(op_type, op_class, mali_gpu, paddle_mobile::GPU_MALI);
 #define REGISTER_OPERATOR_FPGA(op_type, op_class) \
  REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);

--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -130,13 +130,6 @@ class OpKernelBase {
  }
 #endif
-#ifdef PADDLE_McOBILE_MALI_GPU
-  OpKernelBase() { acl_op_ = nullptr; }
-  void *GetAclOp() const { return acl_op_; }
-  void SetAclOp(void *op, void *ob) const {
-    reinterpret_cast<OpKernelBase<Dtype, P> *>(ob)->acl_op_ = op;
-  }
-#endif
  virtual void Compute(const P &para) = 0;
  virtual bool Init(P *para) { return true; }
  virtual ~OpKernelBase() = default;
@@ -147,9 +140,6 @@ class OpKernelBase {
 #endif
 private:
-#ifdef PADDLE_MOBILE_MALI_GPU
-  void *acl_op_;
-#endif
 };
 class FusionOpMatcher {

--- a/src/operators/activation_op.cpp
+++ b/src/operators/activation_op.cpp
@@ -55,9 +55,6 @@ namespace ops = paddle_mobile::operators;
 REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
 REGISTER_OPERATOR_CPU(relu6, ops::Relu6Op);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(relu, ops::ReluOp);
 #endif

--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -34,9 +34,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/bilinear_interp_op.cpp
+++ b/src/operators/bilinear_interp_op.cpp
@@ -48,8 +48,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(bilinear_interp, ops::BilinearOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/box_coder_op.cpp
+++ b/src/operators/box_coder_op.cpp
@@ -58,8 +58,6 @@ REGISTER_OPERATOR_CPU(box_coder, ops::BoxCoderOp);
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(box_coder, ops::BoxCoderOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -69,9 +69,7 @@ REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(concat, ops::ConcatOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(concat, ops::ConcatOp);
 #endif

--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -55,9 +55,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
 #endif

--- a/src/operators/conv_transpose_op.cpp
+++ b/src/operators/conv_transpose_op.cpp
@@ -24,8 +24,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(conv2d_transpose, ops::ConvOpTranspose);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(conv2d_transpose, ops::ConvOpTranspose);
 #endif

--- a/src/operators/crf_op.cpp
+++ b/src/operators/crf_op.cpp
@@ -48,8 +48,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(crf_decoding, ops::CrfOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -33,9 +33,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
-#endif
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp);

--- a/src/operators/elementwise_mul_op.cpp
+++ b/src/operators/elementwise_mul_op.cpp
@@ -32,9 +32,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(elementwise_mul, ops::ElementwiseMulOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(elementwise_mul, ops::ElementwiseMulOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(elementwise_mul, ops::ElementwiseMulOp);
 #endif

--- a/src/operators/elementwise_sub_op.cpp
+++ b/src/operators/elementwise_sub_op.cpp
@@ -32,9 +32,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(elementwise_sub, ops::ElementwiseSubOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(elementwise_sub, ops::ElementwiseSubOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -38,9 +38,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(feed, ops::FeedOp);
 #endif

--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
@@ -30,9 +30,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fetch, ops::FetchOp);
 #endif

--- a/src/operators/fusion_conv_add_op.cpp
+++ b/src/operators/fusion_conv_add_op.cpp
@@ -54,9 +54,6 @@ REGISTER_FUSION_MATCHER(fusion_conv_add, ops::FusionConvAddMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp);
-#endif
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(fusion_conv_add, ops::FusionConvAddOp);

--- a/src/operators/fusion_deconv_add_bn_op.cpp
+++ b/src/operators/fusion_deconv_add_bn_op.cpp
@@ -24,8 +24,7 @@ namespace ops = paddle_mobile::operators;
 REGISTER_FUSION_MATCHER(fusion_deconv_add_bn, ops::FusionDeconvAddBNMatcher);
 #ifdef PADDLE_MOBILE_CPU
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn, ops::FusionDeconvAddBNOp);
 #endif

--- a/src/operators/fusion_deconv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_deconv_add_bn_relu_op.cpp
@@ -25,8 +25,7 @@ REGISTER_FUSION_MATCHER(fusion_deconv_add_bn_relu,
                        ops::FusionDeconvAddBNReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn_relu, ops::FusionDeconvAddBNReluOp);
 #endif

--- a/src/operators/fusion_deconv_add_op.cpp
+++ b/src/operators/fusion_deconv_add_op.cpp
@@ -24,8 +24,7 @@ namespace ops = paddle_mobile::operators;
 REGISTER_FUSION_MATCHER(fusion_deconv_add, ops::FusionDeconvAddMatcher);
 #ifdef PADDLE_MOBILE_CPU
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_deconv_add, ops::FusionDeconvAddOp);
 #endif

--- a/src/operators/fusion_deconv_add_relu_op.cpp
+++ b/src/operators/fusion_deconv_add_relu_op.cpp
@@ -25,8 +25,7 @@ REGISTER_FUSION_MATCHER(fusion_deconv_add_relu,
                        ops::FusionDeconvAddReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_deconv_add_relu, ops::FusionDeconvAddReluOp);
 #endif

--- a/src/operators/fusion_deconv_bn_relu_op.cpp
+++ b/src/operators/fusion_deconv_bn_relu_op.cpp
@@ -24,8 +24,7 @@ namespace ops = paddle_mobile::operators;
 REGISTER_FUSION_MATCHER(fusion_deconv_bn_relu, ops::FusionDeconvBNReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_deconv_bn_relu, ops::FusionDeconvBNReluOp);
 #endif

--- a/src/operators/fusion_deconv_relu_op.cpp
+++ b/src/operators/fusion_deconv_relu_op.cpp
@@ -23,8 +23,7 @@ namespace operators {}
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_deconv_relu, ops::FusionDeconvReluOp);
 #endif

--- a/src/operators/fusion_elementwise_add_relu_op.cpp
+++ b/src/operators/fusion_elementwise_add_relu_op.cpp
@@ -36,10 +36,6 @@ REGISTER_FUSION_MATCHER(fusion_elementwise_add_relu,
 // REGISTER_OPERATOR_CPU(fusion_elementwise_add_relu,
 //                      ops::FusionElementwiseAddReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-// REGISTER_OPERATOR_MALI_GPU(fusion_elementwise_add_relu,
-//                           ops::FusionElementwiseAddReluOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_elementwise_add_relu,
                       ops::FusionElementwiseAddReluOp);

--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -63,9 +63,6 @@ REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp);
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(fusion_fc, ops::FusionFcOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(fusion_fc, ops::FusionFcOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_fc, ops::FusionFcOp);
 #endif

--- a/src/operators/fusion_fc_relu_op.cpp
+++ b/src/operators/fusion_fc_relu_op.cpp
@@ -60,9 +60,6 @@ REGISTER_FUSION_MATCHER(fusion_fc_relu, ops::FusionFcReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_fc_relu, ops::FusionFcReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(fusion_fc_relu, ops::FusionFcReluOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_fc_relu, ops::FusionFcReluOp);
 #endif

--- a/src/operators/gru_unit_op.cpp
+++ b/src/operators/gru_unit_op.cpp
@@ -59,8 +59,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(gru_unit, ops::GruUnitOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/increment_op.cpp
+++ b/src/operators/increment_op.cpp
@@ -37,8 +37,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(increment, ops::IncrementOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/is_empty_op.cpp
+++ b/src/operators/is_empty_op.cpp
@@ -34,8 +34,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(is_empty, ops::IsEmptyOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/ACL_Android @ 591027fc
+++ b/ACL_Android @ 591027fc
-Subproject commit 591027fcffea084100c756e48356e0f8a48e35e5
--- a/src/operators/kernel/mali/acl_operator.cc
+++ b/src/operators/kernel/mali/acl_operator.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#if USE_ACL == 1
-#include "acl_operator.h"
-unsigned int bypass_acl_class_layer =
-    (0 | FLAGS_ENABLE_ACL_CONCAT |
-     /*0xffffffff |*/ /*FLAGS_ENABLE_ACL_FC |*/ /*FLAGS_ENABLE_ACL_LRN
-                                                   |*/
-     0);
-int enable_schedule = 0;
-#ifdef USE_PROFILING
-#include "arm_neon.h"
-unsigned int acl_log_flags =
-    (0 | MASK_LOG_APP_TIME | /*MASK_LOG_ALLOCATE | */  /*MASK_LOG_ALLOCATE | */
-     /*MASK_LOG_RUN      | */ /*MASK_LOG_CONFIG   | */ /*MASK_LOG_COPY     | */
-     MASK_LOG_ABSVAL | MASK_LOG_BNLL | MASK_LOG_CONV | MASK_LOG_FC |
-     MASK_LOG_LRN | MASK_LOG_POOLING | MASK_LOG_RELU | MASK_LOG_SIGMOID |
-     MASK_LOG_SOFTMAX | MASK_LOG_TANH | MASK_LOG_LC | MASK_LOG_BN |
-     MASK_LOG_CONCAT | 0);
-#include <stdio.h>  /* printf */
-#include <stdlib.h> /* getenv */
-#endif              // USE_PROFILING
-static bool force_enable_gpu = false;
-bool AclEnableSchedule(int enable) {
-  enable_schedule = enable;
-  if (enable) {
-    force_enable_gpu = true;
-  }
-  return true;
-}
-int isScheduleEnable() { return enable_schedule; }
-namespace paddle_mobile {
-namespace operators {
-namespace acl {
-bool ACLOperator::init_gpu_env = true;
-#ifdef USE_OPENCL
-bool ACLOperator::support_opencl_ = false;
-bool opencl_is_available() { return arm_compute::opencl_is_available(); }
-#elif defined(USE_OPENGLES)
-bool ACLOperator::support_opengles_ = false;
-#endif
-ACLOperator::ACLOperator(bool is_gpu)
-    : operator_state_(operator_not_init),
-      force_bypass_acl_path_(false),
-      target_hint_(TargetHint::DONT_CARE),
-      convolution_method_hint_(ConvolutionMethodHint::GEMM),
-      _group(1),
-      name_(""),
-      input_idx_(0),
-      output_idx_(0),
-      is_gpu_(is_gpu) {
-  const char* pBypassACL;
-  if (init_gpu_env) {
-#ifdef USE_OPENCL
-    try {
-      if (opencl_is_available()) {
-        arm_compute::CLScheduler::get().default_init();
-        support_opencl_ = true;
-      }
-    } catch (std::exception& e) {
-      support_opencl_ = false;
-    }
-#elif defined(USE_OPENGLES)
-    try {
-      arm_compute::GCScheduler::get().default_init();
-      support_opengles_ = true;
-    } catch (std::exception& e) {
-      support_opengles_ = false;
-    }
-#endif
-    init_gpu_env = false;
-  }
-  if (force_enable_gpu) is_gpu_ = true;
-  pBypassACL = getenv("BYPASSACL");
-  if (pBypassACL) {
-    unsigned int bacl;
-    sscanf(pBypassACL, "%i", &bacl);
-    if (bacl != bypass_acl_class_layer) {
-      bypass_acl_class_layer = bacl;
-      printf("BYPASSACL<%s>\n", pBypassACL);
-      printf("BYPASSACL: %x\n", bypass_acl_class_layer);
-    }
-  }
-#ifdef USE_PROFILING
-  const char* pLogACL;
-  pLogACL = getenv("LOGACL");
-  if (pLogACL) {
-    unsigned int alf;
-    sscanf(pLogACL, "%i", &alf);
-    if (alf != acl_log_flags) {
-      acl_log_flags = alf;
-      printf("LOGACL<%s>\n", pLogACL);
-      printf("LOGACL: %x\n", acl_log_flags);
-    }
-  }
-#endif  // USE_PROFILING
-  const char* pEnableSchedule;
-  pEnableSchedule = getenv("ENABLESCHEDULE");
-  if (pEnableSchedule) {
-    int bshedule;
-    sscanf(pEnableSchedule, "%i", &bshedule);
-    if (bshedule != enable_schedule) {
-      enable_schedule = bshedule;
-      printf("ENABLESCHEDULE<%s>\n", pEnableSchedule);
-      printf("ENABLESCHEDULE: %x\n", enable_schedule);
-    }
-    if (enable_schedule) {
-      AclEnableSchedule(1);
-    }
-  }
-}
-ACLOperator::~ACLOperator() {}
-bool ACLOperator::new_tensor(std::unique_ptr<ACLTensor>& tensor,
-                             arm_compute::TensorShape& shape, void* mem,
-                             bool commit) {
-  auto acl_tensor =
-      new ACLTensor(arm_compute::TensorInfo(shape, arm_compute::Format::F32));
-  acl_tensor->set_target(getTargetHint());
-  acl_tensor->bindmem(mem);
-  if (commit) acl_tensor->commit();
-  tensor = (std::unique_ptr<ACLTensor>)std::move(acl_tensor);
-  return true;
-}
-bool ACLOperator::new_tensor(std::unique_ptr<ACLSubTensor>& tensor,
-                             std::unique_ptr<ACLTensor>& parent,
-                             arm_compute::TensorShape& shape,
-                             arm_compute::Coordinates& coord) {
-  auto acl_tensor = new ACLSubTensor(parent, shape, coord);
-  acl_tensor->set_target(getTargetHint());
-  tensor = (std::unique_ptr<ACLSubTensor>)std::move(acl_tensor);
-  return true;
-}
-void ACLTensor::commit(TensorType type) {
-  settensortype(type);
-  if (mem_) {
-    if (!allocate_) {
-#ifdef USE_PROFILING
-      logtime_util log_time(ACL_ALLOCATE_INFO);
-#endif  // USE_PROFILING
-      allocate();
-      allocate_ = true;
-    }
-    if (type_ != tensor_output) {
-      tensor_copy(mem_);
-    }
-    mem_ = nullptr;
-  }
-}
-int BaseACLTensor::tensor_copy(arm_compute::ITensor* tensor, void* mem,
-                               bool toTensor) {
-#ifdef USE_PROFILING
-  logtime_util log_time(ACL_COPY_INFO);
-#endif  // USE_PROFILING
-  arm_compute::Window window;
-  // Iterate through the rows (not each element)
-  window.use_tensor_dimensions(tensor->info()->tensor_shape(),
-                               /* first_dimension =*/arm_compute::Window::DimY);
-  int width = tensor->info()->tensor_shape()[0];
-  int height = tensor->info()->tensor_shape()[1];
-  int deepth = tensor->info()->tensor_shape()[2];
-  map();
-  // Create an iterator:
-  arm_compute::Iterator it(tensor, window);
-  // Except it works for an arbitrary number of dimensions
-  if (toTensor) {  // mem->tensor
-    arm_compute::execute_window_loop(
-        window,
-        [&](const arm_compute::Coordinates& id) {
-          memcpy(it.ptr(),
-                 ((char*)mem) +
-                     ((id[3] * (width * height * deepth) +
-                       id.z() * (width * height) + id.y() * width + id.x()) *
-                      tensor->info()->element_size()),
-                 width * tensor->info()->element_size());
-        },
-        it);
-  } else {  // tensor-->mem
-    arm_compute::execute_window_loop(
-        window,
-        [&](const arm_compute::Coordinates& id) {
-          memcpy(((char*)mem) + ((id[3] * (width * height * deepth) +
-                                  id.z() * (width * height) + id.y() * width) *
-                                 tensor->info()->element_size()),
-                 it.ptr(), width * tensor->info()->element_size());
-        },
-        it);
-  }
-  unmap();
-  return 0;
-}
-}  // namespace acl
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/kernel/mali/acl_operator.h
+++ b/src/operators/kernel/mali/acl_operator.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef ACL_OPERATOR_H_
-#define ACL_OPERATOR_H_
-#include <framework/tensor.h>
-#include <operators/op_param.h>
-#if USE_ACL == 1
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
-#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
-#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
-#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
-#include "arm_compute/runtime/Tensor.h"
-#ifdef PADDLE_MOBILE_MALI_GPU
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
-#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
-#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
-#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
-#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
-#endif
-#ifdef USE_OPENGLES
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h"
-#endif
-#include "acl_tensor.h"
-#define FLAGS_ENABLE_ACL_ABSVAL 0x00000001
-#define FLAGS_ENABLE_ACL_BNLL 0x00000002
-#define FLAGS_ENABLE_ACL_CONV 0x00000004
-#define FLAGS_ENABLE_ACL_FC 0x00000008
-#define FLAGS_ENABLE_ACL_LRN 0x00000010
-#define FLAGS_ENABLE_ACL_POOLING 0x00000020
-#define FLAGS_ENABLE_ACL_RELU 0x00000040
-#define FLAGS_ENABLE_ACL_SIGMOID 0x00000080
-#define FLAGS_ENABLE_ACL_SOFTMAX 0x00000100
-#define FLAGS_ENABLE_ACL_TANH 0x00000200
-#define FLAGS_ENABLE_ACL_LC 0x00000400
-#define FLAGS_ENABLE_ACL_BN 0x00000800
-#define FLAGS_ENABLE_ACL_CONCAT 0x00001000
-extern unsigned int bypass_acl_class_layer;
-#ifdef USE_PROFILING
-#include <sys/time.h>
-#define NANO_SEC_CONV 1000000
-#define MASK_LOG_APP_TIME 0x00000001
-#define MASK_LOG_ALLOCATE 0x00000002
-#define MASK_LOG_RUN 0x00000004
-#define MASK_LOG_CONFIG 0x00000008
-#define MASK_LOG_COPY 0x00000010
-#define MASK_LOG_ABSVAL 0x00000020
-#define MASK_LOG_BNLL 0x00000040
-#define MASK_LOG_CONV 0x00000080
-#define MASK_LOG_FC 0x00000100
-#define MASK_LOG_LRN 0x00000200
-#define MASK_LOG_POOLING 0x00000400
-#define MASK_LOG_RELU 0x00000800
-#define MASK_LOG_SIGMOID 0x00001000
-#define MASK_LOG_SOFTMAX 0x00002000
-#define MASK_LOG_TANH 0x00004000
-#define MASK_LOG_LC 0x00008000
-#define MASK_LOG_BN 0x00010000
-#define MASK_LOG_CONCAT 0x00020000
-#define APP_TIME_INFO MASK_LOG_APP_TIME, "time:       \t"
-#define ACL_ALLOCATE_INFO MASK_LOG_ALLOCATE, "allocate:   \t\t"
-#define ACL_RUN_INFO MASK_LOG_RUN, "run:        \t\t\t"
-#define ACL_CONFIG_INFO MASK_LOG_CONFIG, "configure:  \t\t\t\t"
-#define ACL_COPY_INFO MASK_LOG_COPY, "tensor_copy:\t\t\t\t\t"
-#define ACL_ABSVAL_INFO MASK_LOG_ABSVAL, "ACL_ABSVAL :\t\t\t\t\t\t"
-#define ACL_BNLL_INFO MASK_LOG_BNLL, "ACL_BNLL   :\t\t\t\t\t\t\t"
-#define ACL_CONV_INFO MASK_LOG_CONV, "ACL_CONV   :\t\t\t\t\t\t\t\t"
-#define ACL_FC_INFO MASK_LOG_FC, "ACL_FC     :\t\t\t\t\t\t\t\t\t"
-#define ACL_LRN_INFO MASK_LOG_LRN, "ACL_LRN    :\t\t\t\t\t\t\t\t\t\t"
-#define ACL_POOLING_INFO MASK_LOG_POOLING, "ACL_POOLING:\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_RELU_INFO MASK_LOG_RELU, "ACL_RELU   :\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_SIGMOID_INFO \
-  MASK_LOG_SIGMOID, "ACL_SIGMOID:\t\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_SOFTMAX_INFO \
-  MASK_LOG_SOFTMAX, "ACL_SOFTMAX:\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_TANH_INFO \
-  MASK_LOG_TANH, "ACL_TANH   :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_LC_INFO MASK_LOG_LC, "ACL_LC     :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_BN_INFO \
-  MASK_LOG_BN, "ACL_BN     :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_CONCAT_INFO \
-  MASK_LOG_CONCAT, "ACL_CONCAT :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
-extern unsigned int acl_log_flags;
-class logtime_util {
- public:
-  logtime_util() { mask = 0; }
-  logtime_util(int mask_, const char *information_) {
-    setlogtime_info(mask_, information_);
-  }
-  void setlogtime_info(int mask_, const char *information_) {
-    mask = mask_;
-    if (acl_log_flags & mask) {
-      strncpy(information, information_, 255);
-      gettimeofday(&tv[0], NULL);
-    }
-  }
-  ~logtime_util() {
-    if (acl_log_flags & mask) {
-      int time[2];
-      gettimeofday(&tv[1], NULL);
-      time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
-      time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
-      printf("%s %.6lf\n", information,
-             (((double)time[1] - time[0]) / NANO_SEC_CONV));
-    }
-  }
-  void log_time(bool start) {
-    if (acl_log_flags & mask) {
-      if (start) {
-        gettimeofday(&tv[0], NULL);
-      } else {
-        int time[2];
-        gettimeofday(&tv[1], NULL);
-        time[0] = tv[0].tv_sec * NANO_SEC_CONV + tv[0].tv_usec;
-        time[1] = tv[1].tv_sec * NANO_SEC_CONV + tv[1].tv_usec;
-        printf("%s %.6lf\n", information,
-               (((double)time[1] - time[0]) / NANO_SEC_CONV));
-      }
-    }
-  }
- private:
-  struct timeval tv[2];
-  int mask;
-  char information[256];
-};
-#endif  // USE_PROFILING
-namespace paddle_mobile {
-namespace operators {
-namespace acl {
-class AclParameters {
- public:
-  AclParameters() {
-    dilated = false;
-    dim = 2;
-    num_group = 1;
-  }
-  int batch;
-  int in_depth;
-  int in_rows;
-  int in_cols;
-  int out_depth;
-  int out_rows;
-  int out_cols;
-  int out_num;
-  int filter_rows;
-  int filter_cols;
-  int stride_rows;
-  int stride_cols;
-  int pad_rows;
-  int pad_cols;
-  int dilation_rows;
-  int dilation_cols;
-  int num_group;
-  bool dilated;
-  int dim;
-  int epsilon;
-  int nsize;
-  float alpha;
-  float beta;
-  float knorm;
-  void *input_data;
-  void *output_data;
-  void *weight_data;
-  void *biases_data;
-  void *mean_data;
-  void *var_data;
-  std::string pool_type;
-  std::string act_type;
-  std::string data_layout;
-  bool is_global_pool;
-  bool is_channel_concat;
-  bool is_bypass;
-  std::vector<framework::LoDTensor *> in_tensor;
-};
-enum TensorType {
-  tensor_input,
-  tensor_output,
-  tensor_weights,
-  tensor_biases,
-  tensor_mean,
-  tensor_var,
-  tensor_beta,
-  tensor_gamma,
-  tensor_concat,
-  tensor_data,
-};
-enum OperatorState {
-  operator_not_init,
-  operator_init_done,
-  operator_reinit,
-};
-enum OperateType {
-  operate_type_pooling,
-  operate_type_activation,
-  operate_type_lrn,
-  operate_type_conv,
-  operate_type_lc,
-  operate_type_fc,
-  operate_type_bn,
-  operate_type_softmax,
-  operate_type_concat,
-};
-class BaseACLTensor {
- public:
-  BaseACLTensor() : type_(tensor_input), allocate_(false) {}
-  virtual ~BaseACLTensor() {}
-  virtual void bindmem(void *mem) { mem_ = mem; }
-  virtual void settensortype(TensorType type) { type_ = type; }
-  virtual void map(bool blocking = true) {}
-  virtual void unmap() {}
-  virtual void commit(TensorType type = tensor_data) {}
-  int tensor_copy(arm_compute::ITensor *tensor, void *mem,
-                  bool toTensor = true);
- protected:
-  void *mem_;
-  TensorType type_;
-  bool allocate_;
-};
-class ACLTensor : public BaseACLTensor, public Tensor {
- public:
-  explicit ACLTensor(arm_compute::TensorInfo &&info) : Tensor(info) {}
-  virtual void map(bool blocking = true) {
-    if (!allocate_) {
-      Tensor::allocate();
-      allocate_ = true;
-    }
-    Tensor::map(blocking);
-  }
-  virtual int tensor_copy(void *mem, bool toTensor = true) {
-    auto acl_tensor = this;
-    arm_compute::ITensor *tensor = acl_tensor->tensor();
-    BaseACLTensor::tensor_copy(tensor, mem, toTensor);
-    return 0;
-  }
-  virtual void unmap() { Tensor::unmap(); }
-  virtual void commit(TensorType type = tensor_data);
-};
-class ACLSubTensor : public BaseACLTensor, public SubTensor {
- public:
-  ACLSubTensor(std::unique_ptr<ACLTensor> &parent,
-               arm_compute::TensorShape &shape, arm_compute::Coordinates &coord)
-      : SubTensor(parent.get(), shape, coord) {}
-  virtual int tensor_copy(void *mem, bool toTensor = true) { return 0; }
-};
-template <typename T>
-class TensorPair {
- public:
-  TensorPair() {}
-  ~TensorPair() {}
-  TensorType type;
-  std::unique_ptr<T> tensor;
-};
-template <typename T>
-std::unique_ptr<T> &tensor_item(
-    std::vector<std::unique_ptr<TensorPair<T>>> &pool, TensorType type,
-    int idx) {
-  int count = 0;
-  for (auto &item : pool) {
-    if (item.get()->type == type) {
-      ++count;
-    }
-    if (item.get()->type == type && idx == count - 1) {
-      return item.get()->tensor;
-    }
-  }
-  pool.push_back((std::unique_ptr<TensorPair<T>>)std::move(new TensorPair<T>));
-  auto item = pool[pool.size() - 1].get();
-  item->type = type;
-  item->tensor = NULL;
-  return item->tensor;
-}
-class ACLOperator {
- public:
-  virtual void commit() {
-    for (auto &item : tensor_pool_) {
-      if (item.get()->tensor) item.get()->tensor->commit(item.get()->type);
-    }
-  }
-  inline void run() {
-    commit();
-#ifdef USE_PROFILING
-    logtime_util log_time(ACL_RUN_INFO);
-#endif  // USE_PROFILING
-    for (auto &c : funcs_) {
-      c->run();
-    }
-  }
-  inline std::vector<std::unique_ptr<arm_compute::IFunction>> &funcs() {
-    return funcs_;
-  }
-  inline std::unique_ptr<ACLSubTensor> &sinput(int idx = 0) {
-    return tensor_item(subtensor_pool_, tensor_input, idx);
-  }
-  inline std::unique_ptr<ACLSubTensor> &soutput(int idx = 0) {
-    return tensor_item(subtensor_pool_, tensor_output, idx);
-  }
-  inline std::unique_ptr<ACLSubTensor> &sweights(int idx = 0) {
-    return tensor_item(subtensor_pool_, tensor_weights, idx);
-  }
-  inline std::unique_ptr<ACLSubTensor> &sbiases(int idx = 0) {
-    return tensor_item(subtensor_pool_, tensor_biases, idx);
-  }
-  inline std::unique_ptr<ACLTensor> &cinput(int idx = 0) {
-    return tensor_item(tensor_pool_, tensor_concat, idx);
-  }
-  inline std::unique_ptr<ACLTensor> &input(int idx = 0) {
-    return tensor_item(tensor_pool_, tensor_input, idx);
-  }
-  inline std::unique_ptr<ACLTensor> &output(int idx = 0) {
-    return tensor_item(tensor_pool_, tensor_output, idx);
-  }
-  inline std::unique_ptr<ACLTensor> &weights(int idx = 0) {
-    return tensor_item(tensor_pool_, tensor_weights, idx);
-  }
-  inline std::unique_ptr<ACLTensor> &biases(int idx = 0) {
-    return tensor_item(tensor_pool_, tensor_biases, idx);
-  }
-  inline std::unique_ptr<ACLTensor> &mean(int idx = 0) {
-    return tensor_item(tensor_pool_, tensor_mean, idx);
-  }
-  inline std::unique_ptr<ACLTensor> &var(int idx = 0) {
-    return tensor_item(tensor_pool_, tensor_var, idx);
-  }
-  inline std::unique_ptr<ACLTensor> &beta(int idx = 0) {
-    return tensor_item(tensor_pool_, tensor_beta, idx);
-  }
-  inline std::unique_ptr<ACLTensor> &gamma(int idx = 0) {
-    return tensor_item(tensor_pool_, tensor_gamma, idx);
-  }
-  inline std::unique_ptr<ACLTensor> &tensor(TensorType type) {
-    switch (type) {
-      case tensor_biases:
-        return biases();
-        break;
-      case tensor_weights:
-        return weights();
-        break;
-      case tensor_output:
-        return output();
-        break;
-      default:
-      case tensor_input:
-        return input();
-        break;
-    }
-    return input();
-  }
-  explicit ACLOperator(bool is_gpu = false);
-  virtual ~ACLOperator();
-  inline TargetHint getTargetHint() {
-#ifdef USE_OPENCL
-    if (target_hint_ == TargetHint::DONT_CARE) {
-      if (is_gpu_) {
-        return TargetHint::OPENCL;
-      }
-      return TargetHint::NEON;
-    }
-    return target_hint_;
-#elif defined(USE_OPENGLES)
-    if (target_hint_ == TargetHint::DONT_CARE) {
-      if (is_gpu_) {
-        return TargetHint::OPENGLES;
-      }
-      return TargetHint::NEON;
-    }
-    return target_hint_;
-#else
-    return TargetHint::NEON;
-#endif
-  }
-  inline void setTargetHint(TargetHint hint) { target_hint_ = hint; }
-  inline ConvolutionMethodHint &getConvMethod() {
-    return convolution_method_hint_;
-  }
-  inline void setConvMethod() {
-    convolution_method_hint_ = ConvolutionMethodHint::DIRECT;
-  }
-  inline bool tensor_mem(std::unique_ptr<ACLTensor> &tensor, void *mem) {
-    tensor->bindmem(mem);
-    return true;
-  }
-  inline bool tensor_mem(void *mem, std::unique_ptr<ACLTensor> &tensor) {
-    tensor->tensor_copy(mem, false);
-    return true;
-  }
-  bool new_tensor(std::unique_ptr<ACLTensor> &tensor,
-                  arm_compute::TensorShape &shape, void *mem = nullptr,
-                  bool commit = false);
-  bool new_tensor(std::unique_ptr<ACLSubTensor> &tensor,
-                  std::unique_ptr<ACLTensor> &parent,
-                  arm_compute::TensorShape &shape,
-                  arm_compute::Coordinates &coord);
-  inline int &group() { return _group; }
-  inline void set_operator_property(OperateType type, const char *name) {
-    name_ = name;
-    type_ = type;
-  }
-  inline void acl_run(void *input_data, void *output_data) {
-    if (input_data) tensor_mem(input(), input_data);
-    run();
-    tensor_mem(output_data, output());
-  }
-  inline int &input_idx() { return input_idx_; }
-  inline int &output_idx() { return output_idx_; }
- protected:
-  inline bool isGPUMode() {
-#ifdef USE_OPENCL
-    if (!support_opencl_) return false;
-    return getTargetHint() == TargetHint::OPENCL;
-#elif defined(USE_OPENGLES)
-    if (!support_opengles_) return false;
-    return getTargetHint() == TargetHint::OPENGLES;
-#endif
-    return false;
-  }
-  inline OperatorState &opstate() { return operator_state_; }
-  inline bool is_operator_init_done(arm_compute::TensorShape shape,
-                                    TensorType type = tensor_input) {
-    checkreshape(shape, type);
-    return operator_state_ == operator_init_done;
-  }
-  inline void set_operator_init_done() {
-    opstate() = operator_init_done;
-    set_bypass_state(false);
-  }
-  inline void set_bypass_state(bool state = false) {
-    force_bypass_acl_path_ = state;
-  }
-  inline OperatorState checkreshape(arm_compute::TensorShape shape,
-                                    TensorType type = tensor_input) {
-    opstate() = reshape(shape, type);
-    if (opstate() == operator_reinit) {
-      freeres();
-    }
-    return opstate();
-  }
-  inline OperatorState reshape(arm_compute::TensorShape &shape,
-                               TensorType type) {
-    arm_compute::TensorShape _shape;
-    std::unique_ptr<ACLTensor> &acl_tensor = tensor(type);
-    if (!acl_tensor.get()) return operator_not_init;
-    _shape = acl_tensor->info().tensor_shape();
-    if (_shape.total_size() == shape.total_size() && _shape[0] == shape[0] &&
-        _shape[1] == shape[1]) {
-      return operator_init_done;
-    }
-    return operator_reinit;
-  }
-  inline void freeres() {
-    tensor_pool_.clear();
-    subtensor_pool_.clear();
-    funcs_.clear();
-  }
-  inline const char *&name() { return name_; }
-  inline void set_in_out_index(int indata_idx, int outdata_idx) {
-    input_idx() = indata_idx;
-    output_idx() = outdata_idx;
-  }
- protected:
-  std::vector<std::unique_ptr<TensorPair<ACLTensor>>> tensor_pool_;
-  std::vector<std::unique_ptr<TensorPair<ACLSubTensor>>> subtensor_pool_;
-  std::vector<std::unique_ptr<arm_compute::IFunction>> funcs_;
-  OperatorState operator_state_;
-  bool force_bypass_acl_path_;
-  TargetHint target_hint_;
-  ConvolutionMethodHint convolution_method_hint_;
-  static bool support_opengles_;
-  static bool support_opencl_;
-  static bool init_gpu_env;
-  int _group;
-  const char *name_;
-  OperateType type_;
-  int input_idx_, output_idx_;
-  bool is_gpu_;
-};
-int isScheduleEnable();
-template <typename OperatorType, typename TensorType>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(
-    arm_compute::ITensor *input, arm_compute::ITensor *output) {
-  auto op = cpp14::make_unique<OperatorType>();
-  op->configure(dynamic_cast<TensorType *>(input),
-                dynamic_cast<TensorType *>(output));
-  return std::move(op);
-}
-template <typename OperatorType, typename TensorType>
-std::unique_ptr<arm_compute::IFunction> instantiate(
-    arm_compute::ITensor *input, arm_compute::ITensor *output) {
-  return instantiate_function<OperatorType, TensorType>(input, output);
-}
-template <typename OpType, typename OpTensor>
-std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
-    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
-    TargetHint &hint) {
-  std::unique_ptr<arm_compute::IFunction> func;
-  func = instantiate<OpType, OpTensor>(input->tensor(), output->tensor());
-  return func;
-}
-template <typename OperatorType, typename TensorType, typename VectorTensor>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(
-    VectorTensor inputs, arm_compute::ITensor *output) {
-  auto op = cpp14::make_unique<OperatorType>();
-  op->configure(inputs, dynamic_cast<TensorType *>(output));
-  return std::move(op);
-}
-template <typename OperatorType, typename TensorType, typename VectorTensor>
-std::unique_ptr<arm_compute::IFunction> instantiate(
-    VectorTensor inputs, arm_compute::ITensor *output) {
-  return instantiate_function<OperatorType, TensorType, VectorTensor>(inputs,
-                                                                      output);
-}
-template <typename OpType, typename OpTensor>
-std::unique_ptr<arm_compute::IFunction> instantiate_op_func_lists(
-    ACLOperator *&acl_op, std::unique_ptr<ACLTensor> &output, int num,
-    TargetHint &hint) {
-  std::unique_ptr<arm_compute::IFunction> func;
-  static std::vector<OpTensor *> tensors;
-  tensors.clear();
-  for (int i = 0; i < num; ++i) {
-    tensors.push_back(
-        dynamic_cast<OpTensor *>(acl_op->cinput(i).get()->tensor()));
-  }
-  func = instantiate<OpType, OpTensor, std::vector<OpTensor *>>(
-      tensors, output->tensor());
-  return func;
-}
-template <typename OperatorType, typename TensorType, typename OperatorInfo>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(
-    arm_compute::ITensor *input, arm_compute::ITensor *output,
-    const OperatorInfo &info) {
-  auto op = cpp14::make_unique<OperatorType>();
-  op->configure(dynamic_cast<TensorType *>(input),
-                dynamic_cast<TensorType *>(output), info);
-  return std::move(op);
-}
-template <typename OperatorType, typename TensorType, typename OperatorInfo>
-std::unique_ptr<arm_compute::IFunction> instantiate(
-    arm_compute::ITensor *input, arm_compute::ITensor *output,
-    const OperatorInfo &info) {
-  return instantiate_function<OperatorType, TensorType, OperatorInfo>(
-      input, output, info);
-}
-template <typename OpType, typename OpTensor, typename OperatorInfo>
-std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
-    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
-    const OperatorInfo &info, TargetHint &hint) {
-  std::unique_ptr<arm_compute::IFunction> func;
-  func = instantiate<OpType, OpTensor, OperatorInfo>(input->tensor(),
-                                                     output->tensor(), info);
-  return func;
-}
-template <typename OperatorType, typename TensorType, typename OperatorInfo>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(
-    arm_compute::ITensor *input, arm_compute::ITensor *weights,
-    arm_compute::ITensor *biases, arm_compute::ITensor *output,
-    const OperatorInfo &info) {
-  auto op = cpp14::make_unique<OperatorType>();
-  op->configure(dynamic_cast<TensorType *>(input),
-                dynamic_cast<TensorType *>(weights),
-                dynamic_cast<TensorType *>(biases),
-                dynamic_cast<TensorType *>(output), info);
-  return std::move(op);
-}
-template <typename OperatorType, typename TensorType, typename OperatorInfo>
-std::unique_ptr<arm_compute::IFunction> instantiate(
-    arm_compute::ITensor *input, arm_compute::ITensor *weights,
-    arm_compute::ITensor *biases, arm_compute::ITensor *output,
-    const OperatorInfo &info) {
-  return instantiate_function<OperatorType, TensorType, OperatorInfo>(
-      input, weights, biases, output, info);
-}
-template <typename OpType, typename OpTensor, typename OperatorInfo,
-          typename ACLTensor>
-std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
-    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &weights,
-    std::unique_ptr<ACLTensor> &biases, std::unique_ptr<ACLTensor> &output,
-    const OperatorInfo &info, TargetHint &hint) {
-  std::unique_ptr<arm_compute::IFunction> func;
-  arm_compute::ITensor *biases_tensor = NULL;
-  if (biases.get()) {
-    biases_tensor = biases->tensor();
-  }
-  func = instantiate<OpType, OpTensor, OperatorInfo>(
-      input->tensor(), weights->tensor(), biases_tensor, output->tensor(),
-      info);
-  return func;
-}
-template <typename Dtype, typename OperatorType, typename TensorType>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(
-    arm_compute::ITensor *input, arm_compute::ITensor *output,
-    arm_compute::ITensor *mean, arm_compute::ITensor *var,
-    arm_compute::ITensor *beta, arm_compute::ITensor *gamma, Dtype &eps) {
-  auto op = cpp14::make_unique<OperatorType>();
-  op->configure(
-      dynamic_cast<TensorType *>(input), dynamic_cast<TensorType *>(output),
-      dynamic_cast<TensorType *>(mean), dynamic_cast<TensorType *>(var),
-      dynamic_cast<TensorType *>(beta), dynamic_cast<TensorType *>(gamma), eps);
-  return std::move(op);
-}
-template <typename Dtype, typename OperatorType, typename TensorType>
-std::unique_ptr<arm_compute::IFunction> instantiate(
-    arm_compute::ITensor *input, arm_compute::ITensor *output,
-    arm_compute::ITensor *mean, arm_compute::ITensor *var,
-    arm_compute::ITensor *beta, arm_compute::ITensor *gamma, Dtype eps) {
-  return instantiate_function<Dtype, OperatorType, TensorType>(
-      input, output, mean, var, beta, gamma, eps);
-}
-template <typename Dtype, typename OpType, typename OpTensor>
-std::unique_ptr<arm_compute::IFunction> instantiate_op_func(
-    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
-    std::unique_ptr<ACLTensor> &mean, std::unique_ptr<ACLTensor> &var,
-    std::unique_ptr<ACLTensor> &beta, std::unique_ptr<ACLTensor> &gamma,
-    Dtype eps, TargetHint hint) {
-  std::unique_ptr<arm_compute::IFunction> func;
-  func = instantiate<Dtype, OpType, OpTensor>(
-      input->tensor(), output->tensor(), mean->tensor(), var->tensor(),
-      beta->tensor(), gamma->tensor(), eps);
-  return func;
-}
-template <typename OperatorInfo>
-bool instantiate_op_pooling(
-    ACLOperator *acl_op,
-    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
-    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
-    TargetHint hint, const OperatorInfo &info) {
-#ifdef USE_OPENCL
-  if (hint == TargetHint::OPENCL) {
-    func.push_back(
-        instantiate_op_func<arm_compute::CLPoolingLayer, arm_compute::ICLTensor,
-                            arm_compute::PoolingLayerInfo>(input, output, info,
-                                                           hint));
-    return true;
-  }
-#elif defined(USE_OPENGLES)
-  if (hint == TargetHint::OPENGLES) {
-    func.push_back(
-        instantiate_op_func<arm_compute::GCPoolingLayer, arm_compute::IGCTensor,
-                            arm_compute::PoolingLayerInfo>(input, output, info,
-                                                           hint));
-    return true;
-  }
-#endif
-  {
-    func.push_back(
-        instantiate_op_func<arm_compute::NEPoolingLayer, arm_compute::ITensor,
-                            arm_compute::PoolingLayerInfo>(input, output, info,
-                                                           hint));
-  }
-  return true;
-}
-template <typename OperatorInfo>
-bool instantiate_op_activation(
-    ACLOperator *acl_op,
-    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
-    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
-    TargetHint hint, const OperatorInfo &info) {
-#ifdef USE_OPENCL
-  if (hint == TargetHint::OPENCL) {
-    func.push_back(instantiate_op_func<arm_compute::CLActivationLayer,
-                                       arm_compute::ICLTensor,
-                                       arm_compute::ActivationLayerInfo>(
-        input, output, info, hint));
-    return true;
-  }
-#elif defined(USE_OPENGLES)
-  if (hint == TargetHint::OPENGLES) {
-    func.push_back(instantiate_op_func<arm_compute::GCActivationLayer,
-                                       arm_compute::IGCTensor,
-                                       arm_compute::ActivationLayerInfo>(
-        input, output, info, hint));
-    return true;
-  }
-#endif
-  {
-    func.push_back(instantiate_op_func<arm_compute::NEActivationLayer,
-                                       arm_compute::ITensor,
-                                       arm_compute::ActivationLayerInfo>(
-        input, output, info, hint));
-  }
-  return true;
-}
-template <typename OperatorInfo>
-bool instantiate_op_lrn(
-    ACLOperator *acl_op,
-    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
-    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
-    TargetHint hint, const OperatorInfo &info) {
-#ifdef USE_OPENCL
-  if (hint == TargetHint::OPENCL) {
-    func.push_back(instantiate_op_func<arm_compute::CLNormalizationLayer,
-                                       arm_compute::ICLTensor,
-                                       arm_compute::NormalizationLayerInfo>(
-        input, output, info, hint));
-    return true;
-  }
-#elif defined(USE_OPENGLES)
-  if (hint == TargetHint::OPENGLES) {
-    func.push_back(instantiate_op_func<arm_compute::GCNormalizationLayer,
-                                       arm_compute::IGCTensor,
-                                       arm_compute::NormalizationLayerInfo>(
-        input, output, info, hint));
-    return true;
-  }
-#endif
-  {
-    func.push_back(instantiate_op_func<arm_compute::NENormalizationLayer,
-                                       arm_compute::ITensor,
-                                       arm_compute::NormalizationLayerInfo>(
-        input, output, info, hint));
-  }
-  return true;
-}
-template <typename OperatorInfo>
-bool instantiate_op_conv(
-    ACLOperator *acl_op,
-    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
-    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
-    TargetHint hint, const OperatorInfo &info) {
-  std::unique_ptr<ACLTensor> &weights = acl_op->weights();
-  std::unique_ptr<ACLTensor> &biases = acl_op->biases();
-  ConvolutionMethodHint &conv_method = acl_op->getConvMethod();
-  bool has_biases = biases.get() ? true : false;
-  int &groups = acl_op->group();
-  arm_compute::TensorShape input_shape = input->info().tensor_shape();
-  arm_compute::TensorShape weights_shape = weights->info().tensor_shape();
-  arm_compute::TensorShape biases_shape;
-  if (has_biases) {
-    biases_shape = biases->info().tensor_shape();
-  }
-  arm_compute::TensorShape output_shape = output->info().tensor_shape();
-  if (groups == 1) {
-    if (conv_method == ConvolutionMethodHint::GEMM) {
-#ifdef USE_OPENCL
-      if (hint == TargetHint::OPENCL) {
-        func.push_back(instantiate_op_func<arm_compute::CLConvolutionLayer,
-                                           arm_compute::ICLTensor,
-                                           arm_compute::PadStrideInfo>(
-            acl_op->input(), acl_op->weights(), acl_op->biases(),
-            acl_op->output(), info, hint));
-        return true;
-      }
-#elif defined(USE_OPENGLES)
-      if (hint == TargetHint::OPENGLES) {
-        func.push_back(instantiate_op_func<arm_compute::GCConvolutionLayer,
-                                           arm_compute::IGCTensor,
-                                           arm_compute::PadStrideInfo>(
-            acl_op->input(), acl_op->weights(), acl_op->biases(),
-            acl_op->output(), info, hint));
-        return true;
-      }
-#endif
-      {
-        func.push_back(instantiate_op_func<arm_compute::NEConvolutionLayer,
-                                           arm_compute::ITensor,
-                                           arm_compute::PadStrideInfo>(
-            acl_op->input(), acl_op->weights(), acl_op->biases(),
-            acl_op->output(), info, hint));
-      }
-    } else {
-#ifdef USE_OPENCL
-      if (hint == TargetHint::OPENCL) {
-        func.push_back(
-            instantiate_op_func<arm_compute::CLDirectConvolutionLayer,
-                                arm_compute::ICLTensor,
-                                arm_compute::PadStrideInfo>(
-                acl_op->input(), acl_op->weights(), acl_op->biases(),
-                acl_op->output(), info, hint));
-        return true;
-      }
-#elif defined(USE_OPENGLES)
-      if (hint == TargetHint::OPENGLES) {
-        func.push_back(
-            instantiate_op_func<arm_compute::GCDirectConvolutionLayer,
-                                arm_compute::IGCTensor,
-                                arm_compute::PadStrideInfo>(
-                acl_op->input(), acl_op->weights(), acl_op->biases(),
-                acl_op->output(), info, hint));
-        return true;
-      }
-#endif
-      {
-        func.push_back(
-            instantiate_op_func<arm_compute::NEDirectConvolutionLayer,
-                                arm_compute::ITensor,
-                                arm_compute::PadStrideInfo>(
-                acl_op->input(), acl_op->weights(), acl_op->biases(),
-                acl_op->output(), info, hint));
-      }
-    }
-    return true;
-  }
-  // Calculate sub-tensor splits
-  const int input_split = input_shape.z() / groups;
-  const int output_split = output_shape.z() / groups;
-  const int weights_split = weights_shape[3] / groups;
-  const int biases_split = biases_shape.x() / groups;
-  // Calculate sub-tensor shapes
-  input_shape.set(2, input_split);
-  output_shape.set(2, output_split);
-  weights_shape.set(3, weights_split);
-  biases_shape.set(0, biases_split);
-  for (auto i = 0; i < groups; ++i) {
-    // Calculate sub-tensors starting coordinates
-    arm_compute::Coordinates input_coord(0, 0, input_split * i);
-    arm_compute::Coordinates output_coord(0, 0, output_split * i);
-    arm_compute::Coordinates weights_coord(0, 0, 0, weights_split * i);
-    arm_compute::Coordinates biases_coord(biases_split * i);
-    // Create sub-tensors for input, output, weights and bias
-    acl_op->new_tensor(acl_op->sinput(i), acl_op->input(), input_shape,
-                       input_coord);
-    acl_op->new_tensor(acl_op->soutput(i), acl_op->output(), output_shape,
-                       output_coord);
-    acl_op->new_tensor(acl_op->sweights(i), acl_op->weights(), weights_shape,
-                       weights_coord);
-    if (has_biases) {
-      acl_op->new_tensor(acl_op->sbiases(i), acl_op->biases(), biases_shape,
-                         biases_coord);
-    }
-    bool use_opencl = false;
-    if (conv_method == ConvolutionMethodHint::GEMM) {
-#ifdef USE_OPENCL
-      if (hint == TargetHint::OPENCL) {
-        use_opencl = true;
-        func.push_back(
-            instantiate_op_func<arm_compute::CLConvolutionLayer,
-                                arm_compute::ICLTensor,
-                                arm_compute::PadStrideInfo, ACLSubTensor>(
-                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
-                acl_op->soutput(i), info, hint));
-      }
-#endif
-      if (!use_opencl) {
-        func.push_back(
-            instantiate_op_func<arm_compute::NEConvolutionLayer,
-                                arm_compute::ITensor,
-                                arm_compute::PadStrideInfo, ACLSubTensor>(
-                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
-                acl_op->soutput(i), info, hint));
-      }
-    } else {
-#ifdef USE_OPENCL
-      if (hint == TargetHint::OPENCL) {
-        use_opencl = true;
-        func.push_back(
-            instantiate_op_func<arm_compute::CLDirectConvolutionLayer,
-                                arm_compute::ICLTensor,
-                                arm_compute::PadStrideInfo, ACLSubTensor>(
-                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
-                acl_op->soutput(i), info, hint));
-      }
-#endif
-      if (!use_opencl) {
-        func.push_back(
-            instantiate_op_func<arm_compute::NEDirectConvolutionLayer,
-                                arm_compute::ITensor,
-                                arm_compute::PadStrideInfo, ACLSubTensor>(
-                acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i),
-                acl_op->soutput(i), info, hint));
-      }
-    }
-  }
-  return true;
-}
-template <typename OperatorInfo>
-bool instantiate_op_lc(
-    ACLOperator *acl_op,
-    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
-    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
-    TargetHint hint, const OperatorInfo &info) {
-  std::unique_ptr<ACLTensor> &weights = acl_op->weights();
-  std::unique_ptr<ACLTensor> &biases = acl_op->biases();
-#ifdef USE_OPENCL
-  if (hint == TargetHint::OPENCL) {
-    func.push_back(
-        instantiate_op_func<arm_compute::CLLocallyConnectedLayer,
-                            arm_compute::ICLTensor, arm_compute::PadStrideInfo>(
-            input, weights, biases, output, info, hint));
-    return true;
-  }
-#endif
-  {
-    func.push_back(
-        instantiate_op_func<arm_compute::NELocallyConnectedLayer,
-                            arm_compute::ITensor, arm_compute::PadStrideInfo>(
-            input, weights, biases, output, info, hint));
-  }
-  return true;
-}
-template <typename OperatorInfo>
-bool instantiate_op_fc(
-    ACLOperator *acl_op,
-    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
-    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
-    TargetHint hint, const OperatorInfo &info) {
-  std::unique_ptr<ACLTensor> &weights = acl_op->weights();
-  std::unique_ptr<ACLTensor> &biases = acl_op->biases();
-#ifdef USE_OPENCL
-  if (hint == TargetHint::OPENCL) {
-    func.push_back(instantiate_op_func<arm_compute::CLFullyConnectedLayer,
-                                       arm_compute::ICLTensor, bool>(
-        input, weights, biases, output, info, hint));
-    return true;
-  }
-#elif defined(USE_OPENGLES)
-  if (hint == TargetHint::OPENGLES) {
-    func.push_back(instantiate_op_func<arm_compute::GCFullyConnectedLayer,
-                                       arm_compute::IGCTensor, bool>(
-        input, weights, biases, output, info, hint));
-    return true;
-  }
-#endif
-  {
-    func.push_back(instantiate_op_func<arm_compute::NEFullyConnectedLayer,
-                                       arm_compute::ITensor, bool>(
-        input, weights, biases, output, info, hint));
-  }
-  return true;
-}
-template <typename Dtype>
-bool instantiate_op_bn(
-    ACLOperator *acl_op,
-    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
-    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
-    TargetHint hint, Dtype eps) {
-  std::unique_ptr<ACLTensor> &mean = acl_op->mean();
-  std::unique_ptr<ACLTensor> &var = acl_op->var();
-  std::unique_ptr<ACLTensor> &beta = acl_op->beta();
-  std::unique_ptr<ACLTensor> &gamma = acl_op->gamma();
-#ifdef USE_OPENCL
-  if (hint == TargetHint::OPENCL) {
-    func.push_back(
-        instantiate_op_func<Dtype, arm_compute::CLBatchNormalizationLayer,
-                            arm_compute::ICLTensor>(input, output, mean, var,
-                                                    beta, gamma, eps, hint));
-    return true;
-  }
-#elif defined(USE_OPENGLES)
-  if (hint == TargetHint::OPENGLES) {
-    func.push_back(
-        instantiate_op_func<Dtype, arm_compute::GCBatchNormalizationLayer,
-                            arm_compute::IGCTensor>(input, output, mean, var,
-                                                    beta, gamma, eps, hint));
-    return true;
-  }
-#endif
-  {
-    func.push_back(
-        instantiate_op_func<Dtype, arm_compute::NEBatchNormalizationLayer,
-                            arm_compute::ITensor>(input, output, mean, var,
-                                                  beta, gamma, eps, hint));
-  }
-  return true;
-}
-inline bool instantiate_op_softmax(
-    ACLOperator *acl_op,
-    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
-    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
-    TargetHint hint, void *data) {
-#ifdef USE_OPENCL
-  if (hint == TargetHint::OPENCL) {
-    func.push_back(
-        instantiate_op_func<arm_compute::CLSoftmaxLayer,
-                            arm_compute::ICLTensor>(input, output, hint));
-    return true;
-  }
-#elif defined(USE_OPENGLES)
-  if (hint == TargetHint::OPENGLES) {
-    func.push_back(
-        instantiate_op_func<arm_compute::GCSoftmaxLayer,
-                            arm_compute::IGCTensor>(input, output, hint));
-    return true;
-  }
-#endif
-  {
-    func.push_back(
-        instantiate_op_func<arm_compute::NESoftmaxLayer, arm_compute::ITensor>(
-            input, output, hint));
-  }
-  return true;
-}
-inline bool instantiate_op_concat(
-    ACLOperator *acl_op,
-    std::vector<std::unique_ptr<arm_compute::IFunction>> &func,
-    std::unique_ptr<ACLTensor> &input, std::unique_ptr<ACLTensor> &output,
-    TargetHint hint, int num) {
-#ifdef USE_OPENCL
-  if (hint == TargetHint::OPENCL) {
-    func.push_back(
-        instantiate_op_func_lists<arm_compute::CLDepthConcatenateLayer,
-                                  arm_compute::ICLTensor>(acl_op, output, num,
-                                                          hint));
-    return true;
-  }
-#elif defined(USE_OPENGLES)
-  if (hint == TargetHint::OPENGLES) {
-    func.push_back(
-        instantiate_op_func_lists<arm_compute::GCDepthConcatenateLayer,
-                                  arm_compute::IGCTensor>(acl_op, output, num,
-                                                          hint));
-    return true;
-  }
-#endif
-  {
-    func.push_back(
-        instantiate_op_func_lists<arm_compute::NEDepthConcatenateLayer,
-                                  arm_compute::ITensor>(acl_op, output, num,
-                                                        hint));
-  }
-  return true;
-}
-template <typename Dtype>
-void *InputdataPtr(ACLOperator *op,
-                   const std::vector<framework::LoDTensor *> &input_data,
-                   Dtype type, int index = -1) {
-  if (index == -1) index = 0;
-  return (void *)(input_data[index]->mutable_data<Dtype>());
-}
-template <typename Dtype>
-void acl_run(ACLOperator *op,
-             const std::vector<framework::LoDTensor *> &in_data, void *out_data,
-             Dtype type, bool multi_input_run = true) {
-  for (int i = 0; i < in_data.size(); ++i) {
-    op->tensor_mem(op->cinput(i), InputdataPtr(op, in_data, type, i));
-  }
-  op->acl_run(NULL, out_data);
-}
-}  // namespace acl
-}  // namespace operators
-}  // namespace paddle_mobile
-#ifdef USE_PROFILING
-#define acl_configure(opname, acl_op, args...)                                \
-  {                                                                           \
-    set_operator_property(acl::operate_type_##opname, #opname);               \
-    logtime_util log_time(ACL_CONFIG_INFO);                                   \
-    instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(),         \
-                            acl_op->output(), acl_op->getTargetHint(), args); \
-  }
-#else
-#define acl_configure(opname, acl_op, args...)                                \
-  {                                                                           \
-    set_operator_property(acl::operate_type_##opname, #opname);               \
-    instantiate_op_##opname(acl_op, acl_op->funcs(), acl_op->input(),         \
-                            acl_op->output(), acl_op->getTargetHint(), args); \
-  }
-#endif
-#define ACLOp_Ptr(a) dynamic_cast<ACLOperator *>(a)
-#endif  // USE_ACL
-#endif  // ACL_OPERATOR_H_
--- a/src/operators/kernel/mali/acl_tensor.cc
+++ b/src/operators/kernel/mali/acl_tensor.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "acl_tensor.h"
-namespace paddle_mobile {
-namespace operators {
-namespace acl {
-#ifdef USE_ACL
-template <typename TensorType>
-std::unique_ptr<arm_compute::ITensor> initialise_tensor(
-    arm_compute::TensorInfo &info) {
-  auto tensor = cpp14::make_unique<TensorType>();
-  tensor->allocator()->init(info);
-  return std::move(tensor);
-}
-template <typename TensorType>
-void tensor_allocate(arm_compute::ITensor &tensor) {
-  auto itensor = dynamic_cast<TensorType *>(&tensor);
-  itensor->allocator()->allocate();
-}
-Tensor::Tensor(arm_compute::TensorInfo &info) noexcept
-    : _target(TargetHint::DONT_CARE), _info(info), _tensor(nullptr) {}
-Tensor::Tensor(Tensor &&src) noexcept
-    : _target(src._target),
-      _info(std::move(src._info)),
-      _tensor(std::move(src._tensor)) {}
-arm_compute::ITensor *Tensor::set_target(TargetHint target) {
-  switch (target) {
-#ifdef USE_OPENCL
-    case TargetHint::OPENCL:
-      _tensor = initialise_tensor<arm_compute::CLTensor>(_info);
-      break;
-#elif defined(USE_OPENGLES)
-    case TargetHint::OPENGLES:
-      _tensor = initialise_tensor<arm_compute::GCTensor>(_info);
-      break;
-#endif
-    case TargetHint::NEON:
-      _tensor = initialise_tensor<arm_compute::Tensor>(_info);
-      break;
-    default:
-      break;
-  }
-  _target = target;
-  return _tensor.get();
-}
-void Tensor::allocate() {
-  switch (_target) {
-#ifdef USE_OPENCL
-    case TargetHint::OPENCL:
-      tensor_allocate<arm_compute::CLTensor>(*_tensor);
-      break;
-#elif defined(USE_OPENGLES)
-    case TargetHint::OPENGLES:
-      tensor_allocate<arm_compute::GCTensor>(*_tensor);
-      break;
-#endif
-    case TargetHint::NEON:
-      tensor_allocate<arm_compute::Tensor>(*_tensor);
-      break;
-    default:
-      break;
-  }
-}
-void Tensor::map(bool blocking) {
-#ifdef USE_OPENCL
-  if (_target == TargetHint::OPENCL)
-    dynamic_cast<arm_compute::CLTensor *>(tensor())->map(blocking);
-#elif defined(USE_OPENGLES)
-  if (_target == TargetHint::OPENGLES)
-    dynamic_cast<arm_compute::GCTensor *>(tensor())->map(blocking);
-#endif
-}
-void Tensor::unmap() {
-#ifdef USE_OPENCL
-  if (_target == TargetHint::OPENCL)
-    dynamic_cast<arm_compute::CLTensor *>(tensor())->unmap();
-#elif defined(USE_OPENGLES)
-  if (_target == TargetHint::OPENGLES)
-    dynamic_cast<arm_compute::GCTensor *>(tensor())->unmap();
-#endif
-}
-template <typename SubTensorType, typename ParentTensorType>
-std::unique_ptr<arm_compute::ITensor> initialise_subtensor(
-    arm_compute::ITensor *parent, arm_compute::TensorShape shape,
-    arm_compute::Coordinates coords) {
-  auto ptensor = dynamic_cast<ParentTensorType *>(parent);
-  auto subtensor = cpp14::make_unique<SubTensorType>(ptensor, shape, coords);
-  return std::move(subtensor);
-}
-SubTensor::SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
-                     arm_compute::Coordinates &coords) noexcept
-    : _target(TargetHint::DONT_CARE),
-      _tensor_shape(tensor_shape),
-      _coords(coords),
-      _parent(nullptr),
-      _subtensor(nullptr) {
-  _parent = parent->tensor();
-  _target = parent->target();
-  instantiate_subtensor();
-}
-arm_compute::ITensor *SubTensor::set_target(TargetHint target) {
-  return (target == _target) ? _subtensor.get() : nullptr;
-}
-arm_compute::ITensor *SubTensor::tensor() { return _subtensor.get(); }
-const arm_compute::ITensor *SubTensor::tensor() const {
-  return _subtensor.get();
-}
-TargetHint SubTensor::target() const { return _target; }
-void SubTensor::allocate() {
-  // NOP for sub-tensors
-}
-void SubTensor::instantiate_subtensor() {
-  switch (_target) {
-#ifdef USE_OPENCL
-    case TargetHint::OPENCL:
-      _subtensor = initialise_subtensor<arm_compute::CLSubTensor,
-                                        arm_compute::ICLTensor>(
-          _parent, _tensor_shape, _coords);
-      break;
-#endif
-    default:
-    case TargetHint::NEON:
-      _subtensor =
-          initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(
-              _parent, _tensor_shape, _coords);
-      break;
-  }
-}
-#endif
-}  // namespace acl
-}  // namespace operators
-}  // namespace paddle_mobile
--- a/src/operators/kernel/mali/acl_tensor.h
+++ b/src/operators/kernel/mali/acl_tensor.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef ACL_TENSOR_H_
-#define ACL_TENSOR_H_
-#ifdef USE_ACL
-#ifdef USE_OPENCL
-#include "arm_compute/runtime/CL/CLSubTensor.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#elif defined(USE_OPENGLES)
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
-#endif
-#include "arm_compute/runtime/SubTensor.h"
-#include "arm_compute/runtime/Tensor.h"
-#include <memory>
-namespace paddle_mobile {
-namespace operators {
-namespace acl {
-enum class TargetHint {
-  DONT_CARE,
-  OPENCL,
-  OPENGLES,
-  NEON,
-};
-enum class ConvolutionMethodHint {
-  GEMM,
-  DIRECT,
-};
-namespace cpp14 {
-template <class T>
-struct _Unique_if {
-  typedef std::unique_ptr<T> _Single_object;
-};
-template <class T>
-struct _Unique_if<T[]> {
-  typedef std::unique_ptr<T[]> _Unknown_bound;
-};
-template <class T, size_t N>
-struct _Unique_if<T[N]> {
-  typedef void _Known_bound;
-};
-template <class T, class... Args>
-typename _Unique_if<T>::_Single_object make_unique(Args &&... args) {
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
-template <class T>
-typename _Unique_if<T>::_Unknown_bound make_unique(size_t n) {
-  typedef typename std::remove_extent<T>::type U;
-  return std::unique_ptr<T>(new U[n]());
-}
-template <class T, class... Args>
-typename _Unique_if<T>::_Known_bound make_unique(Args &&...);
-}  // namespace cpp14
-class Tensor {
- public:
-  explicit Tensor(arm_compute::TensorInfo &info) noexcept;
-  virtual ~Tensor() {}
-  Tensor(Tensor &&src) noexcept;
-  void set_info(arm_compute::TensorInfo &&info) { _info = info; }
-  arm_compute::ITensor *set_target(TargetHint target);
-  const arm_compute::TensorInfo &info() const { return _info; }
-  arm_compute::ITensor *tensor() { return _tensor.get(); }
-  void allocate();
-  void init() {}
-  TargetHint target() const { return _target; }
-  virtual void map(bool blocking = true);
-  virtual void unmap();
- private:
-  TargetHint _target;
-  arm_compute::TensorInfo _info;
-  std::unique_ptr<arm_compute::ITensor> _tensor;
-};
-class SubTensor {
- public:
-  SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
-            arm_compute::Coordinates &coords) noexcept;
-  ~SubTensor() {}
-  arm_compute::ITensor *tensor();
-  const arm_compute::ITensor *tensor() const;
-  TargetHint target() const;
-  void allocate();
-  arm_compute::ITensor *set_target(TargetHint target);
- private:
-  /** Instantiates a sub-tensor */
-  void instantiate_subtensor();
- private:
-  /**< Target that this tensor is pinned on */
-  TargetHint _target;
-  /**< SubTensor shape */
-  arm_compute::TensorShape _tensor_shape;
-  /**< SubTensor Coordinates */
-  arm_compute::Coordinates _coords;
-  /**< Parent tensor */
-  arm_compute::ITensor *_parent;
-  /**< SubTensor */
-  std::unique_ptr<arm_compute::ITensor> _subtensor;
-};
-}  // namespace acl
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
-#endif  // ACL_TENSOR_H_
--- a/src/operators/kernel/mali/batchnorm_kernel.cpp
+++ b/src/operators/kernel/mali/batchnorm_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef BATCHNORM_OP
-#include "operators/kernel/batchnorm_kernel.h"
-#ifdef PADDLE_MOBILE_MALI_GPU
-#include "acl_operator.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-class AclBatchNormOp : public acl::ACLOperator {
- public:
-  AclBatchNormOp() {
-    this->force_bypass_acl_path_ = bypass_acl_class_layer & FLAGS_ENABLE_ACL_BN;
-  }
-  ~AclBatchNormOp() = default;
-  AclBatchNormOp(const AclBatchNormOp&) = delete;
-  AclBatchNormOp& operator=(const AclBatchNormOp&) = delete;
-  AclBatchNormOp(AclBatchNormOp&&) = delete;
-  AclBatchNormOp& operator=(AclBatchNormOp&&) = delete;
-  acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const BatchNormParam<DeviceType>& param) {
-    setTargetHint(acl::TargetHint::OPENCL);
-    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
-                                         args.in_depth, args.batch);
-    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
-                                          args.out_depth, args.out_num);
-    if (is_operator_init_done(input_shape)) return;
-    set_operator_init_done();
-    this->force_bypass_acl_path_ = false;
-    arm_compute::TensorShape mean_shape(args.in_depth);
-    arm_compute::TensorShape var_shape = mean_shape;
-    arm_compute::TensorShape beta_shape = mean_shape;
-    arm_compute::TensorShape gamma_shape = mean_shape;
-    //[width, height, IFM]
-    new_tensor(input(), input_shape, args.input_data);
-    //[width, height, OFM]
-    new_tensor(output(), output_shape, args.output_data);
-    new_tensor(mean(), mean_shape, args.mean_data);
-    new_tensor(var(), var_shape, args.var_data);
-    new_tensor(beta(), beta_shape, args.biases_data);
-    new_tensor(gamma(), gamma_shape, args.weight_data);
-    acl_configure(bn, this, args.epsilon);
-  }
-  void RunAcl(void* input, void* output) {
-    acl::ACLOperator::acl_run(input, output);
-  }
-  bool Bypass_acl(const BatchNormParam<DeviceType>& param) {
-    bool bypass_acl = false;
-    AclParametersByContext(param);
-    InitAclLayer(param);
-    // for performance, more groups impact GPU performance
-    if (this->force_bypass_acl_path_) {
-      bypass_acl = true;
-    }
-    return bypass_acl;
-  }
- private:
-  void AclParametersByContext(const BatchNormParam<DeviceType>& param) {
-    const Tensor* in_x = param.InputX();
-    Tensor* out = param.OutputY();
-    const Tensor* scale = param.InputScale();
-    const Tensor* bias = param.InputBias();
-    const Tensor* saved_mean = param.InputMean();
-    const Tensor* saved_variance = param.InputVariance();
-    const T* input_data = in_x->data<T>();
-    T* output_data = out->mutable_data<T>();
-    const T* weight_data = scale->data<T>();
-    const T* bias_data = bias->data<T>();
-    const T* mean_data = saved_mean->data<T>();
-    const T* var_data = saved_variance->data<T>();
-    float epsilon = param.Epsilon();
-    args.input_data = (void*)input_data;
-    args.output_data = (void*)output_data;
-    // args.weight_data = (void*)weight_data;
-    // args.biases_data = (void*)bias_data;
-    args.mean_data = (void*)mean_data;
-    args.var_data = (void*)var_data;
-    args.epsilon = epsilon;
-    args.dim = in_x->dims().size();
-    args.batch = in_x->dims()[0];
-    args.in_depth = in_x->dims()[1];
-    args.in_rows = in_x->dims()[2];
-    args.in_cols = in_x->dims()[3];
-    args.out_num = out->dims()[0];
-    args.out_depth = out->dims()[1];
-    args.out_rows = out->dims()[2];
-    args.out_cols = out->dims()[3];
-    args.weight_data = (void*)weight_data;
-    args.biases_data = (void*)bias_data;
-    // std::cout
-    //  << "Out C: " <<  args.out_depth
-    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
-  }
-  acl::AclParameters args;
-};
-template <>
-bool BatchNormKernel<GPU_MALI, float>::Init(BatchNormParam<GPU_MALI>* param) {
-  AclBatchNormOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    acl_op = new AclBatchNormOp<GPU_MALI, float>();
-    this->SetAclOp((void*)acl_op, (void*)this);
-  }
-  if (acl_op->Bypass_acl(*param)) {
-    std::cout << "init acl failed" << std::endl;
-    return false;
-  }
-  return true;
-}
-template <>
-void BatchNormKernel<GPU_MALI, float>::Compute(
-    const BatchNormParam<GPU_MALI>& param) {
-  std::cout << "init acl" << std::endl;
-  AclBatchNormOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    return;
-  }
-  acl::AclParameters& args = acl_op->getargs();
-  acl_op->RunAcl(args.input_data, args.output_data);
-}
-template class BatchNormKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
-#endif
--- a/src/operators/kernel/mali/concat_kernel.cpp
+++ b/src/operators/kernel/mali/concat_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef CONCAT_OP
-#include "operators/kernel/concat_kernel.h"
-#ifdef PADDLE_MOBILE_MALI_GPU
-#include "acl_operator.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-class AclConcatOp : public acl::ACLOperator {
- public:
-  AclConcatOp() {
-    this->force_bypass_acl_path_ =
-        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONCAT;
-  }
-  ~AclConcatOp() = default;
-  AclConcatOp(const AclConcatOp&) = delete;
-  AclConcatOp& operator=(const AclConcatOp&) = delete;
-  AclConcatOp(AclConcatOp&&) = delete;
-  AclConcatOp& operator=(AclConcatOp&&) = delete;
-  acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const ConcatParam<DeviceType>& param) {
-    setTargetHint(acl::TargetHint::OPENCL);
-    const std::vector<framework::LoDTensor*>* input_data = &args.in_tensor;
-    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
-                                          args.out_depth, args.batch);
-    if (is_operator_init_done(output_shape)) return;
-    set_operator_init_done();
-    this->force_bypass_acl_path_ = false;
-    T type;
-    for (int i = 0; i < input_data->size(); i++) {
-      int in_batch = (*input_data)[i]->dims()[0];
-      int in_channels = (*input_data)[i]->dims()[1];
-      int in_width = (*input_data)[i]->dims()[2];
-      int in_height = (*input_data)[i]->dims()[3];
-      arm_compute::TensorShape in_shape(in_width, in_height, in_channels);
-      new_tensor(cinput(i), in_shape,
-                 acl::InputdataPtr(this, args.in_tensor, type, i));
-    }
-    //[width, height, OFM]
-    new_tensor(output(), output_shape, args.output_data);
-    acl_configure(concat, this, input_data->size());
-  }
-  void RunAcl(const std::vector<framework::LoDTensor*>& input, void* output) {
-    T type;
-    acl::acl_run(this, input, output, type);
-  }
-  bool Bypass_acl(const ConcatParam<DeviceType>& param) {
-    bool bypass_acl = false;
-    AclParametersByContext(param);
-    InitAclLayer(param);
-    // for performance, more groups impact GPU performance
-    if (this->force_bypass_acl_path_ || !args.is_channel_concat) {
-      bypass_acl = true;
-    }
-    return bypass_acl;
-  }
- private:
-  void AclParametersByContext(const ConcatParam<DeviceType>& param) {
-    auto inputs = param.Inputs();
-    auto* output = param.Out();
-    int64_t axis = param.Axis();
-    T* output_data = output->mutable_data<T>();
-    args.is_channel_concat = (axis == 1);
-    args.in_tensor = inputs;
-    args.output_data = (void*)output_data;
-    args.batch = output->dims()[0];
-    args.out_depth = output->dims()[1];
-    args.out_rows = output->dims()[2];
-    args.out_cols = output->dims()[3];
-  }
-  acl::AclParameters args;
-};
-template <>
-bool ConcatKernel<GPU_MALI, float>::Init(ConcatParam<GPU_MALI>* param) {
-  AclConcatOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    acl_op = new AclConcatOp<GPU_MALI, float>();
-    this->SetAclOp((void*)acl_op, (void*)this);
-  }
-  if (acl_op->Bypass_acl(*param)) {
-    std::cout << "init acl failed" << std::endl;
-    return false;
-  }
-  return true;
-}
-template <>
-void ConcatKernel<GPU_MALI, float>::Compute(
-    const ConcatParam<GPU_MALI>& param) {
-  std::cout << "init acl" << std::endl;
-  AclConcatOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    return;
-  }
-  acl::AclParameters& args = acl_op->getargs();
-  acl_op->RunAcl(args.in_tensor, args.output_data);
-}
-template class ConcatKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
-#endif
--- a/src/operators/kernel/mali/conv_add_kernel.cpp
+++ b/src/operators/kernel/mali/conv_add_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_CONVADD_OP
-#include "operators/kernel/conv_add_kernel.h"
-#ifdef PADDLE_MOBILE_MALI_GPU
-#include "acl_operator.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-class AclConvAddOp : public acl::ACLOperator {
- public:
-  AclConvAddOp() {
-    this->force_bypass_acl_path_ =
-        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
-  }
-  ~AclConvAddOp() = default;
-  AclConvAddOp(const AclConvAddOp&) = delete;
-  AclConvAddOp& operator=(const AclConvAddOp&) = delete;
-  AclConvAddOp(AclConvAddOp&&) = delete;
-  AclConvAddOp& operator=(AclConvAddOp&&) = delete;
-  acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const FusionConvAddParam<DeviceType>& param) {
-    setTargetHint(acl::TargetHint::OPENCL);
-    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
-                                         args.in_depth, args.batch);
-    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
-                                          args.out_depth, args.out_num);
-    arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
-                                           args.in_depth / args.num_group,
-                                           args.out_depth);
-    arm_compute::TensorShape biases_shape(args.out_depth);
-    arm_compute::PadStrideInfo conv_info(
-        args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
-        arm_compute::DimensionRoundingType::FLOOR);
-    if (is_operator_init_done(input_shape)) return;
-    set_operator_init_done();
-    this->force_bypass_acl_path_ = false;
-    // check_direct_conv();
-    group() = args.num_group;
-    //[kernel_x, kernel_y, IFM, OFM]
-    new_tensor(weights(), weights_shape, args.weight_data);
-    //[OFM]
-    if (args.biases_data) {
-      new_tensor(biases(), biases_shape, args.biases_data);
-    }
-    //[width, height, IFM]
-    new_tensor(input(), input_shape, args.input_data);
-    //[width, height, OFM]
-    new_tensor(output(), output_shape, args.output_data);
-    acl_configure(conv, this, conv_info);
-  }
-  void RunAcl(void* input, void* output) {
-    acl::ACLOperator::acl_run(input, output);
-  }
-  bool Bypass_acl(const FusionConvAddParam<DeviceType>& param) {
-    bool bypass_acl = false;
-    AclParametersByContext(param);
-    InitAclLayer(param);
-    // for performance, more groups impact GPU performance
-    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
-      bypass_acl = true;
-    }
-    if (args.dim > 2) {
-      bypass_acl = true;
-    }
-    if (args.dilated) {
-      bypass_acl = true;
-    }
-    return bypass_acl;
-  }
- private:
-  void check_direct_conv() {
-    bool use_direct_conv = false;
-    const char* pDirectConv;
-    pDirectConv = getenv("DIRECTCONV");
-    if (pDirectConv) {
-      unsigned int bdirectconv;
-      sscanf(pDirectConv, "%i", &bdirectconv);
-      if (bdirectconv != use_direct_conv) {
-        use_direct_conv = bdirectconv;
-        printf("DIRECTCONV<%s>\n", pDirectConv);
-        printf("DIRECTCONV: %x\n", use_direct_conv);
-      }
-    }
-    int pad_data[2], kernel[2];
-    pad_data[1] = args.pad_rows;
-    pad_data[0] = args.pad_cols;
-    kernel[1] = args.filter_rows;
-    kernel[0] = args.filter_cols;
-    if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
-                             pad_data[0] == 0 && pad_data[1] == 0) ||
-                            (kernel[0] == 3 && kernel[1] == 3 &&
-                             pad_data[0] <= 1 && pad_data[1] <= 1))) {
-      setConvMethod();  // NEDirectConvolutionLayer only for 1x1 and 3x3
-    }
-  }
-  void AclParametersByContext(const FusionConvAddParam<DeviceType>& param) {
-    const Tensor* input = param.Input();
-    Tensor filter = *param.Filter();
-    Tensor* output = param.Output();
-    Tensor* bias;
-    int groups = param.Groups();
-    std::vector<int> strides = param.Strides();
-    std::vector<int> paddings = param.Paddings();
-    std::vector<int> dilations = param.Dilations();
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>();
-    const T* weight_data = filter.data<T>();
-    args.input_data = (void*)input_data;
-    args.output_data = (void*)output_data;
-    args.weight_data = (void*)weight_data;
-    args.biases_data = nullptr;
-    try {
-      bias = param.Bias();
-    } catch (const std::exception& e) {
-    }
-    if (bias) {
-      const T* biases_data = bias->data<T>();
-      args.biases_data = (void*)biases_data;
-    }
-    args.num_group = groups;
-    args.dilation_rows = dilations[0];
-    args.dilation_cols = dilations[1];
-    if (dilations[0] != 1 || dilations[1] != 1) {
-      args.dilated = true;
-    }
-    // NCHW
-    // std::cout << "In dims: " << (input->dims()).size() << std::endl;
-    args.batch = input->dims()[0];
-    args.in_depth = input->dims()[1];
-    args.in_rows = input->dims()[2];
-    args.in_cols = input->dims()[3];
-    // std::cout <<"In N: " << args.batch << " C: " <<  args.in_depth
-    //  << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
-    // NCHW
-    // std::cout << "Out dims: " << (output->dims()).size() << std::endl;
-    args.out_num = output->dims()[0];
-    args.out_depth = output->dims()[1];
-    args.out_rows = output->dims()[2];
-    args.out_cols = output->dims()[3];
-    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
-    //  << " C: " <<  args.out_depth
-    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
-    // MCHW = OIHW
-    args.filter_rows = filter.dims()[2];
-    args.filter_cols = filter.dims()[3];
-    // std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
-    //  << " I: " <<  static_cast<int>(filter.dims()[1])
-    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
-    // strides(h_stride, w_stride)
-    args.stride_rows = strides[0];
-    args.stride_cols = strides[1];
-    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
-    // args.stride_cols << "\n";
-    // paddings(h_pad, w_pad)
-    args.pad_rows = paddings[0];
-    args.pad_cols = paddings[1];
-    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
-    // "\n";
-  }
-  acl::AclParameters args;
-};
-template <>
-bool ConvAddKernel<GPU_MALI, float>::Init(FusionConvAddParam<GPU_MALI>* param) {
-  AclConvAddOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    acl_op = new AclConvAddOp<GPU_MALI, float>();
-    this->SetAclOp((void*)acl_op, (void*)this);
-  }
-  if (acl_op->Bypass_acl(*param)) {
-    std::cout << "init acl failed" << std::endl;
-    return false;
-  }
-  return true;
-}
-template <>
-void ConvAddKernel<GPU_MALI, float>::Compute(
-    const FusionConvAddParam<GPU_MALI>& param) {
-  std::cout << "init acl" << std::endl;
-  AclConvAddOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    return;
-  }
-  acl::AclParameters& args = acl_op->getargs();
-  acl_op->RunAcl(args.input_data, args.output_data);
-}
-template class ConvAddKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
-#endif
--- a/src/operators/kernel/mali/conv_kernel.cpp
+++ b/src/operators/kernel/mali/conv_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef CONV_OP
-#include "operators/kernel/conv_kernel.h"
-#ifdef PADDLE_MOBILE_MALI_GPU
-#include "acl_operator.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-class AclConvOp : public acl::ACLOperator {
- public:
-  AclConvOp() {
-    this->force_bypass_acl_path_ =
-        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
-  }
-  ~AclConvOp() = default;
-  AclConvOp(const AclConvOp&) = delete;
-  AclConvOp& operator=(const AclConvOp&) = delete;
-  AclConvOp(AclConvOp&&) = delete;
-  AclConvOp& operator=(AclConvOp&&) = delete;
-  acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const ConvParam<DeviceType>& param) {
-    setTargetHint(acl::TargetHint::OPENCL);
-    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
-                                         args.in_depth, args.batch);
-    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
-                                          args.out_depth, args.out_num);
-    arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
-                                           args.in_depth / args.num_group,
-                                           args.out_depth);
-    // arm_compute::TensorShape biases_shape(args.out_depth);
-    arm_compute::PadStrideInfo conv_info(
-        args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
-        arm_compute::DimensionRoundingType::FLOOR);
-    if (is_operator_init_done(input_shape)) return;
-    set_operator_init_done();
-    this->force_bypass_acl_path_ = false;
-    check_direct_conv();
-    //[kernel_x, kernel_y, IFM, OFM]
-    new_tensor(weights(), weights_shape, args.weight_data);
-    //[OFM]
-    // if (args.biases_data) {
-    //    new_tensor(biases(),biases_shape,args.biases_data);
-    //}
-    group() = args.num_group;
-    //[width, height, IFM]
-    new_tensor(input(), input_shape, args.input_data);
-    //[width, height, OFM]
-    new_tensor(output(), output_shape, args.output_data);
-    acl_configure(conv, this, conv_info);
-  }
-  void RunAcl(void* input, void* output) {
-    acl::ACLOperator::acl_run(input, output);
-  }
-  bool Bypass_acl(const ConvParam<DeviceType>& param) {
-    bool bypass_acl = false;
-    AclParametersByContext(param);
-    InitAclLayer(param);
-    // for performance, more groups impact GPU performance
-    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
-      bypass_acl = true;
-    }
-    if (args.dim > 2) {
-      bypass_acl = true;
-    }
-    if (args.dilated) {
-      bypass_acl = true;
-    }
-    return bypass_acl;
-  }
- private:
-  void check_direct_conv() {
-    bool use_direct_conv = false;
-    const char* pDirectConv;
-    pDirectConv = getenv("DIRECTCONV");
-    if (pDirectConv) {
-      unsigned int bdirectconv;
-      sscanf(pDirectConv, "%i", &bdirectconv);
-      if (bdirectconv != use_direct_conv) {
-        use_direct_conv = bdirectconv;
-        printf("DIRECTCONV<%s>\n", pDirectConv);
-        printf("DIRECTCONV: %x\n", use_direct_conv);
-      }
-    }
-    int pad_data[2], kernel[2];
-    pad_data[1] = args.pad_rows;
-    pad_data[0] = args.pad_cols;
-    kernel[1] = args.filter_rows;
-    kernel[0] = args.filter_cols;
-    if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
-                             pad_data[0] == 0 && pad_data[1] == 0) ||
-                            (kernel[0] == 3 && kernel[1] == 3 &&
-                             pad_data[0] <= 1 && pad_data[1] <= 1))) {
-      setConvMethod();  // NEDirectConvolutionLayer only for 1x1 and 3x3
-    }
-  }
-  void AclParametersByContext(const ConvParam<DeviceType>& param) {
-    const Tensor* input = param.Input();
-    Tensor filter = *param.Filter();
-    Tensor* output = param.Output();
-    int groups = param.Groups();
-    std::vector<int> strides = param.Strides();
-    std::vector<int> paddings = param.Paddings();
-    std::vector<int> dilations = param.Dilations();
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>();
-    const T* weight_data = filter.data<T>();
-    args.input_data = (void*)input_data;
-    args.output_data = (void*)output_data;
-    args.weight_data = (void*)weight_data;
-    args.biases_data = nullptr;
-    // try {
-    //     bias = context.Input<framework::Tensor>("Bias");
-    // } catch (const std::exception& e) {
-    // }
-    // if (bias) {
-    //     const T* biases_data = bias->data<T>();
-    //     args.biases_data = (void*)biases_data;
-    // }
-    args.num_group = groups;
-    args.dilation_rows = dilations[0];
-    args.dilation_cols = dilations[1];
-    if (dilations[0] != 1 || dilations[1] != 1) {
-      args.dilated = true;
-    }
-    // NCHW
-    // std::cout << "In dims: " << (input->dims()).size() << std::endl;
-    args.batch = input->dims()[0];
-    args.in_depth = input->dims()[1];
-    args.in_rows = input->dims()[2];
-    args.in_cols = input->dims()[3];
-    std::cout << "In N: " << args.batch << " C: " << args.in_depth
-              << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
-    // NCHW
-    // std::cout << "Out dims: " << (output->dims()).size() << std::endl;
-    args.out_num = output->dims()[0];
-    args.out_depth = output->dims()[1];
-    args.out_rows = output->dims()[2];
-    args.out_cols = output->dims()[3];
-    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
-    //  << " C: " <<  args.out_depth
-    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
-    // MCHW = OIHW
-    args.filter_rows = filter.dims()[2];
-    args.filter_cols = filter.dims()[3];
-    // std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
-    //  << " I: " <<  static_cast<int>(filter.dims()[1])
-    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
-    // strides(h_stride, w_stride)
-    args.stride_rows = strides[0];
-    args.stride_cols = strides[1];
-    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
-    // args.stride_cols << "\n";
-    // paddings(h_pad, w_pad)
-    args.pad_rows = paddings[0];
-    args.pad_cols = paddings[1];
-    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
-    // "\n";
-  }
-  acl::AclParameters args;
-};
-template <>
-bool ConvKernel<GPU_MALI, float>::Init(ConvParam<GPU_MALI>* param) {
-  AclConvOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    acl_op = new AclConvOp<GPU_MALI, float>();
-    this->SetAclOp((void*)acl_op, (void*)this);
-  }
-  if (acl_op->Bypass_acl(*param)) {
-    std::cout << "init acl failed" << std::endl;
-    return false;
-  }
-  return true;
-}
-template <>
-void ConvKernel<GPU_MALI, float>::Compute(const ConvParam<GPU_MALI>& param) {
-  std::cout << "init acl" << std::endl;
-  AclConvOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    return;
-  }
-  acl::AclParameters& args = acl_op->getargs();
-  acl_op->RunAcl(args.input_data, args.output_data);
-}
-template class ConvKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
-#endif
--- a/src/operators/kernel/mali/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef ELEMENTWISEADD_OP
-#pragma once
-#include "operators/kernel/elementwise_add_kernel.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename T>
-struct AddFunctor {
-  inline T operator()(T a, T b) const { return a + b; }
-};
-template <>
-bool ElementwiseAddKernel<GPU_MALI, float>::Init(
-    ElementwiseAddParam<GPU_MALI> *param) {
-  return true;
-}
-template <>
-void ElementwiseAddKernel<GPU_MALI, float>::Compute(
-    const ElementwiseAddParam<GPU_MALI> &param) {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *Out = param.Out();
-  Out->mutable_data<float>();
-  int axis = param.Axis();
-  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
-                                                 AddFunctor<float>(), Out);
-}
-template class ElementwiseAddKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/kernel/mali/feed_kernel.cpp
+++ b/src/operators/kernel/mali/feed_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "operators/kernel/feed_kernel.h"
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool FeedKernel<GPU_MALI, float>::Init(FeedParam<GPU_MALI> *param) {
-  return true;
-}
-template <>
-void FeedKernel<GPU_MALI, float>::Compute(const FeedParam<GPU_MALI> &param) {
-  param.Out()->ShareDataWith(*(param.InputX()));
-  param.Out()->set_lod(param.InputX()->lod());
-}
-template class FeedKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/kernel/mali/fetch_kernel.cpp
+++ b/src/operators/kernel/mali/fetch_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_CONVADD_OP
-#include "operators/kernel/fetch_kernel.h"
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool FetchKernel<GPU_MALI, float>::Init(FetchParam<GPU_MALI> *param) {
-  return true;
-}
-template <>
-void FetchKernel<GPU_MALI, float>::Compute(const FetchParam<GPU_MALI> &param) {
-  param.Out()->ShareDataWith(*(param.InputX()));
-}
-template class FetchKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/kernel/mali/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_FC_OP
-#include "operators/kernel/fusion_fc_kernel.h"
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool FusionFcKernel<GPU_MALI, float>::Init(FusionFcParam<GPU_MALI> *param) {
-  return true;
-}
-template <>
-void FusionFcKernel<GPU_MALI, float>::Compute(
-    const FusionFcParam<GPU_MALI> &param) {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  const Tensor *input_z = param.InputZ();
-  auto *input_z_data = input_z->data<float>();
-  int axis = param.Axis();
-  Tensor *out = param.Out();
-  auto *out_data = out->mutable_data<float>();
-  const Tensor x_matrix =
-      input_x->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
-          : *input_x;
-  const Tensor y_matrix =
-      input_y->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
-          : *input_y;
-  auto out_dim = out->dims();
-  if (out_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
-  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
-                        " out_dim.size must be 2.");
-  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
-  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ")
-  int64_t classes = input_z->numel();
-  for (int i = 0; i < out_dim[0]; i++) {
-    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
-  }
-  for (int i = 0; i < out->numel(); i++) {
-    DLOG << out_data[i];
-  }
-  math::MatMul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
-                      out, static_cast<float>(1));
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-  //            if (out_dim.size() != 2) {
-  //                out->Resize(out_dim);
-  //            }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/kernel/mali/lrn_kernel.cpp
+++ b/src/operators/kernel/mali/lrn_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef LRN_OP
-#pragma once
-#include "operators/kernel/lrn_kernel.h"
-#ifdef PADDLE_MOBILE_MALI_GPU
-#include "acl_operator.h"
-#include "framework/operator.h"
-#include "operators/kernel/central-arm-func/lrn_arm_func.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-class AclLrnOp : public acl::ACLOperator {
- public:
-  AclLrnOp() {
-    this->force_bypass_acl_path_ =
-        bypass_acl_class_layer & FLAGS_ENABLE_ACL_LRN;
-  }
-  ~AclLrnOp() = default;
-  AclLrnOp(const AclLrnOp&) = delete;
-  AclLrnOp& operator=(const AclLrnOp&) = delete;
-  AclLrnOp(AclLrnOp&&) = delete;
-  AclLrnOp& operator=(AclLrnOp&&) = delete;
-  acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const LrnParam<DeviceType>& param) {
-    setTargetHint(acl::TargetHint::OPENCL);
-    arm_compute::TensorShape shape(args.in_cols, args.in_rows, args.in_depth);
-    if (is_operator_init_done(shape)) return;
-    set_operator_init_done();
-    this->force_bypass_acl_path_ = false;
-    arm_compute::NormalizationLayerInfo norm_info(
-        arm_compute::NormType::CROSS_MAP, args.nsize, args.alpha, args.beta,
-        args.knorm);
-    //[width, height, IFM]
-    new_tensor(input(), shape, args.input_data);
-    //[width, height, OFM]
-    new_tensor(output(), shape, args.output_data);
-    acl_configure(lrn, this, norm_info);
-  }
-  void Set_bypass(bool bypass) { args.is_bypass = bypass; }
-  void RunAcl(void* input, void* output) {
-    acl::ACLOperator::acl_run(input, output);
-  }
-  bool Bypass_acl(const LrnParam<DeviceType>& param) {
-    bool bypass_acl = false;
-    AclParametersByContext(param);
-    InitAclLayer(param);
-    // for performance, more groups impact GPU performance
-    if (this->force_bypass_acl_path_) {
-      bypass_acl = true;
-    }
-    return bypass_acl;
-  }
- private:
-  void AclParametersByContext(const LrnParam<DeviceType>& param) {
-    const Tensor* in_x = param.InputX();
-    Tensor* out = param.Out();
-    int n = param.N();
-    T alpha = param.Alpha();
-    T beta = param.Beta();
-    T k = param.K();
-    const T* input_data = in_x->data<T>();
-    T* output_data = out->mutable_data<T>();
-    args.input_data = (void*)input_data;
-    args.output_data = (void*)output_data;
-    args.nsize = n;
-    args.alpha = alpha;
-    args.beta = beta;
-    args.knorm = k;
-    // NCHW
-    args.batch = in_x->dims()[0];
-    args.in_depth = in_x->dims()[1];
-    args.in_rows = in_x->dims()[2];
-    args.in_cols = in_x->dims()[3];
-    // std::cout
-    //  << "Out C: " <<  args.out_depth
-    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
-  }
-  acl::AclParameters args;
-};
-template <>
-bool LrnKernel<GPU_MALI, float>::Init(LrnParam<GPU_MALI>* param) {
-  AclLrnOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    acl_op = new AclLrnOp<GPU_MALI, float>();
-    this->SetAclOp((void*)acl_op, (void*)this);
-  }
-  if (acl_op->Bypass_acl(*param)) {
-    acl_op->Set_bypass(true);
-    std::cout << "init acl failed" << std::endl;
-    return true;
-  }
-  return true;
-}
-template <>
-void LrnKernel<GPU_MALI, float>::Compute(const LrnParam<GPU_MALI>& param) {
-  std::cout << "init acl" << std::endl;
-  AclLrnOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    return;
-  }
-  acl::AclParameters& args = acl_op->getargs();
-  if (args.is_bypass) {
-    std::cout << "bypass op" << std::endl;
-    LrnCompute<float>(param);
-    return;
-  }
-  const float* input_data = (const float*)args.input_data;
-  const float* output_data = (const float*)args.output_data;
-  for (int n = 0; n < args.batch; ++n) {
-    acl_op->RunAcl((void*)input_data, (void*)output_data);
-    input_data += args.in_depth * args.in_cols * args.in_rows;
-    output_data += args.in_depth * args.in_cols * args.in_rows;
-  }
-}
-template class LrnKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
-#endif
--- a/src/operators/kernel/mali/mul_kernel.cpp
+++ b/src/operators/kernel/mali/mul_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef MUL_OP
-#pragma once
-#include "operators/kernel/mul_kernel.h"
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool MulKernel<GPU_MALI, float>::Init(MulParam<GPU_MALI> *param) {
-  return true;
-}
-template <>
-void MulKernel<GPU_MALI, float>::Compute(const MulParam<GPU_MALI> &param) {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *out = param.Out();
-  out->mutable_data<float>();
-  const Tensor x_matrix =
-      input_x->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
-          : *input_x;
-  const Tensor y_matrix =
-      input_y->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
-          : *input_y;
-  auto out_dim = out->dims();
-  if (out_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-  math::MatMul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
-                      out, static_cast<float>(0));
-  if (out_dim.size() != 2) {
-    out->Resize(out_dim);
-  }
-}
-template class MulKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/kernel/mali/pool_kernel.cpp
+++ b/src/operators/kernel/mali/pool_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef POOL_OP
-#pragma once
-#include "operators/kernel/pool_kernel.h"
-#ifdef PADDLE_MOBILE_MALI_GPU
-#include "acl_operator.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-class AclPoolOp : public acl::ACLOperator {
- public:
-  AclPoolOp() {
-    this->force_bypass_acl_path_ =
-        bypass_acl_class_layer & FLAGS_ENABLE_ACL_POOLING;
-  }
-  ~AclPoolOp() = default;
-  AclPoolOp(const AclPoolOp&) = delete;
-  AclPoolOp& operator=(const AclPoolOp&) = delete;
-  AclPoolOp(AclPoolOp&&) = delete;
-  AclPoolOp& operator=(AclPoolOp&&) = delete;
-  acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const PoolParam<DeviceType>& param) {
-    setTargetHint(acl::TargetHint::OPENCL);
-    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
-                                         args.in_depth);
-    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
-                                          args.out_depth);
-    // arm_compute::TensorShape weights_shape(
-    // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
-    // arm_compute::TensorShape biases_shape(args.out_depth);
-    arm_compute::PoolingLayerInfo pool_info;
-    if (args.pool_type == "max") {
-      pool_info = arm_compute::PoolingLayerInfo(
-          arm_compute::PoolingType::MAX, args.filter_rows,
-          arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
-                                     args.pad_cols, args.pad_rows,
-                                     arm_compute::DimensionRoundingType::CEIL));
-    } else {
-      pool_info = arm_compute::PoolingLayerInfo(
-          arm_compute::PoolingType::AVG, args.filter_rows,
-          arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
-                                     args.pad_cols, args.pad_rows,
-                                     arm_compute::DimensionRoundingType::CEIL));
-    }
-    if (is_operator_init_done(input_shape)) return;
-    set_operator_init_done();
-    this->force_bypass_acl_path_ = false;
-    //[width, height, IFM]
-    new_tensor(input(), input_shape, args.input_data);
-    //[width, height, OFM]
-    new_tensor(output(), output_shape, args.output_data);
-    acl_configure(pooling, this, pool_info);
-  }
-  void RunAcl(void* input, void* output) {
-    acl::ACLOperator::acl_run(input, output);
-  }
-  bool Bypass_acl(const PoolParam<DeviceType>& param) {
-    bool bypass_acl = false;
-    AclParametersByContext(param);
-    InitAclLayer(param);
-    // for performance, more groups impact GPU performance
-    if (this->force_bypass_acl_path_) {
-      bypass_acl = true;
-    }
-    if (args.pool_type != "max" && args.pool_type != "avg") {
-      bypass_acl = true;
-    }
-    if (args.filter_rows != args.filter_cols) {
-      bypass_acl = true;
-    }
-    // if (args.filter_rows!=2 && args.filter_rows!=3) {
-    //     bypass_acl = true;
-    // }
-    return bypass_acl;
-  }
- private:
-  void AclParametersByContext(const PoolParam<DeviceType>& param) {
-    const Tensor* in_x = param.Input();
-    Tensor* out = param.Output();
-    std::string pooling_type = param.PoolingType();
-    std::vector<int> ksize = param.Ksize();
-    std::vector<int> strides = param.Strides();
-    std::vector<int> paddings = param.Paddings();
-    bool is_global_pooling = param.isGlobalPooling();
-    const T* input_data = in_x->data<T>();
-    T* output_data = out->mutable_data<T>();
-    args.input_data = (void*)input_data;
-    args.output_data = (void*)output_data;
-    args.is_global_pool = is_global_pooling;
-    args.pool_type = pooling_type;
-    args.filter_rows = ksize[0];
-    args.filter_cols = ksize[1];
-    args.dim = ksize.size();
-    // NCHW
-    args.batch = in_x->dims()[0];
-    args.in_depth = in_x->dims()[1];
-    args.in_rows = in_x->dims()[2];
-    args.in_cols = in_x->dims()[3];
-    // std::cout <<"In N: " << args.batch << " C: " <<  args.in_depth
-    //  << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
-    // NCHW
-    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
-    //  << " C: " <<  args.out_depth
-    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
-    // MCHW = OIHW
-    // std::cout <<"Filter O: " << static_cast<int>(filter->dims()[0])
-    //  << " I: " <<  static_cast<int>(filter->dims()[1])
-    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
-    // strides(h_stride, w_stride)
-    args.stride_rows = strides[0];
-    args.stride_cols = strides[1];
-    // std::cout <<"PoolingType: " << args.pool_type << "\n";
-    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
-    // args.stride_cols << "\n";
-    // paddings(h_pad, w_pad)
-    args.pad_rows = paddings[0];
-    args.pad_cols = paddings[1];
-    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
-    // "\n";
-    args.out_depth = args.in_depth;
-    // args.out_rows = out->dims()[2];
-    // args.out_cols = out->dims()[3];
-    args.out_rows = static_cast<int>(ceil(static_cast<float>(args.in_rows +
-                                                             2 * args.pad_rows -
-                                                             args.filter_rows) /
-                                          args.stride_rows)) +
-                    1;
-    args.out_cols = static_cast<int>(ceil(static_cast<float>(args.in_cols +
-                                                             2 * args.pad_cols -
-                                                             args.filter_cols) /
-                                          args.stride_cols)) +
-                    1;
-    if (is_global_pooling) {
-      args.filter_rows = args.in_rows;
-      args.filter_cols = args.in_cols;
-      args.pad_rows = 0;
-      args.pad_cols = 0;
-    }
-  }
-  acl::AclParameters args;
-};
-template <>
-bool PoolKernel<GPU_MALI, float>::Init(PoolParam<GPU_MALI>* param) {
-  AclPoolOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    acl_op = new AclPoolOp<GPU_MALI, float>();
-    this->SetAclOp((void*)acl_op, (void*)this);
-  }
-  if (acl_op->Bypass_acl(*param)) {
-    std::cout << "init acl failed" << std::endl;
-    return false;
-  }
-  return true;
-}
-template <>
-void PoolKernel<GPU_MALI, float>::Compute(const PoolParam<GPU_MALI>& param) {
-  std::cout << "init acl" << std::endl;
-  AclPoolOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    return;
-  }
-  acl::AclParameters& args = acl_op->getargs();
-  const float* input_data = (const float*)args.input_data;
-  const float* output_data = (const float*)args.output_data;
-  for (int n = 0; n < args.batch; ++n) {
-    acl_op->RunAcl((void*)input_data, (void*)output_data);
-    input_data += args.in_depth * args.in_cols * args.in_rows;
-    output_data += args.in_depth * args.out_cols * args.out_rows;
-  }
-}
-template class PoolKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
-#endif
--- a/src/operators/kernel/mali/relu_kernel.cpp
+++ b/src/operators/kernel/mali/relu_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef RELU_OP
-#pragma once
-#include "operators/kernel/relu_kernel.h"
-#ifdef PADDLE_MOBILE_MALI_GPU
-#include "acl_operator.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-class AclReluOp : public acl::ACLOperator {
- public:
-  AclReluOp() {
-    this->force_bypass_acl_path_ =
-        bypass_acl_class_layer & FLAGS_ENABLE_ACL_RELU;
-  }
-  ~AclReluOp() = default;
-  AclReluOp(const AclReluOp&) = delete;
-  AclReluOp& operator=(const AclReluOp&) = delete;
-  AclReluOp(AclReluOp&&) = delete;
-  AclReluOp& operator=(AclReluOp&&) = delete;
-  acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const ReluParam<DeviceType>& param) {
-    setTargetHint(acl::TargetHint::OPENCL);
-    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
-                                         args.in_depth, args.batch);
-    arm_compute::TensorShape output_shape(args.in_cols, args.in_rows,
-                                          args.in_depth, args.out_num);
-    // arm_compute::TensorShape weights_shape(
-    // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
-    // arm_compute::TensorShape biases_shape(args.out_depth);
-    arm_compute::ActivationLayerInfo::ActivationFunction type;
-    type = arm_compute::ActivationLayerInfo::ActivationFunction::RELU;
-    arm_compute::ActivationLayerInfo act_info(type);
-    if (is_operator_init_done(input_shape)) return;
-    set_operator_init_done();
-    this->force_bypass_acl_path_ = false;
-    //[width, height, IFM]
-    new_tensor(input(), input_shape, args.input_data);
-    //[width, height, OFM]
-    new_tensor(output(), output_shape, args.output_data);
-    acl_configure(activation, this, act_info);
-  }
-  void RunAcl(void* input, void* output) {
-    acl::ACLOperator::acl_run(input, output);
-  }
-  bool Bypass_acl(const ReluParam<DeviceType>& param) {
-    bool bypass_acl = false;
-    AclParametersByContext(param);
-    InitAclLayer(param);
-    // for performance, more groups impact GPU performance
-    if (this->force_bypass_acl_path_) {
-      bypass_acl = true;
-    }
-    return bypass_acl;
-  }
- private:
-  void AclParametersByContext(const ReluParam<DeviceType>& param) {
-    const auto* input_x = param.InputX();
-    auto* out = param.Out();
-    const T* input_data = input_x->data<T>();
-    T* output_data = out->mutable_data<T>();
-    args.input_data = (void*)input_data;
-    args.output_data = (void*)output_data;
-    args.batch = input_x->dims()[0];
-    args.in_depth = input_x->dims()[1];
-    args.in_rows = input_x->dims()[2];
-    args.in_cols = input_x->dims()[3];
-    args.out_num = out->dims()[0];
-  }
-  acl::AclParameters args;
-};
-template <>
-bool ReluKernel<GPU_MALI, float>::Init(ReluParam<GPU_MALI>* param) {
-  AclReluOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    acl_op = new AclReluOp<GPU_MALI, float>();
-    this->SetAclOp((void*)acl_op, (void*)this);
-  }
-  if (acl_op->Bypass_acl(*param)) {
-    std::cout << "init acl failed" << std::endl;
-    return false;
-  }
-  return true;
-}
-template <>
-void ReluKernel<GPU_MALI, float>::Compute(const ReluParam<GPU_MALI>& param) {
-  std::cout << "init acl" << std::endl;
-  AclReluOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    return;
-  }
-  acl::AclParameters& args = acl_op->getargs();
-  acl_op->RunAcl(args.input_data, args.output_data);
-}
-template class ReluKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
-#endif
--- a/src/operators/kernel/mali/reshape_kernel.cpp
+++ b/src/operators/kernel/mali/reshape_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef RESHAPE_OP
-#pragma once
-#include "operators/kernel/reshape_kernel.h"
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool ReshapeKernel<GPU_MALI, float>::Init(ReshapeParam<GPU_MALI> *param) {
-  return true;
-}
-template <>
-void ReshapeKernel<GPU_MALI, float>::Compute(
-    const ReshapeParam<GPU_MALI> &param) {
-  const auto *input_x = param.InputX();
-  const auto &input_x_dims = input_x->dims();
-  auto *out = param.Out();
-  framework::DDim out_dims = out->dims();
-  const auto *input_shape = param.InputShape();
-  if (input_shape) {
-    auto *shape_data = input_shape->data<int>();
-    framework::Tensor cpu_shape_tensor;
-    auto shape =
-        std::vector<int>(shape_data, shape_data + input_shape->numel());
-    out_dims = ValidateShape(shape, input_x->dims());
-  }
-  bool inplace = param.Inplace();
-  out->Resize(out_dims);
-  if (!inplace) {
-    out->mutable_data<float>();
-    framework::TensorCopy(*input_x, out);
-    out->Resize(out_dims);
-  } else {
-    out->ShareDataWith(*input_x);
-    out->Resize(out_dims);
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/kernel/mali/softmax_kernel.cpp
+++ b/src/operators/kernel/mali/softmax_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef SOFTMAX_OP
-#pragma once
-#include "operators/kernel/softmax_kernel.h"
-#ifdef PADDLE_MOBILE_MALI_GPU
-#include "acl_operator.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-class AclSoftmaxOp : public acl::ACLOperator {
- public:
-  AclSoftmaxOp() {
-    this->force_bypass_acl_path_ =
-        bypass_acl_class_layer & FLAGS_ENABLE_ACL_SOFTMAX;
-  }
-  ~AclSoftmaxOp() = default;
-  AclSoftmaxOp(const AclSoftmaxOp&) = delete;
-  AclSoftmaxOp& operator=(const AclSoftmaxOp&) = delete;
-  AclSoftmaxOp(AclSoftmaxOp&&) = delete;
-  AclSoftmaxOp& operator=(AclSoftmaxOp&&) = delete;
-  acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const SoftmaxParam<DeviceType>& param) {
-    setTargetHint(acl::TargetHint::OPENCL);
-    arm_compute::TensorShape shape(args.in_depth, args.batch);
-    if (is_operator_init_done(shape)) return;
-    set_operator_init_done();
-    this->force_bypass_acl_path_ = false;
-    //[width, height, IFM]
-    new_tensor(input(), shape, args.input_data);
-    //[width, height, OFM]
-    new_tensor(output(), shape, args.output_data);
-    acl_configure(softmax, this, NULL);
-  }
-  void RunAcl(void* input, void* output) {
-    acl::ACLOperator::acl_run(input, output);
-  }
-  bool Bypass_acl(const SoftmaxParam<DeviceType>& param) {
-    bool bypass_acl = false;
-    AclParametersByContext(param);
-    InitAclLayer(param);
-    // for performance, more groups impact GPU performance
-    if (this->force_bypass_acl_path_) {
-      bypass_acl = true;
-    }
-    return bypass_acl;
-  }
- private:
-  void AclParametersByContext(const SoftmaxParam<DeviceType>& param) {
-    const framework::Tensor* in_x = param.InputX();
-    framework::Tensor* out = param.Out();
-    auto x_dims = in_x->dims();
-    out->Resize(x_dims);
-    const T* input_data = in_x->data<T>();
-    T* output_data = out->data<T>();
-    args.input_data = (void*)input_data;
-    args.output_data = (void*)output_data;
-    // NCHW
-    args.batch = in_x->dims()[0];
-    args.in_depth = in_x->dims()[1];
-    args.out_num = out->dims()[0];
-    // std::cout
-    //  << "Out C: " <<  args.out_depth
-    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
-  }
-  acl::AclParameters args;
-};
-template <>
-bool SoftmaxKernel<GPU_MALI, float>::Init(SoftmaxParam<GPU_MALI>* param) {
-  AclSoftmaxOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    acl_op = new AclSoftmaxOp<GPU_MALI, float>();
-    this->SetAclOp((void*)acl_op, (void*)this);
-  }
-  if (acl_op->Bypass_acl(*param)) {
-    std::cout << "init acl failed" << std::endl;
-    return false;
-  }
-  return true;
-}
-template <>
-void SoftmaxKernel<GPU_MALI, float>::Compute(
-    const SoftmaxParam<GPU_MALI>& param) {
-  std::cout << "init acl" << std::endl;
-  AclSoftmaxOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    return;
-  }
-  acl::AclParameters& args = acl_op->getargs();
-  const float* input_data = (const float*)args.input_data;
-  const float* output_data = (const float*)args.output_data;
-  for (int n = 0; n < args.out_num; ++n) {
-    acl_op->RunAcl((void*)input_data, (void*)output_data);
-    input_data += args.in_depth;
-    output_data += args.in_depth;
-  }
-}
-template class SoftmaxKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
-#endif
--- a/src/operators/lookup_op.cpp
+++ b/src/operators/lookup_op.cpp
@@ -59,8 +59,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(lookup_table, ops::LookupOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -35,8 +35,5 @@ REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(lrn, ops::LrnOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
-#endif
 #endif
--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -61,9 +61,6 @@ REGISTER_OPERATOR_CPU(mul, ops::MulOp);
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(mul, ops::MulOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(mul, ops::MulOp);
 #endif

--- a/src/operators/norm_op.cpp
+++ b/src/operators/norm_op.cpp
@@ -41,8 +41,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(norm, ops::NormOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
@@ -63,9 +63,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(pool2d, ops::PoolOp);
 #endif

--- a/src/operators/prelu_op.cpp
+++ b/src/operators/prelu_op.cpp
@@ -36,8 +36,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(prelu, ops::PReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(prelu, ops::PReluOp);
-#endif
 #endif
--- a/src/operators/prior_box_op.cpp
+++ b/src/operators/prior_box_op.cpp
@@ -52,8 +52,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(prior_box, ops::PriorBoxOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(prior_box, ops::PriorBoxOp);
 #endif

--- a/src/operators/reshape2_op.cpp
+++ b/src/operators/reshape2_op.cpp
@@ -40,9 +40,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(reshape2, ops::Reshape2Op);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(reshape2, ops::Reshape2Op);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(reshape2, ops::Reshape2Op);
 #endif

--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -35,9 +35,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(reshape, ops::ReshapeOp);
 #endif

--- a/src/operators/resize_op.cpp
+++ b/src/operators/resize_op.cpp
@@ -32,8 +32,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(resize, ops::ResizeOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(resize, ops::ResizeOp);
-#endif
 #endif
--- a/src/operators/scale_op.cpp
+++ b/src/operators/scale_op.cpp
@@ -32,8 +32,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(scale, ops::ScaleOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(scale, ops::ScaleOp);
-#endif
 #endif
--- a/src/operators/shape_op.cpp
+++ b/src/operators/shape_op.cpp
@@ -34,7 +34,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(shape, ops::ShapeOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #endif
--- a/src/operators/slice_op.cpp
+++ b/src/operators/slice_op.cpp
@@ -31,9 +31,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(slice, ops::SliceOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(slice, ops::SliceOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(slice, ops::SliceOp);
 #endif

--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -31,9 +31,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(softmax, ops::SoftmaxOp);
 #endif

--- a/src/operators/sum_op.cpp
+++ b/src/operators/sum_op.cpp
@@ -61,9 +61,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(sum, ops::SumOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(sum, ops::ConcatOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif