remove mali

a29db744 · xiebaiyuan · ad3844d6 · a29db744 · a29db744 · a29db744
66 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,6 @@ option(WITH_TEST     "build with unit tests" ON)

 # select the platform to build
 option(CPU        "build with arm CPU support" ON)
-option(GPU_MALI   "build with arm mali GPU support" OFF)
 option(GPU_CL     "build with OpenCL support" OFF)
 option(FPGA       "build with FPGA support" OFF)
 if(FPGA)
@@ -97,31 +96,6 @@ else()
    endforeach()
 endif()

-if (GPU_MALI)
-    add_definitions(-DPADDLE_MOBILE_MALI_GPU)
-    add_definitions(-DUSE_ACL=1)
-    add_definitions(-DUSE_OPENCL)
-    set(ACL_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/ACL_Android)
-    include_directories(${ACL_ROOT} ${ACL_ROOT}/include)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_core")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_graph")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build/opencl-1.2-stubs")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ACL=1")
-else()
-    file(GLOB_RECURSE _tmp_list src/operators/kernel/mali/*.cpp src/operators/kernel/mali/*.cc)
-    foreach(f ${_tmp_list})
-        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-    endforeach()
-
-    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/mali/*.h)
-    foreach(f ${_tmp_list_h})
-        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-    endforeach()
-endif()
-
 if(FPGA)
    add_definitions(-DPADDLE_MOBILE_FPGA)
    file(GLOB_RECURSE _tmp_list src/operators/math/*.cpp src/operators/kernel/fpga/*.cc)

--- a/doc/build.md
+++ b/doc/build.md
@@ -46,7 +46,6 @@ root@5affd29d4fc5:/ # ccmake .
 DEBUGING                         ON
 FPGA                             OFF
 LOG_PROFILE                      ON
- MALI_GPU                         OFF
 NET                              googlenet
 USE_EXCEPTION                    ON
 USE_OPENMP                       OFF

--- a/doc/design_doc.md
+++ b/doc/design_doc.md
@@ -109,11 +109,6 @@ USE_OP_CPU(conv2d);
 REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
 #endif

-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(conv2d);
-REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
-#endif
-
 #ifdef PADDLE_MOBILE_FPGA
 USE_OP_FPGA(conv2d);
 REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);

--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -23,15 +23,6 @@ limitations under the License. */
 #define LOAD_CPU_OP(op_type)
 #endif

-#ifdef PADDLE_MOBILE_MALI_GPU
-#define LOAD_MALI_GPU_OP(op_type)                                           \
-  extern int TouchOpRegistrar_##op_type##_##mali_gpu();                     \
-  static int use_op_itself_##op_type##_##mali_gpu __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type##_##mali_gpu()
-#else
-#define LOAD_MALI_GPU_OP(op_type)
-#endif
-
 #ifdef PADDLE_MOBILE_FPGA
 #define LOAD_FPGA_OP(op_type)                                           \
  extern int TouchOpRegistrar_##op_type##_##fpga();                     \
@@ -46,9 +37,8 @@ limitations under the License. */
  static int use_fusion_matcher_itself_##op_type __attribute__((unused)) = \
      TouchFusionMatcherRegistrar_##op_type();

-#define LOAD_OP(op_type)     \
-  LOAD_CPU_OP(op_type);      \
-  LOAD_MALI_GPU_OP(op_type); \
+#define LOAD_OP(op_type) \
+  LOAD_CPU_OP(op_type);  \
  LOAD_FPGA_OP(op_type);

 #define LOAD_OP1(op_type, device_type) LOAD_##device_type##_OP(op_type);
@@ -68,7 +58,7 @@ LOAD_OP(fetch)
 LOAD_OP(fill_constant)
 #endif
 #ifdef BATCHNORM_OP
-LOAD_OP2(batch_norm, CPU, MALI_GPU);
+LOAD_OP1(batch_norm, CPU);
 #endif
 #ifdef BILINEAR_INTERP_OP
 LOAD_OP1(bilinear_interp, CPU);
@@ -77,40 +67,40 @@ LOAD_OP1(bilinear_interp, CPU);
 LOAD_OP1(box_coder, CPU);
 #endif
 #ifdef CONCAT_OP
-LOAD_OP3(concat, CPU, MALI_GPU, FPGA);
+LOAD_OP2(concat, CPU, FPGA);
 #endif
 #ifdef CONV_OP
-LOAD_OP3(conv2d, CPU, MALI_GPU, FPGA);
+LOAD_OP2(conv2d, CPU, FPGA);
 #endif
 #ifdef LRN_OP
-LOAD_OP2(lrn, CPU, MALI_GPU);
+LOAD_OP1(lrn, CPU);
 #endif
 #ifdef SIGMOID_OP
 LOAD_OP1(sigmoid, CPU);
 #endif
 #ifdef FUSION_FC_RELU_OP
-LOAD_OP3(fusion_fc_relu, CPU, MALI_GPU, FPGA);
+LOAD_OP3(fusion_fc_relu, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_fc_relu);
 #endif
 #ifdef FUSION_ELEMENTWISEADDRELU_OP
-LOAD_OP3(fusion_elementwise_add_relu, CPU, MALI_GPU, FPGA);
+LOAD_OP3(fusion_elementwise_add_relu, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_elementwise_add_relu);
 #endif
 #ifdef SPLIT_OP
 LOAD_OP1(split, CPU);
 #endif
 #ifdef RESIZE_OP
-LOAD_OP2(resize, CPU, MALI_GPU);
+LOAD_OP1(resize, CPU);
 #endif
 #ifdef FUSION_CONVADDBNRELU_OP
 LOAD_OP2(fusion_conv_add_bn_relu, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_add_bn_relu);
 #endif
 #ifdef RESHAPE_OP
-LOAD_OP2(reshape, CPU, MALI_GPU);
+LOAD_OP1(reshape, CPU);
 #endif
 #ifdef RESHAPE2_OP
-LOAD_OP2(reshape2, CPU, MALI_GPU);
+LOAD_OP1(reshape2, CPU);
 #endif
 #ifdef TRANSPOSE_OP
 LOAD_OP1(transpose, CPU);
@@ -126,11 +116,11 @@ LOAD_OP2(fusion_conv_add_relu, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_add_relu);
 #endif
 #ifdef FUSION_CONVADD_OP
-LOAD_OP2(fusion_conv_add, CPU, MALI_GPU);
+LOAD_OP1(fusion_conv_add, CPU);
 LOAD_FUSION_MATCHER(fusion_conv_add);
 #endif
 #ifdef SOFTMAX_OP
-LOAD_OP2(softmax, CPU, MALI_GPU);
+LOAD_OP1(softmax, CPU);
 #endif
 #ifdef SHAPE_OP
 LOAD_OP1(shape, CPU);
@@ -142,13 +132,13 @@ LOAD_OP1(depthwise_conv2d, CPU);
 LOAD_OP1(conv2d_transpose, CPU);
 #endif
 #ifdef SCALE_OP
-LOAD_OP2(scale, CPU, MALI_GPU);
+LOAD_OP1(scale, CPU);
 #endif
 #ifdef ELEMENTWISEADD_OP
-LOAD_OP2(elementwise_add, CPU, MALI_GPU);
+LOAD_OP1(elementwise_add, CPU);
 #endif
 #ifdef PRELU_OP
-LOAD_OP2(prelu, CPU, MALI_GPU);
+LOAD_OP1(prelu, CPU);
 #endif
 #ifdef FLATTEN_OP
 LOAD_OP1(flatten, CPU);
@@ -182,13 +172,13 @@ LOAD_FUSION_MATCHER(fusion_dwconv_bn_relu);
 LOAD_OP1(crf_decoding, CPU);
 #endif
 #ifdef MUL_OP
-LOAD_OP2(mul, CPU, MALI_GPU);
+LOAD_OP1(mul, CPU);
 #endif
 #ifdef NORM_OP
 LOAD_OP1(norm, CPU);
 #endif
 #ifdef RELU_OP
-LOAD_OP2(relu, CPU, MALI_GPU);
+LOAD_OP1(relu, CPU);
 LOAD_OP1(relu6, CPU);
 #endif
 #ifdef IM2SEQUENCE_OP
@@ -198,11 +188,11 @@ LOAD_OP1(im2sequence, CPU);
 LOAD_OP1(lookup_table, CPU);
 #endif
 #ifdef FUSION_FC_OP
-LOAD_OP3(fusion_fc, CPU, MALI_GPU, FPGA);
+LOAD_OP2(fusion_fc, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_fc);
 #endif
 #ifdef POOL_OP
-LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA);
+LOAD_OP2(pool2d, CPU, FPGA);
 #endif
 #ifdef MULTICLASSNMS_OP
 LOAD_OP1(multiclass_nms, CPU);
@@ -217,7 +207,7 @@ LOAD_OP1(sum, CPU);
 LOAD_OP1(elementwise_mul, CPU);
 #endif
 #ifdef SLICE_OP
-LOAD_OP2(slice, CPU, MALI_GPU);
+LOAD_OP1(slice, CPU);
 #endif
 #ifdef FUSION_CONVBN_OP
 LOAD_OP2(fusion_conv_bn, CPU, FPGA);

--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -115,9 +115,6 @@ class OpRegistry {
 #define REGISTER_OPERATOR_CPU(op_type, op_class) \
  REGISTER_OPERATOR(op_type, op_class, cpu, paddle_mobile::CPU);

-#define REGISTER_OPERATOR_MALI_GPU(op_type, op_class) \
-  REGISTER_OPERATOR(op_type, op_class, mali_gpu, paddle_mobile::GPU_MALI);
-
 #define REGISTER_OPERATOR_FPGA(op_type, op_class) \
  REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);


--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -130,13 +130,6 @@ class OpKernelBase {
  }
 #endif

-#ifdef PADDLE_McOBILE_MALI_GPU
-  OpKernelBase() { acl_op_ = nullptr; }
-  void *GetAclOp() const { return acl_op_; }
-  void SetAclOp(void *op, void *ob) const {
-    reinterpret_cast<OpKernelBase<Dtype, P> *>(ob)->acl_op_ = op;
-  }
-#endif
  virtual void Compute(const P &para) = 0;
  virtual bool Init(P *para) { return true; }
  virtual ~OpKernelBase() = default;
@@ -147,9 +140,6 @@ class OpKernelBase {
 #endif

 private:
-#ifdef PADDLE_MOBILE_MALI_GPU
-  void *acl_op_;
-#endif
 };

 class FusionOpMatcher {

--- a/src/operators/activation_op.cpp
+++ b/src/operators/activation_op.cpp
@@ -55,9 +55,6 @@ namespace ops = paddle_mobile::operators;
 REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
 REGISTER_OPERATOR_CPU(relu6, ops::Relu6Op);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(relu, ops::ReluOp);
 #endif

--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -34,9 +34,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif


--- a/src/operators/bilinear_interp_op.cpp
+++ b/src/operators/bilinear_interp_op.cpp
@@ -48,8 +48,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(bilinear_interp, ops::BilinearOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 #endif


--- a/src/operators/box_coder_op.cpp
+++ b/src/operators/box_coder_op.cpp
@@ -58,8 +58,6 @@ REGISTER_OPERATOR_CPU(box_coder, ops::BoxCoderOp);
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(box_coder, ops::BoxCoderOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif


--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -69,9 +69,7 @@ REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(concat, ops::ConcatOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(concat, ops::ConcatOp);
 #endif

--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -55,9 +55,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
 #endif

--- a/src/operators/conv_transpose_op.cpp
+++ b/src/operators/conv_transpose_op.cpp
@@ -24,8 +24,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(conv2d_transpose, ops::ConvOpTranspose);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(conv2d_transpose, ops::ConvOpTranspose);
 #endif

--- a/src/operators/crf_op.cpp
+++ b/src/operators/crf_op.cpp
@@ -48,8 +48,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(crf_decoding, ops::CrfOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 #endif


--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -33,9 +33,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
-#endif

 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp);

--- a/src/operators/elementwise_mul_op.cpp
+++ b/src/operators/elementwise_mul_op.cpp
@@ -32,9 +32,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(elementwise_mul, ops::ElementwiseMulOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(elementwise_mul, ops::ElementwiseMulOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(elementwise_mul, ops::ElementwiseMulOp);
 #endif

--- a/src/operators/elementwise_sub_op.cpp
+++ b/src/operators/elementwise_sub_op.cpp
@@ -32,9 +32,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(elementwise_sub, ops::ElementwiseSubOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(elementwise_sub, ops::ElementwiseSubOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif


--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -38,9 +38,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(feed, ops::FeedOp);
 #endif

--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
@@ -30,9 +30,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fetch, ops::FetchOp);
 #endif

--- a/src/operators/fusion_conv_add_op.cpp
+++ b/src/operators/fusion_conv_add_op.cpp
@@ -54,9 +54,6 @@ REGISTER_FUSION_MATCHER(fusion_conv_add, ops::FusionConvAddMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp);
-#endif

 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(fusion_conv_add, ops::FusionConvAddOp);

--- a/src/operators/fusion_deconv_add_bn_op.cpp
+++ b/src/operators/fusion_deconv_add_bn_op.cpp
@@ -24,8 +24,7 @@ namespace ops = paddle_mobile::operators;
 REGISTER_FUSION_MATCHER(fusion_deconv_add_bn, ops::FusionDeconvAddBNMatcher);
 #ifdef PADDLE_MOBILE_CPU
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn, ops::FusionDeconvAddBNOp);
 #endif

--- a/src/operators/fusion_deconv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_deconv_add_bn_relu_op.cpp
@@ -25,8 +25,7 @@ REGISTER_FUSION_MATCHER(fusion_deconv_add_bn_relu,
                        ops::FusionDeconvAddBNReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn_relu, ops::FusionDeconvAddBNReluOp);
 #endif

--- a/src/operators/fusion_deconv_add_op.cpp
+++ b/src/operators/fusion_deconv_add_op.cpp
@@ -24,8 +24,7 @@ namespace ops = paddle_mobile::operators;
 REGISTER_FUSION_MATCHER(fusion_deconv_add, ops::FusionDeconvAddMatcher);
 #ifdef PADDLE_MOBILE_CPU
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_deconv_add, ops::FusionDeconvAddOp);
 #endif

--- a/src/operators/fusion_deconv_add_relu_op.cpp
+++ b/src/operators/fusion_deconv_add_relu_op.cpp
@@ -25,8 +25,7 @@ REGISTER_FUSION_MATCHER(fusion_deconv_add_relu,
                        ops::FusionDeconvAddReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_deconv_add_relu, ops::FusionDeconvAddReluOp);
 #endif

--- a/src/operators/fusion_deconv_bn_relu_op.cpp
+++ b/src/operators/fusion_deconv_bn_relu_op.cpp
@@ -24,8 +24,7 @@ namespace ops = paddle_mobile::operators;
 REGISTER_FUSION_MATCHER(fusion_deconv_bn_relu, ops::FusionDeconvBNReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_deconv_bn_relu, ops::FusionDeconvBNReluOp);
 #endif

--- a/src/operators/fusion_deconv_relu_op.cpp
+++ b/src/operators/fusion_deconv_relu_op.cpp
@@ -23,8 +23,7 @@ namespace operators {}
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_deconv_relu, ops::FusionDeconvReluOp);
 #endif

--- a/src/operators/fusion_elementwise_add_relu_op.cpp
+++ b/src/operators/fusion_elementwise_add_relu_op.cpp
@@ -36,10 +36,6 @@ REGISTER_FUSION_MATCHER(fusion_elementwise_add_relu,
 // REGISTER_OPERATOR_CPU(fusion_elementwise_add_relu,
 //                      ops::FusionElementwiseAddReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-// REGISTER_OPERATOR_MALI_GPU(fusion_elementwise_add_relu,
-//                           ops::FusionElementwiseAddReluOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_elementwise_add_relu,
                       ops::FusionElementwiseAddReluOp);

--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -63,9 +63,6 @@ REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp);
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(fusion_fc, ops::FusionFcOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(fusion_fc, ops::FusionFcOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_fc, ops::FusionFcOp);
 #endif

--- a/src/operators/fusion_fc_relu_op.cpp
+++ b/src/operators/fusion_fc_relu_op.cpp
@@ -60,9 +60,6 @@ REGISTER_FUSION_MATCHER(fusion_fc_relu, ops::FusionFcReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_fc_relu, ops::FusionFcReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(fusion_fc_relu, ops::FusionFcReluOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_fc_relu, ops::FusionFcReluOp);
 #endif

--- a/src/operators/gru_unit_op.cpp
+++ b/src/operators/gru_unit_op.cpp
@@ -59,8 +59,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(gru_unit, ops::GruUnitOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 #endif


--- a/src/operators/increment_op.cpp
+++ b/src/operators/increment_op.cpp
@@ -37,8 +37,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(increment, ops::IncrementOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 #endif


--- a/src/operators/is_empty_op.cpp
+++ b/src/operators/is_empty_op.cpp
@@ -34,8 +34,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(is_empty, ops::IsEmptyOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 #endif


--- a/ACL_Android @ 591027fc
+++ b/ACL_Android @ 591027fc
-Subproject commit 591027fcffea084100c756e48356e0f8a48e35e5
--- a/src/operators/kernel/mali/acl_operator.cc
+++ b/src/operators/kernel/mali/acl_operator.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if USE_ACL == 1
-#include "acl_operator.h"
-unsigned int bypass_acl_class_layer =
-    (0 | FLAGS_ENABLE_ACL_CONCAT |
-     /*0xffffffff |*/ /*FLAGS_ENABLE_ACL_FC |*/ /*FLAGS_ENABLE_ACL_LRN
-                                                   |*/
-     0);
-
-int enable_schedule = 0;
-
-#ifdef USE_PROFILING
-
-#include "arm_neon.h"
-
-unsigned int acl_log_flags =
-    (0 | MASK_LOG_APP_TIME | /*MASK_LOG_ALLOCATE | */  /*MASK_LOG_ALLOCATE | */
-     /*MASK_LOG_RUN      | */ /*MASK_LOG_CONFIG   | */ /*MASK_LOG_COPY     | */
-     MASK_LOG_ABSVAL | MASK_LOG_BNLL | MASK_LOG_CONV | MASK_LOG_FC |
-     MASK_LOG_LRN | MASK_LOG_POOLING | MASK_LOG_RELU | MASK_LOG_SIGMOID |
-     MASK_LOG_SOFTMAX | MASK_LOG_TANH | MASK_LOG_LC | MASK_LOG_BN |
-     MASK_LOG_CONCAT | 0);
-#include <stdio.h>  /* printf */
-#include <stdlib.h> /* getenv */
-#endif              // USE_PROFILING
-
-static bool force_enable_gpu = false;
-bool AclEnableSchedule(int enable) {
-  enable_schedule = enable;
-  if (enable) {
-    force_enable_gpu = true;
-  }
-  return true;
-}
-int isScheduleEnable() { return enable_schedule; }
-
-namespace paddle_mobile {
-namespace operators {
-namespace acl {
-
-bool ACLOperator::init_gpu_env = true;
-#ifdef USE_OPENCL
-bool ACLOperator::support_opencl_ = false;
-bool opencl_is_available() { return arm_compute::opencl_is_available(); }
-#elif defined(USE_OPENGLES)
-bool ACLOperator::support_opengles_ = false;
-#endif
-ACLOperator::ACLOperator(bool is_gpu)
-    : operator_state_(operator_not_init),
-      force_bypass_acl_path_(false),
-      target_hint_(TargetHint::DONT_CARE),
-      convolution_method_hint_(ConvolutionMethodHint::GEMM),
-      _group(1),
-      name_(""),
-      input_idx_(0),
-      output_idx_(0),
-      is_gpu_(is_gpu) {
-  const char* pBypassACL;
-  if (init_gpu_env) {
-#ifdef USE_OPENCL
-    try {
-      if (opencl_is_available()) {
-        arm_compute::CLScheduler::get().default_init();
-        support_opencl_ = true;
-      }
-    } catch (std::exception& e) {
-      support_opencl_ = false;
-    }
-#elif defined(USE_OPENGLES)
-    try {
-      arm_compute::GCScheduler::get().default_init();
-      support_opengles_ = true;
-    } catch (std::exception& e) {
-      support_opengles_ = false;
-    }
-#endif
-    init_gpu_env = false;
-  }
-  if (force_enable_gpu) is_gpu_ = true;
-  pBypassACL = getenv("BYPASSACL");
-  if (pBypassACL) {
-    unsigned int bacl;
-    sscanf(pBypassACL, "%i", &bacl);
-    if (bacl != bypass_acl_class_layer) {
-      bypass_acl_class_layer = bacl;
-      printf("BYPASSACL<%s>\n", pBypassACL);
-      printf("BYPASSACL: %x\n", bypass_acl_class_layer);
-    }
-  }
-
-#ifdef USE_PROFILING
-  const char* pLogACL;
-  pLogACL = getenv("LOGACL");
-  if (pLogACL) {
-    unsigned int alf;
-    sscanf(pLogACL, "%i", &alf);
-    if (alf != acl_log_flags) {
-      acl_log_flags = alf;
-      printf("LOGACL<%s>\n", pLogACL);
-      printf("LOGACL: %x\n", acl_log_flags);
-    }
-  }
-#endif  // USE_PROFILING
-  const char* pEnableSchedule;
-  pEnableSchedule = getenv("ENABLESCHEDULE");
-  if (pEnableSchedule) {
-    int bshedule;
-    sscanf(pEnableSchedule, "%i", &bshedule);
-    if (bshedule != enable_schedule) {
-      enable_schedule = bshedule;
-      printf("ENABLESCHEDULE<%s>\n", pEnableSchedule);
-      printf("ENABLESCHEDULE: %x\n", enable_schedule);
-    }
-    if (enable_schedule) {
-      AclEnableSchedule(1);
-    }
-  }
-}
-ACLOperator::~ACLOperator() {}
-
-bool ACLOperator::new_tensor(std::unique_ptr<ACLTensor>& tensor,
-                             arm_compute::TensorShape& shape, void* mem,
-                             bool commit) {
-  auto acl_tensor =
-      new ACLTensor(arm_compute::TensorInfo(shape, arm_compute::Format::F32));
-  acl_tensor->set_target(getTargetHint());
-  acl_tensor->bindmem(mem);
-  if (commit) acl_tensor->commit();
-  tensor = (std::unique_ptr<ACLTensor>)std::move(acl_tensor);
-  return true;
-}
-bool ACLOperator::new_tensor(std::unique_ptr<ACLSubTensor>& tensor,
-                             std::unique_ptr<ACLTensor>& parent,
-                             arm_compute::TensorShape& shape,
-                             arm_compute::Coordinates& coord) {
-  auto acl_tensor = new ACLSubTensor(parent, shape, coord);
-  acl_tensor->set_target(getTargetHint());
-  tensor = (std::unique_ptr<ACLSubTensor>)std::move(acl_tensor);
-  return true;
-}
-
-void ACLTensor::commit(TensorType type) {
-  settensortype(type);
-  if (mem_) {
-    if (!allocate_) {
-#ifdef USE_PROFILING
-      logtime_util log_time(ACL_ALLOCATE_INFO);
-#endif  // USE_PROFILING
-      allocate();
-      allocate_ = true;
-    }
-    if (type_ != tensor_output) {
-      tensor_copy(mem_);
-    }
-    mem_ = nullptr;
-  }
-}
-
-int BaseACLTensor::tensor_copy(arm_compute::ITensor* tensor, void* mem,
-                               bool toTensor) {
-#ifdef USE_PROFILING
-  logtime_util log_time(ACL_COPY_INFO);
-#endif  // USE_PROFILING
-  arm_compute::Window window;
-  // Iterate through the rows (not each element)
-  window.use_tensor_dimensions(tensor->info()->tensor_shape(),
-                               /* first_dimension =*/arm_compute::Window::DimY);
-
-  int width = tensor->info()->tensor_shape()[0];
-  int height = tensor->info()->tensor_shape()[1];
-  int deepth = tensor->info()->tensor_shape()[2];
-  map();
-  // Create an iterator:
-  arm_compute::Iterator it(tensor, window);
-  // Except it works for an arbitrary number of dimensions
-  if (toTensor) {  // mem->tensor
-    arm_compute::execute_window_loop(
-        window,
-        [&](const arm_compute::Coordinates& id) {
-          memcpy(it.ptr(),
-                 ((char*)mem) +
-                     ((id[3] * (width * height * deepth) +
-                       id.z() * (width * height) + id.y() * width + id.x()) *
-                      tensor->info()->element_size()),
-                 width * tensor->info()->element_size());
-        },
-        it);
-  } else {  // tensor-->mem
-    arm_compute::execute_window_loop(
-        window,
-        [&](const arm_compute::Coordinates& id) {
-          memcpy(((char*)mem) + ((id[3] * (width * height * deepth) +
-                                  id.z() * (width * height) + id.y() * width) *
-                                 tensor->info()->element_size()),
-                 it.ptr(), width * tensor->info()->element_size());
-        },
-        it);
-  }
-  unmap();
-
-  return 0;
-}
-
-}  // namespace acl
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/kernel/mali/acl_operator.h
+++ b/src/operators/kernel/mali/acl_operator.h
--- a/src/operators/kernel/mali/acl_tensor.cc
+++ b/src/operators/kernel/mali/acl_tensor.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "acl_tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace acl {
-
-#ifdef USE_ACL
-template <typename TensorType>
-std::unique_ptr<arm_compute::ITensor> initialise_tensor(
-    arm_compute::TensorInfo &info) {
-  auto tensor = cpp14::make_unique<TensorType>();
-  tensor->allocator()->init(info);
-  return std::move(tensor);
-}
-
-template <typename TensorType>
-void tensor_allocate(arm_compute::ITensor &tensor) {
-  auto itensor = dynamic_cast<TensorType *>(&tensor);
-  itensor->allocator()->allocate();
-}
-
-Tensor::Tensor(arm_compute::TensorInfo &info) noexcept
-    : _target(TargetHint::DONT_CARE), _info(info), _tensor(nullptr) {}
-
-Tensor::Tensor(Tensor &&src) noexcept
-    : _target(src._target),
-      _info(std::move(src._info)),
-      _tensor(std::move(src._tensor)) {}
-
-arm_compute::ITensor *Tensor::set_target(TargetHint target) {
-  switch (target) {
-#ifdef USE_OPENCL
-    case TargetHint::OPENCL:
-      _tensor = initialise_tensor<arm_compute::CLTensor>(_info);
-      break;
-#elif defined(USE_OPENGLES)
-    case TargetHint::OPENGLES:
-      _tensor = initialise_tensor<arm_compute::GCTensor>(_info);
-      break;
-#endif
-    case TargetHint::NEON:
-      _tensor = initialise_tensor<arm_compute::Tensor>(_info);
-      break;
-    default:
-      break;
-  }
-  _target = target;
-  return _tensor.get();
-}
-
-void Tensor::allocate() {
-  switch (_target) {
-#ifdef USE_OPENCL
-    case TargetHint::OPENCL:
-      tensor_allocate<arm_compute::CLTensor>(*_tensor);
-      break;
-#elif defined(USE_OPENGLES)
-    case TargetHint::OPENGLES:
-      tensor_allocate<arm_compute::GCTensor>(*_tensor);
-      break;
-#endif
-    case TargetHint::NEON:
-      tensor_allocate<arm_compute::Tensor>(*_tensor);
-      break;
-    default:
-      break;
-  }
-}
-void Tensor::map(bool blocking) {
-#ifdef USE_OPENCL
-  if (_target == TargetHint::OPENCL)
-    dynamic_cast<arm_compute::CLTensor *>(tensor())->map(blocking);
-#elif defined(USE_OPENGLES)
-  if (_target == TargetHint::OPENGLES)
-    dynamic_cast<arm_compute::GCTensor *>(tensor())->map(blocking);
-#endif
-}
-void Tensor::unmap() {
-#ifdef USE_OPENCL
-  if (_target == TargetHint::OPENCL)
-    dynamic_cast<arm_compute::CLTensor *>(tensor())->unmap();
-#elif defined(USE_OPENGLES)
-  if (_target == TargetHint::OPENGLES)
-    dynamic_cast<arm_compute::GCTensor *>(tensor())->unmap();
-#endif
-}
-
-template <typename SubTensorType, typename ParentTensorType>
-std::unique_ptr<arm_compute::ITensor> initialise_subtensor(
-    arm_compute::ITensor *parent, arm_compute::TensorShape shape,
-    arm_compute::Coordinates coords) {
-  auto ptensor = dynamic_cast<ParentTensorType *>(parent);
-  auto subtensor = cpp14::make_unique<SubTensorType>(ptensor, shape, coords);
-  return std::move(subtensor);
-}
-SubTensor::SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
-                     arm_compute::Coordinates &coords) noexcept
-    : _target(TargetHint::DONT_CARE),
-      _tensor_shape(tensor_shape),
-      _coords(coords),
-      _parent(nullptr),
-      _subtensor(nullptr) {
-  _parent = parent->tensor();
-  _target = parent->target();
-
-  instantiate_subtensor();
-}
-arm_compute::ITensor *SubTensor::set_target(TargetHint target) {
-  return (target == _target) ? _subtensor.get() : nullptr;
-}
-
-arm_compute::ITensor *SubTensor::tensor() { return _subtensor.get(); }
-
-const arm_compute::ITensor *SubTensor::tensor() const {
-  return _subtensor.get();
-}
-
-TargetHint SubTensor::target() const { return _target; }
-
-void SubTensor::allocate() {
-  // NOP for sub-tensors
-}
-
-void SubTensor::instantiate_subtensor() {
-  switch (_target) {
-#ifdef USE_OPENCL
-    case TargetHint::OPENCL:
-      _subtensor = initialise_subtensor<arm_compute::CLSubTensor,
-                                        arm_compute::ICLTensor>(
-          _parent, _tensor_shape, _coords);
-      break;
-#endif
-    default:
-    case TargetHint::NEON:
-      _subtensor =
-          initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(
-              _parent, _tensor_shape, _coords);
-      break;
-  }
-}
-
-#endif
-
-}  // namespace acl
-}  // namespace operators
-}  // namespace paddle_mobile
--- a/src/operators/kernel/mali/acl_tensor.h
+++ b/src/operators/kernel/mali/acl_tensor.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef ACL_TENSOR_H_
-#define ACL_TENSOR_H_
-
-#ifdef USE_ACL
-#ifdef USE_OPENCL
-#include "arm_compute/runtime/CL/CLSubTensor.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#elif defined(USE_OPENGLES)
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
-#endif
-#include "arm_compute/runtime/SubTensor.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <memory>
-
-namespace paddle_mobile {
-namespace operators {
-namespace acl {
-enum class TargetHint {
-  DONT_CARE,
-  OPENCL,
-  OPENGLES,
-  NEON,
-};
-
-enum class ConvolutionMethodHint {
-  GEMM,
-  DIRECT,
-};
-namespace cpp14 {
-template <class T>
-struct _Unique_if {
-  typedef std::unique_ptr<T> _Single_object;
-};
-
-template <class T>
-struct _Unique_if<T[]> {
-  typedef std::unique_ptr<T[]> _Unknown_bound;
-};
-
-template <class T, size_t N>
-struct _Unique_if<T[N]> {
-  typedef void _Known_bound;
-};
-
-template <class T, class... Args>
-typename _Unique_if<T>::_Single_object make_unique(Args &&... args) {
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
-
-template <class T>
-typename _Unique_if<T>::_Unknown_bound make_unique(size_t n) {
-  typedef typename std::remove_extent<T>::type U;
-  return std::unique_ptr<T>(new U[n]());
-}
-
-template <class T, class... Args>
-typename _Unique_if<T>::_Known_bound make_unique(Args &&...);
-}  // namespace cpp14
-
-class Tensor {
- public:
-  explicit Tensor(arm_compute::TensorInfo &info) noexcept;
-  virtual ~Tensor() {}
-  Tensor(Tensor &&src) noexcept;
-  void set_info(arm_compute::TensorInfo &&info) { _info = info; }
-  arm_compute::ITensor *set_target(TargetHint target);
-  const arm_compute::TensorInfo &info() const { return _info; }
-  arm_compute::ITensor *tensor() { return _tensor.get(); }
-  void allocate();
-  void init() {}
-  TargetHint target() const { return _target; }
-  virtual void map(bool blocking = true);
-  virtual void unmap();
-
- private:
-  TargetHint _target;
-  arm_compute::TensorInfo _info;
-  std::unique_ptr<arm_compute::ITensor> _tensor;
-};
-
-class SubTensor {
- public:
-  SubTensor(Tensor *parent, arm_compute::TensorShape &tensor_shape,
-            arm_compute::Coordinates &coords) noexcept;
-  ~SubTensor() {}
-  arm_compute::ITensor *tensor();
-  const arm_compute::ITensor *tensor() const;
-  TargetHint target() const;
-  void allocate();
-  arm_compute::ITensor *set_target(TargetHint target);
-
- private:
-  /** Instantiates a sub-tensor */
-  void instantiate_subtensor();
-
- private:
-  /**< Target that this tensor is pinned on */
-  TargetHint _target;
-  /**< SubTensor shape */
-  arm_compute::TensorShape _tensor_shape;
-  /**< SubTensor Coordinates */
-  arm_compute::Coordinates _coords;
-  /**< Parent tensor */
-  arm_compute::ITensor *_parent;
-  /**< SubTensor */
-  std::unique_ptr<arm_compute::ITensor> _subtensor;
-};
-
-}  // namespace acl
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
-#endif  // ACL_TENSOR_H_
--- a/src/operators/kernel/mali/batchnorm_kernel.cpp
+++ b/src/operators/kernel/mali/batchnorm_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BATCHNORM_OP
-
-#include "operators/kernel/batchnorm_kernel.h"
-#ifdef PADDLE_MOBILE_MALI_GPU
-#include "acl_operator.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class AclBatchNormOp : public acl::ACLOperator {
- public:
-  AclBatchNormOp() {
-    this->force_bypass_acl_path_ = bypass_acl_class_layer & FLAGS_ENABLE_ACL_BN;
-  }
-  ~AclBatchNormOp() = default;
-  AclBatchNormOp(const AclBatchNormOp&) = delete;
-  AclBatchNormOp& operator=(const AclBatchNormOp&) = delete;
-  AclBatchNormOp(AclBatchNormOp&&) = delete;
-  AclBatchNormOp& operator=(AclBatchNormOp&&) = delete;
-
-  acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const BatchNormParam<DeviceType>& param) {
-    setTargetHint(acl::TargetHint::OPENCL);
-    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
-                                         args.in_depth, args.batch);
-    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
-                                          args.out_depth, args.out_num);
-
-    if (is_operator_init_done(input_shape)) return;
-    set_operator_init_done();
-    this->force_bypass_acl_path_ = false;
-
-    arm_compute::TensorShape mean_shape(args.in_depth);
-    arm_compute::TensorShape var_shape = mean_shape;
-    arm_compute::TensorShape beta_shape = mean_shape;
-    arm_compute::TensorShape gamma_shape = mean_shape;
-
-    //[width, height, IFM]
-    new_tensor(input(), input_shape, args.input_data);
-    //[width, height, OFM]
-    new_tensor(output(), output_shape, args.output_data);
-
-    new_tensor(mean(), mean_shape, args.mean_data);
-    new_tensor(var(), var_shape, args.var_data);
-    new_tensor(beta(), beta_shape, args.biases_data);
-    new_tensor(gamma(), gamma_shape, args.weight_data);
-
-    acl_configure(bn, this, args.epsilon);
-  }
-
-  void RunAcl(void* input, void* output) {
-    acl::ACLOperator::acl_run(input, output);
-  }
-  bool Bypass_acl(const BatchNormParam<DeviceType>& param) {
-    bool bypass_acl = false;
-    AclParametersByContext(param);
-    InitAclLayer(param);
-    // for performance, more groups impact GPU performance
-    if (this->force_bypass_acl_path_) {
-      bypass_acl = true;
-    }
-
-    return bypass_acl;
-  }
-
- private:
-  void AclParametersByContext(const BatchNormParam<DeviceType>& param) {
-    const Tensor* in_x = param.InputX();
-    Tensor* out = param.OutputY();
-    const Tensor* scale = param.InputScale();
-    const Tensor* bias = param.InputBias();
-    const Tensor* saved_mean = param.InputMean();
-    const Tensor* saved_variance = param.InputVariance();
-
-    const T* input_data = in_x->data<T>();
-    T* output_data = out->mutable_data<T>();
-    const T* weight_data = scale->data<T>();
-    const T* bias_data = bias->data<T>();
-    const T* mean_data = saved_mean->data<T>();
-    const T* var_data = saved_variance->data<T>();
-
-    float epsilon = param.Epsilon();
-
-    args.input_data = (void*)input_data;
-    args.output_data = (void*)output_data;
-    // args.weight_data = (void*)weight_data;
-    // args.biases_data = (void*)bias_data;
-    args.mean_data = (void*)mean_data;
-    args.var_data = (void*)var_data;
-    args.epsilon = epsilon;
-
-    args.dim = in_x->dims().size();
-
-    args.batch = in_x->dims()[0];
-    args.in_depth = in_x->dims()[1];
-    args.in_rows = in_x->dims()[2];
-    args.in_cols = in_x->dims()[3];
-
-    args.out_num = out->dims()[0];
-    args.out_depth = out->dims()[1];
-    args.out_rows = out->dims()[2];
-    args.out_cols = out->dims()[3];
-
-    args.weight_data = (void*)weight_data;
-    args.biases_data = (void*)bias_data;
-
-    // std::cout
-    //  << "Out C: " <<  args.out_depth
-    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
-  }
-  acl::AclParameters args;
-};
-
-template <>
-bool BatchNormKernel<GPU_MALI, float>::Init(BatchNormParam<GPU_MALI>* param) {
-  AclBatchNormOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    acl_op = new AclBatchNormOp<GPU_MALI, float>();
-    this->SetAclOp((void*)acl_op, (void*)this);
-  }
-  if (acl_op->Bypass_acl(*param)) {
-    std::cout << "init acl failed" << std::endl;
-    return false;
-  }
-  return true;
-}
-
-template <>
-void BatchNormKernel<GPU_MALI, float>::Compute(
-    const BatchNormParam<GPU_MALI>& param) {
-  std::cout << "init acl" << std::endl;
-  AclBatchNormOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclBatchNormOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    return;
-  }
-  acl::AclParameters& args = acl_op->getargs();
-  acl_op->RunAcl(args.input_data, args.output_data);
-}
-
-template class BatchNormKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
-#endif
--- a/src/operators/kernel/mali/concat_kernel.cpp
+++ b/src/operators/kernel/mali/concat_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONCAT_OP
-
-#include "operators/kernel/concat_kernel.h"
-#ifdef PADDLE_MOBILE_MALI_GPU
-#include "acl_operator.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class AclConcatOp : public acl::ACLOperator {
- public:
-  AclConcatOp() {
-    this->force_bypass_acl_path_ =
-        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONCAT;
-  }
-  ~AclConcatOp() = default;
-  AclConcatOp(const AclConcatOp&) = delete;
-  AclConcatOp& operator=(const AclConcatOp&) = delete;
-  AclConcatOp(AclConcatOp&&) = delete;
-  AclConcatOp& operator=(AclConcatOp&&) = delete;
-
-  acl::AclParameters& getargs() { return args; }
-
-  void InitAclLayer(const ConcatParam<DeviceType>& param) {
-    setTargetHint(acl::TargetHint::OPENCL);
-    const std::vector<framework::LoDTensor*>* input_data = &args.in_tensor;
-    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
-                                          args.out_depth, args.batch);
-
-    if (is_operator_init_done(output_shape)) return;
-    set_operator_init_done();
-    this->force_bypass_acl_path_ = false;
-    T type;
-
-    for (int i = 0; i < input_data->size(); i++) {
-      int in_batch = (*input_data)[i]->dims()[0];
-      int in_channels = (*input_data)[i]->dims()[1];
-      int in_width = (*input_data)[i]->dims()[2];
-      int in_height = (*input_data)[i]->dims()[3];
-      arm_compute::TensorShape in_shape(in_width, in_height, in_channels);
-
-      new_tensor(cinput(i), in_shape,
-                 acl::InputdataPtr(this, args.in_tensor, type, i));
-    }
-
-    //[width, height, OFM]
-    new_tensor(output(), output_shape, args.output_data);
-
-    acl_configure(concat, this, input_data->size());
-  }
-
-  void RunAcl(const std::vector<framework::LoDTensor*>& input, void* output) {
-    T type;
-    acl::acl_run(this, input, output, type);
-  }
-  bool Bypass_acl(const ConcatParam<DeviceType>& param) {
-    bool bypass_acl = false;
-    AclParametersByContext(param);
-    InitAclLayer(param);
-    // for performance, more groups impact GPU performance
-    if (this->force_bypass_acl_path_ || !args.is_channel_concat) {
-      bypass_acl = true;
-    }
-    return bypass_acl;
-  }
-
- private:
-  void AclParametersByContext(const ConcatParam<DeviceType>& param) {
-    auto inputs = param.Inputs();
-    auto* output = param.Out();
-    int64_t axis = param.Axis();
-
-    T* output_data = output->mutable_data<T>();
-
-    args.is_channel_concat = (axis == 1);
-    args.in_tensor = inputs;
-    args.output_data = (void*)output_data;
-
-    args.batch = output->dims()[0];
-    args.out_depth = output->dims()[1];
-    args.out_rows = output->dims()[2];
-    args.out_cols = output->dims()[3];
-  }
-  acl::AclParameters args;
-};
-
-template <>
-bool ConcatKernel<GPU_MALI, float>::Init(ConcatParam<GPU_MALI>* param) {
-  AclConcatOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    acl_op = new AclConcatOp<GPU_MALI, float>();
-    this->SetAclOp((void*)acl_op, (void*)this);
-  }
-  if (acl_op->Bypass_acl(*param)) {
-    std::cout << "init acl failed" << std::endl;
-    return false;
-  }
-  return true;
-}
-
-template <>
-void ConcatKernel<GPU_MALI, float>::Compute(
-    const ConcatParam<GPU_MALI>& param) {
-  std::cout << "init acl" << std::endl;
-  AclConcatOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclConcatOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    return;
-  }
-  acl::AclParameters& args = acl_op->getargs();
-  acl_op->RunAcl(args.in_tensor, args.output_data);
-}
-
-template class ConcatKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
-#endif
--- a/src/operators/kernel/mali/conv_add_kernel.cpp
+++ b/src/operators/kernel/mali/conv_add_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-
-#include "operators/kernel/conv_add_kernel.h"
-#ifdef PADDLE_MOBILE_MALI_GPU
-#include "acl_operator.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class AclConvAddOp : public acl::ACLOperator {
- public:
-  AclConvAddOp() {
-    this->force_bypass_acl_path_ =
-        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
-  }
-  ~AclConvAddOp() = default;
-  AclConvAddOp(const AclConvAddOp&) = delete;
-  AclConvAddOp& operator=(const AclConvAddOp&) = delete;
-  AclConvAddOp(AclConvAddOp&&) = delete;
-  AclConvAddOp& operator=(AclConvAddOp&&) = delete;
-
-  acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const FusionConvAddParam<DeviceType>& param) {
-    setTargetHint(acl::TargetHint::OPENCL);
-    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
-                                         args.in_depth, args.batch);
-    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
-                                          args.out_depth, args.out_num);
-    arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
-                                           args.in_depth / args.num_group,
-                                           args.out_depth);
-    arm_compute::TensorShape biases_shape(args.out_depth);
-    arm_compute::PadStrideInfo conv_info(
-        args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
-        arm_compute::DimensionRoundingType::FLOOR);
-
-    if (is_operator_init_done(input_shape)) return;
-    set_operator_init_done();
-    this->force_bypass_acl_path_ = false;
-
-    // check_direct_conv();
-    group() = args.num_group;
-    //[kernel_x, kernel_y, IFM, OFM]
-    new_tensor(weights(), weights_shape, args.weight_data);
-    //[OFM]
-    if (args.biases_data) {
-      new_tensor(biases(), biases_shape, args.biases_data);
-    }
-
-    //[width, height, IFM]
-    new_tensor(input(), input_shape, args.input_data);
-    //[width, height, OFM]
-    new_tensor(output(), output_shape, args.output_data);
-
-    acl_configure(conv, this, conv_info);
-  }
-
-  void RunAcl(void* input, void* output) {
-    acl::ACLOperator::acl_run(input, output);
-  }
-  bool Bypass_acl(const FusionConvAddParam<DeviceType>& param) {
-    bool bypass_acl = false;
-    AclParametersByContext(param);
-    InitAclLayer(param);
-    // for performance, more groups impact GPU performance
-    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
-      bypass_acl = true;
-    }
-    if (args.dim > 2) {
-      bypass_acl = true;
-    }
-    if (args.dilated) {
-      bypass_acl = true;
-    }
-    return bypass_acl;
-  }
-
- private:
-  void check_direct_conv() {
-    bool use_direct_conv = false;
-    const char* pDirectConv;
-    pDirectConv = getenv("DIRECTCONV");
-    if (pDirectConv) {
-      unsigned int bdirectconv;
-      sscanf(pDirectConv, "%i", &bdirectconv);
-      if (bdirectconv != use_direct_conv) {
-        use_direct_conv = bdirectconv;
-        printf("DIRECTCONV<%s>\n", pDirectConv);
-        printf("DIRECTCONV: %x\n", use_direct_conv);
-      }
-    }
-    int pad_data[2], kernel[2];
-    pad_data[1] = args.pad_rows;
-    pad_data[0] = args.pad_cols;
-    kernel[1] = args.filter_rows;
-    kernel[0] = args.filter_cols;
-    if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
-                             pad_data[0] == 0 && pad_data[1] == 0) ||
-                            (kernel[0] == 3 && kernel[1] == 3 &&
-                             pad_data[0] <= 1 && pad_data[1] <= 1))) {
-      setConvMethod();  // NEDirectConvolutionLayer only for 1x1 and 3x3
-    }
-  }
-
-  void AclParametersByContext(const FusionConvAddParam<DeviceType>& param) {
-    const Tensor* input = param.Input();
-    Tensor filter = *param.Filter();
-    Tensor* output = param.Output();
-    Tensor* bias;
-
-    int groups = param.Groups();
-    std::vector<int> strides = param.Strides();
-    std::vector<int> paddings = param.Paddings();
-    std::vector<int> dilations = param.Dilations();
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>();
-    const T* weight_data = filter.data<T>();
-
-    args.input_data = (void*)input_data;
-    args.output_data = (void*)output_data;
-    args.weight_data = (void*)weight_data;
-    args.biases_data = nullptr;
-
-    try {
-      bias = param.Bias();
-    } catch (const std::exception& e) {
-    }
-    if (bias) {
-      const T* biases_data = bias->data<T>();
-      args.biases_data = (void*)biases_data;
-    }
-
-    args.num_group = groups;
-
-    args.dilation_rows = dilations[0];
-    args.dilation_cols = dilations[1];
-    if (dilations[0] != 1 || dilations[1] != 1) {
-      args.dilated = true;
-    }
-
-    // NCHW
-    // std::cout << "In dims: " << (input->dims()).size() << std::endl;
-    args.batch = input->dims()[0];
-    args.in_depth = input->dims()[1];
-    args.in_rows = input->dims()[2];
-    args.in_cols = input->dims()[3];
-    // std::cout <<"In N: " << args.batch << " C: " <<  args.in_depth
-    //  << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
-    // NCHW
-    // std::cout << "Out dims: " << (output->dims()).size() << std::endl;
-    args.out_num = output->dims()[0];
-    args.out_depth = output->dims()[1];
-    args.out_rows = output->dims()[2];
-    args.out_cols = output->dims()[3];
-    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
-    //  << " C: " <<  args.out_depth
-    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
-    // MCHW = OIHW
-    args.filter_rows = filter.dims()[2];
-    args.filter_cols = filter.dims()[3];
-    // std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
-    //  << " I: " <<  static_cast<int>(filter.dims()[1])
-    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
-
-    // strides(h_stride, w_stride)
-    args.stride_rows = strides[0];
-    args.stride_cols = strides[1];
-    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
-    // args.stride_cols << "\n";
-
-    // paddings(h_pad, w_pad)
-    args.pad_rows = paddings[0];
-    args.pad_cols = paddings[1];
-    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
-    // "\n";
-  }
-  acl::AclParameters args;
-};
-
-template <>
-bool ConvAddKernel<GPU_MALI, float>::Init(FusionConvAddParam<GPU_MALI>* param) {
-  AclConvAddOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    acl_op = new AclConvAddOp<GPU_MALI, float>();
-    this->SetAclOp((void*)acl_op, (void*)this);
-  }
-  if (acl_op->Bypass_acl(*param)) {
-    std::cout << "init acl failed" << std::endl;
-    return false;
-  }
-  return true;
-}
-
-template <>
-void ConvAddKernel<GPU_MALI, float>::Compute(
-    const FusionConvAddParam<GPU_MALI>& param) {
-  std::cout << "init acl" << std::endl;
-  AclConvAddOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclConvAddOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    return;
-  }
-  acl::AclParameters& args = acl_op->getargs();
-
-  acl_op->RunAcl(args.input_data, args.output_data);
-}
-
-template class ConvAddKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
-#endif
--- a/src/operators/kernel/mali/conv_kernel.cpp
+++ b/src/operators/kernel/mali/conv_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#include "operators/kernel/conv_kernel.h"
-#ifdef PADDLE_MOBILE_MALI_GPU
-#include "acl_operator.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class AclConvOp : public acl::ACLOperator {
- public:
-  AclConvOp() {
-    this->force_bypass_acl_path_ =
-        bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
-  }
-  ~AclConvOp() = default;
-  AclConvOp(const AclConvOp&) = delete;
-  AclConvOp& operator=(const AclConvOp&) = delete;
-  AclConvOp(AclConvOp&&) = delete;
-  AclConvOp& operator=(AclConvOp&&) = delete;
-
-  acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const ConvParam<DeviceType>& param) {
-    setTargetHint(acl::TargetHint::OPENCL);
-    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
-                                         args.in_depth, args.batch);
-    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
-                                          args.out_depth, args.out_num);
-    arm_compute::TensorShape weights_shape(args.filter_cols, args.filter_rows,
-                                           args.in_depth / args.num_group,
-                                           args.out_depth);
-    // arm_compute::TensorShape biases_shape(args.out_depth);
-    arm_compute::PadStrideInfo conv_info(
-        args.stride_cols, args.stride_rows, args.pad_cols, args.pad_rows,
-        arm_compute::DimensionRoundingType::FLOOR);
-
-    if (is_operator_init_done(input_shape)) return;
-    set_operator_init_done();
-    this->force_bypass_acl_path_ = false;
-
-    check_direct_conv();
-    //[kernel_x, kernel_y, IFM, OFM]
-    new_tensor(weights(), weights_shape, args.weight_data);
-    //[OFM]
-    // if (args.biases_data) {
-    //    new_tensor(biases(),biases_shape,args.biases_data);
-    //}
-
-    group() = args.num_group;
-
-    //[width, height, IFM]
-    new_tensor(input(), input_shape, args.input_data);
-    //[width, height, OFM]
-    new_tensor(output(), output_shape, args.output_data);
-
-    acl_configure(conv, this, conv_info);
-  }
-
-  void RunAcl(void* input, void* output) {
-    acl::ACLOperator::acl_run(input, output);
-  }
-  bool Bypass_acl(const ConvParam<DeviceType>& param) {
-    bool bypass_acl = false;
-    AclParametersByContext(param);
-    InitAclLayer(param);
-    // for performance, more groups impact GPU performance
-    if (this->force_bypass_acl_path_ || args.num_group >= 5) {
-      bypass_acl = true;
-    }
-    if (args.dim > 2) {
-      bypass_acl = true;
-    }
-    if (args.dilated) {
-      bypass_acl = true;
-    }
-    return bypass_acl;
-  }
-
- private:
-  void check_direct_conv() {
-    bool use_direct_conv = false;
-    const char* pDirectConv;
-    pDirectConv = getenv("DIRECTCONV");
-    if (pDirectConv) {
-      unsigned int bdirectconv;
-      sscanf(pDirectConv, "%i", &bdirectconv);
-      if (bdirectconv != use_direct_conv) {
-        use_direct_conv = bdirectconv;
-        printf("DIRECTCONV<%s>\n", pDirectConv);
-        printf("DIRECTCONV: %x\n", use_direct_conv);
-      }
-    }
-    int pad_data[2], kernel[2];
-    pad_data[1] = args.pad_rows;
-    pad_data[0] = args.pad_cols;
-    kernel[1] = args.filter_rows;
-    kernel[0] = args.filter_cols;
-    if (use_direct_conv && ((kernel[0] == 1 && kernel[1] == 1 &&
-                             pad_data[0] == 0 && pad_data[1] == 0) ||
-                            (kernel[0] == 3 && kernel[1] == 3 &&
-                             pad_data[0] <= 1 && pad_data[1] <= 1))) {
-      setConvMethod();  // NEDirectConvolutionLayer only for 1x1 and 3x3
-    }
-  }
-
-  void AclParametersByContext(const ConvParam<DeviceType>& param) {
-    const Tensor* input = param.Input();
-    Tensor filter = *param.Filter();
-    Tensor* output = param.Output();
-
-    int groups = param.Groups();
-    std::vector<int> strides = param.Strides();
-    std::vector<int> paddings = param.Paddings();
-    std::vector<int> dilations = param.Dilations();
-
-    const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>();
-    const T* weight_data = filter.data<T>();
-
-    args.input_data = (void*)input_data;
-    args.output_data = (void*)output_data;
-    args.weight_data = (void*)weight_data;
-    args.biases_data = nullptr;
-
-    // try {
-    //     bias = context.Input<framework::Tensor>("Bias");
-    // } catch (const std::exception& e) {
-    // }
-    // if (bias) {
-    //     const T* biases_data = bias->data<T>();
-    //     args.biases_data = (void*)biases_data;
-    // }
-
-    args.num_group = groups;
-
-    args.dilation_rows = dilations[0];
-    args.dilation_cols = dilations[1];
-    if (dilations[0] != 1 || dilations[1] != 1) {
-      args.dilated = true;
-    }
-
-    // NCHW
-    // std::cout << "In dims: " << (input->dims()).size() << std::endl;
-    args.batch = input->dims()[0];
-    args.in_depth = input->dims()[1];
-    args.in_rows = input->dims()[2];
-    args.in_cols = input->dims()[3];
-    std::cout << "In N: " << args.batch << " C: " << args.in_depth
-              << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
-    // NCHW
-    // std::cout << "Out dims: " << (output->dims()).size() << std::endl;
-    args.out_num = output->dims()[0];
-    args.out_depth = output->dims()[1];
-    args.out_rows = output->dims()[2];
-    args.out_cols = output->dims()[3];
-    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
-    //  << " C: " <<  args.out_depth
-    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
-    // MCHW = OIHW
-    args.filter_rows = filter.dims()[2];
-    args.filter_cols = filter.dims()[3];
-    // std::cout <<"Filter O: " << static_cast<int>(filter.dims()[0])
-    //  << " I: " <<  static_cast<int>(filter.dims()[1])
-    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
-
-    // strides(h_stride, w_stride)
-    args.stride_rows = strides[0];
-    args.stride_cols = strides[1];
-    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
-    // args.stride_cols << "\n";
-
-    // paddings(h_pad, w_pad)
-    args.pad_rows = paddings[0];
-    args.pad_cols = paddings[1];
-    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
-    // "\n";
-  }
-  acl::AclParameters args;
-};
-
-template <>
-bool ConvKernel<GPU_MALI, float>::Init(ConvParam<GPU_MALI>* param) {
-  AclConvOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    acl_op = new AclConvOp<GPU_MALI, float>();
-    this->SetAclOp((void*)acl_op, (void*)this);
-  }
-  if (acl_op->Bypass_acl(*param)) {
-    std::cout << "init acl failed" << std::endl;
-    return false;
-  }
-  return true;
-}
-
-template <>
-void ConvKernel<GPU_MALI, float>::Compute(const ConvParam<GPU_MALI>& param) {
-  std::cout << "init acl" << std::endl;
-  AclConvOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclConvOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    return;
-  }
-  acl::AclParameters& args = acl_op->getargs();
-  acl_op->RunAcl(args.input_data, args.output_data);
-}
-
-template class ConvKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
-#endif
--- a/src/operators/kernel/mali/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEADD_OP
-
-#pragma once
-
-#include "operators/kernel/elementwise_add_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct AddFunctor {
-  inline T operator()(T a, T b) const { return a + b; }
-};
-
-template <>
-bool ElementwiseAddKernel<GPU_MALI, float>::Init(
-    ElementwiseAddParam<GPU_MALI> *param) {
-  return true;
-}
-
-template <>
-void ElementwiseAddKernel<GPU_MALI, float>::Compute(
-    const ElementwiseAddParam<GPU_MALI> &param) {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *Out = param.Out();
-  Out->mutable_data<float>();
-  int axis = param.Axis();
-  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
-                                                 AddFunctor<float>(), Out);
-}
-
-template class ElementwiseAddKernel<GPU_MALI, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
--- a/src/operators/kernel/mali/feed_kernel.cpp
+++ b/src/operators/kernel/mali/feed_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/feed_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FeedKernel<GPU_MALI, float>::Init(FeedParam<GPU_MALI> *param) {
-  return true;
-}
-
-template <>
-void FeedKernel<GPU_MALI, float>::Compute(const FeedParam<GPU_MALI> &param) {
-  param.Out()->ShareDataWith(*(param.InputX()));
-  param.Out()->set_lod(param.InputX()->lod());
-}
-
-template class FeedKernel<GPU_MALI, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
--- a/src/operators/kernel/mali/fetch_kernel.cpp
+++ b/src/operators/kernel/mali/fetch_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_CONVADD_OP
-
-#include "operators/kernel/fetch_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FetchKernel<GPU_MALI, float>::Init(FetchParam<GPU_MALI> *param) {
-  return true;
-}
-
-template <>
-void FetchKernel<GPU_MALI, float>::Compute(const FetchParam<GPU_MALI> &param) {
-  param.Out()->ShareDataWith(*(param.InputX()));
-}
-
-template class FetchKernel<GPU_MALI, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
--- a/src/operators/kernel/mali/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_FC_OP
-
-#include "operators/kernel/fusion_fc_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FusionFcKernel<GPU_MALI, float>::Init(FusionFcParam<GPU_MALI> *param) {
-  return true;
-}
-
-template <>
-void FusionFcKernel<GPU_MALI, float>::Compute(
-    const FusionFcParam<GPU_MALI> &param) {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  const Tensor *input_z = param.InputZ();
-  auto *input_z_data = input_z->data<float>();
-  int axis = param.Axis();
-  Tensor *out = param.Out();
-  auto *out_data = out->mutable_data<float>();
-  const Tensor x_matrix =
-      input_x->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
-          : *input_x;
-  const Tensor y_matrix =
-      input_y->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
-          : *input_y;
-  auto out_dim = out->dims();
-  if (out_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
-  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
-                        " out_dim.size must be 2.");
-  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
-  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ")
-
-  int64_t classes = input_z->numel();
-  for (int i = 0; i < out_dim[0]; i++) {
-    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
-  }
-
-  for (int i = 0; i < out->numel(); i++) {
-    DLOG << out_data[i];
-  }
-  math::MatMul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
-                      out, static_cast<float>(1));
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-  //            if (out_dim.size() != 2) {
-  //                out->Resize(out_dim);
-  //            }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
--- a/src/operators/kernel/mali/lrn_kernel.cpp
+++ b/src/operators/kernel/mali/lrn_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LRN_OP
-
-#pragma once
-
-#include "operators/kernel/lrn_kernel.h"
-#ifdef PADDLE_MOBILE_MALI_GPU
-#include "acl_operator.h"
-#include "framework/operator.h"
-#include "operators/kernel/central-arm-func/lrn_arm_func.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class AclLrnOp : public acl::ACLOperator {
- public:
-  AclLrnOp() {
-    this->force_bypass_acl_path_ =
-        bypass_acl_class_layer & FLAGS_ENABLE_ACL_LRN;
-  }
-  ~AclLrnOp() = default;
-  AclLrnOp(const AclLrnOp&) = delete;
-  AclLrnOp& operator=(const AclLrnOp&) = delete;
-  AclLrnOp(AclLrnOp&&) = delete;
-  AclLrnOp& operator=(AclLrnOp&&) = delete;
-
-  acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const LrnParam<DeviceType>& param) {
-    setTargetHint(acl::TargetHint::OPENCL);
-    arm_compute::TensorShape shape(args.in_cols, args.in_rows, args.in_depth);
-
-    if (is_operator_init_done(shape)) return;
-    set_operator_init_done();
-    this->force_bypass_acl_path_ = false;
-
-    arm_compute::NormalizationLayerInfo norm_info(
-        arm_compute::NormType::CROSS_MAP, args.nsize, args.alpha, args.beta,
-        args.knorm);
-
-    //[width, height, IFM]
-    new_tensor(input(), shape, args.input_data);
-    //[width, height, OFM]
-    new_tensor(output(), shape, args.output_data);
-
-    acl_configure(lrn, this, norm_info);
-  }
-
-  void Set_bypass(bool bypass) { args.is_bypass = bypass; }
-
-  void RunAcl(void* input, void* output) {
-    acl::ACLOperator::acl_run(input, output);
-  }
-  bool Bypass_acl(const LrnParam<DeviceType>& param) {
-    bool bypass_acl = false;
-    AclParametersByContext(param);
-    InitAclLayer(param);
-    // for performance, more groups impact GPU performance
-    if (this->force_bypass_acl_path_) {
-      bypass_acl = true;
-    }
-
-    return bypass_acl;
-  }
-
- private:
-  void AclParametersByContext(const LrnParam<DeviceType>& param) {
-    const Tensor* in_x = param.InputX();
-    Tensor* out = param.Out();
-
-    int n = param.N();
-    T alpha = param.Alpha();
-    T beta = param.Beta();
-    T k = param.K();
-
-    const T* input_data = in_x->data<T>();
-    T* output_data = out->mutable_data<T>();
-
-    args.input_data = (void*)input_data;
-    args.output_data = (void*)output_data;
-
-    args.nsize = n;
-    args.alpha = alpha;
-    args.beta = beta;
-    args.knorm = k;
-
-    // NCHW
-    args.batch = in_x->dims()[0];
-    args.in_depth = in_x->dims()[1];
-    args.in_rows = in_x->dims()[2];
-    args.in_cols = in_x->dims()[3];
-    // std::cout
-    //  << "Out C: " <<  args.out_depth
-    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
-  }
-  acl::AclParameters args;
-};
-
-template <>
-bool LrnKernel<GPU_MALI, float>::Init(LrnParam<GPU_MALI>* param) {
-  AclLrnOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    acl_op = new AclLrnOp<GPU_MALI, float>();
-    this->SetAclOp((void*)acl_op, (void*)this);
-  }
-  if (acl_op->Bypass_acl(*param)) {
-    acl_op->Set_bypass(true);
-    std::cout << "init acl failed" << std::endl;
-    return true;
-  }
-  return true;
-}
-
-template <>
-void LrnKernel<GPU_MALI, float>::Compute(const LrnParam<GPU_MALI>& param) {
-  std::cout << "init acl" << std::endl;
-  AclLrnOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclLrnOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    return;
-  }
-  acl::AclParameters& args = acl_op->getargs();
-  if (args.is_bypass) {
-    std::cout << "bypass op" << std::endl;
-    LrnCompute<float>(param);
-    return;
-  }
-  const float* input_data = (const float*)args.input_data;
-  const float* output_data = (const float*)args.output_data;
-  for (int n = 0; n < args.batch; ++n) {
-    acl_op->RunAcl((void*)input_data, (void*)output_data);
-    input_data += args.in_depth * args.in_cols * args.in_rows;
-    output_data += args.in_depth * args.in_cols * args.in_rows;
-  }
-}
-
-template class LrnKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
-#endif
--- a/src/operators/kernel/mali/mul_kernel.cpp
+++ b/src/operators/kernel/mali/mul_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MUL_OP
-
-#pragma once
-
-#include "operators/kernel/mul_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool MulKernel<GPU_MALI, float>::Init(MulParam<GPU_MALI> *param) {
-  return true;
-}
-
-template <>
-void MulKernel<GPU_MALI, float>::Compute(const MulParam<GPU_MALI> &param) {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *out = param.Out();
-  out->mutable_data<float>();
-  const Tensor x_matrix =
-      input_x->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
-          : *input_x;
-  const Tensor y_matrix =
-      input_y->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
-          : *input_y;
-  auto out_dim = out->dims();
-  if (out_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-  math::MatMul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
-                      out, static_cast<float>(0));
-  if (out_dim.size() != 2) {
-    out->Resize(out_dim);
-  }
-}
-
-template class MulKernel<GPU_MALI, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
--- a/src/operators/kernel/mali/pool_kernel.cpp
+++ b/src/operators/kernel/mali/pool_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#pragma once
-
-#include "operators/kernel/pool_kernel.h"
-#ifdef PADDLE_MOBILE_MALI_GPU
-#include "acl_operator.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class AclPoolOp : public acl::ACLOperator {
- public:
-  AclPoolOp() {
-    this->force_bypass_acl_path_ =
-        bypass_acl_class_layer & FLAGS_ENABLE_ACL_POOLING;
-  }
-  ~AclPoolOp() = default;
-  AclPoolOp(const AclPoolOp&) = delete;
-  AclPoolOp& operator=(const AclPoolOp&) = delete;
-  AclPoolOp(AclPoolOp&&) = delete;
-  AclPoolOp& operator=(AclPoolOp&&) = delete;
-
-  acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const PoolParam<DeviceType>& param) {
-    setTargetHint(acl::TargetHint::OPENCL);
-    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
-                                         args.in_depth);
-    arm_compute::TensorShape output_shape(args.out_cols, args.out_rows,
-                                          args.out_depth);
-    // arm_compute::TensorShape weights_shape(
-    // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
-    // arm_compute::TensorShape biases_shape(args.out_depth);
-    arm_compute::PoolingLayerInfo pool_info;
-
-    if (args.pool_type == "max") {
-      pool_info = arm_compute::PoolingLayerInfo(
-          arm_compute::PoolingType::MAX, args.filter_rows,
-          arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
-                                     args.pad_cols, args.pad_rows,
-                                     arm_compute::DimensionRoundingType::CEIL));
-    } else {
-      pool_info = arm_compute::PoolingLayerInfo(
-          arm_compute::PoolingType::AVG, args.filter_rows,
-          arm_compute::PadStrideInfo(args.stride_cols, args.stride_rows,
-                                     args.pad_cols, args.pad_rows,
-                                     arm_compute::DimensionRoundingType::CEIL));
-    }
-
-    if (is_operator_init_done(input_shape)) return;
-    set_operator_init_done();
-    this->force_bypass_acl_path_ = false;
-
-    //[width, height, IFM]
-    new_tensor(input(), input_shape, args.input_data);
-    //[width, height, OFM]
-    new_tensor(output(), output_shape, args.output_data);
-
-    acl_configure(pooling, this, pool_info);
-  }
-
-  void RunAcl(void* input, void* output) {
-    acl::ACLOperator::acl_run(input, output);
-  }
-  bool Bypass_acl(const PoolParam<DeviceType>& param) {
-    bool bypass_acl = false;
-    AclParametersByContext(param);
-    InitAclLayer(param);
-    // for performance, more groups impact GPU performance
-    if (this->force_bypass_acl_path_) {
-      bypass_acl = true;
-    }
-    if (args.pool_type != "max" && args.pool_type != "avg") {
-      bypass_acl = true;
-    }
-    if (args.filter_rows != args.filter_cols) {
-      bypass_acl = true;
-    }
-    // if (args.filter_rows!=2 && args.filter_rows!=3) {
-    //     bypass_acl = true;
-    // }
-    return bypass_acl;
-  }
-
- private:
-  void AclParametersByContext(const PoolParam<DeviceType>& param) {
-    const Tensor* in_x = param.Input();
-    Tensor* out = param.Output();
-    std::string pooling_type = param.PoolingType();
-
-    std::vector<int> ksize = param.Ksize();
-
-    std::vector<int> strides = param.Strides();
-
-    std::vector<int> paddings = param.Paddings();
-
-    bool is_global_pooling = param.isGlobalPooling();
-
-    const T* input_data = in_x->data<T>();
-    T* output_data = out->mutable_data<T>();
-
-    args.input_data = (void*)input_data;
-    args.output_data = (void*)output_data;
-
-    args.is_global_pool = is_global_pooling;
-    args.pool_type = pooling_type;
-
-    args.filter_rows = ksize[0];
-    args.filter_cols = ksize[1];
-    args.dim = ksize.size();
-
-    // NCHW
-    args.batch = in_x->dims()[0];
-    args.in_depth = in_x->dims()[1];
-    args.in_rows = in_x->dims()[2];
-    args.in_cols = in_x->dims()[3];
-    // std::cout <<"In N: " << args.batch << " C: " <<  args.in_depth
-    //  << " H: " << args.in_rows << " W: " << args.in_cols << "\n";
-    // NCHW
-    // std::cout <<"Out N: " << static_cast<int>(output->dims()[0])
-    //  << " C: " <<  args.out_depth
-    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
-    // MCHW = OIHW
-    // std::cout <<"Filter O: " << static_cast<int>(filter->dims()[0])
-    //  << " I: " <<  static_cast<int>(filter->dims()[1])
-    //  << " H: " << args.filter_rows << " W: " << args.filter_cols << "\n";
-
-    // strides(h_stride, w_stride)
-    args.stride_rows = strides[0];
-    args.stride_cols = strides[1];
-    // std::cout <<"PoolingType: " << args.pool_type << "\n";
-    // std::cout <<"Stride H: " << args.stride_rows << " W: " <<
-    // args.stride_cols << "\n";
-
-    // paddings(h_pad, w_pad)
-    args.pad_rows = paddings[0];
-    args.pad_cols = paddings[1];
-    // std::cout <<"Pad H: " << args.pad_rows << " W: " << args.pad_cols <<
-    // "\n";
-
-    args.out_depth = args.in_depth;
-    // args.out_rows = out->dims()[2];
-    // args.out_cols = out->dims()[3];
-    args.out_rows = static_cast<int>(ceil(static_cast<float>(args.in_rows +
-                                                             2 * args.pad_rows -
-                                                             args.filter_rows) /
-                                          args.stride_rows)) +
-                    1;
-    args.out_cols = static_cast<int>(ceil(static_cast<float>(args.in_cols +
-                                                             2 * args.pad_cols -
-                                                             args.filter_cols) /
-                                          args.stride_cols)) +
-                    1;
-
-    if (is_global_pooling) {
-      args.filter_rows = args.in_rows;
-      args.filter_cols = args.in_cols;
-      args.pad_rows = 0;
-      args.pad_cols = 0;
-    }
-  }
-  acl::AclParameters args;
-};
-
-template <>
-bool PoolKernel<GPU_MALI, float>::Init(PoolParam<GPU_MALI>* param) {
-  AclPoolOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    acl_op = new AclPoolOp<GPU_MALI, float>();
-    this->SetAclOp((void*)acl_op, (void*)this);
-  }
-  if (acl_op->Bypass_acl(*param)) {
-    std::cout << "init acl failed" << std::endl;
-    return false;
-  }
-  return true;
-}
-
-template <>
-void PoolKernel<GPU_MALI, float>::Compute(const PoolParam<GPU_MALI>& param) {
-  std::cout << "init acl" << std::endl;
-  AclPoolOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclPoolOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    return;
-  }
-  acl::AclParameters& args = acl_op->getargs();
-  const float* input_data = (const float*)args.input_data;
-  const float* output_data = (const float*)args.output_data;
-  for (int n = 0; n < args.batch; ++n) {
-    acl_op->RunAcl((void*)input_data, (void*)output_data);
-    input_data += args.in_depth * args.in_cols * args.in_rows;
-    output_data += args.in_depth * args.out_cols * args.out_rows;
-  }
-}
-
-template class PoolKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
-#endif
--- a/src/operators/kernel/mali/relu_kernel.cpp
+++ b/src/operators/kernel/mali/relu_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RELU_OP
-
-#pragma once
-
-#include "operators/kernel/relu_kernel.h"
-#ifdef PADDLE_MOBILE_MALI_GPU
-#include "acl_operator.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class AclReluOp : public acl::ACLOperator {
- public:
-  AclReluOp() {
-    this->force_bypass_acl_path_ =
-        bypass_acl_class_layer & FLAGS_ENABLE_ACL_RELU;
-  }
-  ~AclReluOp() = default;
-  AclReluOp(const AclReluOp&) = delete;
-  AclReluOp& operator=(const AclReluOp&) = delete;
-  AclReluOp(AclReluOp&&) = delete;
-  AclReluOp& operator=(AclReluOp&&) = delete;
-
-  acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const ReluParam<DeviceType>& param) {
-    setTargetHint(acl::TargetHint::OPENCL);
-    arm_compute::TensorShape input_shape(args.in_cols, args.in_rows,
-                                         args.in_depth, args.batch);
-    arm_compute::TensorShape output_shape(args.in_cols, args.in_rows,
-                                          args.in_depth, args.out_num);
-    // arm_compute::TensorShape weights_shape(
-    // args.filter_cols, args.filter_rows, args.in_depth, args.out_depth);
-    // arm_compute::TensorShape biases_shape(args.out_depth);
-    arm_compute::ActivationLayerInfo::ActivationFunction type;
-    type = arm_compute::ActivationLayerInfo::ActivationFunction::RELU;
-
-    arm_compute::ActivationLayerInfo act_info(type);
-
-    if (is_operator_init_done(input_shape)) return;
-    set_operator_init_done();
-    this->force_bypass_acl_path_ = false;
-
-    //[width, height, IFM]
-    new_tensor(input(), input_shape, args.input_data);
-    //[width, height, OFM]
-    new_tensor(output(), output_shape, args.output_data);
-
-    acl_configure(activation, this, act_info);
-  }
-
-  void RunAcl(void* input, void* output) {
-    acl::ACLOperator::acl_run(input, output);
-  }
-  bool Bypass_acl(const ReluParam<DeviceType>& param) {
-    bool bypass_acl = false;
-    AclParametersByContext(param);
-    InitAclLayer(param);
-    // for performance, more groups impact GPU performance
-    if (this->force_bypass_acl_path_) {
-      bypass_acl = true;
-    }
-    return bypass_acl;
-  }
-
- private:
-  void AclParametersByContext(const ReluParam<DeviceType>& param) {
-    const auto* input_x = param.InputX();
-    auto* out = param.Out();
-
-    const T* input_data = input_x->data<T>();
-    T* output_data = out->mutable_data<T>();
-
-    args.input_data = (void*)input_data;
-    args.output_data = (void*)output_data;
-
-    args.batch = input_x->dims()[0];
-    args.in_depth = input_x->dims()[1];
-    args.in_rows = input_x->dims()[2];
-    args.in_cols = input_x->dims()[3];
-    args.out_num = out->dims()[0];
-  }
-  acl::AclParameters args;
-};
-
-template <>
-bool ReluKernel<GPU_MALI, float>::Init(ReluParam<GPU_MALI>* param) {
-  AclReluOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    acl_op = new AclReluOp<GPU_MALI, float>();
-    this->SetAclOp((void*)acl_op, (void*)this);
-  }
-  if (acl_op->Bypass_acl(*param)) {
-    std::cout << "init acl failed" << std::endl;
-    return false;
-  }
-  return true;
-}
-
-template <>
-void ReluKernel<GPU_MALI, float>::Compute(const ReluParam<GPU_MALI>& param) {
-  std::cout << "init acl" << std::endl;
-  AclReluOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclReluOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    return;
-  }
-  acl::AclParameters& args = acl_op->getargs();
-  acl_op->RunAcl(args.input_data, args.output_data);
-}
-
-template class ReluKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
-#endif
--- a/src/operators/kernel/mali/reshape_kernel.cpp
+++ b/src/operators/kernel/mali/reshape_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE_OP
-
-#pragma once
-
-#include "operators/kernel/reshape_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ReshapeKernel<GPU_MALI, float>::Init(ReshapeParam<GPU_MALI> *param) {
-  return true;
-}
-
-template <>
-void ReshapeKernel<GPU_MALI, float>::Compute(
-    const ReshapeParam<GPU_MALI> &param) {
-  const auto *input_x = param.InputX();
-  const auto &input_x_dims = input_x->dims();
-  auto *out = param.Out();
-  framework::DDim out_dims = out->dims();
-  const auto *input_shape = param.InputShape();
-
-  if (input_shape) {
-    auto *shape_data = input_shape->data<int>();
-    framework::Tensor cpu_shape_tensor;
-    auto shape =
-        std::vector<int>(shape_data, shape_data + input_shape->numel());
-    out_dims = ValidateShape(shape, input_x->dims());
-  }
-
-  bool inplace = param.Inplace();
-  out->Resize(out_dims);
-  if (!inplace) {
-    out->mutable_data<float>();
-    framework::TensorCopy(*input_x, out);
-    out->Resize(out_dims);
-  } else {
-    out->ShareDataWith(*input_x);
-    out->Resize(out_dims);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
--- a/src/operators/kernel/mali/softmax_kernel.cpp
+++ b/src/operators/kernel/mali/softmax_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#pragma once
-
-#include "operators/kernel/softmax_kernel.h"
-#ifdef PADDLE_MOBILE_MALI_GPU
-#include "acl_operator.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class AclSoftmaxOp : public acl::ACLOperator {
- public:
-  AclSoftmaxOp() {
-    this->force_bypass_acl_path_ =
-        bypass_acl_class_layer & FLAGS_ENABLE_ACL_SOFTMAX;
-  }
-  ~AclSoftmaxOp() = default;
-  AclSoftmaxOp(const AclSoftmaxOp&) = delete;
-  AclSoftmaxOp& operator=(const AclSoftmaxOp&) = delete;
-  AclSoftmaxOp(AclSoftmaxOp&&) = delete;
-  AclSoftmaxOp& operator=(AclSoftmaxOp&&) = delete;
-
-  acl::AclParameters& getargs() { return args; }
-  void InitAclLayer(const SoftmaxParam<DeviceType>& param) {
-    setTargetHint(acl::TargetHint::OPENCL);
-    arm_compute::TensorShape shape(args.in_depth, args.batch);
-
-    if (is_operator_init_done(shape)) return;
-    set_operator_init_done();
-    this->force_bypass_acl_path_ = false;
-
-    //[width, height, IFM]
-    new_tensor(input(), shape, args.input_data);
-    //[width, height, OFM]
-    new_tensor(output(), shape, args.output_data);
-
-    acl_configure(softmax, this, NULL);
-  }
-
-  void RunAcl(void* input, void* output) {
-    acl::ACLOperator::acl_run(input, output);
-  }
-  bool Bypass_acl(const SoftmaxParam<DeviceType>& param) {
-    bool bypass_acl = false;
-    AclParametersByContext(param);
-    InitAclLayer(param);
-    // for performance, more groups impact GPU performance
-    if (this->force_bypass_acl_path_) {
-      bypass_acl = true;
-    }
-
-    return bypass_acl;
-  }
-
- private:
-  void AclParametersByContext(const SoftmaxParam<DeviceType>& param) {
-    const framework::Tensor* in_x = param.InputX();
-    framework::Tensor* out = param.Out();
-    auto x_dims = in_x->dims();
-    out->Resize(x_dims);
-
-    const T* input_data = in_x->data<T>();
-    T* output_data = out->data<T>();
-
-    args.input_data = (void*)input_data;
-    args.output_data = (void*)output_data;
-
-    // NCHW
-    args.batch = in_x->dims()[0];
-    args.in_depth = in_x->dims()[1];
-
-    args.out_num = out->dims()[0];
-
-    // std::cout
-    //  << "Out C: " <<  args.out_depth
-    //  << " H: " << args.out_rows << " W: " << args.out_cols << "\n";
-  }
-  acl::AclParameters args;
-};
-
-template <>
-bool SoftmaxKernel<GPU_MALI, float>::Init(SoftmaxParam<GPU_MALI>* param) {
-  AclSoftmaxOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    acl_op = new AclSoftmaxOp<GPU_MALI, float>();
-    this->SetAclOp((void*)acl_op, (void*)this);
-  }
-  if (acl_op->Bypass_acl(*param)) {
-    std::cout << "init acl failed" << std::endl;
-    return false;
-  }
-  return true;
-}
-
-template <>
-void SoftmaxKernel<GPU_MALI, float>::Compute(
-    const SoftmaxParam<GPU_MALI>& param) {
-  std::cout << "init acl" << std::endl;
-  AclSoftmaxOp<GPU_MALI, float>* acl_op =
-      reinterpret_cast<AclSoftmaxOp<GPU_MALI, float>*>(this->GetAclOp());
-  if (acl_op == nullptr) {
-    return;
-  }
-  acl::AclParameters& args = acl_op->getargs();
-  const float* input_data = (const float*)args.input_data;
-  const float* output_data = (const float*)args.output_data;
-
-  for (int n = 0; n < args.out_num; ++n) {
-    acl_op->RunAcl((void*)input_data, (void*)output_data);
-    input_data += args.in_depth;
-    output_data += args.in_depth;
-  }
-}
-
-template class SoftmaxKernel<GPU_MALI, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
-#endif
--- a/src/operators/lookup_op.cpp
+++ b/src/operators/lookup_op.cpp
@@ -59,8 +59,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(lookup_table, ops::LookupOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 #endif


--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -35,8 +35,5 @@ REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(lrn, ops::LrnOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
-#endif

 #endif
--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -61,9 +61,6 @@ REGISTER_OPERATOR_CPU(mul, ops::MulOp);
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(mul, ops::MulOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(mul, ops::MulOp);
 #endif

--- a/src/operators/norm_op.cpp
+++ b/src/operators/norm_op.cpp
@@ -41,8 +41,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(norm, ops::NormOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 #endif


--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
@@ -63,9 +63,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(pool2d, ops::PoolOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(pool2d, ops::PoolOp);
 #endif

--- a/src/operators/prelu_op.cpp
+++ b/src/operators/prelu_op.cpp
@@ -36,8 +36,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(prelu, ops::PReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(prelu, ops::PReluOp);
-#endif

 #endif
--- a/src/operators/prior_box_op.cpp
+++ b/src/operators/prior_box_op.cpp
@@ -52,8 +52,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(prior_box, ops::PriorBoxOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(prior_box, ops::PriorBoxOp);
 #endif

--- a/src/operators/reshape2_op.cpp
+++ b/src/operators/reshape2_op.cpp
@@ -40,9 +40,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(reshape2, ops::Reshape2Op);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(reshape2, ops::Reshape2Op);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(reshape2, ops::Reshape2Op);
 #endif

--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -35,9 +35,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(reshape, ops::ReshapeOp);
 #endif

--- a/src/operators/resize_op.cpp
+++ b/src/operators/resize_op.cpp
@@ -32,8 +32,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(resize, ops::ResizeOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(resize, ops::ResizeOp);
-#endif

 #endif
--- a/src/operators/scale_op.cpp
+++ b/src/operators/scale_op.cpp
@@ -32,8 +32,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(scale, ops::ScaleOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(scale, ops::ScaleOp);
-#endif

 #endif
--- a/src/operators/shape_op.cpp
+++ b/src/operators/shape_op.cpp
@@ -34,7 +34,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(shape, ops::ShapeOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif

 #endif
--- a/src/operators/slice_op.cpp
+++ b/src/operators/slice_op.cpp
@@ -31,9 +31,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(slice, ops::SliceOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(slice, ops::SliceOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(slice, ops::SliceOp);
 #endif

--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -31,9 +31,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(softmax, ops::SoftmaxOp);
 #endif

--- a/src/operators/sum_op.cpp
+++ b/src/operators/sum_op.cpp
@@ -61,9 +61,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(sum, ops::SumOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(sum, ops::ConcatOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif