add conv_add_relu op

d4c0395f · liuruilong · e691f022 · d4c0395f · d4c0395f · d4c0395f
27 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,9 +12,13 @@ option(FPGA "fpga" OFF)

 if (CPU)
    add_definitions(-DPADDLE_MOBILE_CPU)
-elseif (MALI_GPU)
+endif()
+
+if (MALI_GPU)
    add_definitions(-DPADDLE_MOBILE_MALI_GPU)
-elseif(FPGA)
+endif()
+
+if(FPGA)
    add_definitions(-DPADDLE_MOBILE_FPGA)
 endif()

@@ -94,6 +98,7 @@ if (googlenet)
    add_definitions(-DPOOL_OP)
    add_definitions(-DRELU_OP)
    add_definitions(-DFUSION_CONVADD_OP)
+    add_definitions(-DFUSION_CONVADD_RELU_OP)
 elseif (mobilenet)
    add_definitions(-DCONV_OP)
    add_definitions(-DELEMENTWISEADD_OP)
@@ -145,6 +150,7 @@ else ()
    add_definitions(-DSIGMOID_OP)
    add_definitions(-DSOFTMAX_OP)
    add_definitions(-DTRANSPOSE_OP)
+    add_definitions(-DFUSION_CONVADD_RELU_OP)
 endif()

 if (IS_IOS)

--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -63,6 +63,7 @@ std::unordered_map<
        {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
        {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}}};
+        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}}};

 }  // namespace paddle_mobile
--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "operators/conv_op.h"
 #include <vector>
 #include "framework/op_proto_maker.h"
+#include "operators/math/conv_func.h"
 #include "framework/op_registry.h"

 namespace paddle_mobile {
@@ -38,7 +39,7 @@ void ConvOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+    output_shape.push_back(math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
                                          dilations[i], paddings[i],
                                          strides[i]));
  }

--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -43,13 +43,6 @@ class ConvOp
 private:
 };

-inline int ConvOutputSize(int input_size, int filter_size, int dilation,
-                          int padding, int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  return output_size;
-}
-
 }  // namespace operators
 }  // namespace paddle_mobile


--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "operators/depthwise_conv_op.h"
 #include <vector>
 #include "framework/op_proto_maker.h"
+#include "operators/math/conv_func.h"
 #include "framework/op_registry.h"
 #include "operators/conv_op.h"

@@ -39,7 +40,7 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+    output_shape.push_back(math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
                                          dilations[i], paddings[i],
                                          strides[i]));
  }

--- a/src/operators/fusion_conv_add.cpp
+++ b/src/operators/fusion_conv_add.cpp
@@ -14,6 +14,7 @@ limitations under the License. */

 #ifdef FUSION_CONVADD_OP

+#include "operators/math/conv_func.h"
 #include "operators/fusion_conv_add.h"

 namespace paddle_mobile {
@@ -35,7 +36,7 @@ void FushionConvAddOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+    output_shape.push_back(math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
                                          dilations[i], paddings[i],
                                          strides[i]));
  }

--- a/src/operators/fusion_conv_add.h
+++ b/src/operators/fusion_conv_add.h
@@ -18,10 +18,10 @@ limitations under the License. */

 #include <string>
 #include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
 #include "op_param.h"
+#include "framework/operator.h"
 #include "operators/kernel/conv_add_kernel.h"
+#include "framework/program/program-optimize/fusion_op_register.h"

 namespace paddle_mobile {
 namespace operators {
@@ -67,13 +67,6 @@ class FushionConvAddOp : public framework::OperatorWithKernel<
 protected:
 };

-inline int ConvOutputSize(int input_size, int filter_size, int dilation,
-                          int padding, int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  return output_size;
-}
-
 #ifdef PADDLE_MOBILE_CPU
 static framework::FusionOpRegistrar convadd_registrar(
    new FusionConvAddMatcher());

--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -15,5 +15,46 @@ limitations under the License. */
 #ifdef CONVADDRELU_OP

 #include "fusion_conv_add_relu_op.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionConvAddReluOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
+  }
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}
+}
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_add_relu);
+REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif

 #endif
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -17,6 +17,8 @@ limitations under the License. */
 #pragma once

 #include "framework/operator.h"
+#include "operators/op_param.h"
+#include "operators/kernel/conv_add_relu_kernel.h"
 #include "framework/program/program-optimize/fusion_op_register.h"

 namespace paddle_mobile {
@@ -33,22 +35,34 @@ class FushionConvAddReluOpMatcher : public framework::FusionOpMatcher {
  void FolderNodes(
      framework::Node *node,
      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    std::vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node->OpDescs(node_.Depth());
    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}}, removed_nodes);
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Y"}}}, removed_nodes);
  }
  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; }
 };

-class ConvAddReluOp {
+template <typename DeviceType, typename T>
+class FusionConvAddReluOp: public framework::OperatorWithKernel<
+        DeviceType, FushionConvAddReluParam,
+        operators::ConvAddReluKernel<DeviceType, T>> {
 public:
- private:
+  FusionConvAddReluOp(const string &type, const VariableNameMap &inputs,
+                   const VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs,
+                   std::shared_ptr<framework::Scope> scope)
+          : framework::OperatorWithKernel<DeviceType, FushionConvAddReluParam,
+          operators::ConvAddReluKernel<DeviceType, T>>(
+          type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+          DeviceType, FushionConvAddReluParam,
+          operators::ConvAddReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+ protected:
 };

 #ifdef PADDLE_MOBILE_CPU
-// static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
-//        new FushionConvAddReluOpMatcher());
+//static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(new FushionConvAddReluOpMatcher());
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif

--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -70,7 +70,7 @@ class FushionFcOp : public framework::OperatorWithKernel<
 static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
+//static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
 #endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/kernel/arm/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_kernel.cpp
@@ -26,7 +26,7 @@ void ConvAddKernel<CPU, float>::Compute(
  Tensor bias = *param.Bias();
  int axis = param.Axis();
  Tensor *output = param.Output();
-  expand_bias(bias, axis, output->dims());
+  math::expand_bias(bias, axis, output->dims());
  output->ShareDataWith(bias);
  int groups = param.Groups();
  std::vector<int> strides = param.Strides();
@@ -50,7 +50,7 @@ void ConvAddKernel<CPU, float>::Compute(
  framework::DDim col_matrix_shape =
      framework::flatten_to_2d(col_shape, data_dim + 1);

-  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+  bool is_expand = math::IsExpand(filter_shape_vec, strides, paddings, dilations);
  Tensor col;
  Tensor col_matrix;
  if (is_expand) {

--- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADD_RELU_OP
+
+#include "operators/kernel/conv_add_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+void ConvAddReluKernel<CPU, float>::Compute(
+        const FushionConvAddReluParam &param) const {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor bias = *param.Bias();
+  int axis = param.Axis();
+  Tensor *output = param.Output();
+  math::expand_bias(bias, axis, output->dims());
+  output->ShareDataWith(bias);
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+          framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand = math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+          input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+          output->dims()[1],
+          output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(1), true);
+
+    }
+  }
+}
+template class ConvAddReluKernel<CPU, float>;
+
+}
+}
+
+#endif
--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
@@ -21,11 +21,12 @@ limitations under the License. */
 #include <arm_neon.h>
 #endif
 #include "framework/ddim.h"
+#include "operators/op_param.h"
 #include "framework/operator.h"
 #include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"
-#include "operators/op_param.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/math_function.h"

 namespace paddle_mobile {
 namespace operators {
@@ -33,75 +34,13 @@ namespace operators {
 using framework::DDim;
 using framework::OpKernelBase;

+
 template <typename DeviceType, typename T>
 class ConvAddKernel : public OpKernelBase<DeviceType, FushionConvAddParam> {
 public:
  void Compute(const FushionConvAddParam &param) const;
 };

-inline void expand_bias(Tensor &bias, int axis, const DDim &dDim) {
-  auto bias_ptr = bias.data<float>();
-  const DDim bias_ddim = bias.dims();
-  PADDLE_MOBILE_ENFORCE(bias.dims().size() == 1,
-                        "the bias tensor's dims size != 1")
-  DDim outer_ddim = paddle_mobile::framework::slice_ddim(dDim, 0, axis + 1);
-  DDim inner_ddim =
-      paddle_mobile::framework::slice_ddim(dDim, axis + 1, dDim.size());
-  int outer_size = paddle_mobile::framework::product(outer_ddim);
-  int inner_size = paddle_mobile::framework::product(inner_ddim);
-  bias.Resize(dDim);
-  auto new_ptr = bias.mutable_data<float>();
-  int axis_size = dDim[axis];
-
-#if __ARM_NEON
-  for (int i = 0; i < outer_size; ++i) {
-    int inner_num = inner_size >> 4;
-    int remain = inner_size - (inner_num << 4);
-    float v_bias = bias_ptr[i * axis_size / outer_size];
-    for (; inner_num > 0; inner_num--) {
-      float32x4_t v_newptr1 = vdupq_n_f32(v_bias);
-      float32x4_t v_newptr2 = vdupq_n_f32(v_bias);
-      float32x4_t v_newptr3 = vdupq_n_f32(v_bias);
-      float32x4_t v_newptr4 = vdupq_n_f32(v_bias);
-      vst1q_f32(new_ptr, v_newptr1);
-      new_ptr += 4;
-      vst1q_f32(new_ptr, v_newptr2);
-      new_ptr += 4;
-      vst1q_f32(new_ptr, v_newptr3);
-      new_ptr += 4;
-      vst1q_f32(new_ptr, v_newptr4);
-      new_ptr += 4;
-    }
-    for (; remain > 0; remain--) {
-      *new_ptr = v_bias;
-      new_ptr++;
-    }
-  }
-#else
-  for (int i = 0; i < outer_size; ++i) {
-    float v_bias = bias_ptr[i * axis_size / outer_size];
-    for (int j = 0; j < inner_size; ++j) {
-      new_ptr[i * inner_size + j] = v_bias;
-    }
-  }
-#endif
-}
-
-inline bool IsExpand(const std::vector<int64_t> &filter_dim,
-                     const std::vector<int> &strides,
-                     const std::vector<int> &paddings,
-                     const std::vector<int> &dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
-
 }  // namespace operators
 }  // namespace paddle_mobile


--- a/src/operators/kernel/conv_add_relu_kernel.h
+++ b/src/operators/kernel/conv_add_relu_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_CONVADD_RELU_OP
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/op_param.h"
+#include "operators/math/im2col.h"
+#include "operators/math/vol2col.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/math_function.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::DDim;
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class ConvAddReluKernel : public OpKernelBase<DeviceType, FushionConvAddReluParam> {
+ public:
+  void Compute(const FushionConvAddReluParam &param) const;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/mali/batchnorm_kernel.cpp
+++ b/src/operators/kernel/mali/batchnorm_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef BATCHNORM_OP
+
+#pragma once
+
+#include "operators/kernel/batchnorm_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+void BatchNormKernel<GPU_MALI, float>::Compute(const BatchNormParam &param) const {
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/mali/conv_kernel.cpp
+++ b/src/operators/kernel/mali/conv_kernel.cpp
@@ -23,6 +23,7 @@ template <>
 void ConvKernel<GPU_MALI, float>::Compute(const ConvParam &param) const {
  //  ArmConvImplement imp;
  //  imp.Compute(param);
+  param.Output()->mutable_data<float>()[0] = 100.0;
 }

 template class ConvKernel<GPU_MALI, float>;

--- a/src/operators/math/conv_func.h
+++ b/src/operators/math/conv_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+
+#include "framework/ddim.h"
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+using framework::DDim;
+using framework::Tensor;
+
+inline int ConvOutputSize(int input_size, int filter_size, int dilation,
+                          int padding, int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  return output_size;
+}
+
+inline void expand_bias(Tensor &bias, int axis, const DDim &dDim) {
+  auto bias_ptr = bias.data<float>();
+  const DDim bias_ddim = bias.dims();
+  PADDLE_MOBILE_ENFORCE(bias.dims().size() == 1,
+                        "the bias tensor's dims size != 1")
+  DDim outer_ddim = paddle_mobile::framework::slice_ddim(dDim, 0, axis + 1);
+  DDim inner_ddim =
+          paddle_mobile::framework::slice_ddim(dDim, axis + 1, dDim.size());
+  int outer_size = paddle_mobile::framework::product(outer_ddim);
+  int inner_size = paddle_mobile::framework::product(inner_ddim);
+  bias.Resize(dDim);
+  auto new_ptr = bias.mutable_data<float>();
+  int axis_size = dDim[axis];
+
+#if __ARM_NEON
+  for (int i = 0; i < outer_size; ++i) {
+    int inner_num = inner_size >> 4;
+    int remain = inner_size - (inner_num << 4);
+    float v_bias = bias_ptr[i * axis_size / outer_size];
+    for (; inner_num > 0; inner_num--) {
+      float32x4_t v_newptr1 = vdupq_n_f32(v_bias);
+      float32x4_t v_newptr2 = vdupq_n_f32(v_bias);
+      float32x4_t v_newptr3 = vdupq_n_f32(v_bias);
+      float32x4_t v_newptr4 = vdupq_n_f32(v_bias);
+      vst1q_f32(new_ptr, v_newptr1);
+      new_ptr += 4;
+      vst1q_f32(new_ptr, v_newptr2);
+      new_ptr += 4;
+      vst1q_f32(new_ptr, v_newptr3);
+      new_ptr += 4;
+      vst1q_f32(new_ptr, v_newptr4);
+      new_ptr += 4;
+    }
+    for (; remain > 0; remain--) {
+      *new_ptr = v_bias;
+      new_ptr++;
+    }
+  }
+#else
+  for (int i = 0; i < outer_size; ++i) {
+    float v_bias = bias_ptr[i * axis_size / outer_size];
+    for (int j = 0; j < inner_size; ++j) {
+      new_ptr[i * inner_size + j] = v_bias;
+    }
+  }
+#endif
+}
+
+inline bool IsExpand(const std::vector<int64_t> &filter_dim,
+                     const std::vector<int> &strides,
+                     const std::vector<int> &paddings,
+                     const std::vector<int> &dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+
+}
+}
+}
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -175,7 +175,48 @@ void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
  }
 }

-// 计算一个更小的 4 * 4 的 C 矩阵分块
+// 分块矩阵乘法
+void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 int first_time, bool relu = false) {
+  int Buff_A_M = m;
+  int Buff_B_N = n;
+
+  int _mc = m % MR;
+  int _nc = n % NR;
+
+  if (_mc != 0) {
+    Buff_A_M = m + (MR - _mc);
+  }
+
+  if (_nc != 0) {
+    Buff_B_N = n + (NR - _nc);
+  }
+
+  float packedA[MC * KC];
+  static float packedB[KC * NC];
+
+  if (first_time) {
+    PackMatrixB_(k, n, _nc, B, ldb, packedB);
+  }
+  PackMatrixA_(m, k, _mc, A, lda, packedA);
+
+  int i, j, mc, nc;
+
+  // B 取 4 列, 打包预热
+  for (j = 0; j < Buff_B_N; j += NR) {
+    nc = (n - j) < NR ? _nc : NR;
+    // A 取 4 行，打包预热
+    for (i = 0; i < Buff_A_M; i += MR) {
+      mc = (m - i) < MR ? _mc : MR;
+      AddDot4x4_relu(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
+                &C(i, j), ldc, mc, nc, relu);
+    }
+  }
+}
+
+
+//计算一个更小的 4 * 4 的 C 矩阵分块
 #if defined(IOS)
 void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
               int ldb, float beta, float *C, int ldc, int mc, int nc) {
@@ -226,6 +267,60 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
    }
  }
 }
+
+void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
+               int ldb, float beta, float *C, int ldc, int mc, int nc, bool relu = false) {
+   // init C
+  float32x4_t cv0 = vdupq_n_f32(0.0);
+  float32x4_t cv1 = vdupq_n_f32(0.0);
+  float32x4_t cv2 = vdupq_n_f32(0.0);
+  float32x4_t cv3 = vdupq_n_f32(0.0);
+
+  float32x4_t av;
+  float32x4_t bv;
+
+  float32x2_t av01;
+  float32x2_t av23;
+
+  for (int p = 0; p < k; p += 1) {
+    av = vld1q_f32(a);
+    bv = vld1q_f32(b);
+
+    av01 = vget_low_f32(av);
+    cv0 = vmlaq_lane_f32(cv0, bv, av01, 0);
+    cv1 = vmlaq_lane_f32(cv1, bv, av01, 1);
+    av23 = vget_high_f32(av);
+    cv2 = vmlaq_lane_f32(cv2, bv, av23, 0);
+    cv3 = vmlaq_lane_f32(cv3, bv, av23, 1);
+
+    a += MR;
+    b += NR;
+  }
+  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
+  int i, j;
+  for (i = 0; i < mc; ++i) {
+    for (j = 0; j < nc; ++j) {
+      if (beta == 0.0) {
+        C(i, j) = 0.0;
+      } else if (beta != 1.0) {
+        C(i, j) *= beta;
+      }
+      if (j == 0) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
+      } else if (j == 1) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
+      } else if (j == 2) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
+      } else if (j == 3) {
+        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
+      }
+      if (C(i, j) < 0) {
+        C(i, j) = 0;
+      }
+    }
+  }
+}
+
 #elif defined(ARMV7)
 void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
               int ldb, float beta, float *C, int ldc, int mc, int nc) {
@@ -361,6 +456,155 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
    }
  }
 }
+
+void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
+               int ldb, float beta, float *C, int ldc, int mc, int nc, bool relu = false) {
+  int kc1 = k / 2, kc2 = k % 2;
+  int bytes_ldc = 4 * ldc;
+  int flag_alpha = (alpha == 1.0) ? 1 : 2;
+  int flag_beta;
+  if (beta == 0.0) {
+    flag_beta = 0;
+  } else if (beta == 1.0) {
+    flag_beta = 1;
+  } else {
+    flag_beta = 2;
+  }
+
+  asm volatile(
+      "vmov.f32   q10,    #0.0        \n\t"
+      "vmov.f32   q11,    #0.0        \n\t"
+      "vmov.f32   q12,    #0.0        \n\t"
+      "vmov.f32   q13,    #0.0        \n\t"
+      "vmov.f32   q14,    #0.0          \n\t"
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "blt        end_kc1_%=          \n\t"
+      "loop_kc1_%=:                   \n\t"
+      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
+      "vmla.f32   q10, q2, d0[0]      \n\t"
+      "vmla.f32   q11, q2, d0[1]      \n\t"
+      "vmla.f32   q12, q2, d1[0]      \n\t"
+      "vmla.f32   q13, q2, d1[1]      \n\t"
+      "vmla.f32   q10, q3, d2[0]      \n\t"
+      "vmla.f32   q11, q3, d2[1]      \n\t"
+      "vmla.f32   q12, q3, d3[0]      \n\t"
+      "vmla.f32   q13, q3, d3[1]      \n\t"
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "bge        loop_kc1_%=         \n\t"
+      "end_kc1_%=:                    \n\t"
+
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "blt        end_kc2_%=          \n\t"
+      "vld1.32    {q0}, [%[a]]!       \n\t"
+      "vld1.32    {q1}, [%[b]]!       \n\t"
+      "vmla.f32   q10, q1, d0[0]      \n\t"
+      "vmla.f32   q11, q1, d0[1]      \n\t"
+      "vmla.f32   q12, q1, d1[0]      \n\t"
+      "vmla.f32   q13, q1, d1[1]      \n\t"
+      "end_kc2_%=:                    \n\t"
+
+      "cmp        %[mc],      #4      \n\t"
+      "bne        temp_%=             \n\t"
+      "cmp        %[nc],      #4      \n\t"
+      "bne        temp_%=             \n\t"
+
+      "vmov.f32   d8[0],    %[alpha]  \n\t"
+      "vmov.f32   d8[1],    %[beta]   \n\t"
+
+      "cmp        %[flag_alpha],  #1  \n\t"
+      "bne        alpha_%=            \n\t"
+
+      "alpha_%=:                      \n\t"
+      "vmul.f32   q10, q10, d8[0]     \n\t"
+      "vmul.f32   q11, q11, d8[0]     \n\t"
+      "vmul.f32   q12, q12, d8[0]     \n\t"
+      "vmul.f32   q13, q13, d8[0]     \n\t"
+
+      "beta_%=:                       \n\t"
+      "cmp        %[flag_beta],   #0            \n\t"
+      "beq        memory_%=           \n\t"
+
+      "mov        r4,     %[C]        \n\t"
+      "mov        r6,     %[bytes_ldc]\n\t"
+      "vld1.32    {q0}, [r4], r6      \n\t"
+      "vld1.32    {q1}, [r4], r6      \n\t"
+      "vld1.32    {q2}, [r4], r6      \n\t"
+      "vld1.32    {q3}, [r4]          \n\t"
+      "cmp         %[flag_beta],   #1            \n\t"
+      "beq        beta_eq1_%=         \n\t"
+      "bne        beta_ne1_%=         \n\t"
+
+      "beta_eq1_%=:                   \n\t"
+      "vadd.f32   q10, q10, q0        \n\t"
+      "vadd.f32   q11, q11, q1        \n\t"
+      "vadd.f32   q12, q12, q2        \n\t"
+      "vadd.f32   q13, q13, q3        \n\t"
+      "b          memory_%=           \n\t"
+
+      "beta_ne1_%=:                   \n\t"
+      "vmla.f32   q10, q0, d8[1]      \n\t"
+      "vmla.f32   q11, q1, d8[1]      \n\t"
+      "vmla.f32   q12, q2, d8[1]      \n\t"
+      "vmla.f32   q13, q3, d8[1]      \n\t"
+
+      "memory_%=:                     \n\t"
+      "vmax.f32 q10, q10, q14           \n\t"
+      "vmax.f32 q11, q11, q14           \n\t"
+      "vmax.f32 q12, q12, q14           \n\t"
+      "vmax.f32 q13, q13, q14           \n\t"
+      "mov        r5,     %[C]        \n\t"
+      "mov        r6,     %[bytes_ldc]\n\t"
+
+      "vst1.32    {q10}, [r5], r6     \n\t"
+      "vst1.32    {q11}, [r5], r6     \n\t"
+      "vst1.32    {q12}, [r5], r6     \n\t"
+      "vst1.32    {q13}, [r5]         \n\t"
+      "b          end_%=              \n\t"
+
+      "temp_%=:                       \n\t"
+      "vst1.32    {q10, q11}, [%[ab]]!\n\t"
+      "vst1.32    {q12, q13}, [%[ab]] \n\t"
+      "end_%=:                        \n\t"
+      :
+      : [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
+        [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
+        [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13", "q14");
+
+
+  if (mc != MR || nc != NR) {
+    int i, j;
+    for (i = 0; i < mc; ++i) {
+      for (j = 0; j < nc; ++j) {
+        if (beta == 0.0) {
+          if (alpha != 1.0) {
+            C(i, j) = alpha * ab[i * MR + j];
+          } else {
+            C(i, j) = ab[i * MR + j];
+          }
+        } else {
+          if (beta != 1.0) {
+            C(i, j) *= beta;
+          }
+          if (alpha != 1.0) {
+            C(i, j) += alpha * ab[i * MR + j];
+          } else {
+            C(i, j) += ab[i * MR + j];
+          }
+        }
+        if (relu) {
+          if (C(i, j) < 0) {
+            C(i, j) = 0;
+          }
+        }
+
+      }
+    }
+  }
+}
+
 #else
 void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
               int ldb, float beta, float *C, int ldc, int mc, int nc) {
@@ -418,6 +662,70 @@ void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
    }
  }
 }
+
+void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
+               int ldb, float beta, float *C, int ldc, int mc, int nc, bool relu) {
+  float c[16] = {0};
+  float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
+
+  for (int p = 0; p < k; p += 1) {
+    reg_b0 = *b++;
+    reg_b1 = *b++;
+    reg_b2 = *b++;
+    reg_b3 = *b++;
+
+    reg_a0 = *a++;
+    reg_a1 = *a++;
+    reg_a2 = *a++;
+    reg_a3 = *a++;
+
+    // first row
+    c[0] += reg_a0 * reg_b0;
+    c[1] += reg_a0 * reg_b1;
+    c[2] += reg_a0 * reg_b2;
+    c[3] += reg_a0 * reg_b3;
+
+    // second row
+    c[4] += reg_a1 * reg_b0;
+    c[5] += reg_a1 * reg_b1;
+    c[6] += reg_a1 * reg_b2;
+    c[7] += reg_a1 * reg_b3;
+
+    // third row
+    c[8] += reg_a2 * reg_b0;
+    c[9] += reg_a2 * reg_b1;
+    c[10] += reg_a2 * reg_b2;
+    c[11] += reg_a2 * reg_b3;
+
+    // fourth row
+    c[12] += reg_a3 * reg_b0;
+    c[13] += reg_a3 * reg_b1;
+    c[14] += reg_a3 * reg_b2;
+    c[15] += reg_a3 * reg_b3;
+  }
+  int i, j;
+  for (i = 0; i < mc; ++i) {
+    for (j = 0; j < nc; ++j) {
+      if (beta == 0.0) {
+        C(i, j) = 0.0;
+      } else if (beta != 1.0) {
+        C(i, j) *= beta;
+      }
+      if (alpha != 1.0) {
+        C(i, j) += alpha * c[i * MR + j];
+      } else {
+        C(i, j) += c[i * MR + j];
+      }
+      if (relu) {
+        if (C(i, j) < 0) {
+          C(i, j) = 0;
+        }
+      }
+    }
+  }
+}
+
+
 #endif

 // 32位 float 矩阵乘法
@@ -443,6 +751,34 @@ void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
  }
 }

+void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
+                const float *B, int ldb, float beta, float *C, int ldc) {
+  int i, j, p, mc, nc, kc;
+  float beta_;
+  for (j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+    for (p = 0; p < k; p += KC) {
+      kc = s_min(k - p, KC);
+      for (i = 0; i < m; i += MC) {
+        mc = s_min(m - i, MC);
+        if (p != 0) {
+          beta_ = 1.0;
+        } else {
+          beta_ = beta;
+        }
+
+        if (p + KC >= k) {
+          InnerKernel_relu(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
+                           &C(i, j), ldc, i == 0, true);
+        } else {
+          InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
+                           &C(i, j), ldc, i == 0);
+        }
+      }
+    }
+  }
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -57,10 +57,16 @@ void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
 void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B,
               int ldb, float beta, float *C, int ldc, int mc, int nc);

+void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
+                    int ldb, float beta, float *C, int ldc, int mc, int nc, bool relu);
+
 // 32位 float 矩阵乘法
 void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
           const float *B, int ldb, float beta, float *C, int ldc);

+void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
+           const float *B, int ldb, float beta, float *C, int ldc);
+
 // 64位 double 矩阵乘法
 void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
           const double *B, int ldb, float beta, double *C, int ldc);

--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -22,7 +22,7 @@ namespace math {
 template <>
 void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
                   const framework::Tensor &matrix_b, bool trans_b, float alpha,
-                   framework::Tensor *matrix_out, float beta) {
+                   framework::Tensor *matrix_out, float beta, bool relu) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -41,14 +41,20 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
  int N = dim_out[1];
  int K = (trans_a == false) ? dim_a[1] : dim_a[0];

-  sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-        beta, matrix_out->data<float>(), N);
+  if (relu) {
+    sgemm_relu(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+               beta, matrix_out->data<float>(), N);
+  } else {
+    sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+          beta, matrix_out->data<float>(), N);
+  }
+
 }

 template <>
 void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
                    const framework::Tensor &matrix_b, bool trans_b,
-                    double alpha, framework::Tensor *matrix_out, double beta) {
+                    double alpha, framework::Tensor *matrix_out, double beta, bool relu) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -68,6 +74,8 @@ void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
 }

+
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -25,7 +25,7 @@ namespace math {
 template <typename T>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
            const framework::Tensor &matrix_b, bool trans_b, T alpha,
-            framework::Tensor *matrix_out, T beta);
+            framework::Tensor *matrix_out, T beta, bool relu = false);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -823,7 +823,7 @@ class FushionConvAddParam : public OpParam {

  const int &Groups() const { return groups; }

- private:
+ protected:
  Tensor *bias_;
  int axis_;
  Tensor *input_;
@@ -838,5 +838,15 @@ class FushionConvAddParam : public OpParam {
 Print &operator<<(Print &printer, const FushionConvAddParam &conv_param);
 #endif

+#ifdef FUSION_CONVADD_RELU_OP
+class FushionConvAddReluParam: public FushionConvAddParam {
+ public:
+  FushionConvAddReluParam(const VariableNameMap &inputs,
+                      const VariableNameMap &outputs, const AttributeMap &attrs,
+                      const Scope &scope): FushionConvAddParam(inputs, outputs, attrs, scope) {
+  }
+};
+#endif
+
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -141,6 +141,10 @@ else ()
    ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-mobilenet paddle-mobile)

+    # gen test
+    ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-conv-add-relu-op paddle-mobile)
+
    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)

 endif()
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -42,8 +42,9 @@ using std::vector;
 template <typename DeviceType, typename OpType>
 class Executor4Test : public Executor<DeviceType> {
 public:
-  Executor4Test(Program<DeviceType> p, string op_type)
+  Executor4Test(Program<DeviceType> p, string op_type, bool use_optimize = false)
      : Executor<DeviceType>() {
+    this->use_optimize_ = use_optimize;
    this->program_ = p;
    if (this->use_optimize_) {
      this->to_predict_program_ = this->program_.optimizeProgram;
@@ -61,10 +62,13 @@ class Executor4Test : public Executor<DeviceType> {
      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
      for (std::shared_ptr<OpDesc> op : ops) {
        if (op->Type() == op_type) {
+
+          DLOG << "匹配到: " << op->Type();
+
          /// test first meeting op in program
          std::shared_ptr<paddle_mobile::framework::OperatorBase<DeviceType>>
              op_ptr = paddle_mobile::framework::OpRegistry<
-                  paddle_mobile::CPU>::CreateOp(op->Type(), op->GetInputs(),
+                  DeviceType>::CreateOp(op->Type(), op->GetInputs(),
                                                op->GetOutputs(),
                                                op->GetAttrMap(),
                                                this->program_.scope);

--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -20,8 +20,9 @@ int main() {
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
  auto program = loader.Load(g_mobilenet_ssd, false, false);
-  //  loader.Load(g_googlenet_combine + "/model", g_googlenet_combine +
-  //  "/params", true);
-  program.originProgram->Description("program desc: ");
+//  auto program = loader.Load(g_googlenet_combine + "/model", g_googlenet_combine +
+//    "/params", true);
+  
+  //  program.originProgram->Description("program desc: ");
  return 0;
 }
--- a/test/operators/test_conv_add_relu_op.cpp
+++ b/test/operators/test_conv_add_relu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/fusion_conv_add_relu_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../models/image_classification_resnet.inference.model
+  auto program = loader.Load(g_googlenet, true);
+
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+          paddle_mobile::operators::FusionConvAddReluOp<paddle_mobile::CPU, float>>
+          executor(program, "fusion_conv_add_relu", true);
+
+  paddle_mobile::framework::Tensor input;
+  GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
+  //  // use SetupTensor if not has local input image .
+  //  SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
+  //                     static_cast<float>(1));
+
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 112, 112});
+  auto output = executor.Predict(input, "data", "conv2d_0.tmp_2", out_ddim);
+
+  auto output_ptr = output->data<float>();
+  for (int j = 0; j < 25; ++j) {
+    DLOG << " value of output: " << output_ptr[j];
+  }
+  return 0;
+}
--- a/test/operators/test_cov_op.cpp
+++ b/test/operators/test_cov_op.cpp
@@ -16,15 +16,15 @@ limitations under the License. */
 #include "operators/conv_op.h"

 int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  paddle_mobile::Loader<paddle_mobile::GPU_MALI> loader;
  //  ../models/image_classification_resnet.inference.model
  auto program = loader.Load(g_googlenet);

  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
                        "program file read fail");

-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::ConvOp<paddle_mobile::CPU, float>>
+  Executor4Test<paddle_mobile::GPU_MALI,
+                paddle_mobile::operators::ConvOp<paddle_mobile::GPU_MALI, float>>
      executor(program, "conv2d");

  paddle_mobile::framework::Tensor input;
@@ -37,7 +37,7 @@ int main() {
  auto output = executor.Predict(input, "data", "conv2d_0.tmp_0", out_ddim);

  auto output_ptr = output->data<float>();
-  for (int j = 0; j < output->numel(); ++j) {
+  for (int j = 0; j < 20; ++j) {
    DLOG << " value of output: " << output_ptr[j];
  }
  return 0;