fuse conv add batch relu when using faster depthwise conv (#1749)

497bf326 · Yanzhan Yang · StarryRain · b42f3d49 · 497bf326 · 497bf326
4 changed file
--- a/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
@@ -61,16 +61,61 @@ bool ConvAddBNReluKernel<CPU, float>::Init(
  param->SetNewBias(new_bias);
  InitBaseConvKernel(param);
+  // try to use faster depthwise conv
+  switch (param->ExecMode()) {
+    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
+    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
+      const std::vector<int> &paddings = param->Paddings();
+      const std::vector<int> &strides = param->Strides();
+      if (paddings.size() == 2 && paddings[0] == paddings[1] &&
+          strides.size() == 2 && strides[0] == strides[1]) {
+        int pad = paddings[0];
+        int stride = strides[0];
+        const int hin = param->Input()->dims()[2];
+        if (pad == 0 && hin > 2) {
+          could_use_faster_depthwise_conv_ = true;
+        } else if (pad == 1) {
+          could_use_faster_depthwise_conv_ = true;
+        }
+      }
+      break;
+  }
+  if (could_use_faster_depthwise_conv_) {
+    auto filter_data = param->Filter()->data<float>();
+    auto filter_dim = param->Filter()->dims();
+    int len = 1;
+    for (int i = 0; i < filter_dim.size(); i++) {
+      len *= filter_dim[i];
+    }
+    int batch = filter_dim[0];
+    int step = len / batch;
+    for (int i = 0; i < batch; i++) {
+      for (int k = 0; k < step; k++) {
+        filter_data[i * step + k] =
+            filter_data[i * step + k] * new_scale_ptr[i];
+      }
+    }
+  }
  return true;
 }
 template <>
 void ConvAddBNReluKernel<CPU, float>::Compute(
    const FusionConvAddBNReluParam<CPU> &param) {
+  bool fusion_has_been_computed = false;
  switch (param.ExecMode()) {
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      DepthwiseConv3x3<float, float>(param);
+      if (could_use_faster_depthwise_conv_) {
+        FasterDepthwiseConv3x3_bias_relu(param, param.NewBias()->data<float>(),
+                                         true);
+        fusion_has_been_computed = true;
+      } else {
+        DepthwiseConv3x3<float, float>(param);
+      }
      break;
    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
      DepthwiseConv5x5<float, float>(param);
@@ -89,8 +134,10 @@ void ConvAddBNReluKernel<CPU, float>::Compute(
      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                    param.ExecMode());
  }
-  math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
+  if (!fusion_has_been_computed) {
-                                  param.NewBias(), param.Output());
+    math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
+                                    param.NewBias(), param.Output());
+  }
 }
 template class ConvAddBNReluKernel<CPU, float>;

--- a/src/operators/kernel/central-arm-func/conv_arm_func.cpp
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.cpp
@@ -212,8 +212,8 @@ void DepthwiseConv3x3(const ConvParam<CPU> &param) {
  }
 }
-template <>
+void FasterDepthwiseConv3x3_bias_relu(const ConvParam<CPU> &param,
-void DepthwiseConv3x3<float, float>(const ConvParam<CPU> &param) {
+                                      const float *bias, bool flag_relu) {
  const Tensor *input = param.Input();
  const Tensor *filter = param.Filter();
  const std::vector<int> &paddings = param.Paddings();
@@ -222,52 +222,27 @@ void DepthwiseConv3x3<float, float>(const ConvParam<CPU> &param) {
  Tensor *output = param.Output();
  output->mutable_data<float>();
-  if (paddings.size() == 2 && paddings[0] == paddings[1] &&
+  int pad = paddings[0];
-      strides.size() == 2 && strides[0] == strides[1]) {
+  int stride = strides[0];
-    int pad = paddings[0];
+  const float *din = input->data<float>();
-    int stride = strides[0];
+  float *dout = output->mutable_data<float>();
-    const float *din = input->data<float>();
+  const float *weights = filter->data<float>();
-    float *dout = output->mutable_data<float>();
+  const int num = input->dims()[0];
-    const float *weights = filter->data<float>();
+  const int chin = input->dims()[1];
-    const float *bias = nullptr;
+  const int hin = input->dims()[2];
-    const int num = input->dims()[0];
+  const int win = input->dims()[3];
-    const int chin = input->dims()[1];
+  const int chout = output->dims()[1];
-    const int hin = input->dims()[2];
+  const int hout = output->dims()[2];
-    const int win = input->dims()[3];
+  const int wout = output->dims()[3];
-    const int chout = output->dims()[1];
+  bool flag_bias = bias != nullptr;
-    const int hout = output->dims()[2];
+  if (pad == 0 && hin > 2) {
-    const int wout = output->dims()[3];
+    math::depthwise::conv_depthwise_3x3p0(din, dout, num, chout, hout, wout,
-    bool flag_relu = false;
+                                          chin, hin, win, weights, bias, stride,
-    bool flag_bias = bias != nullptr;
+                                          flag_bias, flag_relu);
-    if (pad == 0 && hin > 2) {
+  } else if (pad == 1) {
-      math::depthwise::conv_depthwise_3x3p0(din, dout, num, chout, hout, wout,
+    math::depthwise::conv_depthwise_3x3p1(din, dout, num, chout, hout, wout,
-                                            chin, hin, win, weights, bias,
+                                          chin, hin, win, weights, bias, stride,
-                                            stride, flag_bias, flag_relu);
+                                          flag_bias, flag_relu);
-    } else if (pad == 1) {
-      math::depthwise::conv_depthwise_3x3p1(din, dout, num, chout, hout, wout,
-                                            chin, hin, win, weights, bias,
-                                            stride, flag_bias, flag_relu);
-    } else {
-      GemmConv<float, float>(param);
-    }
-  } else {
-    if (strides[0] == 1) {
-      for (int i = 0; i < batch_size; i++) {
-        Tensor in_batch = input->Slice(i, i + 1);
-        Tensor out_batch = output->Slice(i, i + 1);
-        math::DepthwiseConv3x3S1<float, float>(in_batch, *filter, paddings,
-                                               &out_batch);
-      }
-    } else if (strides[0] == 2) {
-      for (int i = 0; i < batch_size; i++) {
-        Tensor in_batch = input->Slice(i, i + 1);
-        Tensor out_batch = output->Slice(i, i + 1);
-        math::DepthwiseConv3x3S2<float, float>(in_batch, *filter, paddings,
-                                               &out_batch);
-      }
-    } else {
-      GemmConv<float, float>(param);
-    }
  }
 }

--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -44,6 +44,9 @@ void DepthwiseConv5x5(const ConvParam<CPU> &param);
 template <typename Itype, typename Otype>
 void SlidingwindowConv3x3(const ConvParam<CPU> &param);
+void FasterDepthwiseConv3x3_bias_relu(const ConvParam<CPU> &param,
+                                      const float *bias, bool flag_relu);
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/conv_add_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_add_bn_relu_kernel.h
@@ -36,6 +36,9 @@ class ConvAddBNReluKernel
 public:
  void Compute(const FusionConvAddBNReluParam<DeviceType> &param);
  bool Init(FusionConvAddBNReluParam<DeviceType> *param);
+ private:
+  bool could_use_faster_depthwise_conv_ = false;
 };
 }  // namespace operators