Support winograd algo to speed up 3x3 convlution operator

68e8dc4a · hjchen2 · 4af571c4 · 68e8dc4a · 68e8dc4a · 68e8dc4a
8 changed file
--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "operators/math/math_function.h"
 #include "operators/math/pad.h"
 #include "operators/math/vol2col.h"
+#include "operators/math/winograd/winograd.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
@@ -116,6 +117,34 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
  }
 }
+inline void BatchConv3x3Winograd(const ConvParam<CPU> &param) {
+  const Tensor *input = param.Input();
+  Tensor *filter = param.Filter();
+  Tensor *output = param.Output();
+  output->mutable_data<float>();
+  int batch_size = input->dims()[0];
+  int groups = param.Groups();
+  const std::vector<int> &paddings = param.Paddings();
+  math::PadFunctor<CPU, float> pad;
+  Tensor input_pad;
+  for (int i = 0; i < batch_size; ++i) {
+    Tensor in_batch = input->Slice(i, i + 1);
+    Tensor out_batch = output->Slice(i, i + 1);
+    if (paddings[0] == 0 && paddings[1] == 0) {
+      input_pad = in_batch;
+    } else {
+      framework::DDim pad_shape = in_batch.dims();
+      pad_shape[2] += 2 * paddings[0];
+      pad_shape[3] += 2 * paddings[1];
+      input_pad.mutable_data<float>(pad_shape);
+      pad(in_batch, paddings[0], paddings[0], paddings[1], paddings[1],
+          &input_pad);
+    }
+    math::winograd_f6k3(input_pad, *filter, &out_batch);
+  }
+}
 template <typename P>
 void ConvCompute(const ConvParam<CPU> &param) {
  if (param.Input()->type() == typeid(int8_t)) {
@@ -133,6 +162,12 @@ void ConvCompute(const ConvParam<CPU> &param) {
               param.Filter()->dims()[2] == 3) {
      math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
                             param.Filter(), nullptr, param.Output(), false);
+    } else if (param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+               param.Strides()[0] == param.Strides()[1] &&
+               param.Dilations()[0] == param.Dilations()[1] &&
+               param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 &&
+               param.Dilations()[0] == 1 && param.Input()->dims()[1] > 16) {
+      BatchConv3x3Winograd(param);
    } else {
      ConvBasic<float, float>(param);
    }

--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -249,7 +249,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
 #if __ARM_NEON
  const float *input_data = input->data<float>();
  const float *filter_data = filter->data<float>();
-  float *output_data = output->data<float>();
+  float *output_data = output->mutable_data<float>();
  const float *bias_data;
  if (if_bias) {
    bias_data = bias->data<float>();

--- a/src/operators/math/pad.cpp
+++ b/src/operators/math/pad.cpp
@@ -21,10 +21,12 @@ namespace math {
 template <typename T>
 class PadFunctor<CPU, T> {
 public:
-  void operator()(const framework::Tensor &input, const int pad_h,
+  void operator()(const framework::Tensor &input, const int pad_top,
-                  const int pad_w, framework::Tensor *output) {
+                  const int pad_bottom, const int pad_left, const int pad_right,
+                  framework::Tensor *output) {
    const T *in_data = input.data<T>();
    T *out_data = output->mutable_data<T>();
+    // should check output shape is valid for such pad parameters
    const framework::DDim &input_shape = input.dims();
    const framework::DDim &output_shape = output->dims();
    // fill output with 0
@@ -32,13 +34,13 @@ class PadFunctor<CPU, T> {
    // should make sure the shape of output is match with input
    for (int i = 0; i < input_shape[0]; ++i) {
      for (int c = 0; c < input_shape[1]; ++c) {
-        out_data += pad_h * output_shape[3];
+        out_data += pad_top * output_shape[3];
        for (int h = 0; h < input_shape[2]; ++h) {
-          memcpy(out_data + pad_w, in_data, sizeof(T) * input_shape[3]);
+          memcpy(out_data + pad_left, in_data, sizeof(T) * input_shape[3]);
          out_data += output_shape[3];
          in_data += input_shape[3];
        }
-        out_data += pad_h * output_shape[3];
+        out_data += pad_bottom * output_shape[3];
      }
    }
  }

--- a/src/operators/math/pad.h
+++ b/src/operators/math/pad.h
@@ -22,8 +22,9 @@ namespace math {
 template <typename DeviceType, typename T>
 class PadFunctor {
 public:
-  void operator()(const framework::Tensor &input, const int pad_h,
+  void operator()(const framework::Tensor &input, const int pad_top,
-                  const int pad_w, framework::Tensor *output);
+                  const int pad_bottom, const int pad_left, const int pad_right,
+                  framework::Tensor *output);
 };
 }  // namespace math

--- a/src/operators/math/winograd/winograd.cpp
+++ b/src/operators/math/winograd/winograd.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef CONV_OP
+#include "operators/math/winograd/winograd.h"
+#include "operators/math/winograd/winograd_transform.h"
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+// F(2X2, 3X3)
+void winograd_f2k3(const framework::Tensor &input,
+                   const framework::Tensor &weight, framework::Tensor *output) {
+}
+// F(6X6, 3X3)
+void winograd_f6k3(const framework::Tensor &input,
+                   const framework::Tensor &weight, framework::Tensor *output) {
+  framework::Tensor transformed_input;
+  framework::Tensor transformed_weight;
+  // transform weight
+  winograd_transform_weight<8, 3>(weight, &transformed_weight);
+  // tile input and transform
+  winograd_transform_input<8, 3>(input, &transformed_input);
+  // caculate output
+  winograd_transform_output<8, 3>(transformed_input, transformed_weight,
+                                  output);
+}
+// F(4X4, 5X5)
+void winograd_f4k5(const framework::Tensor &input,
+                   const framework::Tensor &weight, framework::Tensor *output) {
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/math/winograd/winograd.h
+++ b/src/operators/math/winograd/winograd.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef CONV_OP
+#pragma once
+#include "framework/tensor.h"
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+// F(2X2, 3X3)
+void winograd_f2k3(const framework::Tensor &input,
+                   const framework::Tensor &weight, framework::Tensor *output);
+// F(6X6, 3X3)
+void winograd_f6k3(const framework::Tensor &input,
+                   const framework::Tensor &weight, framework::Tensor *output);
+// F(4X4, 5X5)
+void winograd_f4k5(const framework::Tensor &input,
+                   const framework::Tensor &weight, framework::Tensor *output);
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/math/winograd/winograd_transform.h
+++ b/src/operators/math/winograd/winograd_transform.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef CONV_OP
+#pragma once
+#include "framework/tensor.h"
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+template <int tile, int kernel>
+void winograd_transform_weight(const framework::Tensor &weight,
+                               framework::Tensor *output);
+template <int tile, int kernel>
+void winograd_transform_input(const framework::Tensor &input,
+                              framework::Tensor *output);
+template <int tile, int kernel>
+void winograd_transform_output(const framework::Tensor &input,
+                               const framework::Tensor &weight,
+                               framework::Tensor *output);
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/math/winograd/winograd_transform_f6k3.cpp
+++ b/src/operators/math/winograd/winograd_transform_f6k3.cpp