optimize conv7x7 s1 s2 s3 armv7 neon

74f5a329 · Bin Li · 6ad006d0 · 74f5a329 · 74f5a329 · 74f5a329
5 changed file
--- a/mace/kernels/arm/conv_2d_neon.h
+++ b/mace/kernels/arm/conv_2d_neon.h
@@ -51,6 +51,39 @@ extern void Conv2dNeonK3x3S2(const float *input,
                             const index_t out_channels,
                             float *output);

+extern void Conv2dNeonK7x7S1(const float *input,
+                             const float *filter,
+                             const index_t batch,
+                             const index_t in_height,
+                             const index_t in_width,
+                             const index_t in_channels,
+                             const index_t out_height,
+                             const index_t out_width,
+                             const index_t out_channels,
+                             float *output);
+
+extern void Conv2dNeonK7x7S2(const float *input,
+                             const float *filter,
+                             const index_t batch,
+                             const index_t in_height,
+                             const index_t in_width,
+                             const index_t in_channels,
+                             const index_t out_height,
+                             const index_t out_width,
+                             const index_t out_channels,
+                             float *output);
+
+extern void Conv2dNeonK7x7S3(const float *input,
+                             const float *filter,
+                             const index_t batch,
+                             const index_t in_height,
+                             const index_t in_width,
+                             const index_t in_channels,
+                             const index_t out_height,
+                             const index_t out_width,
+                             const index_t out_channels,
+                             float *output);
+
 }  // namespace kernels
 }  // namespace mace


--- a/mace/kernels/arm/conv_2d_neon_7x7.cc
+++ b/mace/kernels/arm/conv_2d_neon_7x7.cc
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -224,6 +224,12 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
      && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1;
    bool use_neon_1x1_s1 = filter_h == 1 && filter_w == 1
      && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
+    bool use_neon_7x7_s1 = filter_h == 7 && filter_w == 7
+        && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
+    bool use_neon_7x7_s2 = filter_h == 7 && filter_w == 7
+        && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1;
+    bool use_neon_7x7_s3 = filter_h == 7 && filter_w == 7
+        && stride_h == 3 && stride_w == 3 && dilation_h == 1 && dilation_w == 1;

    std::vector<index_t> transformed_input_shape;
    std::vector<index_t> transformed_output_shape;
@@ -288,6 +294,44 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
      if (extra_input_width != padded_input_width) {
        pad_right += (extra_input_width - padded_input_width);
      }
+    } else if (use_neon_7x7_s1) {
+      extra_output_height = height;
+      extra_input_height =
+          std::max(padded_input_height, extra_output_height + 6);
+      extra_output_width = RoundUp<index_t>(width, 4);
+      extra_input_width = std::max(padded_input_width, extra_output_width + 6);
+      if (extra_input_height != padded_input_height) {
+        pad_bottom += (extra_input_height - padded_input_height);
+      }
+      if (extra_input_width != padded_input_width) {
+        pad_right += (extra_input_width - padded_input_width);
+      }
+    } else if (use_neon_7x7_s2) {
+      extra_output_height = height;
+      extra_input_height =
+          std::max(padded_input_height, (extra_output_height - 1) * 2 + 7);
+      extra_output_width = RoundUp<index_t>(width, 4);
+      extra_input_width =
+          std::max(padded_input_width, (extra_output_width - 1) * 2 + 7);
+      if (extra_input_height != padded_input_height) {
+        pad_bottom += (extra_input_height - padded_input_height);
+      }
+      if (extra_input_width != padded_input_width) {
+        pad_right += (extra_input_width - padded_input_width);
+      }
+    } else if (use_neon_7x7_s3) {
+      extra_output_height = height;
+      extra_input_height =
+          std::max(padded_input_height, (extra_output_height - 1) * 3 + 7);
+      extra_output_width = RoundUp<index_t>(width, 4);
+      extra_input_width =
+          std::max(padded_input_width, (extra_output_width - 1) * 3 + 7);
+      if (extra_input_height != padded_input_height) {
+        pad_bottom += (extra_input_height - padded_input_height);
+      }
+      if (extra_input_width != padded_input_width) {
+        pad_right += (extra_input_width - padded_input_width);
+      }
    }

    // decide scratch size before allocate it
@@ -413,6 +457,45 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                         channels,
                         pad_output);
      };
+    } else if (use_neon_7x7_s1) {
+      conv_func = [=](const float *pad_input, float *pad_output) {
+        Conv2dNeonK7x7S1(pad_input,
+                         filter_data,
+                         batch,
+                         extra_input_height,
+                         extra_input_width,
+                         input_channels,
+                         extra_output_height,
+                         extra_output_width,
+                         channels,
+                         pad_output);
+      };
+    } else if (use_neon_7x7_s2) {
+      conv_func = [=](const float *pad_input, float *pad_output) {
+        Conv2dNeonK7x7S2(pad_input,
+                         filter_data,
+                         batch,
+                         extra_input_height,
+                         extra_input_width,
+                         input_channels,
+                         extra_output_height,
+                         extra_output_width,
+                         channels,
+                         pad_output);
+      };
+    } else if (use_neon_7x7_s3) {
+      conv_func = [=](const float *pad_input, float *pad_output) {
+        Conv2dNeonK7x7S3(pad_input,
+                         filter_data,
+                         batch,
+                         extra_input_height,
+                         extra_input_width,
+                         input_channels,
+                         extra_output_height,
+                         extra_output_width,
+                         channels,
+                         pad_output);
+      };
    } else {
      conv_func = [=](const float *pad_input, float *pad_output) {
        Conv2dGeneral(pad_input,

--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -152,6 +152,9 @@ BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, 1, SAME, 128);
 BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, 1, SAME, 128);
 BM_CONV_2D(1, 64, 32, 31, 15, 1, 1, 1, SAME, 128);
 BM_CONV_2D(1, 64, 32, 31, 1, 15, 1, 1, SAME, 128);
+BM_CONV_2D(1, 64, 32, 31, 7, 7, 1, 1, SAME, 128);
+BM_CONV_2D(1, 64, 32, 31, 7, 7, 2, 1, SAME, 128);
+BM_CONV_2D(1, 64, 32, 31, 7, 7, 3, 1, SAME, 128);

 // 3 channels input
 BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, 1, VALID, 3);

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -878,7 +878,7 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
                            1e-4, 1e-4);
  };

-  for (int kernel_size : {3, 5}) {
+  for (int kernel_size : {3, 5, 7}) {
    for (int stride : {2, 3}) {
      func(kernel_size, kernel_size, stride, stride);
    }