From 60695ebdef71b92ae39deff2a1823fc5b77c9f71 Mon Sep 17 00:00:00 2001
From: Bin Li <libin11@xiaomi.com>
Date: Mon, 21 May 2018 17:22:14 +0800
Subject: [PATCH] Optimize conv1x15 conv15x1

---
 mace/kernels/arm/conv_2d_neon.h       |  12 ++
 mace/kernels/arm/conv_2d_neon_15x1.cc | 163 ++++++++++++++++++++++++++
 mace/kernels/arm/conv_2d_neon_1x15.cc | 149 +++++++++++++++++++++++
 mace/kernels/conv_2d.h                |  50 +++++---
 mace/ops/conv_2d_benchmark.cc         |   6 +-
 mace/ops/conv_2d_test.cc              |   6 +
 6 files changed, 371 insertions(+), 15 deletions(-)
 create mode 100644 mace/kernels/arm/conv_2d_neon_15x1.cc
 create mode 100644 mace/kernels/arm/conv_2d_neon_1x15.cc

diff --git a/mace/kernels/arm/conv_2d_neon.h b/mace/kernels/arm/conv_2d_neon.h
index 5d2d5f9a..b35429ba 100644
--- a/mace/kernels/arm/conv_2d_neon.h
+++ b/mace/kernels/arm/conv_2d_neon.h
@@ -65,6 +65,18 @@ extern void Conv2dNeonK7x7S3(const float *input,
                              const index_t *out_shape,
                              float *output);
 
+extern void Conv2dNeonK1x15S1(const float *input,
+                              const float *filter,
+                              const index_t *in_shape,
+                              const index_t *out_shape,
+                              float *output);
+
+extern void Conv2dNeonK15x1S1(const float *input,
+                              const float *filter,
+                              const index_t *in_shape,
+                              const index_t *out_shape,
+                              float *output);
+
 }  // namespace kernels
 }  // namespace mace
 
diff --git a/mace/kernels/arm/conv_2d_neon_15x1.cc b/mace/kernels/arm/conv_2d_neon_15x1.cc
new file mode 100644
index 00000000..80dda314
--- /dev/null
+++ b/mace/kernels/arm/conv_2d_neon_15x1.cc
@@ -0,0 +1,163 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(MACE_ENABLE_NEON)
+#include <arm_neon.h>
+#endif
+
+#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+namespace kernels {
+
+inline void Conv2dCPUK15x1Calc(const float *in_ptr,
+                               const float *filter_ptr,
+                               const index_t in_width,
+                               const index_t in_channels,
+                               const index_t out_height,
+                               const index_t out_width,
+                               const index_t w,
+                               const index_t tile_width,
+                               const index_t out_image_size,
+                               float *out_ptr,
+                               const index_t io,
+                               const int stride) {
+  for (index_t ih = 0; ih < out_height; ++ih) {
+    for (index_t iw = 0; iw < tile_width && w + iw < out_width; ++iw) {
+      for (int i = 0; i < 15; ++i) {
+        for (int j = 0; j < 1; ++j) {
+          out_ptr[io * out_image_size + ih * out_width + w + iw]
+              += in_ptr[(ih * stride + i) * in_width + ((w + iw) * stride + j)]
+              * filter_ptr[io * in_channels * 15 + i * 1 + j];
+        }
+      }
+    }
+  }
+}
+
+
+// Ho = 4, Wo = 1, Co = 1
+void Conv2dNeonK15x1S1(const float *input,
+                       const float *filter,
+                       const index_t *in_shape,
+                       const index_t *out_shape,
+                       float *output) {
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
+  const index_t tile_width =
+      out_shape[1] < 4 ?  RoundUpDiv4(out_shape[3]) : out_shape[3];
+
+#pragma omp parallel for collapse(3)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
+      for (index_t w = 0; w < out_shape[3]; w += tile_width) {
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t in_channels = in_shape[1];
+        const index_t in_width = in_shape[3];
+        float *out_ptr_base =
+            output + b * out_batch_size + m * out_image_size;
+        for (index_t c = 0; c < in_channels; ++c) {
+          const float *in_ptr_base =
+              input + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr = filter + m * in_channels * 15 + c * 15;
+#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
+          /* load filter (1 outch x 1 height x 4 width) */
+          float32x4_t vf0, vf1, vf2, vf3;
+          vf0 = vld1q_f32(filter_ptr);
+          vf1 = vld1q_f32(filter_ptr + 4);
+          vf2 = vld1q_f32(filter_ptr + 8);
+          vf3 = vld1q_f32(filter_ptr + 11);
+
+          for (index_t h = 0; h + 3 < out_height; h += 4) {
+            for (index_t wt = 0; wt < tile_width && w + wt < out_width; ++wt) {
+              // load output
+              index_t out_offset = h * out_width + w + wt;
+              // output (1 outch x 1 height x 4 width): vo_outch_height
+              float32x4_t vo = {out_ptr_base[out_offset],
+                                out_ptr_base[out_offset + out_width],
+                                out_ptr_base[out_offset + 2 * out_width],
+                                out_ptr_base[out_offset + 3 * out_width]};
+
+              // input offset
+              index_t in_offset = h * in_width + w + wt;
+              // input (3 slide)
+              float32x4_t vi0 = {in_ptr_base[in_offset],
+                                 in_ptr_base[in_offset + in_width],
+                                 in_ptr_base[in_offset + 2 * in_width],
+                                 in_ptr_base[in_offset + 3 * in_width]};
+              float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
+                                 in_ptr_base[in_offset + 5 * in_width],
+                                 in_ptr_base[in_offset + 6 * in_width],
+                                 in_ptr_base[in_offset + 7 * in_width]};
+              float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
+                                 in_ptr_base[in_offset + 9 * in_width],
+                                 in_ptr_base[in_offset + 10 * in_width],
+                                 in_ptr_base[in_offset + 11 * in_width]};
+              float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width],
+                                  in_ptr_base[in_offset + 13 * in_width],
+                                  in_ptr_base[in_offset + 14 * in_width],
+                                  in_ptr_base[in_offset + 15 * in_width]};
+              float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width],
+                                  in_ptr_base[in_offset + 17 * in_width]};
+              float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
+              float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
+              float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
+              float32x4_t vi5 = vextq_f32(vi4, vi8, 1);
+              float32x4_t vi6 = vextq_f32(vi4, vi8, 2);
+              float32x4_t vi7 = vextq_f32(vi4, vi8, 3);
+              float32x4_t vi9 = vextq_f32(vi8, vi12, 1);
+              float32x4_t vi10 = vextq_f32(vi8, vi12, 2);
+              float32x4_t vi11 = vextq_f32(vi8, vi12, 3);
+              float32x4_t vi13 = vextq_f32(vi12, vi16, 1);
+              float32x4_t vi14 = vextq_f32(vi12, vi16, 2);
+
+              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
+              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
+              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
+              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
+              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
+              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
+              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
+              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
+              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
+              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
+              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
+              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
+              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
+              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
+              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
+
+              out_ptr_base[out_offset] = vo[0];
+              out_ptr_base[out_offset + out_width] = vo[1];
+              out_ptr_base[out_offset + 2 * out_width] = vo[2];
+              out_ptr_base[out_offset + 3 * out_width] = vo[3];
+            }  // wt
+          }  // h
+#else
+          Conv2dCPUK15x1Calc(in_ptr_base, filter_ptr, in_width, in_channels,
+                             out_height, out_width, w, tile_width,
+                             out_image_size, out_ptr_base, 0, 1);
+#endif
+        }  // c
+      }  // w
+    }  // m
+  }  // b
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/arm/conv_2d_neon_1x15.cc b/mace/kernels/arm/conv_2d_neon_1x15.cc
new file mode 100644
index 00000000..0dd39fba
--- /dev/null
+++ b/mace/kernels/arm/conv_2d_neon_1x15.cc
@@ -0,0 +1,149 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(MACE_ENABLE_NEON)
+#include <arm_neon.h>
+#endif
+
+#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/utils/utils.h"
+#include "mace/utils/logging.h"
+
+namespace mace {
+namespace kernels {
+
+inline void Conv2dCPUK1x15Calc(const float *in_ptr,
+                               const float *filter_ptr,
+                               const index_t in_width,
+                               const index_t in_channels,
+                               const index_t out_height,
+                               const index_t h,
+                               const index_t tile_height,
+                               const index_t out_width,
+                               const index_t out_image_size,
+                               float *out_ptr,
+                               const index_t io,
+                               const int stride) {
+  for (index_t ih = 0; ih < tile_height && h + ih < out_height; ++ih) {
+    for (index_t iw = 0; iw < out_width; ++iw) {
+      for (int i = 0; i < 1; ++i) {
+        for (int j = 0; j < 15; ++j) {
+          out_ptr[io * out_image_size + (h + ih) * out_width + iw]
+              += in_ptr[((h + ih) * stride + i) * in_width + (iw * stride + j)]
+              * filter_ptr[io * in_channels * 15 + i * 15 + j];
+        }
+      }
+    }
+  }
+}
+
+
+// Ho = 1, Wo = 4, Co = 1
+void Conv2dNeonK1x15S1(const float *input,
+                       const float *filter,
+                       const index_t *in_shape,
+                       const index_t *out_shape,
+                       float *output) {
+  const index_t in_image_size = in_shape[2] * in_shape[3];
+  const index_t out_image_size = out_shape[2] * out_shape[3];
+  const index_t in_batch_size = in_shape[1] * in_image_size;
+  const index_t out_batch_size = out_shape[1] * out_image_size;
+  const index_t tile_height =
+      out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2];
+
+#pragma omp parallel for collapse(3)
+  for (index_t b = 0; b < out_shape[0]; ++b) {
+    for (index_t m = 0; m < out_shape[1]; ++m) {
+      for (index_t h = 0; h < out_shape[2]; h += tile_height) {
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t in_channels = in_shape[1];
+        const index_t in_width = in_shape[3];
+        float *out_ptr_base =
+            output + b * out_batch_size + m * out_image_size;
+        for (index_t c = 0; c < in_channels; ++c) {
+          const float *in_ptr_base =
+              input + b * in_batch_size + c * in_image_size;
+          const float *filter_ptr = filter + m * in_channels * 15 + c * 15;
+#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__)
+          /* load filter (1 outch x 4 height x 1 width) */
+          float32x4_t vf0, vf1, vf2, vf3;
+          vf0 = vld1q_f32(filter_ptr);
+          vf1 = vld1q_f32(filter_ptr + 4);
+          vf2 = vld1q_f32(filter_ptr + 8);
+          vf3 = vld1q_f32(filter_ptr + 11);
+
+          for (index_t ht = 0; ht < tile_height && h + ht < out_height; ++ht) {
+            for (index_t w = 0; w + 3 < out_width; w += 4) {
+              // output (1 outch x 1 height x 4 width): vo_outch_height
+              float32x4_t vo;
+              // load output
+              index_t out_offset = (h + ht) * out_width + w;
+              vo = vld1q_f32(out_ptr_base + out_offset);
+
+              // input (3 slide)
+              float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9,
+                  vi10, vi11, vi12, vi13, vi14, vi16;
+              // input offset
+              index_t in_offset = (h + ht) * in_width + w;
+              // load input
+              vi0 = vld1q_f32(in_ptr_base + in_offset);
+              vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
+              vi8 = vld1q_f32(in_ptr_base + in_offset + 8);
+              vi12 = vld1q_f32(in_ptr_base + in_offset + 12);
+              vi16 = vld1q_f32(in_ptr_base + in_offset + 16);
+              vi1 = vextq_f32(vi0, vi4, 1);
+              vi2 = vextq_f32(vi0, vi4, 2);
+              vi3 = vextq_f32(vi0, vi4, 3);
+              vi5 = vextq_f32(vi4, vi8, 1);
+              vi6 = vextq_f32(vi4, vi8, 2);
+              vi7 = vextq_f32(vi4, vi8, 3);
+              vi9 = vextq_f32(vi8, vi12, 1);
+              vi10 = vextq_f32(vi8, vi12, 2);
+              vi11 = vextq_f32(vi8, vi12, 3);
+              vi13 = vextq_f32(vi12, vi16, 1);
+              vi14 = vextq_f32(vi12, vi16, 2);
+
+              vo = vmlaq_lane_f32(vo, vi0, vget_low_f32(vf0), 0);
+              vo = vmlaq_lane_f32(vo, vi1, vget_low_f32(vf0), 1);
+              vo = vmlaq_lane_f32(vo, vi2, vget_high_f32(vf0), 0);
+              vo = vmlaq_lane_f32(vo, vi3, vget_high_f32(vf0), 1);
+              vo = vmlaq_lane_f32(vo, vi4, vget_low_f32(vf1), 0);
+              vo = vmlaq_lane_f32(vo, vi5, vget_low_f32(vf1), 1);
+              vo = vmlaq_lane_f32(vo, vi6, vget_high_f32(vf1), 0);
+              vo = vmlaq_lane_f32(vo, vi7, vget_high_f32(vf1), 1);
+              vo = vmlaq_lane_f32(vo, vi8, vget_low_f32(vf2), 0);
+              vo = vmlaq_lane_f32(vo, vi9, vget_low_f32(vf2), 1);
+              vo = vmlaq_lane_f32(vo, vi10, vget_high_f32(vf2), 0);
+              vo = vmlaq_lane_f32(vo, vi11, vget_high_f32(vf2), 1);
+              vo = vmlaq_lane_f32(vo, vi12, vget_low_f32(vf3), 1);
+              vo = vmlaq_lane_f32(vo, vi13, vget_high_f32(vf3), 0);
+              vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
+
+              vst1q_f32(out_ptr_base + out_offset, vo);
+            }  // w
+          }  // ht
+#else
+          Conv2dCPUK1x15Calc(in_ptr_base, filter_ptr, in_width, in_channels,
+                             out_height, h, tile_height, out_width,
+                             out_image_size, out_ptr_base, 0, 1);
+#endif
+        }  // c
+      }  // h
+    }  // m
+  }  // b
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index eb374960..7a0b8328 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -363,6 +363,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
         && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1;
     bool use_neon_7x7_s3 = filter_h == 7 && filter_w == 7
         && stride_h == 3 && stride_w == 3 && dilation_h == 1 && dilation_w == 1;
+    bool use_neon_1x15_s1 = filter_h == 1 && filter_w == 15
+        && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
+    bool use_neon_15x1_s1 = filter_h == 15 && filter_w == 1
+        && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
 
     std::vector<index_t> transformed_input_shape;
     std::vector<index_t> transformed_output_shape;
@@ -402,24 +406,26 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                                        tile_count});
       transformed_filter_shape.insert(transformed_filter_shape.end(),
                                       {in_tile_area, channels, input_channels});
-    } else if (use_neon_3x3_s1) {
-      extra_output_height = RoundUp<index_t>(height, 2);
-      extra_input_height =
-        std::max(padded_input_height, extra_output_height + 2);
-      extra_output_width = RoundUp<index_t>(width, 4);
-      extra_input_width = std::max(padded_input_width, extra_output_width + 2);
-      if (extra_input_height != padded_input_height) {
-        pad_bottom += (extra_input_height - padded_input_height);
-      }
-      if (extra_input_width != padded_input_width) {
-        pad_right += (extra_input_width - padded_input_width);
+    } else {
+      index_t tile_h, tile_w;
+      if (use_neon_1x1_s1) {
+        tile_h = 1;
+        tile_w = 1;
+      } else if (use_neon_3x3_s1) {
+        tile_h = 2;
+        tile_w = 4;
+      } else if (use_neon_15x1_s1) {
+        tile_h = 4;
+        tile_w = 1;
+      } else {
+        tile_h = 1;
+        tile_w = 4;
       }
-    } else if (!use_neon_1x1_s1) {
-      extra_output_height = height;
+      extra_output_height = RoundUp<index_t>(height, tile_h);
       extra_input_height =
           std::max(padded_input_height, (extra_output_height - 1) * stride_h
               + (filter_h - 1) * dilation_h + 1);
-      extra_output_width = RoundUp<index_t>(width, 4);
+      extra_output_width = RoundUp<index_t>(width, tile_w);
       extra_input_width =
           std::max(padded_input_width, (extra_output_width - 1) * stride_w
               + (filter_w - 1) * dilation_w + 1);
@@ -584,6 +590,22 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                          extra_output_shape,
                          pad_output);
       };
+    } else if (use_neon_1x15_s1) {
+      conv_func = [=](const float *pad_input, float *pad_output) {
+        Conv2dNeonK1x15S1(pad_input,
+                         filter_data,
+                         extra_input_shape,
+                         extra_output_shape,
+                         pad_output);
+      };
+    } else if (use_neon_15x1_s1) {
+      conv_func = [=](const float *pad_input, float *pad_output) {
+        Conv2dNeonK15x1S1(pad_input,
+                          filter_data,
+                          extra_input_shape,
+                          extra_output_shape,
+                          pad_output);
+      };
     } else {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dGeneral(pad_input,
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index b795e127..d935c503 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -165,10 +165,14 @@ BM_CONV_2D(1, 32, 256, 256, 3, 3, 1, 4, VALID, 32);
 BM_CONV_2D(1, 128, 56, 56, 1, 1, 1, 1, SAME, 128);
 BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, 1, SAME, 1024);
 
-
 BM_CONV_2D(64, 32, 34, 34, 3, 3, 1, 1, VALID, 32);
 BM_CONV_2D(1, 32, 34, 34, 3, 3, 1, 1, VALID, 32);
 
+// bokeh
+BM_CONV_2D(1, 32, 256, 256, 1, 15, 1, 1, SAME, 2);
+BM_CONV_2D(1, 32, 256, 256, 15, 1, 1, 1, SAME, 2);
+BM_CONV_2D(1, 64, 64, 64, 15, 1, 1, 1, SAME, 2);
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index ea50b0c1..543e2ac9 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -779,11 +779,17 @@ TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv15x1S12) {
   TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {15, 1, 256, 2},
                                                 {1, 1});
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({64, 64}, {15, 1, 64, 2},
+                                             {1, 1});
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({256, 256}, {15, 1, 32, 2},
+                                             {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv1x15S12) {
   TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {1, 15, 256, 2},
                                                 {1, 1});
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({256, 256}, {1, 15, 32, 2},
+                                             {1, 1});
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv7x75S12) {
-- 
GitLab