Merge branch 'conv2d-neon' into 'master'

Neon conv2d 3x3 stride 2 kernel. See merge request !44

Merge branch 'conv2d-neon' into 'master'
Neon conv2d 3x3 stride 2 kernel. See merge request !44
8bb5716a · Liangliang He · 291a5ee6 · 89b4b039 · 8bb5716a · 8bb5716a
5 changed file
--- a/mace/kernels/neon/conv_2d_neon.cc
+++ b/mace/kernels/neon/conv_2d_neon.cc
@@ -22,6 +22,13 @@ extern void Conv2dNeonK3x3S1(const float *input,
                             float *output,
                             const index_t *output_shape);
+extern void Conv2dNeonK3x3S2(const float *input,
+                             const index_t *input_shape,
+                             const float *filter,
+                             const float *bias,
+                             float *output,
+                             const index_t *output_shape);
 extern void Conv2dNeonK5x5S1(const float *input,
                             const index_t *input_shape,
                             const float *filter,
@@ -30,27 +37,25 @@ extern void Conv2dNeonK5x5S1(const float *input,
                             const index_t *output_shape);
 template <>
-void Conv2dFunctor<DeviceType::NEON,
+void Conv2dFunctor<DeviceType::NEON, float>::operator()(const float *input,
-                   float>::
+                                                        const index_t *input_shape,
-operator()(const float *input,  // NCHW
+                                                        const float *filter,
-           const index_t *input_shape,
+                                                        const index_t *filter_shape,
-           const float *filter,  // c_out, c_in, kernel_h, kernel_w
+                                                        const float *bias,
-           const index_t *filter_shape,
+                                                        float *output,
-           const float *bias,  // c_out
+                                                        const index_t *output_shape) {
-           float *output,      // NCHW
-           const index_t *output_shape) {
  typedef void (*Conv2dNeonFunction)(
-      const float *input,  // NCHW
+      const float *input,
      const index_t *input_shape,
-      const float *filter,  // c_out, c_in, kernel_h, kernel_w
+      const float *filter,
-      const float *bias,    // c_out
+      const float *bias,
-      float *output,        // NCHW
+      float *output,
      const index_t *output_shape);
  // Selection matrix: kernel_size x stride_size
  static const Conv2dNeonFunction selector[5][2] = {
      {Conv2dNeonK1x1S1, nullptr},
      {nullptr, nullptr},
-      {Conv2dNeonK3x3S1, nullptr},
+      {Conv2dNeonK3x3S1, Conv2dNeonK3x3S2},
      {nullptr, nullptr},
      {Conv2dNeonK5x5S1, nullptr}};
  // not implement yet
@@ -59,7 +64,10 @@ operator()(const float *input,  // NCHW
  if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
      selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
-    LOG(WARNING) << "NEON conv2d kernel not implementated, using slow vesion";
+    LOG(WARNING) << "NEON conv2d kernel with "
+                 << "filter" << kernel_h << "x" << kernel_w << ","
+                 << " stride " << strides_[0] << "x" << strides_[1]
+                 << " is not implemented yet, using slow version";
    Conv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
        input, input_shape, filter, filter_shape, bias, output, output_shape);
    return;

--- a/mace/kernels/neon/conv_2d_neon_3x3.cc
+++ b/mace/kernels/neon/conv_2d_neon_3x3.cc
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -61,8 +61,7 @@ static void Conv2d(int iters,
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                             \
    mace::testing::ItemsProcessed(tot);                                                          \
    mace::testing::BytesProcessed(tot*(sizeof(TYPE)));                                           \
-    Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P,                    \
+    Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P, OC);               \
-                         OC);                                                                    \
  }                                                                                              \
  BENCHMARK(                                                                                     \
      BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_OC##_##TYPE##_##DEVICE)
@@ -77,6 +76,10 @@ BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
 BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
 BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
 BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
+BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
+BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
+BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
+BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
 BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
 BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
 BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -174,8 +174,8 @@ TEST_F(Conv2dOpTest, ConvNxNS12) {
    // generate random input
    index_t batch = 1 + rand() % 10;
    index_t input_channels = 1 + rand() % 50;
-    index_t height = 7 + rand() % 100;
+    index_t height = 11 + rand() % 100;
-    index_t width = 7 + rand() % 100;
+    index_t width = 11 + rand() % 100;
    index_t output_channels = 1 + rand() % 50;
    // Construct graph
    auto& net = test_net();

--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -155,9 +155,9 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
  net.RunOp(DeviceType::NEON);
  // Check
-  Tensor expected = CreateTensor<float>({1, 1, 2, 3}, {6, 8, 9, 16, 18, 19});
+  auto expected = CreateTensor<float>({1, 1, 2, 3}, {6, 8, 9, 16, 18, 19});
-  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
 TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
@@ -183,7 +183,7 @@ TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
  net.RunOp(DeviceType::NEON);
  // Check
-  Tensor expected = CreateTensor<float>({1, 1, 2, 3}, {11, 13, 14, 16, 18, 19});
+  auto expected = CreateTensor<float>({1, 1, 2, 3}, {11, 13, 14, 16, 18, 19});
-  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }