Finish conv 3x3 with stride 1 and 2.

52cc0540 · liuqi · 0461beb5 · 52cc0540 · 52cc0540 · 52cc0540
8 changed file
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
+float4 conv1x3_s1(const float *input_ptr,
+                  const float *filter_ptr);
+float4 conv1x3_s2(const float *input_ptr,
+                  const float *filter_ptr);
+float conv3x3(const float *input_ptr,
+              const float *filter_ptr,
+              const int row_width);
+
+void kernel conv_2d_3x3(global const float *input,
+                        global const float *filter,
+                        global const float *bias,
+                        global float *output,
+                        private const uint in_chan_num,
+                        private const uint out_chan_num,
+                        private const uint in_height,
+                        private const uint in_width,
+                        private const uint out_height,
+                        private const uint out_width,
+                        private const uint stride_h,
+                        private const uint stride_w) {
+  int batch = get_global_id(0);
+  int out_chan_blk = get_global_id(1);
+  int out_pixel_blk = get_global_id(2);
+
+  const uint in_pixel = in_height * in_width;
+  const uint out_pixel = out_height * out_width;
+
+  const uint round_out_width = (out_width + 3) / 4;
+  const uint out_pixel_height = out_pixel_blk / round_out_width;
+  const uint out_pixel_width = out_pixel_blk % round_out_width;
+
+  const uint out_chan_begin = out_chan_blk * 4;
+  const uint out_chan_end = min(out_chan_begin + 4, out_chan_num);
+  const uint out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
+  const uint out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
+  const uint in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
+
+  const uint in_offset = batch * in_chan_num * in_pixel;
+  const uint out_offset = batch * out_chan_num * out_pixel;
+  const float *input_base = input + in_offset + in_pixel_begin;
+  float *output_base = output + out_offset + out_pixel_begin;
+
+  uint pixels = out_pixel_end - out_pixel_begin;
+
+  for (uint i = out_chan_begin; i < out_chan_end; ++i) {
+    float4 res = (float4)bias[i];
+    float *output_ptr = output_base + i * out_pixel;
+    const float *filter_base = filter + i * in_chan_num * 9;
+    if (pixels == 4) {
+      for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
+        const float* input_ptr = input_base + in_chan_idx * in_pixel;
+        const float* filter_ptr = filter_base + in_chan_idx * 9;
+        if (stride_w == 1) {
+          res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+          res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+          res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
+        } else {
+          res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+          res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+          res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
+        }
+      }
+      vstore4(res, 0, output_ptr);
+    } else {
+      for (uint p = 0; p < pixels; ++p) {
+        float res = bias[i];
+        for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
+          const float* input_ptr = input_base + in_chan_idx * in_pixel + p * stride_w;
+          const float* filter_ptr = filter_base + in_chan_idx * 9;
+          res += conv3x3(input_ptr, filter_ptr, in_width);
+        }
+        output_ptr[p] = res;
+      }
+    }
+  }
+}
--- a/mace/kernels/opencl/cl/conv_helper.cl
+++ b/mace/kernels/opencl/cl/conv_helper.cl
+float4 conv1x3_s1(const float *input_ptr,
+                  const float *filter_ptr) {
+  float4 row0 = vload4(0, input_ptr);
+  float2 input1 = vload2(0, input_ptr+4);
+  float4 row1 = (float4)(row0.s123, input1.s0);
+  float4 row2 = (float4)(row0.s23, input1.s01);
+  float3 filter_values = vload3(0, filter_ptr);
+  return (float4)filter_values.s0 * row0 +
+         (float4)filter_values.s1 * row1 +
+         (float4)filter_values.s2 * row2;
+}
+
+float4 conv1x3_s2(const float *input_ptr,
+                  const float *filter_ptr) {
+  float8 input = vload8(0, input_ptr);
+  float4 row0 = input.even;
+  float4 row1 = input.odd;
+  float4 row2 = (float4)(row0.s123, input_ptr[8]);
+  float3 filter_values = vload3(0, filter_ptr);
+  return (float4)filter_values.s0 * row0 +
+         (float4)filter_values.s1 * row1 +
+         (float4)filter_values.s2 * row2;
+}
+
+float conv3x3(const float *input_ptr,
+              const float *filter_ptr,
+              const int row_width) {
+  float3 input_value = vload3(0, input_ptr);
+  float3 filter_value = vload3(0, filter_ptr);
+  float3 res = input_value * filter_value;
+  input_ptr += row_width;
+  input_value = vload3(0, input_ptr);
+  filter_value = vload3(1, filter_ptr);
+  res += input_value * filter_value;
+  input_ptr += row_width;
+  input_value = vload3(0, input_ptr);
+  filter_value = vload3(2, filter_ptr);
+  res += input_value * filter_value;
+
+  return res.s0 + res.s1 + res.s2;
+}
--- a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
-inline float4 conv1x3(const float *input_ptr,
-                      const float *filter_ptr) {
-  float8 input = vload8(0, input_ptr);
-  float4 row0 = convert_float4(input.s0123);
-  float4 row1 = convert_float4(input.s1234);
-  float4 row2 = convert_float4(input.s2345);
-  return (float4)filter_ptr[0] * row0 + (float4)filter_ptr[1] * row1
-            + (float4)filter_ptr[2] * row2;
-}
-
-inline float4 conv3x3x4(const float *input_ptr,
-                        const float *filter_ptr,
-                        const int row_width) {
-  float4 res;
-  res = conv1x3(input_ptr + 0 * row_width, filter_ptr + 0 * 3);
-  res += conv1x3(input_ptr + 1 * row_width, filter_ptr + 1 * 3);
-  res += conv1x3(input_ptr + 2 * row_width, filter_ptr + 2 * 3);
-
-  return res;
-}
-
-inline float conv3x3(const float *input_ptr,
+float4 conv1x3_s1(const float *input_ptr,
+                  const float *filter_ptr);
+float conv3x3(const float *input_ptr,
                     const float *filter_ptr,
-                     const int row_width) {
-  float res = input_ptr[0] * filter_ptr[0] + input_ptr[1] * filter_ptr[1] + input_ptr[2] * filter_ptr[2];
-  input_ptr += row_width;
-  filter_ptr += 3;
-  res += input_ptr[0] * filter_ptr[0] + input_ptr[1] * filter_ptr[1] + input_ptr[2] * filter_ptr[2];
-  input_ptr += row_width;
-  filter_ptr += 3;
-  res += input_ptr[0] * filter_ptr[0] + input_ptr[1] * filter_ptr[1] + input_ptr[2] * filter_ptr[2];
-
-  return res;
-}
+                     const int row_width);

 void kernel depthwise_conv_3x3_s1(global const float *input, /* n, c, h, w */
                                  global const float *filter, /* m, i, kh, kw */
@@ -80,8 +51,10 @@ void kernel depthwise_conv_3x3_s1(global const float *input, /* n, c, h, w */
        input_ptr += 1;
      }
    } else {
-      float4 res = conv3x3x4(input_ptr, filter_ptr, in_width);
-      res += (float4)bias_value;
+      float4 res = (float4)bias_value;
+      res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+      res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+      res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
      vstore4(res, 0, output_ptr);
    }
  }

--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -3,7 +3,6 @@
 //

 #include "mace/kernels/conv_2d.h"
-#include "mace/kernels/conv_pool_2d_util.h"

 namespace mace {
 namespace kernels {
@@ -11,6 +10,11 @@ namespace kernels {
 extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
                               const Tensor *bias, Tensor *output);

+extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, Tensor *output);
+
+extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, Tensor *output);
 template <>
 void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
                                                          const Tensor *filter,
@@ -22,7 +26,7 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
  static const Conv2dOpenclFunction selector[5][2] = {
      {Conv2dOpenclK1x1S1, nullptr},
      {nullptr, nullptr},
-      {nullptr, nullptr},
+      {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
      {nullptr, nullptr},
      {nullptr, nullptr}};

@@ -40,11 +44,16 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
        input, filter, bias, output);
    return;
  }
-
-  MACE_CHECK(paddings_[0] == 0 && paddings_[1] == 0, "Padding not supported");
-
  auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
-  conv2d_func(input, filter, bias, output);
+  if (paddings_[0] > 0 || paddings_[1] > 0) {
+    Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
+    Tensor::MappingGuard input_mapper(input);
+    ConstructInputWithPadding(input->data<float>(), input->shape().data(), paddings_.data(),
+                              &padded_input);
+    conv2d_func(&padded_input, filter, bias, output);
+  }else {
+    conv2d_func(input, filter, bias, output);
+  }
 }

 }  // namespace kernels

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/core/common.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/conv_2d.h"
+
+namespace mace {
+namespace kernels {
+
+static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, const uint32_t stride, Tensor *output) {
+  const index_t channels = output->shape()[1];
+  const index_t height = output->shape()[2];
+  const index_t width = output->shape()[3];
+
+  MACE_CHECK(input->dim(0) == output->dim(0));
+
+  const index_t channel_blocks = (channels + 3) / 4;
+  const index_t pixel_blocks = (width + 3) / 4 * height;
+
+  auto runtime = OpenCLRuntime::Get();
+  auto program = runtime->program();
+  auto bm_kernel = cl::Kernel(program, "conv_2d_3x3");
+
+  uint32_t idx = 0;
+  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
+  bm_kernel.setArg(idx++, static_cast<uint32_t>(channels));
+  bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
+  bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
+  bm_kernel.setArg(idx++, static_cast<uint32_t>(height));
+  bm_kernel.setArg(idx++, static_cast<uint32_t>(width));
+  bm_kernel.setArg(idx++, stride);
+  bm_kernel.setArg(idx++, stride);
+  const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
+                           static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(pixel_blocks)};
+  const uint32_t lws[3] = {static_cast<uint32_t>(1),
+                           static_cast<uint32_t>(1),
+                           static_cast<uint32_t>(256)};
+  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+      bm_kernel, cl::NullRange,
+      cl::NDRange(gws[0], gws[1], gws[2]),
+      cl::NDRange(lws[0], lws[1], lws[2]));
+  MACE_CHECK(error == CL_SUCCESS);
+}
+
+void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
+                        const Tensor *bias, Tensor *output) {
+  InnerConv2dK3x3S12(input, filter, bias, 1, output);
+};
+
+void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
+                        const Tensor *bias, Tensor *output) {
+  InnerConv2dK3x3S12(input, filter, bias, 2, output);
+};
+
+}  // namespace kernels
+}  // namespace mace
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -3,7 +3,6 @@
 //

 #include <algorithm>
-#include <sstream>

 #include "mace/core/operator.h"
 #include "mace/core/testing/test_benchmark.h"
@@ -14,7 +13,6 @@ namespace mace {

 template <DeviceType D, typename T>
 static void Conv2d(int iters,
-                   int iters_to_sync,
                   int batch,
                   int channels,
                   int height,
@@ -32,37 +30,32 @@ static void Conv2d(int iters,
      .Input("Filter")
      .Input("Bias")
      .Output("Output")
-      .AddIntsArg("strides", {stride, stride})
-      .AddIntArg("padding", padding)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
+      .Finalize(net.operator_def());
+
+  // Add args
+  net.AddIntsArg("strides", {stride, stride});
+  net.AddIntArg("padding", padding);
+  net.AddIntsArg("dilations", {1, 1});

  // Add input data
  net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
  net.AddRandomInput<D, float>("Filter",
-                               {output_channels, channels, kernel_h, kernel_w});
+                            {output_channels, channels, kernel_h, kernel_w});
  net.AddRandomInput<D, float>("Bias", {output_channels});

  // Warm-up
  for (int i = 0; i < 5; ++i) {
    net.RunOp(D);
-    net.Sync();
  }
+  net.Sync();

  mace::testing::StartTiming();
  while (iters--) {
    net.RunOp(D);
-    if (iters % iters_to_sync == 0) {
-      net.Sync();
-    }
  }
+  net.Sync();
 }

-// In common network, there are usually more than 1 layers, this is used to
-// approximate the amortized latency. The OpenCL runtime for Mali/Adreno is
-// in-order.
-constexpr int kItersToSync = 10;
-
 #define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, DEVICE)                          \
  static void                                                                                      \
      BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
@@ -70,8 +63,8 @@ constexpr int kItersToSync = 10;
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                               \
    mace::testing::ItemsProcessed(tot);                                                            \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                                            \
-    Conv2d<DEVICE, TYPE>(iters, kItersToSync, N, C, H, W, KH, KW, STRIDE,                          \
-                         mace::Padding::P, OC);                                                    \
+    Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P,                      \
+                         OC);                                                                      \
  }                                                                                                \
  BENCHMARK(                                                                                       \
      BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -3,16 +3,15 @@
 //

 #include "mace/ops/conv_2d.h"
-#include "mace/core/operator.h"
 #include "mace/ops/ops_test_util.h"

 using namespace mace;

 class Conv2dOpTest : public OpsTestBase {};

-TEST_F(Conv2dOpTest, Simple_VALID) {
-  // Construct graph
-  auto &net = test_net();
+template <DeviceType D>
+void TestSimple3x3VALID() {
+  OpsTestNet net;
  OpDefBuilder("Conv2D", "Conv2dTest")
      .Input("Input")
      .Input("Filter")
@@ -26,27 +25,28 @@ TEST_F(Conv2dOpTest, Simple_VALID) {
  // Add args

  // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>(
+  net.AddInputFromArray<D, float>(
      "Input", {1, 2, 3, 3},
      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<DeviceType::CPU, float>(
+  net.AddInputFromArray<D, float>(
      "Filter", {1, 2, 3, 3},
      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<DeviceType::CPU, float>("Bias", {1}, {0.1f});
+  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});

  // Run
-  net.RunOp();
+  net.RunOp(D);

  // Check
  auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+
 }

-TEST_F(Conv2dOpTest, Simple_SAME) {
-  // Construct graph
-  auto &net = test_net();
+template <DeviceType D>
+void TestSimple3x3SAME() {
+  OpsTestNet net;
  OpDefBuilder("Conv2D", "Conv2dTest")
      .Input("Input")
      .Input("Filter")
@@ -58,17 +58,17 @@ TEST_F(Conv2dOpTest, Simple_SAME) {
      .Finalize(net.NewOperatorDef());

  // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>(
+  net.AddInputFromArray<D, float>(
      "Input", {1, 2, 3, 3},
      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<DeviceType::CPU, float>(
+  net.AddInputFromArray<D, float>(
      "Filter", {1, 2, 3, 3},
      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<DeviceType::CPU, float>("Bias", {1}, {0.1f});
+  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});

  // Run
-  net.RunOp();
+  net.RunOp(D);

  // Check
  auto expected = CreateTensor<float>(
@@ -78,9 +78,25 @@ TEST_F(Conv2dOpTest, Simple_SAME) {
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }

-TEST_F(Conv2dOpTest, Combined) {
+TEST_F(Conv2dOpTest, CPUSimple) {
+  TestSimple3x3VALID<DeviceType::CPU>();
+  TestSimple3x3SAME<DeviceType::CPU>();
+}
+
+TEST_F(Conv2dOpTest, NEONSimple) {
+  TestSimple3x3VALID<DeviceType::NEON>();
+  TestSimple3x3SAME<DeviceType::NEON>();
+}
+
+TEST_F(Conv2dOpTest, OPENCLSimple) {
+  TestSimple3x3VALID<DeviceType::OPENCL>();
+  TestSimple3x3SAME<DeviceType::OPENCL>();
+}
+
+template <DeviceType D>
+static void TestCombined3x3() {
  // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
  OpDefBuilder("Conv2D", "Conv2DTest")
      .Input("Input")
      .Input("Filter")
@@ -92,19 +108,19 @@ TEST_F(Conv2dOpTest, Combined) {
      .Finalize(net.NewOperatorDef());

  // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>(
+  net.AddInputFromArray<D, float>(
      "Input", {1, 2, 5, 5}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                              1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                              1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<DeviceType::CPU, float>(
+  net.AddInputFromArray<D, float>(
      "Filter", {2, 2, 3, 3},
      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
       0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f});
-  net.AddInputFromArray<DeviceType::CPU, float>("Bias", {2}, {0.1f, 0.2f});
+  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});

  // Run
-  net.RunOp();
+  net.RunOp(D);

  // Check
  auto expected = CreateTensor<float>(
@@ -112,6 +128,19 @@ TEST_F(Conv2dOpTest, Combined) {
                     4.2f, 6.2f, 4.2f, 6.2f, 9.2f, 6.2f, 4.2f, 6.2f, 4.2f});

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+
+}
+
+TEST_F(Conv2dOpTest, CPUCombined) {
+  TestCombined3x3<DeviceType::CPU>();
+}
+
+TEST_F(Conv2dOpTest, NEONCombined) {
+  TestCombined3x3<DeviceType::NEON>();
+}
+
+TEST_F(Conv2dOpTest, OPENCLCombined) {
+  TestCombined3x3<DeviceType::OPENCL>();
 }

 template <DeviceType D>
@@ -159,13 +188,16 @@ void TestConv1x1() {
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }

-TEST_F(Conv2dOpTest, Conv1x1) {
+TEST_F(Conv2dOpTest, CPUConv1x1) {
  TestConv1x1<DeviceType::CPU>();
+}
+
+TEST_F(Conv2dOpTest, OPENCLConv1x1) {
  TestConv1x1<DeviceType::OPENCL>();
 }

-// TODO we need more tests
-TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
+template <DeviceType D>
+static void TestAlignedConvNxNS12() {
  testing::internal::LogToStderr();
  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                  Padding type) {
@@ -178,7 +210,7 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
    index_t width = 32;
    index_t output_channels = 128;
    // Construct graph
-    auto &net = test_net();
+    OpsTestNet net;
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("Input")
        .Input("Filter")
@@ -190,19 +222,19 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
        .Finalize(net.NewOperatorDef());

    // Add input data
-    net.AddRandomInput<DeviceType::CPU, float>("Input", {batch, input_channels, height, width});
-    net.AddRandomInput<DeviceType::CPU, float>(
+    net.AddRandomInput<D, float>("Input", {batch, input_channels, height, width});
+    net.AddRandomInput<D, float>(
        "Filter", {output_channels, input_channels, kernel_h, kernel_w});
-    net.AddRandomInput<DeviceType::CPU, float>("Bias", {output_channels});
-    // run cpu
-    net.RunOp();
+    net.AddRandomInput<D, float>("Bias", {output_channels});
+    // Run on device
+    net.RunOp(D);

    // Check
    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));

-    // Run NEON
-    net.RunOp(DeviceType::NEON);
+    // run cpu
+    net.RunOp();
    ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
  };

@@ -214,7 +246,16 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
  }
 }

-TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
+TEST_F(Conv2dOpTest, NEONAlignedConvNxNS12) {
+  TestAlignedConvNxNS12<DeviceType::NEON>();
+}
+
+TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
+  TestAlignedConvNxNS12<DeviceType::OPENCL>();
+}
+
+template <DeviceType D>
+static void TestUnalignedConvNxNS12() {
  testing::internal::LogToStderr();
  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                  Padding type) {
@@ -227,7 +268,7 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
    index_t width = 113;
    index_t output_channels = 3 + rand() % 10;
    // Construct graph
-    auto &net = test_net();
+    OpsTestNet net;
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("Input")
        .Input("Filter")
@@ -239,19 +280,19 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
        .Finalize(net.NewOperatorDef());

    // Add input data
-    net.AddRandomInput<DeviceType::CPU, float>("Input", {batch, input_channels, height, width});
-    net.AddRandomInput<DeviceType::CPU, float>(
+    net.AddRandomInput<D, float>("Input", {batch, input_channels, height, width});
+    net.AddRandomInput<D, float>(
        "Filter", {output_channels, input_channels, kernel_h, kernel_w});
-    net.AddRandomInput<DeviceType::CPU, float>("Bias", {output_channels});
-    // run cpu
-    net.RunOp();
+    net.AddRandomInput<D, float>("Bias", {output_channels});
+    // Run on device
+    net.RunOp(D);

    // Check
    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));

-    // Run NEON
-    net.RunOp(DeviceType::NEON);
+    // run cpu
+    net.RunOp();
    ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
  };

@@ -262,3 +303,11 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
    }
  }
 }
+
+TEST_F(Conv2dOpTest, NEONUnalignedConvNxNS12) {
+  TestUnalignedConvNxNS12<DeviceType::NEON>();
+}
+
+TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
+  TestUnalignedConvNxNS12<DeviceType::OPENCL>();
+}
--- a/mace/ops/depthwise_conv_2d_benchmark.cc
+++ b/mace/ops/depthwise_conv_2d_benchmark.cc
@@ -57,7 +57,7 @@ static void DepthwiseConv2d(int iters,
 #define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE,                                  \
                                   DEVICE)                                                                   \
  static void                                                                                                \
-      BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
+      BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
          int iters) {                                                                                       \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                                         \
    mace::testing::ItemsProcessed(tot);                                                                      \
@@ -66,7 +66,7 @@ static void DepthwiseConv2d(int iters,
                                  mace::Padding::P, OC);                                                     \
  }                                                                                                          \
  BENCHMARK(                                                                                                 \
-      BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
+      BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)

 #define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE)       \
  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \