diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
new file mode 100644
index 0000000000000000000000000000000000000000..1c97a581b9527a41ff48c4d40e1c97e74329bba0
--- /dev/null
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -0,0 +1,76 @@
+float4 conv1x3_s1(const float *input_ptr,
+                  const float *filter_ptr);
+float4 conv1x3_s2(const float *input_ptr,
+                  const float *filter_ptr);
+float conv3x3(const float *input_ptr,
+              const float *filter_ptr,
+              const int row_width);
+
+void kernel conv_2d_3x3(global const float *input,
+                        global const float *filter,
+                        global const float *bias,
+                        global float *output,
+                        private const uint in_chan_num,
+                        private const uint out_chan_num,
+                        private const uint in_height,
+                        private const uint in_width,
+                        private const uint out_height,
+                        private const uint out_width,
+                        private const uint stride_h,
+                        private const uint stride_w) {
+  int batch = get_global_id(0);
+  int out_chan_blk = get_global_id(1);
+  int out_pixel_blk = get_global_id(2);
+
+  const uint in_pixel = in_height * in_width;
+  const uint out_pixel = out_height * out_width;
+
+  const uint round_out_width = (out_width + 3) / 4;
+  const uint out_pixel_height = out_pixel_blk / round_out_width;
+  const uint out_pixel_width = out_pixel_blk % round_out_width;
+
+  const uint out_chan_begin = out_chan_blk * 4;
+  const uint out_chan_end = min(out_chan_begin + 4, out_chan_num);
+  const uint out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
+  const uint out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
+  const uint in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
+
+  const uint in_offset = batch * in_chan_num * in_pixel;
+  const uint out_offset = batch * out_chan_num * out_pixel;
+  const float *input_base = input + in_offset + in_pixel_begin;
+  float *output_base = output + out_offset + out_pixel_begin;
+
+  uint pixels = out_pixel_end - out_pixel_begin;
+
+  for (uint i = out_chan_begin; i < out_chan_end; ++i) {
+    float4 res = (float4)bias[i];
+    float *output_ptr = output_base + i * out_pixel;
+    const float *filter_base = filter + i * in_chan_num * 9;
+    if (pixels == 4) {
+      for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
+        const float* input_ptr = input_base + in_chan_idx * in_pixel;
+        const float* filter_ptr = filter_base + in_chan_idx * 9;
+        if (stride_w == 1) {
+          res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+          res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+          res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
+        } else {
+          res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+          res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+          res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
+        }
+      }
+      vstore4(res, 0, output_ptr);
+    } else {
+      for (uint p = 0; p < pixels; ++p) {
+        float res = bias[i];
+        for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
+          const float* input_ptr = input_base + in_chan_idx * in_pixel + p * stride_w;
+          const float* filter_ptr = filter_base + in_chan_idx * 9;
+          res += conv3x3(input_ptr, filter_ptr, in_width);
+        }
+        output_ptr[p] = res;
+      }
+    }
+  }
+}
diff --git a/mace/kernels/opencl/cl/conv_helper.cl b/mace/kernels/opencl/cl/conv_helper.cl
new file mode 100644
index 0000000000000000000000000000000000000000..f569ef3a51dd6a5069644ecf9e2780f8a5bd3920
--- /dev/null
+++ b/mace/kernels/opencl/cl/conv_helper.cl
@@ -0,0 +1,41 @@
+float4 conv1x3_s1(const float *input_ptr,
+                  const float *filter_ptr) {
+  float4 row0 = vload4(0, input_ptr);
+  float2 input1 = vload2(0, input_ptr+4);
+  float4 row1 = (float4)(row0.s123, input1.s0);
+  float4 row2 = (float4)(row0.s23, input1.s01);
+  float3 filter_values = vload3(0, filter_ptr);
+  return (float4)filter_values.s0 * row0 +
+         (float4)filter_values.s1 * row1 +
+         (float4)filter_values.s2 * row2;
+}
+
+float4 conv1x3_s2(const float *input_ptr,
+                  const float *filter_ptr) {
+  float8 input = vload8(0, input_ptr);
+  float4 row0 = input.even;
+  float4 row1 = input.odd;
+  float4 row2 = (float4)(row0.s123, input_ptr[8]);
+  float3 filter_values = vload3(0, filter_ptr);
+  return (float4)filter_values.s0 * row0 +
+         (float4)filter_values.s1 * row1 +
+         (float4)filter_values.s2 * row2;
+}
+
+float conv3x3(const float *input_ptr,
+              const float *filter_ptr,
+              const int row_width) {
+  float3 input_value = vload3(0, input_ptr);
+  float3 filter_value = vload3(0, filter_ptr);
+  float3 res = input_value * filter_value;
+  input_ptr += row_width;
+  input_value = vload3(0, input_ptr);
+  filter_value = vload3(1, filter_ptr);
+  res += input_value * filter_value;
+  input_ptr += row_width;
+  input_value = vload3(0, input_ptr);
+  filter_value = vload3(2, filter_ptr);
+  res += input_value * filter_value;
+
+  return res.s0 + res.s1 + res.s2;
+}
diff --git a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
index 084156e13788e53769458ca901cbbc19cbc84f10..2cdbe3fa2a7c696678c921d1d39d9ab34751710c 100644
--- a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
@@ -1,37 +1,8 @@
-inline float4 conv1x3(const float *input_ptr,
-                      const float *filter_ptr) {
-  float8 input = vload8(0, input_ptr);
-  float4 row0 = convert_float4(input.s0123);
-  float4 row1 = convert_float4(input.s1234);
-  float4 row2 = convert_float4(input.s2345);
-  return (float4)filter_ptr[0] * row0 + (float4)filter_ptr[1] * row1
-            + (float4)filter_ptr[2] * row2;
-}
-
-inline float4 conv3x3x4(const float *input_ptr,
-                        const float *filter_ptr,
-                        const int row_width) {
-  float4 res;
-  res = conv1x3(input_ptr + 0 * row_width, filter_ptr + 0 * 3);
-  res += conv1x3(input_ptr + 1 * row_width, filter_ptr + 1 * 3);
-  res += conv1x3(input_ptr + 2 * row_width, filter_ptr + 2 * 3);
-
-  return res;
-}
-
-inline float conv3x3(const float *input_ptr,
+float4 conv1x3_s1(const float *input_ptr,
+                  const float *filter_ptr);
+float conv3x3(const float *input_ptr,
                      const float *filter_ptr,
-                     const int row_width) {
-  float res = input_ptr[0] * filter_ptr[0] + input_ptr[1] * filter_ptr[1] + input_ptr[2] * filter_ptr[2];
-  input_ptr += row_width;
-  filter_ptr += 3;
-  res += input_ptr[0] * filter_ptr[0] + input_ptr[1] * filter_ptr[1] + input_ptr[2] * filter_ptr[2];
-  input_ptr += row_width;
-  filter_ptr += 3;
-  res += input_ptr[0] * filter_ptr[0] + input_ptr[1] * filter_ptr[1] + input_ptr[2] * filter_ptr[2];
-
-  return res;
-}
+                     const int row_width);
 
 void kernel depthwise_conv_3x3_s1(global const float *input, /* n, c, h, w */
                                   global const float *filter, /* m, i, kh, kw */
@@ -80,8 +51,10 @@ void kernel depthwise_conv_3x3_s1(global const float *input, /* n, c, h, w */
         input_ptr += 1;
       }
     } else {
-      float4 res = conv3x3x4(input_ptr, filter_ptr, in_width);
-      res += (float4)bias_value;
+      float4 res = (float4)bias_value;
+      res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+      res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+      res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
       vstore4(res, 0, output_ptr);
     }
   }
diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc
index 9eeec07ac57315c1ffa0ced1aee33f11eaeece8a..fcdb3de208fa6da2997dc391d94f514107bc60d7 100644
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -3,7 +3,6 @@
 //
 
 #include "mace/kernels/conv_2d.h"
-#include "mace/kernels/conv_pool_2d_util.h"
 
 namespace mace {
 namespace kernels {
@@ -11,6 +10,11 @@ namespace kernels {
 extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
                                const Tensor *bias, Tensor *output);
 
+extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, Tensor *output);
+
+extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, Tensor *output);
 template <>
 void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
                                                           const Tensor *filter,
@@ -22,7 +26,7 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
   static const Conv2dOpenclFunction selector[5][2] = {
       {Conv2dOpenclK1x1S1, nullptr},
       {nullptr, nullptr},
-      {nullptr, nullptr},
+      {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2},
       {nullptr, nullptr},
       {nullptr, nullptr}};
 
@@ -40,11 +44,16 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
         input, filter, bias, output);
     return;
   }
-
-  MACE_CHECK(paddings_[0] == 0 && paddings_[1] == 0, "Padding not supported");
-
   auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
-  conv2d_func(input, filter, bias, output);
+  if (paddings_[0] > 0 || paddings_[1] > 0) {
+    Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
+    Tensor::MappingGuard input_mapper(input);
+    ConstructInputWithPadding(input->data<float>(), input->shape().data(), paddings_.data(),
+                              &padded_input);
+    conv2d_func(&padded_input, filter, bias, output);
+  }else {
+    conv2d_func(input, filter, bias, output);
+  }
 }
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
new file mode 100644
index 0000000000000000000000000000000000000000..448f31d6f2c940855b8ba308fd49b0df8f71dd41
--- /dev/null
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -0,0 +1,64 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/core/common.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/conv_2d.h"
+
+namespace mace {
+namespace kernels {
+
+static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
+                               const Tensor *bias, const uint32_t stride, Tensor *output) {
+  const index_t channels = output->shape()[1];
+  const index_t height = output->shape()[2];
+  const index_t width = output->shape()[3];
+
+  MACE_CHECK(input->dim(0) == output->dim(0));
+
+  const index_t channel_blocks = (channels + 3) / 4;
+  const index_t pixel_blocks = (width + 3) / 4 * height;
+
+  auto runtime = OpenCLRuntime::Get();
+  auto program = runtime->program();
+  auto bm_kernel = cl::Kernel(program, "conv_2d_3x3");
+
+  uint32_t idx = 0;
+  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
+  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
+  bm_kernel.setArg(idx++, static_cast<uint32_t>(channels));
+  bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
+  bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
+  bm_kernel.setArg(idx++, static_cast<uint32_t>(height));
+  bm_kernel.setArg(idx++, static_cast<uint32_t>(width));
+  bm_kernel.setArg(idx++, stride);
+  bm_kernel.setArg(idx++, stride);
+  const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
+                           static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(pixel_blocks)};
+  const uint32_t lws[3] = {static_cast<uint32_t>(1),
+                           static_cast<uint32_t>(1),
+                           static_cast<uint32_t>(256)};
+  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+      bm_kernel, cl::NullRange,
+      cl::NDRange(gws[0], gws[1], gws[2]),
+      cl::NDRange(lws[0], lws[1], lws[2]));
+  MACE_CHECK(error == CL_SUCCESS);
+}
+
+void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter,
+                        const Tensor *bias, Tensor *output) {
+  InnerConv2dK3x3S12(input, filter, bias, 1, output);
+};
+
+void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter,
+                        const Tensor *bias, Tensor *output) {
+  InnerConv2dK3x3S12(input, filter, bias, 2, output);
+};
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index ca3ccf72b96e642e34fc1f6c8b45d787e000dfa9..d408b05bfe7eeb1acdddbcf103f1c9bb434dca9a 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -3,7 +3,6 @@
 //
 
 #include <algorithm>
-#include <sstream>
 
 #include "mace/core/operator.h"
 #include "mace/core/testing/test_benchmark.h"
@@ -14,7 +13,6 @@ namespace mace {
 
 template <DeviceType D, typename T>
 static void Conv2d(int iters,
-                   int iters_to_sync,
                    int batch,
                    int channels,
                    int height,
@@ -32,37 +30,32 @@ static void Conv2d(int iters,
       .Input("Filter")
       .Input("Bias")
       .Output("Output")
-      .AddIntsArg("strides", {stride, stride})
-      .AddIntArg("padding", padding)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
+      .Finalize(net.operator_def());
+
+  // Add args
+  net.AddIntsArg("strides", {stride, stride});
+  net.AddIntArg("padding", padding);
+  net.AddIntsArg("dilations", {1, 1});
 
   // Add input data
   net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
   net.AddRandomInput<D, float>("Filter",
-                               {output_channels, channels, kernel_h, kernel_w});
+                            {output_channels, channels, kernel_h, kernel_w});
   net.AddRandomInput<D, float>("Bias", {output_channels});
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
     net.RunOp(D);
-    net.Sync();
   }
+  net.Sync();
 
   mace::testing::StartTiming();
   while (iters--) {
     net.RunOp(D);
-    if (iters % iters_to_sync == 0) {
-      net.Sync();
-    }
   }
+  net.Sync();
 }
 
-// In common network, there are usually more than 1 layers, this is used to
-// approximate the amortized latency. The OpenCL runtime for Mali/Adreno is
-// in-order.
-constexpr int kItersToSync = 10;
-
 #define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, DEVICE)                          \
   static void                                                                                      \
       BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
@@ -70,8 +63,8 @@ constexpr int kItersToSync = 10;
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                               \
     mace::testing::ItemsProcessed(tot);                                                            \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                                            \
-    Conv2d<DEVICE, TYPE>(iters, kItersToSync, N, C, H, W, KH, KW, STRIDE,                          \
-                         mace::Padding::P, OC);                                                    \
+    Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P,                      \
+                         OC);                                                                      \
   }                                                                                                \
   BENCHMARK(                                                                                       \
       BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index a8e9f189c255e09178b5b6aa02f0e9ab28606786..31d3130a88796bd21776e2a67ea12be1ed9a7fdb 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -3,16 +3,15 @@
 //
 
 #include "mace/ops/conv_2d.h"
-#include "mace/core/operator.h"
 #include "mace/ops/ops_test_util.h"
 
 using namespace mace;
 
 class Conv2dOpTest : public OpsTestBase {};
 
-TEST_F(Conv2dOpTest, Simple_VALID) {
-  // Construct graph
-  auto &net = test_net();
+template <DeviceType D>
+void TestSimple3x3VALID() {
+  OpsTestNet net;
   OpDefBuilder("Conv2D", "Conv2dTest")
       .Input("Input")
       .Input("Filter")
@@ -26,27 +25,28 @@ TEST_F(Conv2dOpTest, Simple_VALID) {
   // Add args
 
   // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>(
+  net.AddInputFromArray<D, float>(
       "Input", {1, 2, 3, 3},
       {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<DeviceType::CPU, float>(
+  net.AddInputFromArray<D, float>(
       "Filter", {1, 2, 3, 3},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<DeviceType::CPU, float>("Bias", {1}, {0.1f});
+  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
 
   // Run
-  net.RunOp();
+  net.RunOp(D);
 
   // Check
   auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+
 }
 
-TEST_F(Conv2dOpTest, Simple_SAME) {
-  // Construct graph
-  auto &net = test_net();
+template <DeviceType D>
+void TestSimple3x3SAME() {
+  OpsTestNet net;
   OpDefBuilder("Conv2D", "Conv2dTest")
       .Input("Input")
       .Input("Filter")
@@ -58,17 +58,17 @@ TEST_F(Conv2dOpTest, Simple_SAME) {
       .Finalize(net.NewOperatorDef());
 
   // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>(
+  net.AddInputFromArray<D, float>(
       "Input", {1, 2, 3, 3},
       {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<DeviceType::CPU, float>(
+  net.AddInputFromArray<D, float>(
       "Filter", {1, 2, 3, 3},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<DeviceType::CPU, float>("Bias", {1}, {0.1f});
+  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
 
   // Run
-  net.RunOp();
+  net.RunOp(D);
 
   // Check
   auto expected = CreateTensor<float>(
@@ -78,9 +78,25 @@ TEST_F(Conv2dOpTest, Simple_SAME) {
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
 
-TEST_F(Conv2dOpTest, Combined) {
+TEST_F(Conv2dOpTest, CPUSimple) {
+  TestSimple3x3VALID<DeviceType::CPU>();
+  TestSimple3x3SAME<DeviceType::CPU>();
+}
+
+TEST_F(Conv2dOpTest, NEONSimple) {
+  TestSimple3x3VALID<DeviceType::NEON>();
+  TestSimple3x3SAME<DeviceType::NEON>();
+}
+
+TEST_F(Conv2dOpTest, OPENCLSimple) {
+  TestSimple3x3VALID<DeviceType::OPENCL>();
+  TestSimple3x3SAME<DeviceType::OPENCL>();
+}
+
+template <DeviceType D>
+static void TestCombined3x3() {
   // Construct graph
-  auto &net = test_net();
+  OpsTestNet net;
   OpDefBuilder("Conv2D", "Conv2DTest")
       .Input("Input")
       .Input("Filter")
@@ -92,19 +108,19 @@ TEST_F(Conv2dOpTest, Combined) {
       .Finalize(net.NewOperatorDef());
 
   // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>(
+  net.AddInputFromArray<D, float>(
       "Input", {1, 2, 5, 5}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<DeviceType::CPU, float>(
+  net.AddInputFromArray<D, float>(
       "Filter", {2, 2, 3, 3},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
        0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f});
-  net.AddInputFromArray<DeviceType::CPU, float>("Bias", {2}, {0.1f, 0.2f});
+  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});
 
   // Run
-  net.RunOp();
+  net.RunOp(D);
 
   // Check
   auto expected = CreateTensor<float>(
@@ -112,6 +128,19 @@ TEST_F(Conv2dOpTest, Combined) {
                      4.2f, 6.2f, 4.2f, 6.2f, 9.2f, 6.2f, 4.2f, 6.2f, 4.2f});
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+
+}
+
+TEST_F(Conv2dOpTest, CPUCombined) {
+  TestCombined3x3<DeviceType::CPU>();
+}
+
+TEST_F(Conv2dOpTest, NEONCombined) {
+  TestCombined3x3<DeviceType::NEON>();
+}
+
+TEST_F(Conv2dOpTest, OPENCLCombined) {
+  TestCombined3x3<DeviceType::OPENCL>();
 }
 
 template <DeviceType D>
@@ -159,13 +188,16 @@ void TestConv1x1() {
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
 }
 
-TEST_F(Conv2dOpTest, Conv1x1) {
+TEST_F(Conv2dOpTest, CPUConv1x1) {
   TestConv1x1<DeviceType::CPU>();
+}
+
+TEST_F(Conv2dOpTest, OPENCLConv1x1) {
   TestConv1x1<DeviceType::OPENCL>();
 }
 
-// TODO we need more tests
-TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
+template <DeviceType D>
+static void TestAlignedConvNxNS12() {
   testing::internal::LogToStderr();
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                   Padding type) {
@@ -178,7 +210,7 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
     index_t width = 32;
     index_t output_channels = 128;
     // Construct graph
-    auto &net = test_net();
+    OpsTestNet net;
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("Input")
         .Input("Filter")
@@ -190,19 +222,19 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
         .Finalize(net.NewOperatorDef());
 
     // Add input data
-    net.AddRandomInput<DeviceType::CPU, float>("Input", {batch, input_channels, height, width});
-    net.AddRandomInput<DeviceType::CPU, float>(
+    net.AddRandomInput<D, float>("Input", {batch, input_channels, height, width});
+    net.AddRandomInput<D, float>(
         "Filter", {output_channels, input_channels, kernel_h, kernel_w});
-    net.AddRandomInput<DeviceType::CPU, float>("Bias", {output_channels});
-    // run cpu
-    net.RunOp();
+    net.AddRandomInput<D, float>("Bias", {output_channels});
+    // Run on device
+    net.RunOp(D);
 
     // Check
     Tensor expected;
     expected.Copy(*net.GetOutput("Output"));
 
-    // Run NEON
-    net.RunOp(DeviceType::NEON);
+    // run cpu
+    net.RunOp();
     ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
   };
 
@@ -214,7 +246,16 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
   }
 }
 
-TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
+TEST_F(Conv2dOpTest, NEONAlignedConvNxNS12) {
+  TestAlignedConvNxNS12<DeviceType::NEON>();
+}
+
+TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
+  TestAlignedConvNxNS12<DeviceType::OPENCL>();
+}
+
+template <DeviceType D>
+static void TestUnalignedConvNxNS12() {
   testing::internal::LogToStderr();
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                   Padding type) {
@@ -227,7 +268,7 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
     index_t width = 113;
     index_t output_channels = 3 + rand() % 10;
     // Construct graph
-    auto &net = test_net();
+    OpsTestNet net;
     OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("Input")
         .Input("Filter")
@@ -239,19 +280,19 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
         .Finalize(net.NewOperatorDef());
 
     // Add input data
-    net.AddRandomInput<DeviceType::CPU, float>("Input", {batch, input_channels, height, width});
-    net.AddRandomInput<DeviceType::CPU, float>(
+    net.AddRandomInput<D, float>("Input", {batch, input_channels, height, width});
+    net.AddRandomInput<D, float>(
         "Filter", {output_channels, input_channels, kernel_h, kernel_w});
-    net.AddRandomInput<DeviceType::CPU, float>("Bias", {output_channels});
-    // run cpu
-    net.RunOp();
+    net.AddRandomInput<D, float>("Bias", {output_channels});
+    // Run on device
+    net.RunOp(D);
 
     // Check
     Tensor expected;
     expected.Copy(*net.GetOutput("Output"));
 
-    // Run NEON
-    net.RunOp(DeviceType::NEON);
+    // run cpu
+    net.RunOp();
     ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001);
   };
 
@@ -262,3 +303,11 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
     }
   }
 }
+
+TEST_F(Conv2dOpTest, NEONUnalignedConvNxNS12) {
+  TestUnalignedConvNxNS12<DeviceType::NEON>();
+}
+
+TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
+  TestUnalignedConvNxNS12<DeviceType::OPENCL>();
+}
diff --git a/mace/ops/depthwise_conv_2d_benchmark.cc b/mace/ops/depthwise_conv_2d_benchmark.cc
index 13f64ddfbcdeb62378fd6b9bfc43a647910c0fdb..71dca4cc212428a16de996344a06e5ed3ebfb1c5 100644
--- a/mace/ops/depthwise_conv_2d_benchmark.cc
+++ b/mace/ops/depthwise_conv_2d_benchmark.cc
@@ -57,7 +57,7 @@ static void DepthwiseConv2d(int iters,
 #define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE,                                  \
                                    DEVICE)                                                                   \
   static void                                                                                                \
-      BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
+      BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
           int iters) {                                                                                       \
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                                         \
     mace::testing::ItemsProcessed(tot);                                                                      \
@@ -66,7 +66,7 @@ static void DepthwiseConv2d(int iters,
                                   mace::Padding::P, OC);                                                     \
   }                                                                                                          \
   BENCHMARK(                                                                                                 \
-      BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
+      BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
 
 #define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE)       \
   BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \