Add depthwise conv2d opencl kernel

ff5d176e · Liangliang He · 0279acae · ff5d176e · ff5d176e · ff5d176e
26 changed file
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -21,15 +21,15 @@ struct Conv2dFunctorBase {
                    const float relux_max_limit,
                    const float prelu_alpha)
      : strides_(strides),
-        dilations_(dilations),
        paddings_(paddings),
+        dilations_(dilations),
        activation_(activation),
        relux_max_limit_(relux_max_limit),
        prelu_alpha_(prelu_alpha) {}
  const int *strides_;    // [stride_h, stride_w]
-  const int *dilations_;  // [dilation_h, dilation_w]
  const Padding paddings_;
+  const int *dilations_;  // [dilation_h, dilation_w]
  const ActivationType activation_;
  const float relux_max_limit_;
  const float prelu_alpha_;
@@ -50,8 +50,8 @@ struct Conv2dFunctor : Conv2dFunctorBase {
                          relux_max_limit,
                          prelu_alpha) {}
-  void operator()(const Tensor *input,
+  void operator()(const Tensor *input,  // NHWC
-                  const Tensor *filter,
+                  const Tensor *filter, // HWIO
                  const Tensor *bias,
                  Tensor *output,
                  StatsFuture *future) {

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -2,28 +2,57 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
-#ifndef MACE_KERNELS_DEPTHWISE_CONV_H_
+#ifndef MACE_KERNELS_DEPTHWISE_CONV2D_H_
-#define MACE_KERNELS_DEPTHWISE_CONV_H_
+#define MACE_KERNELS_DEPTHWISE_CONV2D_H_
-#include "mace/core/future.h"
 #include "mace/core/common.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/core/future.h"
 #include "mace/core/public/mace.h"
+#include "mace/kernels/conv_pool_2d_util.h"
 namespace mace {
 namespace kernels {
+struct DepthwiseConv2dFunctorBase {
+  DepthwiseConv2dFunctorBase(const int *strides,
+                             const Padding padding,
+                             const int *dilations,
+                             const ActivationType activation,
+                             const float relux_max_limit,
+                             const float prelu_alpha)
+      : strides_(strides),
+        padding_(padding),
+        dilations_(dilations),
+        activation_(activation),
+        relux_max_limit_(relux_max_limit),
+        prelu_alpha_(prelu_alpha) {}
+  const int *strides_;    // [stride_h, stride_w]
+  const Padding padding_;
+  const int *dilations_;  // [dilation_h, dilation_w]
+  const ActivationType activation_;
+  const float relux_max_limit_;
+  const float prelu_alpha_;
+};
 template <DeviceType D, typename T>
-struct DepthwiseConv2dFunctor {
+struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
-  DepthwiseConv2dFunctor() {}
  DepthwiseConv2dFunctor(const int *strides,
-                         const std::vector<int> &paddings,
+                         const Padding padding,
-                         const int *dilations)
+                         const int *dilations,
-      : strides_(strides), paddings_(paddings), dilations_(dilations) {}
+                         const ActivationType activation,
+                         const float relux_max_limit,
-  void operator()(const Tensor *input,  // NCHW
+                         const float prelu_alpha)
-                  const Tensor *filter,  // c_out, c_in, kernel_h, kernel_w
+      : DepthwiseConv2dFunctorBase(strides,
-                  const Tensor *bias,  // c_out
+                                   padding,
+                                   dilations,
+                                   activation,
+                                   relux_max_limit,
+                                   prelu_alpha) {}
+  void operator()(const Tensor *input,   // NHWC
+                  const Tensor *filter,  // HWIM
+                  const Tensor *bias,    // O
                  Tensor *output,
                  StatsFuture *future) {
    MACE_CHECK_NOTNULL(input);
@@ -31,18 +60,36 @@ struct DepthwiseConv2dFunctor {
    MACE_CHECK_NOTNULL(bias);
    MACE_CHECK_NOTNULL(output);
+    // Create a fake conv_2d filter to calculate the paddings and output size
+    std::vector<index_t> fake_filter_shape(4);
+    fake_filter_shape[0] = filter->shape()[0];
+    fake_filter_shape[1] = filter->shape()[1];
+    fake_filter_shape[3] = filter->shape()[2] * filter->shape()[3];
+    fake_filter_shape[2] = 1;
+    std::vector<index_t> output_shape(4);
+    std::vector<int> paddings(2);
+    kernels::CalcNHWCPaddingAndOutputSize(
+        input->shape().data(), fake_filter_shape.data(), dilations_, strides_,
+        padding_, output_shape.data(), paddings.data());
+    auto input_shape = fake_filter_shape;
+    output->Resize(output_shape);
    index_t batch = output->dim(0);
-    index_t channels = output->dim(1);
+    index_t height = output->dim(1);
-    index_t height = output->dim(2);
+    index_t width = output->dim(2);
-    index_t width = output->dim(3);
+    index_t channels = output->dim(3);
    index_t input_batch = input->dim(0);
-    index_t input_channels = input->dim(1);
+    index_t input_height = input->dim(1);
-    index_t input_height = input->dim(2);
+    index_t input_width = input->dim(2);
-    index_t input_width = input->dim(3);
+    index_t input_channels = input->dim(3);
-    index_t kernel_h = filter->dim(2);
+    index_t kernel_h = filter->dim(0);
-    index_t kernel_w = filter->dim(3);
+    index_t kernel_w = filter->dim(1);
+    index_t multiplier = filter->dim(3);
+    MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=", input_channels);
+    MACE_CHECK(channels == input_channels * multiplier);
    int stride_h = strides_[0];
    int stride_w = strides_[1];
@@ -53,13 +100,12 @@ struct DepthwiseConv2dFunctor {
    MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
    // The left-upper most offset of the padded input
-    int padded_h_start = 0 - paddings_[0] / 2;
+    int padded_h_start = 0 - paddings[0] / 2;
-    int padded_w_start = 0 - paddings_[1] / 2;
+    int padded_w_start = 0 - paddings[1] / 2;
-    index_t padded_h_stop = input_height + paddings_[0] - paddings_[0] / 2;
+    index_t padded_h_stop = input_height + paddings[0] - paddings[0] / 2;
-    index_t padded_w_stop = input_width + paddings_[1] - paddings_[1] / 2;
+    index_t padded_w_stop = input_width + paddings[1] - paddings[1] / 2;
-    index_t kernel_size = kernel_h * kernel_w;
+    const index_t kernel_size = kernel_h * kernel_w;
-    index_t multiplier = filter->dim(0);
    Tensor::MappingGuard input_mapper(input);
    Tensor::MappingGuard filter_mapper(filter);
@@ -67,20 +113,22 @@ struct DepthwiseConv2dFunctor {
    Tensor::MappingGuard output_mapper(output);
    const T *input_ptr = input->data<T>();
    const T *filter_ptr = filter->data<T>();
-    const T *bias_ptr   = bias->data<T>();
+    const T *bias_ptr = bias->data<T>();
    T *output_ptr = output->mutable_data<T>();
 #pragma omp parallel for collapse(2)
    for (int n = 0; n < batch; ++n) {
-      for (int c = 0; c < channels; ++c) {
+      for (int h = 0; h < height; ++h) {
-        T bias_channel = bias_ptr ? bias_ptr[c] : 0;
+        for (int w = 0; w < width; ++w) {
-        for (int h = 0; h < height; ++h) {
+          for (int c = 0; c < channels; ++c) {
-          for (int w = 0; w < width; ++w) {
+            const index_t inc = c / multiplier;
-            index_t offset = n * channels * height * width +
+            const index_t m = c % multiplier;
-                             c * height * width + h * width + w;
+            T bias_channel = bias_ptr ? bias_ptr[c] : 0;
+            index_t offset = n * height * width * channels +
+                             h * width * channels + w * channels + c;
            output_ptr[offset] = bias_channel;
            T sum = 0;
-            const T *filter_base = filter_ptr + c * kernel_size;
+            const T *filter_base = filter_ptr + inc * multiplier + m;
            for (int kh = 0; kh < kernel_h; ++kh) {
              for (int kw = 0; kw < kernel_w; ++kw) {
                int inh = padded_h_start + h * stride_h + dilation_h * kh;
@@ -92,12 +140,12 @@ struct DepthwiseConv2dFunctor {
                             "Out of range read from input: ", inh, ", ", inw);
                } else {
                  index_t input_offset =
-                      n * input_channels * input_height * input_width +
+                      n * input_height * input_width * input_channels +
-                      (c / multiplier) * input_height * input_width +
+                      inh * input_width * input_channels +
-                      inh * input_width + inw;
+                      inw * input_channels + inc;
-                  sum += input_ptr[input_offset] * *filter_base;
+                  sum += input_ptr[input_offset] * filter_base[0]; // HWIM
                }
-                ++filter_base;
+                filter_base += input_channels * multiplier;
              }
            }
            output_ptr[offset] += sum;
@@ -106,10 +154,6 @@ struct DepthwiseConv2dFunctor {
      }
    }
  }
-  const int *strides_;         // [stride_h, stride_w]
-  std::vector<int> paddings_;  // [padding_h, padding_w]
-  const int *dilations_;       // [dilation_h, dilation_w]
 };
 template <>
@@ -120,15 +164,30 @@ void DepthwiseConv2dFunctor<DeviceType::NEON, float>::operator()(
    Tensor *output,
    StatsFuture *future);
-template <>
+template <typename T>
-void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(
+struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
-    const Tensor *input,
+    : DepthwiseConv2dFunctorBase {
-    const Tensor *filter,
+  DepthwiseConv2dFunctor(const int *strides,
-    const Tensor *bias,
+                         const Padding padding,
-    Tensor *output,
+                         const int *dilations,
-    StatsFuture *future);
+                         const ActivationType activation,
+                         const float relux_max_limit,
+                         const float prelu_alpha)
+      : DepthwiseConv2dFunctorBase(strides,
+                                   padding,
+                                   dilations,
+                                   activation,
+                                   relux_max_limit,
+                                   prelu_alpha) {}
+  void operator()(const Tensor *input,
+                  const Tensor *filter,
+                  const Tensor *bias,
+                  Tensor *output,
+                  StatsFuture *future);
+};
-}  //  namespace kernels
+}  // namespace kernels
-}  //  namespace mace
+}  // namespace mace
-#endif  //  MACE_KERNELS_DEPTHWISE_CONV_H_
+#endif  // MACE_KERNELS_DEPTHWISE_CONV2D_H_
--- a/mace/kernels/opencl/REAEMD.md
+++ b/mace/kernels/opencl/REAEMD.md
+OpenCL Image Storage Layout
+===
+Input/Output
+---
+Conv2D Filter
+---
+Depthwise Conv2D Filter
+---
--- a/mace/kernels/opencl/buffer_to_image.cc
+++ b/mace/kernels/opencl/buffer_to_image.cc
@@ -27,9 +27,12 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(Tensor *buffer,
  string kernel_name;
  switch (type) {
-    case FILTER:
+    case CONV2D_FILTER:
      kernel_name = i2b_ ? "filter_image_to_buffer" : "filter_buffer_to_image";
      break;
+    case DW_CONV2D_FILTER:
+      kernel_name = i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image";
+      break;
    case IN_OUT:
      kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image";
      break;

--- a/mace/kernels/opencl/cl/buffer_to_image.cl
+++ b/mace/kernels/opencl/cl/buffer_to_image.cl
@@ -72,6 +72,56 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc
  }
 }
+__kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, m */
+                                        __private const int filter_w,
+                                        __private const int in_channel,
+                                        __private const int multiplier,
+                                        __write_only image2d_t output) { /* ic%4 * kh * kw * m, ic/4 */
+  const int w = get_global_id(0);
+  const int h = get_global_id(1);
+  DATA_TYPE4 values = 0;
+  if (multiplier == 1) {
+    const int in_channel_idx = h << 2;
+    const int h_idx = w / filter_w;
+    const int w_idx = w % filter_w;
+    const int offset = mad24(mad24(h_idx, filter_w, w_idx),
+                             in_channel, in_channel_idx);
+    const int size = in_channel - in_channel_idx;
+    if (in_channel_idx < in_channel) {
+      if (size < 4) {
+        switch(size) {
+          case 3:
+            values.z = *(input + offset + 2);
+          case 2:
+            values.y = *(input + offset + 1);
+          case 1:
+            values.x = *(input + offset);
+        }
+      } else {
+        values = vload4(0, input + offset);
+      }
+    }
+  } else {
+    const int in_channel_idx = h << 2;
+    const int m = w % multiplier;
+    const int hw_idx = w / multiplier;
+    const int h_idx = hw_idx / filter_w;
+    const int w_idx = hw_idx % filter_w;
+    const int offset = mad24(mad24(mad24(h_idx, filter_w, w_idx),
+                in_channel, in_channel_idx),
+            multiplier, m);
+    // TODO support multiplier > 1
+  }
+  int2 coord = (int2)(w, h);
+  WRITE_IMAGET(output, coord, values);
+}
 __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
                                     __private const int height,
                                     __private const int width,

--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
 #include <common.h>
 __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
-                      __read_only image2d_t filter, /* cout%4 * cin * kw * kh, cout/4 */
+                      __read_only image2d_t filter, /* cout%4 * cin * kh * kw, cout/4 */
 #ifdef BIAS
    __read_only image2d_t bias, /* cout%4 * cout/4 */
 #endif
@@ -15,6 +15,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                      __private const int out_width,
                      __private const int filter_height,
                      __private const int filter_width,
+                      __private const int stride,
                      __private const int padding_top,
                      __private const int padding_left,
                      __private const int dilation_h,
@@ -38,19 +39,12 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
  DATA_TYPE4 out3 = 0;
 #endif
-#if STRIDE == 1
+  int in_width_stride = mul24(out_w_blks, stride);
-  int in_width0 = out_w_blk - padding_left;
+  int in_width0 = mad24(out_w_blk, stride, -padding_left);
-  int in_width1 = in_width0 + out_w_blks;
+  int in_width1 = in_width0 + in_width_stride;
-  int in_width2 = in_width1 + out_w_blks;
+  int in_width2 = in_width1 + in_width_stride;
-  int in_width3 = in_width2 + out_w_blks;
+  int in_width3 = in_width2 + in_width_stride;
-  const int height_idx = (out_hb % out_height) - padding_top;
+  const int height_idx = mad24((out_hb % out_height), stride, -padding_top);
-#else
-  int in_width0 = (out_w_blk << 1) - padding_left;
-  int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left;
-  int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left;
-  int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left;
-  const int height_idx = ((out_hb % out_height) << 1) - padding_top;
-#endif
  const int batch_idx = mul24((out_hb / out_height), in_height);
  const int rounded_in_ch_x_filter_width = mul24(rounded_in_ch, filter_width);
@@ -61,6 +55,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
    const int in_idx = mul24(in_ch_blk, in_width);
    int filter_x_part0 = in_ch_blk << 2;
    for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) {
+      // TODO (heliangliang) optimize out these muls
      int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
      in_hb_value = select(in_hb_value + batch_idx,
                           -1,

--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -37,7 +37,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  w.z = w.y + out_w_blks;
  w.w = w.z + out_w_blks;
  int out_hb_idx = (out_hb % height);
-#else
+#elif STRIDE == 2
  w.x = out_w_blk << 1;
  w.y = (out_w_blk + out_w_blks) << 1;
  w.z = (out_w_blk + (out_w_blks << 1)) << 1;

--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
 #include <common.h>
 __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
-                          __read_only image2d_t filter, /* cout%4 * cin * kw * kh, cout/4 */
+                          __read_only image2d_t filter, /* cout%4 * cin * kh * kw, cout/4 */
 #ifdef BIAS
                          __read_only image2d_t bias, /* cout%4 * cout/4 */
 #endif
@@ -45,7 +45,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  int in_width3 = in_width2 + out_w_blks;
  int in_width4 = in_width3 + out_w_blks;
  const int height_idx = (out_hb % out_height) - padding_top;
-#else
+#elif STRIDE == 2
  int in_width0 = (out_w_blk << 1) - padding_left;
  int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left;
  int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left;
@@ -63,6 +63,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
    const int in_idx = mul24(in_ch_blk, in_width);
    int filter_x_part0 = in_ch_blk << 2;
    for (short hb_idx = 0; hb_idx < 3; ++hb_idx) {
+      // TODO (heliangliang) optimize out these muls
      int in_hb_value = height_idx + mul24(hb_idx, dilation_h);
      in_hb_value = select(in_hb_value + batch_idx,
                           -1,

--- a/mace/kernels/opencl/cl/depthwise_conv2d.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl
+#include <common.h>
+// Only multiplier = 1 is supported
+__kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
+                                __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
+#ifdef BIAS
+                                __read_only image2d_t bias, /* cout%4 * cout/4 */
+#endif
+                                __write_only image2d_t output,
+                                __private const DATA_TYPE relux_max_limit,
+                                __private const DATA_TYPE prelu_alpha,
+                                __private const short in_height,
+                                __private const short in_width,
+                                __private const short in_ch_blks,
+                                __private const short out_height,
+                                __private const short out_width,
+                                __private const short filter_height,
+                                __private const short filter_width,
+                                __private const short padding_top,
+                                __private const short padding_left,
+                                __private const short dilation_h,
+                                __private const short dilation_w) {
+  const short out_ch_blk = get_global_id(0);
+  const short out_w_blk = get_global_id(1);
+  const short out_w_blks = get_global_size(1);
+  const short out_hb = get_global_id(2);
+  const short rounded_in_ch = in_ch_blks << 2;
+  const short in_ch_blk = out_ch_blk; // multiplier = 1
+#ifdef BIAS
+  DATA_TYPE4 out0 =
+     READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
+  DATA_TYPE4 out1 = out0;
+  DATA_TYPE4 out2 = out0;
+  DATA_TYPE4 out3 = out0;
+#else
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+  DATA_TYPE4 out2 = 0;
+  DATA_TYPE4 out3 = 0;
+#endif
+  const short out_h = out_hb % out_height;
+#if STRIDE == 1
+  const short in_width0 = out_w_blk - padding_left;
+  const short in_width1 = in_width0 + out_w_blks;
+  const short in_width2 = in_width1 + out_w_blks;
+  const short in_width3 = in_width2 + out_w_blks;
+  const short height_idx = out_h - padding_top;
+#elif STRIDE == 2
+  int in_width0 = (out_w_blk << 1) - padding_left;
+  int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left;
+  int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left;
+  int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left;
+  int in_width4 = ((out_w_blk + (out_w_blks << 2)) << 1) - padding_left;
+  const int height_idx = (out_h << 1) - padding_top;
+#else
+  const short in_width_stride = mul24(out_w_blks, STRIDE);
+  const short in_width0 = mad24(out_w_blk, STRIDE, -padding_left);
+  const short in_width1 = in_width0 + in_width_stride;
+  const short in_width2 = in_width1 + in_width_stride;
+  const short in_width3 = in_width2 + in_width_stride;
+  const short height_idx = mad24(out_h, STRIDE, -padding_top);
+#endif
+  const short batch_idx = mul24((out_hb / out_height), in_height);
+  const short rounded_in_ch_x_filter_width = mul24(rounded_in_ch, filter_width);
+  const short in_idx = mul24(in_ch_blk, in_width);
+  short filter_idx = 0;
+  short in_hb_idx = height_idx;
+  for (short filter_h_idx = 0; filter_h_idx < filter_height; ++filter_h_idx) {
+    short in_hb = select(in_hb_idx + batch_idx,
+                         -1,
+                         (in_hb_idx < 0 || in_hb_idx >= in_height));
+    short in_w_idx = 0;
+    for (short filter_w_idx = 0; filter_w_idx < filter_width; ++filter_w_idx) {
+      short in_w;
+      DATA_TYPE4 in0, in1, in2, in3;
+#define READ_INPUT(i)                                   \
+      in_w = in_w_idx + in_width##i; \
+      in_w = select(in_idx + in_w,                      \
+                    -1,                                 \
+                    (in_w < 0 || in_w >= in_width));    \
+      in##i = READ_IMAGET(input, SAMPLER, (int2)(in_w, in_hb));
+      READ_INPUT(0);
+      READ_INPUT(1);
+      READ_INPUT(2);
+      READ_INPUT(3);
+#undef READ_INPUT
+      DATA_TYPE4 weights = READ_IMAGET(filter, SAMPLER,
+                                       (int2)(filter_idx, in_ch_blk));
+      out0 = mad(in0, weights, out0);
+      out1 = mad(in1, weights, out1);
+      out2 = mad(in2, weights, out2);
+      out3 = mad(in3, weights, out3);
+      ++filter_idx;
+      in_w_idx += dilation_w;
+    }
+    in_hb_idx += dilation_h;
+  }
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID)
+  out0 = do_activation(out0, relux_max_limit, prelu_alpha);
+  out1 = do_activation(out1, relux_max_limit, prelu_alpha);
+  out2 = do_activation(out2, relux_max_limit, prelu_alpha);
+  out3 = do_activation(out3, relux_max_limit, prelu_alpha);
+#endif
+  const short out_x_base = mul24(out_ch_blk, out_width);
+  short w = out_w_blk;
+  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0);
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1);
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2);
+  w += out_w_blks;
+  if (w >= out_width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
+}
--- a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
-#include <common.h>
-VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s1(const DATA_TYPE *input_ptr,
-                                      const DATA_TYPE *filter_ptr) {
-  VEC_DATA_TYPE(DATA_TYPE,4) row0 = vload4(0, input_ptr);
-  VEC_DATA_TYPE(DATA_TYPE,2) input1 = vload2(0, input_ptr+4);
-  VEC_DATA_TYPE(DATA_TYPE,4) row1 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input1.s0);
-  VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s23, input1.s01);
-  VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
-  return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
-         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
-         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
-}
-VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s2(const DATA_TYPE *input_ptr,
-                                      const DATA_TYPE *filter_ptr) {
-  VEC_DATA_TYPE(DATA_TYPE,8) input = vload8(0, input_ptr);
-  VEC_DATA_TYPE(DATA_TYPE,4) row0 = input.even;
-  VEC_DATA_TYPE(DATA_TYPE,4) row1 = input.odd;
-  VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input_ptr[8]);
-  VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr);
-  return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 +
-         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 +
-         (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2;
-}
-// Supported data type: half/float
-DATA_TYPE conv3x3(const DATA_TYPE *input_ptr,
-                  const DATA_TYPE *filter_ptr,
-                  const int row_width) {
-  VEC_DATA_TYPE(DATA_TYPE,3) input_value = vload3(0, input_ptr);
-  VEC_DATA_TYPE(DATA_TYPE,3) filter_value = vload3(0, filter_ptr);
-  VEC_DATA_TYPE(DATA_TYPE,3) res = input_value * filter_value;
-  input_ptr += row_width;
-  input_value = vload3(0, input_ptr);
-  filter_value = vload3(1, filter_ptr);
-  res += input_value * filter_value;
-  input_ptr += row_width;
-  input_value = vload3(0, input_ptr);
-  filter_value = vload3(2, filter_ptr);
-  res += input_value * filter_value;
-  return res.s0 + res.s1 + res.s2;
-}
-//TODO merge the depthwise with conv 3x3 to remove duplicate code.
-__kernel void depthwise_conv_3x3(__global const DATA_TYPE *input, /* n, c, h, w */
-                                 __global const DATA_TYPE *filter, /* m, i, kh, kw */
-#ifdef BIAS
-                                 __global const DATA_TYPE *bias, /* o */
-#endif
-                                 __global DATA_TYPE *output, /* n, c, h, w */
-                                 __private const int in_chan_num,
-                                 __private const int out_chan_num,
-                                 __private const int in_height,
-                                 __private const int in_width,
-                                 __private const int out_height,
-                                 __private const int out_width) {
-  int batch = get_global_id(0);
-  int out_chan_blk = get_global_id(1);
-  int out_pixel_blk = get_global_id(2);
-  const int in_pixel = in_height * in_width;
-  const int out_pixel = out_height * out_width;
-  const int multiplier = out_chan_num / in_chan_num;
-  const int round_out_width = (out_width + 3) / 4;
-  const int out_pixel_height = out_pixel_blk / round_out_width;
-  const int out_pixel_width = out_pixel_blk % round_out_width;
-  const int out_chan_begin = out_chan_blk * 4;
-  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
-  const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
-  const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
-#ifdef STRIDE_1
-  const int in_pixel_begin = out_pixel_height * in_width + out_pixel_width * 4;
-#else
-  const int in_pixel_begin = out_pixel_height * 2 * in_width + out_pixel_width * 2 * 4;
-#endif
-  const int in_offset = batch * in_chan_num * in_pixel;
-  const int out_offset = batch * out_chan_num * out_pixel;
-  const DATA_TYPE *input_base = input + in_offset + in_pixel_begin;
-  DATA_TYPE *output_base = output + out_offset + out_pixel_begin;
-  const int pixels = out_pixel_end - out_pixel_begin;
-  for (int i = out_chan_begin; i < out_chan_end; ++i) {
-    const DATA_TYPE *input_ptr = input_base + (i / multiplier) * in_pixel;
-    const DATA_TYPE *filter_ptr = filter + i * 9;
-    DATA_TYPE *output_ptr = output_base + i * out_pixel;
-    if (pixels == 4) {
-#ifdef BIAS
-      VEC_DATA_TYPE(DATA_TYPE,4) res = (VEC_DATA_TYPE(DATA_TYPE,4))bias[i];
-#else
-      VEC_DATA_TYPE(DATA_TYPE,4) res = 0;
-#endif /* defined(BIAS) */
-#ifdef STRIDE_1
-      res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
-      res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
-      res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
-#else
-      res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
-      res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
-      res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
-#endif
-      vstore4(res, 0, output_ptr);
-    } else {
-      for (int p = 0; p < pixels; ++p) {
-#ifdef BIAS
-        DATA_TYPE res = bias[i];
-#else
-        DATA_TYPE res = 0;
-#endif
-        res += conv3x3(input_ptr, filter_ptr, in_width);
-        output_ptr[p] = res;
-#ifdef STRIDE_1
-        input_ptr += 1;
-#else
-        input_ptr += 2;
-#endif
-      }
-    }
-  }
-}
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -81,13 +81,14 @@ void Conv2dOpencl(const Tensor *input,
                        *(static_cast<const cl::Image2D *>(output->buffer())));
  conv_2d_kernel.setArg(idx++, relux_max_limit);
  conv_2d_kernel.setArg(idx++, prelu_alpha);
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1)));
+  conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
+  conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
+  conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(input_channel_blocks));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
+  conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(height));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
+  conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(width));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(0)));
+  conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(filter->dim(0)));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(filter->dim(1)));
+  conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(filter->dim(1)));
+  conv_2d_kernel.setArg(idx++, static_cast<uint32_t>(stride));
  conv_2d_kernel.setArg(idx++, padding[0] / 2);
  conv_2d_kernel.setArg(idx++, padding[1] / 2);
  conv_2d_kernel.setArg(idx++, dilations[0]);

--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -2,60 +2,164 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/activation.h"
 #include "mace/kernels/depthwise_conv2d.h"
+#include "mace/kernels/opencl/helper.h"
+#include "mace/utils/tuner.h"
 namespace mace {
 namespace kernels {
-extern void DepthwiseConvOpenclK3x3S1(const Tensor *input, const Tensor *filter,
+void DepthwiseConv2d(const Tensor *input,   // NHWC
-                                      const Tensor *bias, Tensor *output,
+                     const Tensor *filter,  // HWIM
-                                      StatsFuture *future);
+                     const Tensor *bias,
+                     const int stride,
-extern void DepthwiseConvOpenclK3x3S2(const Tensor *input, const Tensor *filter,
+                     const int *paddings,
-                                      const Tensor *bias, Tensor *output,
+                     const int *dilations,
-                                      StatsFuture *future);
+                     const ActivationType activation,
-template <>
+                     const float relux_max_limit,
-void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
+                     const float prelu_alpha,
-                                                                   const Tensor *filter,
+                     const DataType dt,
-                                                                   const Tensor *bias,
+                     Tensor *output,
-                                                                   Tensor *output,
+                     StatsFuture *future) {
-                                                                   StatsFuture *future) {
+  const index_t batch = output->dim(0);
-  typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter,
+  const index_t height = output->dim(1);
-                                       const Tensor *bias, Tensor *output,
+  const index_t width = output->dim(2);
-                                       StatsFuture *future);
+  const index_t channels = output->dim(3);
-  // Selection matrix: kernel_size x stride_size
-  static const Conv2dOpenclFunction selector[5][2] = {
-      {nullptr, nullptr},
-      {nullptr, nullptr},
-      {DepthwiseConvOpenclK3x3S1, DepthwiseConvOpenclK3x3S2},
-      {nullptr, nullptr},
-      {nullptr, nullptr}};
+  const index_t input_batch = input->dim(0);
+  const index_t input_height = input->dim(1);
+  const index_t input_width = input->dim(2);
+  const index_t input_channels = input->dim(3);
+  const index_t filter_height = filter->dim(0);
+  const index_t filter_width = filter->dim(1);
+  const index_t multiplier = filter->dim(3);
+  MACE_CHECK(multiplier == 1, "Multiplier > 1 not supported");
+  MACE_CHECK(multiplier * input_channels == channels);
+  MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=",
+             input_channels);
+  const index_t channel_blocks = RoundUpDiv4(channels);
+  const index_t input_channel_blocks = RoundUpDiv4(input_channels);
+  const index_t width_blocks = RoundUpDiv4(width);
+  auto runtime = OpenCLRuntime::Global();
+  std::set<std::string> built_options;
+  std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d");
+  built_options.emplace("-Ddepthwise_conv2d=" + kernel_name);
+  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  built_options.emplace("-DSTRIDE=" + ToString(stride));
+  switch (activation) {
+    case NOOP:
+      break;
+    case RELU:
+      built_options.emplace("-DUSE_RELU");
+      break;
+    case RELUX:
+      built_options.emplace("-DUSE_RELUX");
+      break;
+    case PRELU:
+      built_options.emplace("-DUSE_PRELU");
+      break;
+    case TANH:
+      built_options.emplace("-DUSE_TANH");
+      break;
+    case SIGMOID:
+      built_options.emplace("-DUSE_SIGMOID");
+      break;
+    defeult:
+      LOG(FATAL) << "Unknown activation type: " << activation;
+  }
+  auto dw_conv2d_kernel =
+      runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options);
+  uint32_t idx = 0;
+  dw_conv2d_kernel.setArg(idx++,
+                          *(static_cast<const cl::Image2D *>(input->buffer())));
+  dw_conv2d_kernel.setArg(
+      idx++, *(static_cast<const cl::Image2D *>(filter->buffer())));
+  if (bias != nullptr) {
+    dw_conv2d_kernel.setArg(
+        idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
+  }
+  dw_conv2d_kernel.setArg(
+      idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
+  dw_conv2d_kernel.setArg(idx++, relux_max_limit);
+  dw_conv2d_kernel.setArg(idx++, prelu_alpha);
+  dw_conv2d_kernel.setArg(idx++, static_cast<short>(input_height));
+  dw_conv2d_kernel.setArg(idx++, static_cast<short>(input_width));
+  dw_conv2d_kernel.setArg(idx++, static_cast<short>(input_channel_blocks));
+  dw_conv2d_kernel.setArg(idx++, static_cast<short>(height));
+  dw_conv2d_kernel.setArg(idx++, static_cast<short>(width));
+  dw_conv2d_kernel.setArg(idx++, static_cast<short>(filter_height));
+  dw_conv2d_kernel.setArg(idx++, static_cast<short>(filter_width));
+  dw_conv2d_kernel.setArg(idx++, static_cast<short>(paddings[0] / 2));
+  dw_conv2d_kernel.setArg(idx++, static_cast<short>(paddings[1] / 2));
+  dw_conv2d_kernel.setArg(idx++, static_cast<short>(dilations[0]));
+  dw_conv2d_kernel.setArg(idx++, static_cast<short>(dilations[1]));
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(width_blocks),
+                           static_cast<uint32_t>(height * batch)};
+  std::vector<uint32_t> lws = {8, 16, 8, 1};
+  std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation,
+                                  batch, height, width, channels, multiplier);
+  TuningOrRun3DKernel(dw_conv2d_kernel, tuning_key, gws, lws, future);
+}
+template <typename T>
+void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
+    const Tensor *input,
+    const Tensor *filter,
+    const Tensor *bias,
+    Tensor *output,
+    StatsFuture *future) {
+  typedef void (*Conv2dOpenclFunction)(const Tensor *input,
+                                       const Tensor *filter, const Tensor *bias,
+                                       Tensor *output, StatsFuture *future);
  index_t kernel_h = filter->dim(2);
  index_t kernel_w = filter->dim(3);
-  if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] ||
+  if (strides_[0] != strides_[1]) {
-      strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 ||
+    LOG(WARNING) << "OpenCL depthwise conv2d kernel with "
-      selector[kernel_h - 1][strides_[0] - 1] == nullptr) {
-    LOG(WARNING) << "OpenCL conv2d kernel with "
                 << "filter" << kernel_h << "x" << kernel_w << ","
                 << " stride " << strides_[0] << "x" << strides_[1]
                 << " is not implemented yet, using slow version";
    // TODO(heliangliang) The CPU/NEON kernel should map the buffer
-    DepthwiseConv2dFunctor<DeviceType::CPU, float>(strides_, paddings_, dilations_)(
+    DepthwiseConv2dFunctor<DeviceType::CPU, float>(
-        input, filter, bias, output, future);
+        strides_, padding_, dilations_, activation_, relux_max_limit_,
+        prelu_alpha_)(input, filter, bias, output, future);
    return;
  }
-  auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1];
+  // Create a fake conv_2d filter to calculate the paddings and output size
-  if (paddings_[0] > 0 || paddings_[1] > 0) {
+  std::vector<index_t> fake_filter_shape(4);
-    Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum<float>::v());
+  fake_filter_shape[0] = filter->shape()[0];
-    ConstructInputWithPadding(input, paddings_.data(), &padded_input);
+  fake_filter_shape[1] = filter->shape()[1];
-    conv2d_func(&padded_input, filter, bias, output, future);
+  fake_filter_shape[3] = filter->shape()[2] * filter->shape()[3];
-  }else {
+  fake_filter_shape[2] = 1;
-    conv2d_func(input, filter, bias, output, future);
-  }
+  std::vector<index_t> output_shape(4);
+  std::vector<int> paddings(2);
+  kernels::CalcNHWCPaddingAndOutputSize(
+      input->shape().data(), fake_filter_shape.data(), dilations_, strides_,
+      padding_, output_shape.data(), paddings.data());
+  std::vector<size_t> output_image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape);
+  output->ResizeImage(output_shape, output_image_shape);
+  DepthwiseConv2d(input, filter, bias, strides_[0], paddings.data(), dilations_,
+                  activation_, relux_max_limit_, prelu_alpha_,
+                  DataTypeToEnum<T>::value, output, future);
 }
+template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, float>;
+template struct DepthwiseConv2dFunctor<DeviceType::OPENCL, half>;
 }  // namespace kernels
 }  // namespace mace
--- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
-//
-// Copyright (c) 2017 XiaoMi All rights reserved.
-//
-#include "mace/core/common.h"
-#include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/conv_2d.h"
-#include "mace/kernels/opencl/helper.h"
-namespace mace {
-namespace kernels {
-static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
-                                            const Tensor *filter,
-                                            const Tensor *bias,
-                                            const uint32_t stride,
-                                            Tensor *output,
-                                            StatsFuture *future) {
-  const index_t batch = output->dim(0);
-  const index_t channels = output->dim(1);
-  const index_t height = output->dim(2);
-  const index_t width = output->dim(3);
-  const index_t input_batch = input->dim(0);
-  const index_t input_channels = input->dim(1);
-  const index_t input_height = input->dim(2);
-  const index_t input_width = input->dim(3);
-  MACE_CHECK(input_batch == batch);
-  const index_t pixels = height * width;
-  const index_t channel_blocks = (channels + 3) / 4;
-  const index_t pixel_blocks = (width + 3) / 4 * height;
-  auto runtime = OpenCLRuntime::Global();
-  std::set<std::string> built_options;
-  std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv_3x3");
-  built_options.emplace("-Ddepthwise_conv_3x3=" + kernel_name);
-  built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype()));
-  built_options.emplace(stride == 1 ? "-DSTRIDE_1" : "");
-  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
-  auto conv_kernel  = runtime->BuildKernel("depthwise_conv_3x3", kernel_name, built_options);
-  uint32_t idx = 0;
-  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
-  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
-  if (bias != nullptr) {
-    conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
-  }
-  conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
-  conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
-  conv_kernel.setArg(idx++, static_cast<int32_t>(channels));
-  conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
-  conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
-  conv_kernel.setArg(idx++, static_cast<int32_t>(height));
-  conv_kernel.setArg(idx++, static_cast<int32_t>(width));
-  const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
-                           static_cast<uint32_t>(channel_blocks),
-                           static_cast<uint32_t>(pixel_blocks)};
-  const uint32_t lws[3] = {static_cast<uint32_t>(1),
-                           static_cast<uint32_t>(1),
-                           static_cast<uint32_t>(256)};
-  cl::Event event;
-  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      conv_kernel, cl::NullRange,
-      cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]),
-      nullptr, &event);
-  MACE_CHECK(error == CL_SUCCESS);
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
-      event.wait();
-      if (stats != nullptr) {
-        runtime->GetCallStats(event, stats);
-      }
-    };
-  }
-}
-extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
-                                      const Tensor *filter,
-                                      const Tensor *bias,
-                                      Tensor *output,
-                                      StatsFuture *future) {
-  InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 1, output, future);
-};
-extern void DepthwiseConvOpenclK3x3S2(const Tensor *input,
-                                      const Tensor *filter,
-                                      const Tensor *bias,
-                                      Tensor *output,
-                                      StatsFuture *future) {
-  InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 2, output, future);
-};
-}  // namespace kernels
-}  // namespace mace
--- a/mace/kernels/opencl/helper.cc
+++ b/mace/kernels/opencl/helper.cc
@@ -9,7 +9,7 @@
 namespace mace {
 namespace kernels {
-// [(c+3)/4*W, N * H]
+// [(C + 3) / 4 * W, N * H]
 void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
                           std::vector<size_t> &image_shape) {
  MACE_CHECK(shape.size() == 4);
@@ -18,13 +18,22 @@ void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
  image_shape[1] = shape[0] * shape[1];
 }
-// [H * W * RoundUp<4>(Ic), (Oc + 3) / 4]
+// [RoundUp<4>(Ic) * H * W, (Oc + 3) / 4]
-void CalFilterImageShape(const std::vector<index_t> &shape, /* HWIO*/
+void CalConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWIO */
-                         std::vector<size_t> &image_shape) {
+                               std::vector<size_t> &image_shape) {
  MACE_CHECK(shape.size() == 4);
  image_shape.resize(2);
  image_shape[0] = shape[0] * shape[1] * RoundUp<index_t>(shape[2], 4);
-  image_shape[1] = RoundUpDiv4(shape.back());
+  image_shape[1] = RoundUpDiv4(shape[3]);
+}
+// [H * W * M, (Ic + 3) / 4]
+void CalDepthwiseConv2dFilterImageShape(const std::vector<index_t> &shape, /* HWIM */
+                                        std::vector<size_t> &image_shape) {
+  MACE_CHECK(shape.size() == 4);
+  image_shape.resize(2);
+  image_shape[0] = shape[0] * shape[1] * shape[3];
+  image_shape[1] = RoundUpDiv4(shape[2]);
 }
 // [(size + 3) / 4, 1]
@@ -40,11 +49,17 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
                     const BufferType type,
                     std::vector<size_t> &image_shape) {
  switch (type) {
-    case FILTER:CalFilterImageShape(shape, image_shape);
+    case CONV2D_FILTER:
+      CalConv2dFilterImageShape(shape, image_shape);
+      break;
+    case DW_CONV2D_FILTER:
+      CalDepthwiseConv2dFilterImageShape(shape, image_shape);
      break;
-    case IN_OUT:CalInOutputImageShape(shape, image_shape);
+    case IN_OUT:
+      CalInOutputImageShape(shape, image_shape);
      break;
-    case ARGUMENT:CalArgImageShape(shape, image_shape);
+    case ARGUMENT:
+      CalArgImageShape(shape, image_shape);
      break;
    default:LOG(FATAL) << "Mace not supported yet.";
  }

--- a/mace/kernels/opencl/helper.h
+++ b/mace/kernels/opencl/helper.h
@@ -17,9 +17,10 @@ namespace kernels {
 const float kMaxKernelExeTime = 1000.0; // microseconds
 enum BufferType {
-  FILTER = 0,
+  CONV2D_FILTER = 0,
-  IN_OUT = 1,
+  DW_CONV2D_FILTER = 1,
-  ARGUMENT = 2
+  IN_OUT = 2,
+  ARGUMENT = 3
 };
 void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */

--- a/mace/ops/buffer_to_image.h
+++ b/mace/ops/buffer_to_image.h
@@ -20,7 +20,7 @@ class BufferToImageOp: public Operator<D, T> {
    const Tensor *input_tensor = this->Input(INPUT);
    kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
-        "buffer_type", static_cast<int>(kernels::FILTER)));
+        "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
    Tensor *output = this->Output(OUTPUT);
    functor_(const_cast<Tensor *>(input_tensor), type, output, future);

--- a/mace/ops/buffer_to_image_test.cc
+++ b/mace/ops/buffer_to_image_test.cc
@@ -75,27 +75,27 @@ TEST(BufferToImageTest, InputLarge) {
 }
 TEST(BufferToImageTest, Filter1x1Small) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {1, 1, 3, 5});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 3, 5});
 }
 TEST(BufferToImageTest, Filter1x1Media) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {1, 1, 13, 17});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 13, 17});
 }
 TEST(BufferToImageTest, Filter1x1Large) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {1, 1, 128, 512});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {1, 1, 128, 512});
 }
 TEST(BufferToImageTest, Filter3x3Small) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 3, 5});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 3, 5});
 }
 TEST(BufferToImageTest, Filter3x3Meida) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 13, 17});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 13, 17});
 }
 TEST(BufferToImageTest, Filter3x3Large) {
-  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::FILTER, {3, 3, 128, 256});
+  TestBidirectionTransform<DeviceType::OPENCL, float>(kernels::CONV2D_FILTER, {3, 3, 128, 256});
 }
 template<DeviceType D, typename T>

--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -36,7 +36,7 @@ static void Conv2d(int iters,
    BufferToImage<D, T>(net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT);
    BufferToImage<D, T>(net, "Filter", "FilterImage",
-                        kernels::BufferType::FILTER);
+                        kernels::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(net, "Bias", "BiasImage",
                        kernels::BufferType::ARGUMENT);
    OpDefBuilder("Conv2D", "Conv2dTest")

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -102,7 +102,7 @@ void TestNHWCSimple3x3VALID() {
    BufferToImage<D, T>(net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT);
    BufferToImage<D, T>(net, "Filter", "FilterImage",
-                        kernels::BufferType::FILTER);
+                        kernels::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(net, "Bias", "BiasImage",
                        kernels::BufferType::ARGUMENT);
    OpDefBuilder("Conv2D", "Conv2dTest")
@@ -159,7 +159,7 @@ void TestNHWCSimple3x3SAME() {
    BufferToImage<D, T>(net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT);
    BufferToImage<D, T>(net, "Filter", "FilterImage",
-                        kernels::BufferType::FILTER);
+                        kernels::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(net, "Bias", "BiasImage",
                        kernels::BufferType::ARGUMENT);
    OpDefBuilder("Conv2D", "Conv2dTest")
@@ -264,7 +264,7 @@ void TestNHWCSimple3x3WithoutBias() {
    BufferToImage<D, T>(net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT);
    BufferToImage<D, T>(net, "Filter", "FilterImage",
-                        kernels::BufferType::FILTER);
+                        kernels::BufferType::CONV2D_FILTER);
    OpDefBuilder("Conv2D", "Conv2dTest")
        .Input("InputImage")
@@ -371,7 +371,7 @@ static void TestNHWCCombined3x3() {
    BufferToImage<D, T>(net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT);
    BufferToImage<D, T>(net, "Filter", "FilterImage",
-                        kernels::BufferType::FILTER);
+                        kernels::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(net, "Bias", "BiasImage",
                        kernels::BufferType::ARGUMENT);
@@ -444,7 +444,7 @@ void TestConv1x1() {
    BufferToImage<D, float>(net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT);
    BufferToImage<D, float>(net, "Filter", "FilterImage",
-                            kernels::BufferType::FILTER);
+                            kernels::BufferType::CONV2D_FILTER);
    BufferToImage<D, float>(net, "Bias", "BiasImage",
                            kernels::BufferType::ARGUMENT);
@@ -535,7 +535,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
    BufferToImage<D, T>(net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT);
    BufferToImage<D, T>(net, "Filter", "FilterImage",
-                        kernels::BufferType::FILTER);
+                        kernels::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(net, "Bias", "BiasImage",
                        kernels::BufferType::ARGUMENT);
@@ -628,7 +628,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
    BufferToImage<D, half>(net, "Input", "InputImage",
                           kernels::BufferType::IN_OUT);
    BufferToImage<D, half>(net, "Filter", "FilterImage",
-                           kernels::BufferType::FILTER);
+                           kernels::BufferType::CONV2D_FILTER);
    BufferToImage<D, half>(net, "Bias", "BiasImage",
                           kernels::BufferType::ARGUMENT);
@@ -759,7 +759,7 @@ static void TestDilationConvNxN(const std::vector<index_t> &shape, const int dil
    // run on gpu
    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
    OpDefBuilder("Conv2D", "Conv2dTest")

--- a/mace/ops/core_test.cc
+++ b/mace/ops/core_test.cc
@@ -15,7 +15,7 @@ TEST(CoreTest, INIT_MODE) {
  OpDefBuilder("BufferToImage", "BufferToImageTest")
      .Input("Input")
      .Output("B2IOutput")
-      .AddIntArg("buffer_type", kernels::BufferType::FILTER)
+      .AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER)
      .AddIntArg("mode", static_cast<int>(NetMode::INIT))
      .Finalize(&op_defs[op_defs.size() - 1]);
@@ -33,7 +33,7 @@ TEST(CoreTest, INIT_MODE) {
  OpDefBuilder("ImageToBuffer", "ImageToBufferTest")
      .Input("B2IOutput")
      .Output("Output")
-      .AddIntArg("buffer_type", kernels::BufferType::FILTER)
+      .AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER)
      .Finalize(&op_defs[op_defs.size() - 1]);
  NetDef net_def;

--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -26,6 +26,12 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
                                     .TypeConstraint<float>("T")
                                     .Build(),
                    DepthwiseConv2dOp<DeviceType::OPENCL, float>);
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
+                                     .Device(DeviceType::OPENCL)
+                                     .TypeConstraint<half>("T")
+                                     .Build(),
+                    DepthwiseConv2dOp<DeviceType::OPENCL, half>);
 }
 }  // namespace mace
--- a/mace/ops/depthwise_conv2d.h
+++ b/mace/ops/depthwise_conv2d.h
@@ -2,8 +2,8 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
-#ifndef MACE_OPS_DEPTHWISE_CONV_H_
+#ifndef MACE_OPS_DEPTHWISE_CONV2D_H_
-#define MACE_OPS_DEPTHWISE_CONV_H_
+#define MACE_OPS_DEPTHWISE_CONV2D_H_
 #include <memory>
@@ -18,10 +18,13 @@ template <DeviceType D, typename T>
 class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
 public:
  DepthwiseConv2dOp(const OperatorDef &op_def, Workspace *ws)
-      : ConvPool2dOpBase<D, T>(op_def, ws) {
+      : ConvPool2dOpBase<D, T>(op_def, ws),
-    functor_.strides_ = this->strides_.data();
+        functor_(this->strides_.data(),
-    functor_.dilations_ = this->dilations_.data();
+                 this->padding_,
-  }
+                 this->dilations_.data(),
+                 kernels::ActivationType::NOOP,
+                 0.0f,
+                 0.0f) {}
  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);
@@ -31,23 +34,7 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
      bias = this->Input(BIAS);
    }
    Tensor *output = this->Output(OUTPUT);
-    // resize filter shape.
-    std::vector<index_t> filter_shape(filter->shape().begin(),
-                                      filter->shape().end());
-    filter_shape[0] *= filter_shape[1];
-    filter_shape[1] = 1;
-    std::vector<index_t> output_shape(4);
-    std::vector<int> paddings(2);
-    kernels::CalcPaddingAndOutputSize(
-        input->shape().data(), filter_shape.data(), this->dilations_.data(),
-        this->strides_.data(), this->padding_, output_shape.data(),
-        paddings.data());
-    output->Resize(output_shape);
-    functor_.paddings_ = paddings;
    functor_(input, filter, bias, output, future);
    return true;
  }
@@ -59,6 +46,6 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
  OP_OUTPUT_TAGS(OUTPUT);
 };
-}  //  namespace mace
+}  // namespace mace
-#endif  //  MACE_OPS_DEPTHWISE_CONV_H_
+#endif  // MACE_OPS_DEPTHWISE_CONV2D_H_
--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -7,45 +7,200 @@
 using namespace mace;
+namespace {
 class DepthwiseConv2dOpTest : public OpsTestBase {};
-template <DeviceType D>
+template <DeviceType D, typename T>
 void SimpleValidTest() {
  testing::internal::LogToStderr();
  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddInputFromArray<D, float>("Input", {1, 2, 2, 3},
-                                  {1, 3, 5, 7, 9, 11, 2, 4, 6, 8, 10, 12});
  net.AddInputFromArray<D, float>(
-      "Filter", {2, 2, 2, 2},
+      "Input", {1, 3, 3, 2},
-      {1.0f, 5.0f, 9.0f, 13.0f, 2.0f, 6.0f, 10.0f, 14.0f, 3.0f, 7.0f, 11.0f,
+      {1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18});
-       15.0f, 4.0f, 8.0f, 12.0f, 16.0f});
+  net.AddInputFromArray<D, float>(
-  net.AddInputFromArray<D, float>("Bias", {4}, {.1f, .2f, .3f, .4f});
+      "Filter", {2, 2, 2, 1}, {1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f});
-  // Run
+  net.AddInputFromArray<D, float>("Bias", {2}, {.1f, .2f});
-  net.RunOp(D);
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, T>(net, "Input", "InputImage",
+                        kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage",
+                        kernels::BufferType::DW_CONV2D_FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage",
+                        kernels::BufferType::ARGUMENT);
+    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    net.RunOp(D);
+    // Transfer output
+    ImageToBuffer<D, T>(net, "OutputImage", "Output",
+                        kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {1, 1})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
  // Check
-  auto expected = CreateTensor<float>(
+  auto expected = CreateTensor<T>({1, 2, 2, 2}, {37.1f, 148.2f, 47.1f, 188.2f,
-      {1, 4, 1, 2},
+                                                 67.1f, 268.2f, 77.1f, 308.2f});
-      {196.1f, 252.1f, 216.2f, 280.2f, 272.3f, 344.3f, 296.4f, 376.4f});
+  ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 1e-5);
+}
+TEST_F(DepthwiseConv2dOpTest, SimpleCPU) {
+  SimpleValidTest<DeviceType::CPU, float>();
+}
+TEST_F(DepthwiseConv2dOpTest, SimpleOpenCL) {
+  SimpleValidTest<DeviceType::OPENCL, float>();
+}
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) {
+  SimpleValidTest<DeviceType::OPENCL, half>();
 }
-TEST_F(DepthwiseConv2dOpTest, SimpleCPU) { SimpleValidTest<DeviceType::CPU>(); }
+template <DeviceType D, typename T>
+void ComplexValidTest() {
+  testing::internal::LogToStderr();
+  // Construct graph
+  OpsTestNet net;
-template <DeviceType D>
+  // Add input data
+  net.AddInputFromArray<D, float>(
+      "Input", {1, 10, 10, 3},
+      {0.0,  0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1,  0.11,
+       0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2,  0.21, 0.22, 0.23,
+       0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3,  0.31, 0.32, 0.33, 0.34, 0.35,
+       0.36, 0.37, 0.38, 0.39, 0.4,  0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47,
+       0.48, 0.49, 0.5,  0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59,
+       0.6,  0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7,  0.71,
+       0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8,  0.81, 0.82, 0.83,
+       0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9,  0.91, 0.92, 0.93, 0.94, 0.95,
+       0.96, 0.97, 0.98, 0.99, 1.0,  1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07,
+       1.08, 1.09, 1.1,  1.11, 1.12, 1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.19,
+       1.2,  1.21, 1.22, 1.23, 1.24, 1.25, 1.26, 1.27, 1.28, 1.29, 1.3,  1.31,
+       1.32, 1.33, 1.34, 1.35, 1.36, 1.37, 1.38, 1.39, 1.4,  1.41, 1.42, 1.43,
+       1.44, 1.45, 1.46, 1.47, 1.48, 1.49, 1.5,  1.51, 1.52, 1.53, 1.54, 1.55,
+       1.56, 1.57, 1.58, 1.59, 1.6,  1.61, 1.62, 1.63, 1.64, 1.65, 1.66, 1.67,
+       1.68, 1.69, 1.7,  1.71, 1.72, 1.73, 1.74, 1.75, 1.76, 1.77, 1.78, 1.79,
+       1.8,  1.81, 1.82, 1.83, 1.84, 1.85, 1.86, 1.87, 1.88, 1.89, 1.9,  1.91,
+       1.92, 1.93, 1.94, 1.95, 1.96, 1.97, 1.98, 1.99, 2.0,  2.01, 2.02, 2.03,
+       2.04, 2.05, 2.06, 2.07, 2.08, 2.09, 2.1,  2.11, 2.12, 2.13, 2.14, 2.15,
+       2.16, 2.17, 2.18, 2.19, 2.2,  2.21, 2.22, 2.23, 2.24, 2.25, 2.26, 2.27,
+       2.28, 2.29, 2.3,  2.31, 2.32, 2.33, 2.34, 2.35, 2.36, 2.37, 2.38, 2.39,
+       2.4,  2.41, 2.42, 2.43, 2.44, 2.45, 2.46, 2.47, 2.48, 2.49, 2.5,  2.51,
+       2.52, 2.53, 2.54, 2.55, 2.56, 2.57, 2.58, 2.59, 2.6,  2.61, 2.62, 2.63,
+       2.64, 2.65, 2.66, 2.67, 2.68, 2.69, 2.7,  2.71, 2.72, 2.73, 2.74, 2.75,
+       2.76, 2.77, 2.78, 2.79, 2.8,  2.81, 2.82, 2.83, 2.84, 2.85, 2.86, 2.87,
+       2.88, 2.89, 2.9,  2.91, 2.92, 2.93, 2.94, 2.95, 2.96, 2.97, 2.98, 2.99});
+  net.AddInputFromArray<D, float>(
+      "Filter", {5, 5, 3, 1},
+      {0.0,  0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1,
+       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2,  0.21,
+       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3,  0.31, 0.32,
+       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4,  0.41, 0.42, 0.43,
+       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5,  0.51, 0.52, 0.53, 0.54,
+       0.55, 0.56, 0.57, 0.58, 0.59, 0.6,  0.61, 0.62, 0.63, 0.64, 0.65,
+       0.66, 0.67, 0.68, 0.69, 0.7,  0.71, 0.72, 0.73, 0.74});
+  net.AddInputFromArray<D, float>("Bias", {6},
+                                  {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, T>(net, "Input", "InputImage",
+                        kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage",
+                        kernels::BufferType::DW_CONV2D_FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage",
+                        kernels::BufferType::ARGUMENT);
+    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("OutputImage")
+        .AddIntsArg("strides", {2, 2})
+        .AddIntArg("padding", Padding::SAME)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    net.RunOp(D);
+    // Transfer output
+    ImageToBuffer<D, T>(net, "OutputImage", "Output",
+                        kernels::BufferType::IN_OUT);
+  } else {
+    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {2, 2})
+        .AddIntArg("padding", Padding::SAME)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  }
+  // Check
+  auto expected = CreateTensor<T>(
+      {1, 5, 5, 3},
+      {4.48200035,  4.63479996,  4.79079962,  5.85899973,  6.05599976,
+       6.25699997,  6.38100004,  6.59000015,  6.80300045,  6.90299988,
+       7.1239996,   7.34899998,  4.03559971,  4.16820002,  4.30319977,
+       8.90999985,  9.1760006,   9.44599915,  11.20499992, 11.54500103,
+       11.89000034, 11.74499989, 12.09999943, 12.46000004, 12.28499985,
+       12.65500069, 13.03000069, 7.00200033,  7.22399998,  7.44900036,
+       13.4100008,  13.79599953, 14.18599987, 16.60500145, 17.09499741,
+       17.59000015, 17.14500046, 17.65000153, 18.15999794, 17.68499947,
+       18.20499992, 18.72999954, 9.97200012,  10.28399944, 10.59899998,
+       17.90999985, 18.41600037, 18.92599869, 22.00500107, 22.64500046,
+       23.28999901, 22.54500008, 23.19999886, 23.8599987,  23.0850029,
+       23.75500107, 24.43000031, 12.94200039, 13.34400082, 13.7489996,
+       6.97500038,  7.29659986,  7.62060022,  8.32049942,  8.72700024,
+       9.13650036,  8.5095005,   8.92500019,  9.34349918,  8.69849968,
+       9.12300014,  9.55049992,  4.55220032,  4.80690002,  5.06340027});
+  ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 0.2);
+}
+TEST_F(DepthwiseConv2dOpTest, ComplexCPU) {
+  ComplexValidTest<DeviceType::CPU, float>();
+}
+TEST_F(DepthwiseConv2dOpTest, ComplexOpenCL) {
+  ComplexValidTest<DeviceType::OPENCL, float>();
+}
+TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) {
+  ComplexValidTest<DeviceType::OPENCL, half>();
+}
+template <DeviceType D, typename T>
 void TestNxNS12(const index_t height, const index_t width) {
  testing::internal::LogToStderr();
  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
@@ -53,11 +208,18 @@ void TestNxNS12(const index_t height, const index_t width) {
    srand(time(NULL));
    // generate random input
-    index_t batch = 1;
+    index_t batch = 1 + rand() % 5;
-    index_t input_channels = 3;
+    index_t input_channels = 3 + rand() % 16;
-    index_t multiplier = 2;
+    index_t multiplier = 1;
    // Construct graph
    OpsTestNet net;
+    // Add input data
+    net.AddRandomInput<D, float>("Input",
+                                 {batch, height, width, input_channels});
+    net.AddRandomInput<D, float>(
+        "Filter", {kernel_h, kernel_w, input_channels, multiplier});
+    net.AddRandomInput<D, float>("Bias", {multiplier * input_channels});
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
        .Input("Input")
        .Input("Filter")
@@ -66,24 +228,56 @@ void TestNxNS12(const index_t height, const index_t width) {
        .AddIntsArg("strides", {stride_h, stride_w})
        .AddIntArg("padding", type)
        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
        .Finalize(net.NewOperatorDef());
-    // Add input data
+    // Run on cpu
-    net.AddRandomInput<D, float>("Input",
+    net.RunOp();
-                                 {batch, input_channels, height, width});
-    net.AddRandomInput<D, float>(
-        "Filter", {multiplier, input_channels, kernel_h, kernel_w});
-    net.AddRandomInput<D, float>("Bias", {multiplier * input_channels});
-    // Run on device
-    net.RunOp(D);
    // Check
    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));
-    // run cpu
+    if (D == DeviceType::OPENCL) {
-    net.RunOp();
+      BufferToImage<D, T>(net, "Input", "InputImage",
-    ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-3);
+                          kernels::BufferType::IN_OUT);
+      BufferToImage<D, T>(net, "Filter", "FilterImage",
+                          kernels::BufferType::DW_CONV2D_FILTER);
+      BufferToImage<D, T>(net, "Bias", "BiasImage",
+                          kernels::BufferType::ARGUMENT);
+      OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
+          .Input("InputImage")
+          .Input("FilterImage")
+          .Input("BiasImage")
+          .Output("OutputImage")
+          .AddIntsArg("strides", {stride_h, stride_w})
+          .AddIntArg("padding", type)
+          .AddIntsArg("dilations", {1, 1})
+          .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+          .Finalize(net.NewOperatorDef());
+      net.RunOp(D);
+      // Transfer output
+      ImageToBuffer<D, float>(net, "OutputImage", "DeviceOutput",
+                              kernels::BufferType::IN_OUT);
+    } else {
+      OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
+          .Input("Input")
+          .Input("Filter")
+          .Input("Bias")
+          .Output("DeviceOutput")
+          .AddIntsArg("strides", {stride_h, stride_w})
+          .AddIntArg("padding", type)
+          .AddIntsArg("dilations", {1, 1})
+          .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+          .Finalize(net.NewOperatorDef());
+      // Run
+      net.RunOp(D);
+    }
+    // Check
+    ExpectTensorNear<float>(expected, *net.GetOutput("DeviceOutput"), 0.1);
  };
  for (int kernel_size : {3}) {
@@ -96,32 +290,47 @@ void TestNxNS12(const index_t height, const index_t width) {
 #if __ARM_NEON
 TEST_F(DepthwiseConv2dOpTest, NeonSimpleNxNS12) {
-  TestNxNS12<DeviceType::NEON>(4, 4);
+  TestNxNS12<DeviceType::NEON, float>(4, 4);
 }
 #endif
 TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12) {
-  TestNxNS12<DeviceType::OPENCL>(4, 4);
+  TestNxNS12<DeviceType::OPENCL, float>(4, 4);
+}
+TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12Half) {
+  TestNxNS12<DeviceType::OPENCL, half>(4, 4);
 }
 #if __ARM_NEON
 TEST_F(DepthwiseConv2dOpTest, NeonAlignedNxNS12) {
-  TestNxNS12<DeviceType::NEON>(64, 64);
+  TestNxNS12<DeviceType::NEON, float>(64, 64);
-  TestNxNS12<DeviceType::NEON>(128, 128);
+  TestNxNS12<DeviceType::NEON, float>(128, 128);
 }
 #endif
 TEST_F(DepthwiseConv2dOpTest, OpenCLAlignedNxNS12) {
-  TestNxNS12<DeviceType::OPENCL>(64, 64);
+  TestNxNS12<DeviceType::OPENCL, float>(64, 64);
-  TestNxNS12<DeviceType::OPENCL>(128, 128);
+  TestNxNS12<DeviceType::OPENCL, float>(128, 128);
+}
+TEST_F(DepthwiseConv2dOpTest, OpenCLAlignedNxNS12Half) {
+  TestNxNS12<DeviceType::OPENCL, half>(64, 64);
+  TestNxNS12<DeviceType::OPENCL, half>(128, 128);
 }
 #if __ARM_NEON
 TEST_F(DepthwiseConv2dOpTest, NeonUnalignedNxNS12) {
-  TestNxNS12<DeviceType::NEON>(107, 113);
+  TestNxNS12<DeviceType::NEON, float>(107, 113);
 }
 #endif
 TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12) {
-  TestNxNS12<DeviceType::OPENCL>(107, 113);
+  TestNxNS12<DeviceType::OPENCL, float>(107, 113);
 }
+TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12Half) {
+  TestNxNS12<DeviceType::OPENCL, half>(107, 113);
+}
+}  // namespace
--- a/mace/ops/depthwise_conv_2d_benchmark.cc
+++ b/mace/ops/depthwise_conv_2d_benchmark.cc
@@ -14,44 +14,65 @@ namespace mace {
 template <DeviceType D, typename T>
 static void DepthwiseConv2d(int iters,
                            int batch,
-                            int channels,
+                            int input_channels,
                            int height,
                            int width,
                            int kernel_h,
                            int kernel_w,
                            int stride,
                            Padding padding,
-                            int output_channels) {
+                            int multiplier) {
  mace::testing::StopTiming();
  OpsTestNet net;
-  OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {stride, stride})
-      .AddIntArg("padding", padding)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
  // Add input data
-  net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+  net.AddRandomInput<D, float>("Input", {batch, height, width, input_channels});
-  net.AddRandomInput<D, float>("Filter",
+  net.AddRandomInput<D, float>(
-                               {output_channels, channels, kernel_h, kernel_w});
+      "Filter", {kernel_h, kernel_w, input_channels, multiplier});
-  net.AddRandomInput<D, float>("Bias", {output_channels * channels});
+  net.AddRandomInput<D, float>("Bias", {input_channels * multiplier});
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, T>(net, "Input", "InputImage",
+                        kernels::BufferType::IN_OUT);
+    BufferToImage<D, T>(net, "Filter", "FilterImage",
+                        kernels::BufferType::DW_CONV2D_FILTER);
+    BufferToImage<D, T>(net, "Bias", "BiasImage",
+                        kernels::BufferType::ARGUMENT);
+    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
+        .Input("InputImage")
+        .Input("FilterImage")
+        .Input("BiasImage")
+        .Output("Output")
+        .AddIntsArg("strides", {stride, stride})
+        .AddIntArg("padding", padding)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+  } else {
+    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {stride, stride})
+        .AddIntArg("padding", padding)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+  }
  // Warm-up
-  for (int i = 0; i < 5; ++i) {
+  for (int i = 0; i < 2; ++i) {
    net.RunOp(D);
+    net.Sync();
  }
-  net.Sync();
  mace::testing::StartTiming();
  while (iters--) {
    net.RunOp(D);
+    net.Sync();
  }
-  net.Sync();
 }
 #define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE,                             \
@@ -68,20 +89,21 @@ static void DepthwiseConv2d(int iters,
  BENCHMARK(                                                                                            \
      BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
-#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE)        \
+#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE)       \
-  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU);  \
+  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
-BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 2, float);
+BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1, float);
-BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 2, float);
+BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1, float);
-BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 2, float);
+BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1, float);
-BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 2, float);
+BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1, float);
 BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1, float);
 BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1, float);
-BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 2, float);
+BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1, float);
-BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 2, float);
+BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1, float);
-BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 2, float);
+BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1, float);
-BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 2, float);
+BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1, float);
 BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1, float);
 BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1, float);
 }  //  namespace mace
--- a/mace/ops/fused_conv_2d_test.cc
+++ b/mace/ops/fused_conv_2d_test.cc
@@ -26,7 +26,7 @@ void TestNHWCSimple3x3VALID() {
    BufferToImage<D, T>(net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT);
    BufferToImage<D, T>(net, "Filter", "FilterImage",
-                        kernels::BufferType::FILTER);
+                        kernels::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(net, "Bias", "BiasImage",
                        kernels::BufferType::ARGUMENT);
    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
@@ -83,7 +83,7 @@ void TestNHWCSimple3x3SAME() {
    BufferToImage<D, T>(net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT);
    BufferToImage<D, T>(net, "Filter", "FilterImage",
-                        kernels::BufferType::FILTER);
+                        kernels::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(net, "Bias", "BiasImage",
                        kernels::BufferType::ARGUMENT);
    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
@@ -151,7 +151,7 @@ void TestNHWCSimple3x3WithoutBias() {
    BufferToImage<D, T>(net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT);
    BufferToImage<D, T>(net, "Filter", "FilterImage",
-                        kernels::BufferType::FILTER);
+                        kernels::BufferType::CONV2D_FILTER);
    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
        .Input("InputImage")
@@ -220,7 +220,7 @@ void TestConv1x1() {
    BufferToImage<D, float>(net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT);
    BufferToImage<D, float>(net, "Filter", "FilterImage",
-                            kernels::BufferType::FILTER);
+                            kernels::BufferType::CONV2D_FILTER);
    BufferToImage<D, float>(net, "Bias", "BiasImage",
                            kernels::BufferType::ARGUMENT);
@@ -311,7 +311,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
    BufferToImage<D, T>(net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT);
    BufferToImage<D, T>(net, "Filter", "FilterImage",
-                        kernels::BufferType::FILTER);
+                        kernels::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(net, "Bias", "BiasImage",
                        kernels::BufferType::ARGUMENT);
@@ -397,7 +397,7 @@ static void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape) {
    BufferToImage<D, half>(net, "Input", "InputImage",
                           kernels::BufferType::IN_OUT);
    BufferToImage<D, half>(net, "Filter", "FilterImage",
-                           kernels::BufferType::FILTER);
+                           kernels::BufferType::CONV2D_FILTER);
    BufferToImage<D, half>(net, "Bias", "BiasImage",
                           kernels::BufferType::ARGUMENT);
@@ -475,7 +475,7 @@ static void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
    BufferToImage<D, T>(net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT);
    BufferToImage<D, T>(net, "Filter", "FilterImage",
-                        kernels::BufferType::FILTER);
+                        kernels::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(net, "Bias", "BiasImage",
                        kernels::BufferType::ARGUMENT);
@@ -551,7 +551,7 @@ static void TestAtrousConvNxN(const std::vector<index_t> &shape, const int dilat
    // run on gpu
    BufferToImage<D, T>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, T>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
    BufferToImage<D, T>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
@@ -633,7 +633,7 @@ static void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
    // run on gpu
    BufferToImage<D, half>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
-    BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::FILTER);
+    BufferToImage<D, half>(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER);
    BufferToImage<D, half>(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT);
    OpDefBuilder("FusedConv2D", "FusedConv2dTest")

--- a/mace/ops/image_to_buffer.h
+++ b/mace/ops/image_to_buffer.h
@@ -21,7 +21,7 @@ class ImageToBufferOp: public Operator<D, T> {
    Tensor *output = this->Output(OUTPUT);
    kernels::BufferType type = static_cast<kernels::BufferType>(OperatorBase::GetSingleArgument<int>(
-        "buffer_type", static_cast<int>(kernels::FILTER)));
+        "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
    functor_(output, type, const_cast<Tensor *>(input_tensor), future);
    return true;
  }