diff --git a/mace/kernels/opencl/cl/batch_norm.cl b/mace/kernels/opencl/cl/batch_norm.cl
index 3fc449ce499723b4ab78b4e2627be537fe40e978..6cc2a2e08d14165f47bf6821bd36732e07883918 100644
--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
@@ -4,7 +4,7 @@ void kernel batch_norm(global const float *input,
                        global const float *mean,
                        global const float *var,
                        global const float *epsilon,
-                       private const uint pixels,
+                       private const int pixels,
                        global float *output,
                        __local float4 *new_scale,
                        __local float4 *new_offset) {
@@ -12,7 +12,7 @@ void kernel batch_norm(global const float *input,
   const int channel = get_global_id(1);
   const int channels = get_global_size(1);
   const int pixel_offset = get_global_id(2);
-  const unsigned int local_channel = get_local_id(1);
+  const int local_channel = get_local_id(1);
   const int local_pixel_idx = get_local_id(2);
 
   if(local_pixel_idx == 0) {
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index 1c97a581b9527a41ff48c4d40e1c97e74329bba0..c51cc2b6f03993538cd300b33688bb23fd447104 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -1,53 +1,46 @@
-float4 conv1x3_s1(const float *input_ptr,
-                  const float *filter_ptr);
-float4 conv1x3_s2(const float *input_ptr,
-                  const float *filter_ptr);
-float conv3x3(const float *input_ptr,
-              const float *filter_ptr,
-              const int row_width);
-
+#include <conv_helper.h>
 void kernel conv_2d_3x3(global const float *input,
                         global const float *filter,
                         global const float *bias,
                         global float *output,
-                        private const uint in_chan_num,
-                        private const uint out_chan_num,
-                        private const uint in_height,
-                        private const uint in_width,
-                        private const uint out_height,
-                        private const uint out_width,
-                        private const uint stride_h,
-                        private const uint stride_w) {
+                        private const int in_chan_num,
+                        private const int out_chan_num,
+                        private const int in_height,
+                        private const int in_width,
+                        private const int out_height,
+                        private const int out_width,
+                        private const int stride_h,
+                        private const int stride_w) {
   int batch = get_global_id(0);
   int out_chan_blk = get_global_id(1);
   int out_pixel_blk = get_global_id(2);
 
-  const uint in_pixel = in_height * in_width;
-  const uint out_pixel = out_height * out_width;
+  const int in_pixel = in_height * in_width;
+  const int out_pixel = out_height * out_width;
 
-  const uint round_out_width = (out_width + 3) / 4;
-  const uint out_pixel_height = out_pixel_blk / round_out_width;
-  const uint out_pixel_width = out_pixel_blk % round_out_width;
+  const int round_out_width = (out_width + 3) / 4;
+  const int out_pixel_height = out_pixel_blk / round_out_width;
+  const int out_pixel_width = out_pixel_blk % round_out_width;
 
-  const uint out_chan_begin = out_chan_blk * 4;
-  const uint out_chan_end = min(out_chan_begin + 4, out_chan_num);
-  const uint out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
-  const uint out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
-  const uint in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
+  const int out_chan_begin = out_chan_blk * 4;
+  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
+  const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
+  const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
+  const int in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
 
-  const uint in_offset = batch * in_chan_num * in_pixel;
-  const uint out_offset = batch * out_chan_num * out_pixel;
+  const int in_offset = batch * in_chan_num * in_pixel;
+  const int out_offset = batch * out_chan_num * out_pixel;
   const float *input_base = input + in_offset + in_pixel_begin;
   float *output_base = output + out_offset + out_pixel_begin;
 
-  uint pixels = out_pixel_end - out_pixel_begin;
+  const int pixels = out_pixel_end - out_pixel_begin;
 
-  for (uint i = out_chan_begin; i < out_chan_end; ++i) {
-    float4 res = (float4)bias[i];
+  for (int i = out_chan_begin; i < out_chan_end; ++i) {
     float *output_ptr = output_base + i * out_pixel;
     const float *filter_base = filter + i * in_chan_num * 9;
     if (pixels == 4) {
-      for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
+      float4 res = (float4)bias[i];
+      for (int in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
         const float* input_ptr = input_base + in_chan_idx * in_pixel;
         const float* filter_ptr = filter_base + in_chan_idx * 9;
         if (stride_w == 1) {
@@ -62,7 +55,7 @@ void kernel conv_2d_3x3(global const float *input,
       }
       vstore4(res, 0, output_ptr);
     } else {
-      for (uint p = 0; p < pixels; ++p) {
+      for (int p = 0; p < pixels; ++p) {
         float res = bias[i];
         for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) {
           const float* input_ptr = input_base + in_chan_idx * in_pixel + p * stride_w;
diff --git a/mace/kernels/opencl/cl/conv_helper.h b/mace/kernels/opencl/cl/conv_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..553af09d7300dd642b0fdc6f9147ae32b2d0aa91
--- /dev/null
+++ b/mace/kernels/opencl/cl/conv_helper.h
@@ -0,0 +1,15 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
+#define MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
+
+float4 conv1x3_s1(const float *input_ptr,
+                  const float *filter_ptr);
+float4 conv1x3_s2(const float *input_ptr,
+                  const float *filter_ptr);
+float conv3x3(const float *input_ptr,
+              const float *filter_ptr,
+              const int row_width);
+#endif //  MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_
diff --git a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
index 2cdbe3fa2a7c696678c921d1d39d9ab34751710c..9f9a6fc44cdf6f1f6cdbf1ad07734541857fb91b 100644
--- a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl
@@ -1,19 +1,17 @@
-float4 conv1x3_s1(const float *input_ptr,
-                  const float *filter_ptr);
-float conv3x3(const float *input_ptr,
-                     const float *filter_ptr,
-                     const int row_width);
-
-void kernel depthwise_conv_3x3_s1(global const float *input, /* n, c, h, w */
-                                  global const float *filter, /* m, i, kh, kw */
-                                  global const float *bias, /* o */
-                                  global float *output, /* n, c, h, w */
-                                  private const int in_chan_num,
-                                  private const int out_chan_num,
-                                  private const int in_height,
-                                  private const int in_width,
-                                  private const int out_height,
-                                  private const int out_width) {
+#include <conv_helper.h>
+//TODO merge the depthwise with conv 3x3 to remove duplicate code.
+void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */
+                               global const float *filter, /* m, i, kh, kw */
+                               global const float *bias, /* o */
+                               global float *output, /* n, c, h, w */
+                               private const int in_chan_num,
+                               private const int out_chan_num,
+                               private const int in_height,
+                               private const int in_width,
+                               private const int out_height,
+                               private const int out_width,
+                               private const int stride_h,
+                               private const int stride_w) {
   int batch = get_global_id(0);
   int out_chan_blk = get_global_id(1);
   int out_pixel_blk = get_global_id(2);
@@ -30,32 +28,39 @@ void kernel depthwise_conv_3x3_s1(global const float *input, /* n, c, h, w */
   const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
   const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4;
   const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width);
-  const int in_pixel_begin = out_pixel_height * in_width + out_pixel_width * 4;
+  const int in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4;
 
   const int in_offset = batch * in_chan_num * in_pixel;
   const int out_offset = batch * out_chan_num * out_pixel;
   const float *input_base = input + in_offset + in_pixel_begin;
   float *output_base = output + out_offset + out_pixel_begin;
 
-  int pixels = out_pixel_end - out_pixel_begin;
+  const int pixels = out_pixel_end - out_pixel_begin;
 
   for (int i = out_chan_begin; i < out_chan_end; ++i) {
     float bias_value = bias[i];
     const float *input_ptr = input_base + (i / multiplier) * in_pixel;
     const float *filter_ptr = filter + i * 9;
     float *output_ptr = output_base + i * out_pixel;
-    if (pixels < 4) {
-      for (int out_idx = 0; out_idx < pixels; ++out_idx) {
-        output_ptr[out_idx] = bias_value;
-        output_ptr[out_idx] += conv3x3(input_ptr, filter_ptr, in_width);
-        input_ptr += 1;
+    if (pixels == 4) {
+      float4 res = (float4)bias[i];
+      if (stride_w == 1) {
+        res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+        res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+        res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
+      } else {
+        res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
+        res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
+        res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
       }
-    } else {
-      float4 res = (float4)bias_value;
-      res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3);
-      res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3);
-      res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3);
       vstore4(res, 0, output_ptr);
+    } else {
+      for (int p = 0; p < pixels; ++p) {
+        float res = bias[i];
+        res += conv3x3(input_ptr, filter_ptr, in_width);
+        output_ptr[p] = res;
+        input_ptr += stride_w;
+      }
     }
   }
 
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index 448f31d6f2c940855b8ba308fd49b0df8f71dd41..41dccf4c4ef9220ae7822df5f817705ed9ffcbd0 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -22,29 +22,29 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
 
   auto runtime = OpenCLRuntime::Get();
   auto program = runtime->program();
-  auto bm_kernel = cl::Kernel(program, "conv_2d_3x3");
+  auto conv_kernel = cl::Kernel(program, "conv_2d_3x3");
 
   uint32_t idx = 0;
-  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
-  bm_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(1)));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(channels));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(2)));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(input->dim(3)));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(height));
-  bm_kernel.setArg(idx++, static_cast<uint32_t>(width));
-  bm_kernel.setArg(idx++, stride);
-  bm_kernel.setArg(idx++, stride);
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
+  conv_kernel.setArg(idx++, static_cast<int32_t>(channels));
+  conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
+  conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
+  conv_kernel.setArg(idx++, static_cast<int32_t>(height));
+  conv_kernel.setArg(idx++, static_cast<int32_t>(width));
+  conv_kernel.setArg(idx++, stride);
+  conv_kernel.setArg(idx++, stride);
   const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
                            static_cast<uint32_t>(channel_blocks),
                            static_cast<uint32_t>(pixel_blocks)};
   const uint32_t lws[3] = {static_cast<uint32_t>(1),
-                           static_cast<uint32_t>(1),
-                           static_cast<uint32_t>(256)};
+                           static_cast<uint32_t>(8),
+                           static_cast<uint32_t>(128)};
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
-      bm_kernel, cl::NullRange,
+      conv_kernel, cl::NullRange,
       cl::NDRange(gws[0], gws[1], gws[2]),
       cl::NDRange(lws[0], lws[1], lws[2]));
   MACE_CHECK(error == CL_SUCCESS);
diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc
index ca6a5b44682b2d0cd0c37ecd3f24a4daaf487dc3..7dcb996f4d94b6ed1c056d089a425b2d5809169e 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -10,6 +10,8 @@ namespace kernels {
 extern void DepthwiseConvOpenclK3x3S1(const Tensor *input, const Tensor *filter,
                                       const Tensor *bias, Tensor *output);
 
+extern void DepthwiseConvOpenclK3x3S2(const Tensor *input, const Tensor *filter,
+                                      const Tensor *bias, Tensor *output);
 template <>
 void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
                                                                    const Tensor *filter,
@@ -21,7 +23,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor
   static const Conv2dOpenclFunction selector[5][2] = {
       {nullptr, nullptr},
       {nullptr, nullptr},
-      {DepthwiseConvOpenclK3x3S1, nullptr},
+      {DepthwiseConvOpenclK3x3S1, DepthwiseConvOpenclK3x3S2},
       {nullptr, nullptr},
       {nullptr, nullptr}};
 
diff --git a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
index e76858a51ef3c7839988a32684f12892cdb46524..c37fe77fbef5483438e2e01c06996580c2589ef0 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
@@ -9,10 +9,11 @@
 namespace mace {
 namespace kernels {
 
-extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
-                                      const Tensor *filter,
-                                      const Tensor *bias,
-                                      Tensor *output) {
+static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
+                                            const Tensor *filter,
+                                            const Tensor *bias,
+                                            const uint32_t stride,
+                                            Tensor *output) {
   const index_t batch = output->dim(0);
   const index_t channels = output->dim(1);
   const index_t height = output->dim(2);
@@ -24,33 +25,53 @@ extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
   const index_t input_width = input->dim(3);
 
   MACE_CHECK(input_batch == batch);
-
-  auto runtime = OpenCLRuntime::Get();
-  auto program = runtime->program();
-  auto conv_2d = cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer,
-                                   int, int, int, int, int, int, int>(program, "depthwise_conv_3x3_s1");
   const index_t pixels = height * width;
   const index_t channel_blocks = (channels + 3) / 4;
   const index_t pixel_blocks = (width + 3) / 4 * height;
 
-  cl_int error;
-  conv_2d(cl::EnqueueArgs(runtime->command_queue(),
-                          cl::NDRange(static_cast<int>(batch),
-                                      static_cast<int>(channel_blocks),
-                                      static_cast<int>(pixel_blocks)),
-                          cl::NDRange(1, 1, 256)),
-          *(static_cast<cl::Buffer *>(input->buffer())),
-          *(static_cast<cl::Buffer *>(filter->buffer())),
-          *(static_cast<cl::Buffer *>(bias->buffer())),
-          *(static_cast<cl::Buffer *>(output->buffer())),
-          static_cast<int>(input_channels),
-          static_cast<int>(channels),
-          static_cast<int>(input_height),
-          static_cast<int>(input_width),
-          static_cast<int>(height),
-          static_cast<int>(width),
-          error);
+  auto runtime = OpenCLRuntime::Get();
+  auto program = runtime->program();
+  auto conv_kernel = cl::Kernel(program, "depthwise_conv_3x3");
+
+  uint32_t idx = 0;
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(filter->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(bias->buffer())));
+  conv_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
+  conv_kernel.setArg(idx++, static_cast<int32_t>(channels));
+  conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(2)));
+  conv_kernel.setArg(idx++, static_cast<int32_t>(input->dim(3)));
+  conv_kernel.setArg(idx++, static_cast<int32_t>(height));
+  conv_kernel.setArg(idx++, static_cast<int32_t>(width));
+  conv_kernel.setArg(idx++, stride);
+  conv_kernel.setArg(idx++, stride);
+
+  const uint32_t gws[3] = {static_cast<uint32_t>(output->dim(0)),
+                           static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(pixel_blocks)};
+  const uint32_t lws[3] = {static_cast<uint32_t>(1),
+                           static_cast<uint32_t>(1),
+                           static_cast<uint32_t>(256)};
+  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+      conv_kernel, cl::NullRange,
+      cl::NDRange(gws[0], gws[1], gws[2]),
+      cl::NDRange(lws[0], lws[1], lws[2]));
   MACE_CHECK(error == CL_SUCCESS);
+}
+
+extern void DepthwiseConvOpenclK3x3S1(const Tensor *input,
+                                      const Tensor *filter,
+                                      const Tensor *bias,
+                                      Tensor *output) {
+  InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 1, output);
+};
+
+extern void DepthwiseConvOpenclK3x3S2(const Tensor *input,
+                                      const Tensor *filter,
+                                      const Tensor *bias,
+                                      Tensor *output) {
+  InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 2, output);
 };
 
 }  // namespace kernels
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index dd4f4f7d596cb65bcab0ee31f7b90f94dda8b3ea..fb859da80011c7e31e2f88bf4e215940d7ae02ff 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -3,7 +3,6 @@
 //
 
 #include <algorithm>
-#include <sstream>
 
 #include "mace/core/operator.h"
 #include "mace/core/testing/test_benchmark.h"