diff --git a/mace/core/opencl_allocator.cc b/mace/core/opencl_allocator.cc
index 3b393542281266a4564767e732ea703c4371e738..75004f75276a54c47f82626eefa818dcef3941da 100644
--- a/mace/core/opencl_allocator.cc
+++ b/mace/core/opencl_allocator.cc
@@ -13,6 +13,7 @@ namespace {
 static cl_channel_type DataTypeToCLChannelType(const DataType t) {
   switch (t) {
     case DT_HALF:
+      return CL_HALF_FLOAT;
     case DT_FLOAT:
       return CL_FLOAT;
     case DT_INT8:
diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl
index 33d7305b6e8ebb77d97071616fa5dfa9eb7c3a5d..ef76aa79c56f310f77eda05a0a97c3091d9faba2 100644
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -8,7 +8,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
                           __write_only image2d_t output,
                           __private const int in_height,
                           __private const int in_width,
-                          __private const int in_channels,
+                          __private const int in_ch_blks,
                           __private const int out_height,
                           __private const int out_width,
                           __private const int padding_top,
@@ -17,25 +17,26 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   const int out_w_blk = get_global_id(1);
   const int out_w_blks = get_global_size(1);
   const int out_hb = get_global_id(2);
-  const int in_ch_blks = (in_channels + 3) / 4;
   const int rounded_in_ch = in_ch_blks * 4;
 
   const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-  VEC_DATA_TYPE(DATA_TYPE, 4) out[4] = {0};
+  VEC_DATA_TYPE(DATA_TYPE, 4) out[5] = {0};
 #ifdef BIAS
   out[0] =
       CMD_TYPE(read_image, CMD_DATA_TYPE)(bias, sampler, (int2)(out_ch_blk, 0));
   out[1] = out[0];
   out[2] = out[0];
   out[3] = out[0];
+  out[4] = out[0];
 #endif
 
-  int w[4];
+  int w[5];
   w[0] = out_w_blk - padding_left;
   w[1] = w[0] + out_w_blks;
   w[2] = w[1] + out_w_blks;
   w[3] = w[2] + out_w_blks;
+  w[4] = w[3] + out_w_blks;
 
   const int batch_idx = out_hb / out_height;
   const int height_idx = out_hb % out_height;
@@ -50,64 +51,83 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 
   const int input_image_width = in_ch_blks * in_width;
 
+  VEC_DATA_TYPE(DATA_TYPE, 4) in[5];
+  VEC_DATA_TYPE(DATA_TYPE, 4) weights[4];
+  int in_idx, hb_idx, width_idx, in_width_idx;
   // Unrolling this loop hurt perfmance
-  int idx = 0;
   for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    VEC_DATA_TYPE(DATA_TYPE, 4) in[36];
-    VEC_DATA_TYPE(DATA_TYPE, 4) weights[36];
-
-    int filter_idx = in_ch_blk << 2;
-    int in_idx = in_ch_blk * in_width;
-
-    #pragma unroll
-    for (int i = 0; i < 3; ++i) {
-      for (int j = 0; j < 3; ++j) {
-        idx = i * 12 + j * 4;
-        int in_width_idx = w[0] + j;
-        // Judge the width border for padding input.
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
-          in[idx + 0] = 0;
-        } else {
-          in[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
-        }
-        in_width_idx = w[1] + j;
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
-          in[idx + 1] = 0;
-        } else {
-          in[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
-        }
-        in_width_idx = w[2] + j;
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
-          in[idx + 2] = 0;
-        } else {
-          in[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
-        }
-        in_width_idx = w[3] + j;
-        if (in_width_idx < 0 || in_width_idx >= in_width) {
-          in[idx + 3] = 0;
-        } else {
-          in[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
-        }
-
-        weights[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
-        weights[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
-        weights[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
-        weights[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
-
-        filter_idx += rounded_in_ch;
+    for (int i = 0; i < 9; ++i) {
+
+      in_idx = in_ch_blk * in_width;
+
+      hb_idx = i / 3;
+      width_idx = i % 3;
+      in_width_idx = w[0] + width_idx;
+      // Judge the width border for padding input.
+      if (in_width_idx < 0 || in_width_idx >= in_width) {
+        in[0] = 0;
+      } else {
+        in[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
       }
-    }
-    // Will prefetch L2 improve performance? How to pretch image data?
-
-    // Interleaving load and mul does not improve performance as expected
-    #pragma unroll
-    for (int c = 0; c < 4; ++c) {
-      for (int i = 0; i < 9; ++i) {
-        out[c] += in[c + i * 4].x * weights[0 + i * 4];
-        out[c] += in[c + i * 4].y * weights[1 + i * 4];
-        out[c] += in[c + i * 4].z * weights[2 + i * 4];
-        out[c] += in[c + i * 4].w * weights[3 + i * 4];
+      in_width_idx = w[1] + width_idx;
+      if (in_width_idx < 0 || in_width_idx >= in_width) {
+        in[1] = 0;
+      } else {
+        in[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
+      }
+      in_width_idx = w[2] + width_idx;
+      if (in_width_idx < 0 || in_width_idx >= in_width) {
+        in[2] = 0;
+      } else {
+        in[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
+      }
+      in_width_idx = w[3] + width_idx;
+      if (in_width_idx < 0 || in_width_idx >= in_width) {
+        in[3] = 0;
+      } else {
+        in[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
+      }
+      in_width_idx = w[4] + width_idx;
+      if (in_width_idx < 0 || in_width_idx >= in_width) {
+        in[4] = 0;
+      } else {
+        in[4] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
       }
+
+
+      int filter_idx = (in_ch_blk << 2) + i * rounded_in_ch;
+      weights[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
+      weights[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
+      weights[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
+      weights[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
+
+      // Will prefetch L2 improve performance? How to pretch image data?
+
+      // Interleaving load and mul does not improve performance as expected
+      out[0] += in[0].x * weights[0];
+      out[0] += in[0].y * weights[1];
+      out[0] += in[0].z * weights[2];
+      out[0] += in[0].w * weights[3];
+
+      out[1] += in[1].x * weights[0];
+      out[1] += in[1].y * weights[1];
+      out[1] += in[1].z * weights[2];
+      out[1] += in[1].w * weights[3];
+
+      out[2] += in[2].x * weights[0];
+      out[2] += in[2].y * weights[1];
+      out[2] += in[2].z * weights[2];
+      out[2] += in[2].w * weights[3];
+
+      out[3] += in[3].x * weights[0];
+      out[3] += in[3].y * weights[1];
+      out[3] += in[3].z * weights[2];
+      out[3] += in[3].w * weights[3];
+
+      out[4] += in[4].x * weights[0];
+      out[4] += in[4].y * weights[1];
+      out[4] += in[4].z * weights[2];
+      out[4] += in[4].w * weights[3];
     }
   }
 
@@ -133,4 +153,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
                                        (int2)(out_x_base + w[3], out_hb),
                                        out[3]);
+
+  w[4] += padding_left;
+  if (w[4] >= out_width) return;
+  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
+                                       (int2)(out_x_base + w[4], out_hb),
+                                       out[4]);
 }
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index b7e11e817922287a9b048ed9299c5d332f3ef0cf..5836caa8842cfc1749b2ea78ca6d55fcf11e2e3b 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -22,7 +22,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
 
   const index_t channel_blocks = RoundUpDiv4(channels);
   const index_t input_channel_blocks = RoundUpDiv4(input_channels);
-  const index_t width_blocks = RoundUpDiv4(width);
+  const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
 
   std::set<std::string> built_options;
   built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
@@ -44,7 +44,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
   conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
   conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1)));
   conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
-  conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3)));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
   conv_2d_kernel.setArg(idx++, static_cast<int>(height));
   conv_2d_kernel.setArg(idx++, static_cast<int>(width));
   conv_2d_kernel.setArg(idx++, padding[0] / 2);
@@ -56,7 +56,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
       conv_2d_kernel, cl::NullRange,
       cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
                   static_cast<uint32_t>(height * batch)),
-      cl::NDRange(4, 15, 8),
+      cl::NDRange(16, 16, 4),
       NULL, OpenCLRuntime::Get()->GetDefaultEvent());
   MACE_CHECK(error == CL_SUCCESS, error);
 
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index 24211ca1832921c89828b6ec00f45e33a152b77c..2409013ec322966fe2ffc4c682bc1bcbd17da19b 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -27,10 +27,10 @@ static void Conv2d(int iters,
   OpsTestNet net;
 
   // Add input data
-  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
-  net.AddRandomInput<D, float>("Filter",
+  net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
+  net.AddRandomInput<D, T>("Filter",
                                {kernel_h, kernel_w, channels, output_channels});
-  net.AddRandomInput<D, float>("Bias", {output_channels});
+  net.AddRandomInput<D, T>("Bias", {output_channels});
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
@@ -88,6 +88,7 @@ static void Conv2d(int iters,
       BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
 
 #define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE)        \
+  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU);  \
   BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
 
 // ICNet
@@ -99,7 +100,7 @@ BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float);
 BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float);
 
 // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
-BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, VALID, 32, float);
+BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, float);
 
 // Test RGB <-> YUV
 BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index 6120f403b31af34c5689fdd2664ede5924edd826..864c882a3eca7b6c1e2a7355830af8744699415a 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -84,18 +84,18 @@ TEST_F(Conv2dOpTest, NEONSimple) {
   TestSimple3x3SAME<DeviceType::NEON>();
 }
 
-template<DeviceType D>
+template<DeviceType D, typename T>
 void TestNHWCSimple3x3VALID() {
   OpsTestNet net;
   // Add input data
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
       "Input", {1, 3, 3, 2},
       {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
       "Filter", {3, 3, 2, 1},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
+  net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
@@ -130,23 +130,23 @@ void TestNHWCSimple3x3VALID() {
     net.RunOp(D);
   }
 
-  auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+  auto expected = CreateTensor<T>({1, 1, 1, 1}, {18.1f});
+  ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 0.001);
 }
 
-template<DeviceType D>
+template<DeviceType D, typename T>
 void TestNHWCSimple3x3SAME() {
   OpsTestNet net;
 
   // Add input data
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
       "Input", {1, 3, 3, 2},
       {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  net.AddInputFromArray<D, float>(
+  net.AddInputFromArray<D, T>(
       "Filter", {3, 3, 2, 1},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
+  net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
@@ -181,21 +181,21 @@ void TestNHWCSimple3x3SAME() {
     net.RunOp(D);
   }
 
-  auto expected = CreateTensor<float>(
+  auto expected = CreateTensor<T>(
       {1, 3, 3, 1},
       {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
 
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
+  ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 0.001);
 }
 
 TEST_F(Conv2dOpTest, CPUSimple) {
-  TestNHWCSimple3x3VALID<DeviceType::CPU>();
-  TestNHWCSimple3x3SAME<DeviceType::CPU>();
+  TestNHWCSimple3x3VALID<DeviceType::CPU, float>();
+  TestNHWCSimple3x3SAME<DeviceType::CPU, float>();
 }
 
 TEST_F(Conv2dOpTest, OPENCLSimple) {
-  TestNHWCSimple3x3VALID<DeviceType::OPENCL>();
-  TestNHWCSimple3x3SAME<DeviceType::OPENCL>();
+  TestNHWCSimple3x3VALID<DeviceType::OPENCL, float>();
+  TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
 }
 
 template<DeviceType D>
@@ -457,11 +457,11 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
     srand(time(NULL));
 
     // generate random input
-    index_t batch = 3 + rand() % 10;
+    index_t batch = 3 + (rand() % 10);
     index_t height = shape[0];
     index_t width = shape[1];
-    index_t input_channels = shape[2] + rand() % 10;
-    index_t output_channels = shape[3] + rand() % 10;
+    index_t input_channels = shape[2] + (rand() % 10);
+    index_t output_channels = shape[3] + (rand() % 10);
     // Construct graph
     OpsTestNet net;
     OpDefBuilder("Conv2D", "Conv2dTest")
@@ -509,6 +509,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
 
   for (int kernel_size : {3}) {
     for (int stride : {1}) {
+      func(kernel_size, kernel_size, stride, stride, VALID);
       func(kernel_size, kernel_size, stride, stride, SAME);
     }
   }