diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD
index 4352f2a66cd413819e19bf98e70840cbea404d3f..87f81f7f1f31d51975db77265dbc0a99e67fecdf 100644
--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -7,13 +7,20 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-load("//mace:mace.bzl", "if_android")
-load("//mace:mace.bzl", "if_android_arm64")
+load("//mace:mace.bzl", "if_android", "if_android_arm64")
 
 cc_library(
     name = "kernels",
-    srcs = glob(["*.cc"]) + if_android(glob(["opencl/*.cc"])) + if_android_arm64(glob(["neon/*.cc"])),
-    hdrs = glob(["*.h"]) + if_android(glob(["opencl/*.h"])) + if_android_arm64(glob(["neon/*.h"])),
+    srcs = glob(["*.cc"]) + if_android(glob([
+        "opencl/*.cc",
+    ])) + if_android_arm64(glob([
+        "neon/*.cc",
+    ])),
+    hdrs = glob(["*.h"]) + if_android(glob([
+        "opencl/*.h",
+    ])) + if_android_arm64(glob([
+        "neon/*.h",
+    ])),
     copts = [
         "-std=c++11",
         "-fopenmp",
diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl
index c9d49525723285a25410b2a976d0b95de0319c05..56f2cedc5e1f2427fcea57b91b9150e049f618ba 100644
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -141,7 +141,6 @@ __kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */
   }
 }
 
-// TODO : validation
 __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                           __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
                           __read_only image2d_t bias, /* cout%4 * cout/4 */
@@ -155,71 +154,151 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 
   const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
-  half4 bias_value = read_imageh(bias, sampler, (int2)(out_w_blk, 1));
-  half4 out0 = (half4)(bias_value.x, bias_value.x, bias_value.x, bias_value.x);
-  half4 out1 = (half4)(bias_value.y, bias_value.y, bias_value.y, bias_value.y);
-  half4 out2 = (half4)(bias_value.z, bias_value.z, bias_value.z, bias_value.z);
-  half4 out3 = (half4)(bias_value.w, bias_value.w, bias_value.w, bias_value.w);
+  half4 bias_value = read_imageh(bias, sampler, (int2)(out_ch_blk, 0));
+  half4 out[4];
+  out[0] = (half4)(bias_value.x);
+  out[1] = (half4)(bias_value.y);
+  out[2] = (half4)(bias_value.z);
+  out[3] = (half4)(bias_value.w);
+
+  int w[4];
+  w[0] = out_w_blk;
+  w[1] = w[0] + out_w_blks;
+  w[2] = w[1] + out_w_blks;
+  w[3] = w[2] + out_w_blks;
 
   // Unrolling this loop hurt perfmance
   int in_x_base = 0;
   for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    const int in_x0 = in_x_base + out_w_blk;;
-    const int in_x1 = in_x0 + out_w_blks;
-    const int in_x2 = in_x1 + out_w_blks;
-    const int in_x3 = in_x2 + out_w_blks;
-    in_x_base += width;
-
-    const half4 in0 = read_imageh(input, sampler, (int2)(in_x0, out_hb));
-    const half4 in1 = read_imageh(input, sampler, (int2)(in_x1, out_hb));
-    const half4 in2 = read_imageh(input, sampler, (int2)(in_x2, out_hb));
-    const half4 in3 = read_imageh(input, sampler, (int2)(in_x3, out_hb));
+    half4 in[4];
+    in[0] = read_imageh(input, sampler, (int2)(in_x_base + w[0], out_hb));
+    if (w[1] < width) {
+      // conditional load hurt perf, this branching helps sometimes
+      in[1] = read_imageh(input, sampler, (int2)(in_x_base + w[1], out_hb));
+      in[2] = read_imageh(input, sampler, (int2)(in_x_base + w[2], out_hb));
+      in[3] = read_imageh(input, sampler, (int2)(in_x_base + w[3], out_hb));
+    }
 
     // The order matters, load input first then load filter, why?
     const int filter_x0 = in_ch_blk << 2;
-    const half4 weights0 = read_imageh(filter, sampler, (int2)(filter_x0, out_ch_blk));
-    const half4 weights1 = read_imageh(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
-    const half4 weights2 = read_imageh(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
-    const half4 weights3 = read_imageh(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
+    half4 weights[4];
+    #pragma unroll
+    for (int c = 0; c < 4; ++c) {
+      weights[c] = read_imageh(filter, sampler, (int2)(filter_x0 + c, out_ch_blk));
+    }
     // Will prefetch L2 improve performance? How to pretch image data?
 
     // Interleaving load and mul does not improve performance as expected
-    out0 += in0.x * weights0;
-    out1 += in1.x * weights0;
-    out2 += in2.x * weights0;
-    out3 += in3.x * weights0;
-
-    out0 += in0.y * weights1;
-    out1 += in1.y * weights1;
-    out2 += in2.y * weights1;
-    out3 += in3.y * weights1;
-
-    out0 += in0.z * weights2;
-    out1 += in1.z * weights2;
-    out2 += in2.z * weights2;
-    out3 += in3.z * weights2;
-
-    out0 += in0.w * weights3;
-    out1 += in1.w * weights3;
-    out2 += in2.w * weights3;
-    out3 += in3.w * weights3;
+    #pragma unroll
+    for (int c = 0; c < 4; ++c) {
+      out[c] += in[c].x * weights[0];
+      out[c] += in[c].y * weights[1];
+      out[c] += in[c].z * weights[2];
+      out[c] += in[c].w * weights[3];
+    }
+
+    in_x_base += width;
   }
 
-  const int out_x_offset = out_ch_blk * width;
-  const int w0 = out_w_blk;
-  write_imageh(output, (int2)(out_x_offset + w0, out_hb), out0);
+  const int out_x_base = out_ch_blk * width;
+  write_imageh(output, (int2)(out_x_base + w[0], out_hb), out[0]);
 
-  const int w1 = w0 + out_w_blks;
-  if (w1 >= width) return;
-  write_imageh(output, (int2)(out_x_offset + w1, out_hb), out1);
+  if (w[1] >= width) return;
+  write_imageh(output, (int2)(out_x_base + w[1], out_hb), out[1]);
 
-  const int w2 = w1 + out_w_blks;
-  if (w2 >= width) return;
-  write_imageh(output, (int2)(out_x_offset + w2, out_hb), out2);
+  if (w[2] >= width) return;
+  write_imageh(output, (int2)(out_x_base + w[2], out_hb), out[2]);
 
-  const int w3 = w2 + out_w_blks;
-  if (w3 >= width) return;
-  write_imageh(output, (int2)(out_x_offset + w3, out_hb), out3);
+  if (w[3] >= width) return;
+  write_imageh(output, (int2)(out_x_base + w[3], out_hb), out[3]);
 }
 
+__kernel void conv_2d_1x1_h8(__read_only image2d_t input, /* [c%8 * w * c/8, h * b] */
+                             __read_only image2d_t filter, /* cout%8 * cin, cout/8 */
+                             __read_only image2d_t bias, /* cout%8 * cout/8 */
+                             __write_only image2d_t output,
+                             __private const int in_ch_blks,
+                             __private const int width) {
+  const int out_ch_blk = get_global_id(0);
+  const int out_w_blk = get_global_id(1);
+  const int out_w_blks = get_global_size(1);
+  const int out_hb = get_global_id(2);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
+  float4 bias_value = read_imagef(bias, sampler, (int2)(out_ch_blk, 0));
+  half4 bias_value03 = as_half4(bias_value.xy);
+  half4 bias_value47 = as_half4(bias_value.zw);
+  half4 out[8];
+  out[0] = (half4)(bias_value03.x);
+  out[1] = (half4)(bias_value03.y);
+  out[2] = (half4)(bias_value03.z);
+  out[3] = (half4)(bias_value03.w);
+  out[4] = (half4)(bias_value47.x);
+  out[5] = (half4)(bias_value47.y);
+  out[6] = (half4)(bias_value47.z);
+  out[7] = (half4)(bias_value47.w);
+
+  int w[4];
+  w[0] = out_w_blk;
+  w[1] = w[0] + out_w_blks;
+  w[2] = w[1] + out_w_blks;
+  w[3] = w[2] + out_w_blks;
+
+  // Unrolling this loop hurt perfmance
+  int in_x_base = 0;
+  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
+    half4 in[8];
+    #pragma unroll
+    for (int wi = 0; wi < 4; ++wi) {
+      float4 in_value = read_imagef(input, sampler, (int2)(in_x_base + w[0], out_hb));
+      in[wi << 1] = as_half4(in_value.xy);
+      in[wi << 1 + 1] = as_half4(in_value.zw);
+    }
+
+    // The order matters, load input first then load filter, why?
+    const int filter_x0 = in_ch_blk << 2;
+    half4 weights[8];
+    #pragma unroll
+    for (int wi = 0; wi < 4; ++wi) {
+      float4 weights_value = read_imagef(filter, sampler, (int2)(filter_x0 + wi, out_ch_blk));
+      weights[wi << 1] = as_half4(weights_value.xy);
+      weights[wi << 1 + 1] = as_half4(weights_value.zw);
+    }
+    // Will prefetch L2 improve performance? How to pretch image data?
+
+    // Interleaving load and mul does not improve performance as expected
+    #pragma unroll
+    for (int wi = 0; wi < 4; ++wi) {
+      int idx = wi << 1;
+      out[idx] += in[idx].x * weights[0];
+      out[idx] += in[idx].y * weights[1];
+      out[idx] += in[idx].z * weights[2];
+      out[idx] += in[idx].w * weights[3];
+
+      ++idx;
+      out[idx] += in[idx].x * weights[4];
+      out[idx] += in[idx].y * weights[5];
+      out[idx] += in[idx].z * weights[6];
+      out[idx] += in[idx].w * weights[7];
+    }
+
+    in_x_base += width;
+  }
+
+  const int out_x_base = out_ch_blk * width;
+  float4 out_value = (float4)(as_float2(out[0]), as_float2(out[1]));
+  write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
+
+  if (w[1] >= width) return;
+  out_value = (float4)(as_float2(out[2]), as_float2(out[3]));
+  write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
+
+  if (w[2] >= width) return;
+  out_value = (float4)(as_float2(out[4]), as_float2(out[5]));
+  write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
+
+  if (w[3] >= width) return;
+  out_value = (float4)(as_float2(out[6]), as_float2(out[7]));
+  write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
+}
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index 1a85911de29fdde88c2cd41b15a2e2282178b5ac..aa8ee24fd642eab1cf03756149b33c27629985a8 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -66,8 +66,8 @@ void Conv1x1V2(const Tensor *input,
 void Conv1x1V3(const Tensor *input,
                const Tensor *filter,
                const Tensor *bias,
+               const int stride,
                Tensor *output) {
-  
   const index_t batch = output->dim(0);
   const index_t channels = output->dim(1);
   const index_t height = output->dim(2);
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index 0d201a2f5a7a60089b77b9387932c0c11b9f639d..8bfee960e777c7e90f08d2ae508efe620bc2b230 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -13,7 +13,6 @@ namespace mace {
 
 template <DeviceType D, typename T>
 static void Conv2d(int iters,
-                   int iters_to_sync,
                    int batch,
                    int channels,
                    int height,
@@ -51,10 +50,8 @@ static void Conv2d(int iters,
   mace::testing::StartTiming();
   while (iters--) {
     net.RunOp(D);
-    if (iters % iters_to_sync == 0) {
-      net.Sync();
-    }
   }
+  net.Sync();
 }
 
 // In common network, there are usually more than 1 layers, this is used to
@@ -69,8 +66,8 @@ constexpr int kItersToSync = 10;
     const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;                               \
     mace::testing::ItemsProcessed(tot);                                                            \
     mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                                            \
-    Conv2d<DEVICE, TYPE>(iters, kItersToSync, N, C, H, W, KH, KW, STRIDE,                          \
-                         mace::Padding::P, OC);                                                    \
+    Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P,                      \
+                         OC);                                                                      \
   }                                                                                                \
   BENCHMARK(                                                                                       \
       BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
@@ -80,8 +77,18 @@ constexpr int kItersToSync = 10;
   BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, NEON); \
   BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
 
-BM_CONV_2D(1, 3, 4032, 3016, 1, 1, 1, VALID, 3, float);  // Test RGB <-> YUV
-BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);  // Test RGB <-> YUV
+// ICNet
+BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, float);
+BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, float);
+// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
+BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float);
+// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
+BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float);
+
+// Test RGB <-> YUV
+BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
+BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
+
 BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
 BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float);  // Test bad alignments
 BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
diff --git a/mace/utils/utils.h b/mace/utils/utils.h
index 49484c39a92a5a8bfe5bb22369e5636650e3b75a..536a7fb8805bc136e3b235151bbfb433b6c96836 100644
--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
@@ -24,6 +24,11 @@ Integer RoundUpDiv4(Integer i) {
   return (i + 3) >> 2;
 }
 
+template <typename Integer>
+Integer RoundUpDiv8(Integer i) {
+  return (i + 7) >> 3;
+}
+
 template <typename Integer>
 Integer CeilQuotient(Integer a, Integer b) {
   return (a + b - 1) / b;