diff --git a/mace/kernels/opencl/cl/assign_f32.cl b/mace/kernels/opencl/cl/assign_f32.cl
deleted file mode 100644
index cceaace1aae310d8afe1285530002901c53b3a41..0000000000000000000000000000000000000000
--- a/mace/kernels/opencl/cl/assign_f32.cl
+++ /dev/null
@@ -1,42 +0,0 @@
-void kernel assign_v16_f32(global float *output,
-                           private const float value,
-                           private const int pixels) {
-  int pixel_block = get_global_id(0);
-  int pixel_offset = pixel_block * 16;
-
-  float *output_ptr = output + pixel_offset;
-  int remains = pixels - pixel_offset;
-  if (remains >= 16) {
-    for (int i = 0; i < 4; ++i) {
-      vstore4(value, i, output_ptr);
-    }
-  } else {
-    for (int i = 0; i < remains; ++i) {
-      output_ptr[i] = value;
-    }
-  }
-}
-
-void kernel assign_3d_v16_f32(global float *output,
-                              global const float *values,
-                              private const int pixels) {
-  int batch = get_global_id(0);
-  int channel = get_global_id(1);
-  int channels = get_global_size(1);
-  int pixel_block = get_global_id(2);
-  int pixel_offset = pixel_block * 16;
-
-  float value = values[channel];
-  float *output_ptr = output + (batch * channels + channel) * pixels +
-                      pixel_offset;
-  int remains = pixels - pixel_offset;
-  if (remains >= 16) {
-    for (int i = 0; i < 4; ++i) {
-      vstore4(value, i, output_ptr);
-    }
-  } else {
-    for (int i = 0; i < remains; ++i) {
-      output_ptr[i] = value;
-    }
-  }
-}
diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl
index 7d41efc05df9f5c6b61bb14eee19708a41c145d9..7b856f89d5742c3424c4b3f59eb7dbf3fc2073f3 100644
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -1,26 +1,59 @@
-/*
- * Split work item along output channels and pixels
- */
-void kernel conv_2d_1x1_nchw(global const float *input, /* n, c, h, w */
-                             global const float *filter, /* o, i, kh, kw */
-                             global float *output, /* n, c, h, w */
-                             private const int in_offset,
-                             private const int out_offset,
-                             private const int pixel_num,
-                             private const int in_chan_num,
-                             private const int out_chan_num) {
-  int out_chan_blk = get_global_id(0);
-  int out_pixel_blk = get_global_id(1);
+void kernel conv_2d_1x1_naive(global const float *input, /* n, c, h, w */
+                              global const float *filter, /* o, i, kh, kw */
+                              global const float *bias, /* o */
+                              global float *output, /* n, c, h, w */
+                              private const int input_channels) {
+  const int batch = get_global_id(0);
+  const int channel = get_global_id(1);
+  const int channels = get_global_size(1);
+  const int pixel = get_global_id(2);
+  const int pixels = get_global_size(2);
+
+
+  float *output_ptr = output + (batch * channels + channel) * pixels;
+  output_ptr[pixel] = bias[channel];
+
+  for (int inc = 0; inc < input_channels; ++inc) {
+    const float *input_ptr = input + (batch * input_channels + inc) * pixels + pixel;
+    const float weights = filter[channel * input_channels + inc];
+    float in = input_ptr[0];
+    float out = output_ptr[0];
+    out += in * weights;
+    output_ptr[0] = out;
+  }
+}
+
+void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */
+                           global const float *filter, /* o, i, kh, kw */
+                           global const float *bias, /* o */
+                           global float *output, /* n, c, h, w */
+                           private const int in_chan_num,
+                           private const int out_chan_num,
+                           private const int pixel_num) {
+  int batch = get_global_id(0);
+  int out_chan_blk = get_global_id(1);
+  int out_pixel_blk = get_global_id(2);
 
   const int out_chan_begin = out_chan_blk * 4;
   const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
   const int out_pixel_begin = out_pixel_blk * 4;
   const int out_pixel_end = min(out_pixel_begin + 4, pixel_num);
 
+  const int in_offset = batch * in_chan_num * pixel_num;
+  const int out_offset = batch * out_chan_num * pixel_num;
   const float *input_base = input + in_offset + out_pixel_begin;
   float *output_base = output + out_offset + out_pixel_begin;
 
   int pixels = out_pixel_end - out_pixel_begin;
+
+  for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
+    float bias_value = bias[out_chan];
+    float *output_ptr = output_base + out_chan * pixel_num;
+    for (int p = 0; p < pixels; ++p) {
+      output_ptr[p] = bias_value;
+    }
+  }
+
   int in_chan = 0;
   if (pixels == 4) {
     for (; in_chan + 3 < in_chan_num; in_chan += 4) {
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index 7f6e1d19fafa9e17c47abd9a11ef1d8bb006da7e..636bdd79525abb01f3a84cd21f492e156004feed 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -10,49 +10,41 @@
 namespace mace {
 namespace kernels {
 
-static constexpr index_t kInputChannelBlockSize = 2;
-static constexpr index_t kOutputChannelBlockSize = 4;
+void Conv1x1Naive(const Tensor *input,
+                  const Tensor *filter,
+                  const Tensor *bias,
+                  Tensor *output) {
+  const index_t batch = output->shape()[0];
+  const index_t channels = output->shape()[1];
+  const index_t height = output->shape()[2];
+  const index_t width = output->shape()[3];
+  const index_t input_channels = input->shape()[1];
 
-void AssignBias(Tensor *output, const Tensor *bias) {
   auto runtime = OpenCLRuntime::Get();
   auto program = runtime->program();
-  if (bias == nullptr) {
-    auto assign_bias =
-      cl::KernelFunctor<cl::Buffer, float, int>(program, "assign_v16_f32");
-    index_t pixels = output->NumElements();
-    index_t blocks = (pixels + 15) / 16;
-    cl_int error;
-    assign_bias(cl::EnqueueArgs(runtime->command_queue(),
-                                cl::NDRange(blocks),
-                                cl::NullRange),
-                *(static_cast<cl::Buffer *>(output->buffer())),
-                0.0f, static_cast<int>(pixels), error);
-    MACE_CHECK(error == CL_SUCCESS);
-  } else {
-    auto output_shape = output->shape();
-    index_t batch = output_shape[0];
-    index_t channels = output_shape[1];
-    index_t pixels = output_shape[2] * output_shape[3];
-    index_t blocks = (pixels + 15) / 16;
-    MACE_CHECK(channels == bias->shape()[0], "Channels mismatch");
+  auto conv_2d = cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer,
+                                   int, int>(program, "conv_2d_1x1_naive");
+  const index_t pixels = height * width;
 
-    auto assign_bias =
-      cl::KernelFunctor<cl::Buffer, cl::Buffer, int>(program, "assign_3d_v16_f32");
-    cl_int error;
-    assign_bias(cl::EnqueueArgs(runtime->command_queue(),
-                                cl::NDRange(batch, channels, blocks),
-                                cl::NDRange(1, 8, 128)),
-                *(static_cast<cl::Buffer *>(output->buffer())),
-                *(static_cast<cl::Buffer *>(bias->buffer())),
-                static_cast<int>(pixels),
-                error);
-    MACE_CHECK(error == CL_SUCCESS);
-  }
-} 
+  cl_int error;
+  conv_2d(cl::EnqueueArgs(runtime->command_queue(),
+                          cl::NDRange(static_cast<int>(batch),
+                                      static_cast<int>(channels),
+                                      static_cast<int>(pixels)),
+                          cl::NDRange(1, 1, 128)),
+          *(static_cast<cl::Buffer *>(input->buffer())),
+          *(static_cast<cl::Buffer *>(filter->buffer())),
+          *(static_cast<cl::Buffer *>(bias->buffer())),
+          *(static_cast<cl::Buffer *>(output->buffer())),
+          static_cast<int>(input_channels),
+          error);
+  MACE_CHECK(error == CL_SUCCESS);
+}
 
-void Conv1x1NCHW(const Tensor *input,
-                  const Tensor *filter,
-                  Tensor *output) {
+void Conv1x1V2(const Tensor *input,
+               const Tensor *filter,
+               const Tensor *bias,
+               Tensor *output) {
   const index_t batch = output->shape()[0];
   const index_t channels = output->shape()[1];
   const index_t height = output->shape()[2];
@@ -61,25 +53,27 @@ void Conv1x1NCHW(const Tensor *input,
 
   auto runtime = OpenCLRuntime::Get();
   auto program = runtime->program();
-  auto conv_2d = cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer,
-                                   int, int, int, int, int>(program, "conv_2d_1x1_nchw");
-  const index_t total_pixels = height * width;
+  auto conv_2d = cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer,
+                                   int, int, int, int>(program, "conv_2d_1x1_v2");
+  const index_t pixels = height * width;
+  const index_t channel_blocks = (channels + 3) / 4;
+  const index_t pixel_blocks = (pixels + 3) / 4;
 
-  for (int b = 0; b < batch; ++b) {
-    int input_offset = b * input_channels * total_pixels;
-    int output_offset = b * channels * total_pixels;
-    int chan_blk_num = (channels + 3) >> 2; // each 4 output channels
-    int pixel_blk_num = (total_pixels + 3) >> 2; // each 4 pixels
-    cl_int error;
-    conv_2d(cl::EnqueueArgs(runtime->command_queue(),
-                            cl::NDRange(chan_blk_num, pixel_blk_num),
-                            cl::NDRange(1, 256)),
-            *(static_cast<cl::Buffer *>(input->buffer())),
-            *(static_cast<cl::Buffer *>(filter->buffer())),
-            *(static_cast<cl::Buffer *>(output->buffer())),
-            input_offset, output_offset, total_pixels, input_channels, channels, error);
-    MACE_CHECK(error == CL_SUCCESS);
-  }
+  cl_int error;
+  conv_2d(cl::EnqueueArgs(runtime->command_queue(),
+                          cl::NDRange(static_cast<int>(batch),
+                                      static_cast<int>(channel_blocks),
+                                      static_cast<int>(pixel_blocks)),
+                          cl::NDRange(1, 1, 256)),
+          *(static_cast<cl::Buffer *>(input->buffer())),
+          *(static_cast<cl::Buffer *>(filter->buffer())),
+          *(static_cast<cl::Buffer *>(bias->buffer())),
+          *(static_cast<cl::Buffer *>(output->buffer())),
+          static_cast<int>(input_channels),
+          static_cast<int>(channels),
+          static_cast<int>(pixels),
+          error);
+  MACE_CHECK(error == CL_SUCCESS);
 }
 
 extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
@@ -95,8 +89,8 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
   MACE_CHECK(input_batch == batch && input_height == height &&
              input_width == width);
 
-  AssignBias(output, bias);
-  Conv1x1NCHW(input, filter, output);
+  // Conv1x1Naive(input, filter, bias, output);
+  Conv1x1V2(input, filter, bias, output);
 };
 
 }  // namespace kernels
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index 5ee5f1ce221780783c9baf89f63a965988221ddd..caeac58a6052af80cf89e4dec4a087fc3f804f75 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -46,11 +46,13 @@ static void Conv2d(int iters,
   // Warm-up
   for (int i = 0; i < 5; ++i) {
     net.RunOp(D);
+    net.Sync();
   }
 
   mace::testing::StartTiming();
   while (iters--) {
     net.RunOp(D);
+    net.Sync();
   }
 }