From 28d075221a5b106b764de26b1274ec777caa714d Mon Sep 17 00:00:00 2001
From: Liangliang He <lliang.he@gmail.com>
Date: Mon, 23 Oct 2017 20:38:31 +0800
Subject: [PATCH] Improve opencl assign operation

---
 mace/kernels/opencl/cl/assign_f32.cl      | 43 ++++++++++++++++++-----
 mace/kernels/opencl/conv_2d_opencl_1x1.cc | 18 +++++-----
 2 files changed, 43 insertions(+), 18 deletions(-)
diff --git a/mace/kernels/opencl/cl/assign_f32.cl b/mace/kernels/opencl/cl/assign_f32.cl
index c73036ee..cceaace1 100644
--- a/mace/kernels/opencl/cl/assign_f32.cl
+++ b/mace/kernels/opencl/cl/assign_f32.cl
@@ -1,17 +1,42 @@
-void kernel assign_f32(global float *vec, private const float value) {
-  int idx = get_global_id(0);
-  vec[idx] = value;
+void kernel assign_v16_f32(global float *output,
+                           private const float value,
+                           private const int pixels) {
+  int pixel_block = get_global_id(0);
+  int pixel_offset = pixel_block * 16;
+
+  float *output_ptr = output + pixel_offset;
+  int remains = pixels - pixel_offset;
+  if (remains >= 16) {
+    for (int i = 0; i < 4; ++i) {
+      vstore4(value, i, output_ptr);
+    }
+  } else {
+    for (int i = 0; i < remains; ++i) {
+      output_ptr[i] = value;
+    }
+  }
 }
 
-void kernel assign_vec_f32(global float *vec,
-                           global float *values,
-                           private int pixels) {
+void kernel assign_3d_v16_f32(global float *output,
+                              global const float *values,
+                              private const int pixels) {
   int batch = get_global_id(0);
   int channel = get_global_id(1);
   int channels = get_global_size(1);
+  int pixel_block = get_global_id(2);
+  int pixel_offset = pixel_block * 16;
+
   float value = values[channel];
-  float *ptr = vec + (batch * channels + channel) * pixels;
-  for (int i = 0; i < pixels; ++i) {
-    ptr[i] = value;
+  float *output_ptr = output + (batch * channels + channel) * pixels +
+                      pixel_offset;
+  int remains = pixels - pixel_offset;
+  if (remains >= 16) {
+    for (int i = 0; i < 4; ++i) {
+      vstore4(value, i, output_ptr);
+    }
+  } else {
+    for (int i = 0; i < remains; ++i) {
+      output_ptr[i] = value;
+    }
   }
 }
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index 3d2247cf..d37fb293 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -13,35 +13,35 @@ namespace kernels {
 static constexpr index_t kInputChannelBlockSize = 2;
 static constexpr index_t kOutputChannelBlockSize = 4;
 
-// TODO(heliangliang) fix bad performance
 void AssignBias(Tensor *output, const Tensor *bias) {
-
   auto runtime = OpenCLRuntime::Get();
   auto program = runtime->program();
   if (bias == nullptr) {
     auto assign_bias =
-      cl::KernelFunctor<cl::Buffer, float>(program, "assign_f32");
-    int global_size = output->NumElements();
+      cl::KernelFunctor<cl::Buffer, float, int>(program, "assign_v16_f32");
+    index_t pixels = output->NumElements();
+    index_t blocks = (pixels + 15) / 16;
     cl_int error;
     assign_bias(cl::EnqueueArgs(runtime->command_queue(),
-                                cl::NDRange(global_size),
+                                cl::NDRange(blocks),
                                 cl::NullRange),
                 *(static_cast<cl::Buffer *>(output->buffer())),
-                0.0f, error);
+                0.0f, static_cast<int>(pixels), error);
     MACE_CHECK(error == CL_SUCCESS);
   } else {
     auto output_shape = output->shape();
     index_t batch = output_shape[0];
     index_t channels = output_shape[1];
     index_t pixels = output_shape[2] * output_shape[3];
+    index_t blocks = (pixels + 15) / 16;
     MACE_CHECK(channels == bias->shape()[0], "Channels mismatch");
 
     auto assign_bias =
-      cl::KernelFunctor<cl::Buffer, cl::Buffer, int>(program, "assign_vec_f32");
+      cl::KernelFunctor<cl::Buffer, cl::Buffer, int>(program, "assign_3d_v16_f32");
     cl_int error;
     assign_bias(cl::EnqueueArgs(runtime->command_queue(),
-                                cl::NDRange(batch, channels),
-                                cl::NullRange),
+                                cl::NDRange(batch, channels, blocks),
+                                cl::NDRange(1, 8, 128)),
                 *(static_cast<cl::Buffer *>(output->buffer())),
                 *(static_cast<cl::Buffer *>(bias->buffer())),
                 static_cast<int>(pixels),
-- 
GitLab