Add opencl kernel with image example

e4455322 · Liangliang He · 546926a1 · e4455322 · e4455322 · e4455322
5 changed file
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -26,6 +26,7 @@ bool ReadSourceFile(const std::string &filename, std::string *content) {
  std::string line;
  while (std::getline(ifs, line)) {
    *content += line;
+    *content += "\n";
  }
  ifs.close();
  return true;
@@ -66,14 +67,15 @@ bool BuildProgram(OpenCLRuntime *runtime,
  *program = cl::Program(runtime->context(), sources);
  std::string build_options = "-Werror -cl-mad-enable -cl-fast-relaxed-math -I" + path;
  // TODO(heliangliang) -cl-unsafe-math-optimizations -cl-fast-relaxed-math
-  if (program->build({runtime->device()}, build_options.c_str()) != CL_SUCCESS) {
+  cl_int ret = program->build({runtime->device()}, build_options.c_str());
+  if (ret != CL_SUCCESS) {
    if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(runtime->device()) ==
        CL_BUILD_ERROR) {
      std::string build_log =
          program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(runtime->device());
      LOG(INFO) << "Program build log: " << build_log;
    }
-    LOG(FATAL) << "Build program failed";
+    LOG(FATAL) << "Build program failed: " << ret;
  }
  return true;

--- a/mace/core/runtime/opencl/opencl_wrapper.cc
+++ b/mace/core/runtime/opencl/opencl_wrapper.cc
@@ -126,6 +126,12 @@ class OpenCLLibraryImpl final {
  using clRetainKernelFunc = cl_int (*)(cl_kernel kernel);
  using clCreateBufferFunc =
      cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *);
+  using clCreateImageFunc = cl_mem (*)(cl_context,
+                                       cl_mem_flags,
+                                       const cl_image_format *,
+                                       const cl_image_desc *,
+                                       void *,
+                                       cl_int *);
  using clCreateProgramWithSourceFunc = cl_program (*)(
      cl_context, cl_uint, const char **, const size_t *, cl_int *);
  using clReleaseKernelFunc = cl_int (*)(cl_kernel kernel);
@@ -136,8 +142,12 @@ class OpenCLLibraryImpl final {
  using clRetainDeviceFunc = cl_int (*)(cl_device_id);
  using clReleaseDeviceFunc = cl_int (*)(cl_device_id);
  using clRetainEventFunc = cl_int (*)(cl_event);
-  using clGetKernelWorkGroupInfoFunc =
+  using clGetKernelWorkGroupInfoFunc = cl_int (*)(cl_kernel,
-  cl_int (*)(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *);
+                                                  cl_device_id,
+                                                  cl_kernel_work_group_info,
+                                                  size_t,
+                                                  void *,
+                                                  size_t *);
 #define DEFINE_FUNC_PTR(func) func##Func func = nullptr
@@ -149,6 +159,7 @@ class OpenCLLibraryImpl final {
  DEFINE_FUNC_PTR(clReleaseKernel);
  DEFINE_FUNC_PTR(clCreateProgramWithSource);
  DEFINE_FUNC_PTR(clCreateBuffer);
+  DEFINE_FUNC_PTR(clCreateImage);
  DEFINE_FUNC_PTR(clRetainKernel);
  DEFINE_FUNC_PTR(clCreateKernel);
  DEFINE_FUNC_PTR(clGetProgramInfo);
@@ -269,6 +280,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
  ASSIGN_FROM_DLSYM(clReleaseKernel);
  ASSIGN_FROM_DLSYM(clCreateProgramWithSource);
  ASSIGN_FROM_DLSYM(clCreateBuffer);
+  ASSIGN_FROM_DLSYM(clCreateImage);
  ASSIGN_FROM_DLSYM(clRetainKernel);
  ASSIGN_FROM_DLSYM(clCreateKernel);
  ASSIGN_FROM_DLSYM(clGetProgramInfo);
@@ -708,6 +720,24 @@ cl_mem clCreateBuffer(cl_context context,
  }
 }
+cl_mem clCreateImage(cl_context context,
+                     cl_mem_flags flags,
+                     const cl_image_format *image_format,
+                     const cl_image_desc *image_desc,
+                     void *host_ptr,
+                     cl_int *errcode_ret) {
+  auto func = mace::OpenCLLibraryImpl::Get().clCreateImage;
+  if (func != nullptr) {
+    return func(context, flags, image_format, image_desc, host_ptr,
+                errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) {
+      *errcode_ret = CL_OUT_OF_RESOURCES;
+    }
+    return nullptr;
+  }
+}
 cl_program clCreateProgramWithSource(cl_context context,
                                     cl_uint count,
                                     const char **strings,
@@ -795,8 +825,8 @@ cl_int clGetKernelWorkGroupInfo(cl_kernel kernel,
                                size_t *param_value_size_ret) {
  auto func = mace::OpenCLLibraryImpl::Get().clGetKernelWorkGroupInfo;
  if (func != nullptr) {
-    return func(kernel, device, param_name, param_value_size,
+    return func(kernel, device, param_name, param_value_size, param_value,
-                param_value, param_value_size_ret);
+                param_value_size_ret);
  } else {
    return CL_OUT_OF_RESOURCES;
  }

--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
-void kernel conv_2d_1x1_naive(global const float *input, /* n, c, h, w */
+#include <common.h>
-                              global const float *filter, /* o, i, kh, kw */
-                              global const float *bias, /* o */
+__kernel void conv_2d_1x1_naive(__global const float *input, /* n, c, h, w */
-                              global float *output, /* n, c, h, w */
+                                __global const float *filter, /* o, i, kh, kw */
-                              private const int input_channels) {
+                                __global const float *bias, /* o */
+                                __global float *output, /* n, c, h, w */
+                                __private const int in_chan_num) {
  const int batch = get_global_id(0);
  const int channel = get_global_id(1);
  const int channels = get_global_size(1);
  const int pixel = get_global_id(2);
  const int pixels = get_global_size(2);
  float *output_ptr = output + (batch * channels + channel) * pixels;
  output_ptr[pixel] = bias[channel];
-  for (int inc = 0; inc < input_channels; ++inc) {
+  for (int inc = 0; inc < in_chan_num; ++inc) {
-    const float *input_ptr = input + (batch * input_channels + inc) * pixels + pixel;
+    const float *input_ptr = input + (batch * in_chan_num + inc) * pixels + pixel;
-    const float weights = filter[channel * input_channels + inc];
+    const float weights = filter[channel * in_chan_num + inc];
    float in = input_ptr[0];
    float out = output_ptr[0];
    out += in * weights;
@@ -23,17 +24,19 @@ void kernel conv_2d_1x1_naive(global const float *input, /* n, c, h, w */
  }
 }
-void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */
+__kernel void conv_2d_1x1_v2(__global const float *input, /* n, c, h, w */
-                           global const float *filter, /* o, i, kh, kw */
+                             __global const float *filter, /* o, i, kh, kw */
-                           global const float *bias, /* o */
+                             __global const float *bias, /* o */
-                           global float *output, /* n, c, h, w */
+                             __global float *output, /* n, c, h, w */
-                           private const int in_chan_num,
+                             __private const int in_chan_num,
-                           private const int out_chan_num,
+                             __private const int out_chan_num,
-                           private const int pixel_num) {
+                             __private const int pixel_num) {
  int batch = get_global_id(0);
  int out_chan_blk = get_global_id(1);
  int out_pixel_blk = get_global_id(2);
+  __private float output_slice[4 * 4];
  const int out_chan_begin = out_chan_blk * 4;
  const int out_chan_end = min(out_chan_begin + 4, out_chan_num);
  const int out_pixel_begin = out_pixel_blk * 4;
@@ -41,21 +44,23 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */
  const int in_offset = batch * in_chan_num * pixel_num;
  const int out_offset = batch * out_chan_num * pixel_num;
  const float *input_base = input + in_offset + out_pixel_begin;
  float *output_base = output + out_offset + out_pixel_begin;
-  int pixels = out_pixel_end - out_pixel_begin;
+  int out_chan_len = out_chan_end - out_chan_begin;
+  int pixel_len = out_pixel_end - out_pixel_begin;
  for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
    float bias_value = bias[out_chan];
-    float *output_ptr = output_base + out_chan * pixel_num;
+    int out_chan_offset = out_chan - out_chan_begin;
-    for (int p = 0; p < pixels; ++p) {
+    for (int p = 0; p < pixel_len; ++p) {
-      output_ptr[p] = bias_value;
+      output_slice[out_chan_offset * 4 + p] = bias_value;
    }
  }
  int in_chan = 0;
-  if (pixels == 4) {
+  if (pixel_len == 4) {
    for (; in_chan + 3 < in_chan_num; in_chan += 4) {
      const float *input_ptr = input_base + in_chan * pixel_num;
      int out_chan = out_chan_begin;
@@ -66,30 +71,32 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */
        float4 in1 = vload4(0, input_ptr + pixel_num);
        float4 in2 = vload4(0, input_ptr + 2 * pixel_num);
        float4 in3 = vload4(0, input_ptr + 3 * pixel_num);
+        #pragma unroll
        for (int oc = 0; oc < 4; ++oc) {
          float4 weights = vload4(0, filter_ptr + oc * in_chan_num);
-          float4 out = vload4(0, output_ptr + oc * pixel_num);
+          float4 out = vload4(oc, output_slice);
          out += in0 * weights.x;
          out += in1 * weights.y;
          out += in2 * weights.z;
          out += in3 * weights.w;
-          vstore4(out, 0, output_ptr + oc * pixel_num);
+          vstore4(out, oc, output_slice);
        }
      }
      for (; out_chan < out_chan_end; ++out_chan) {
        const float* filter_ptr = filter + out_chan * in_chan_num + in_chan;
        float *output_ptr = output_base + out_chan * pixel_num;
+        int out_chan_offset = out_chan - out_chan_begin;
        float4 weights = vload4(0, filter_ptr);
        float4 in0 = vload4(0, input_ptr);
        float4 in1 = vload4(0, input_ptr + pixel_num);
        float4 in2 = vload4(0, input_ptr + 2 * pixel_num);
        float4 in3 = vload4(0, input_ptr + 3 * pixel_num);
-        float4 out = vload4(0, output_ptr);
+        float4 out = vload4(out_chan_offset, output_slice);
        out += in0 * weights.x;
        out += in1 * weights.y;
        out += in2 * weights.z;
        out += in3 * weights.w;
-        vstore4(out, 0, output_ptr);
+        vstore4(out, out_chan_offset, output_slice);
      }
    }
  }
@@ -99,13 +106,71 @@ void kernel conv_2d_1x1_v2(global const float *input, /* n, c, h, w */
    for (int out_chan = out_chan_begin; out_chan < out_chan_end; ++out_chan) {
      float weights = filter[out_chan * in_chan_num + in_chan];
      float *output_ptr = output_base + out_chan * pixel_num;
+      int out_chan_offset = out_chan - out_chan_begin;
-      for (int p = 0; p < pixels; ++p) {
+      for (int p = 0; p < pixel_len; ++p) {
        float in = input_ptr[p];
-        float out = output_ptr[p];
+        output_slice[out_chan_offset * 4 + p] += in * weights;
-        out += in * weights;
+      }
-        output_ptr[p] = out;
+    }
+  }
+  for (int out_chan_offset = 0; out_chan_offset < out_chan_len; ++out_chan_offset) {
+    int out_chan = out_chan_begin + out_chan_offset;
+    float *output_ptr = output_base + out_chan * pixel_num;
+    if (pixel_len == 4) {
+      float4 out = vload4(out_chan_offset, output_slice);
+      vstore4(out, 0, output_ptr);
+    } else {
+      int offset = out_chan_offset << 2;
+      for (int p = 0; p < pixel_len; ++p) {
+        output_ptr[p] = output_slice[offset + p];
+      }
+    }
  }
+}
+/* FIXME this is incomplete */
+__kernel void conv_2d_1x1_v3(__read_only image3d_t input, /* n, c/4, h, w, 4 */
+                             __global const float *filter, /* o, i, kh, kw */
+                             __global const float *bias, /* o */
+                             __write_only image3d_t output, /* n, c/4, h, w, 4 */
+                             __private const int batch_num,
+                             __private const int in_chan_num,
+                             __private const int out_chan_num,
+                             __private const int height,
+                             __private const int width) {
+  int out_chan_blk = get_global_id(0);
+  int h = get_global_id(1);
+  int w = get_global_id(2);
+  int in_chan_blk_num = (in_chan_num + 3) / 4;
+  int out_chan_blk_num = (out_chan_num + 3) / 4;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  for (int batch = 0; batch < batch_num; ++batch) {
+    float4 bias_value = vload4(out_chan_blk, bias);
+    __private float4 out = bias_value;
+    for (int in_chan_blk = 0; in_chan_blk < in_chan_blk_num; ++in_chan_blk) {
+      int in_d = batch * in_chan_blk_num + in_chan_blk;
+      float4 in = read_imagef(input, sampler, (int4)(in_d, h, w, 0));
+      const float *filter_base = filter + (out_chan_blk << 2) * in_chan_num;
+      float4 weights = vload4(in_chan_blk, filter_base);
+      out.x += dot(in, weights);
+      weights = vload4(in_chan_blk, filter_base + in_chan_num);
+      out.y += dot(in, weights);
+      weights = vload4(in_chan_blk, filter_base + in_chan_num * 2);
+      out.z += dot(in, weights);
+      weights = vload4(in_chan_blk, filter_base + in_chan_num * 3);
+      out.w += dot(in, weights);
    }
+    int out_d = batch * out_chan_blk_num + out_chan_blk;
+    int4 out_coord = (int4)(out_d, h, w, 0);
+    write_imagef(output, out_coord, out);
  }
 }
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -2,9 +2,10 @@
 // Copyright (c) 2017 XiaoMi All rights reserved.
 //
+#include "mace/kernels/conv_2d.h"
 #include "mace/core/common.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/conv_2d.h"
 #include "mace/utils/utils.h"
 namespace mace {
@@ -22,22 +23,22 @@ void Conv1x1Naive(const Tensor *input,
  auto runtime = OpenCLRuntime::Get();
  auto program = runtime->program();
-  auto conv_2d = cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer,
+  auto conv_2d =
-                                   int, int>(program, "conv_2d_1x1_naive");
+      cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer, int,
+                        int>(program, "conv_2d_1x1_naive");
  const index_t pixels = height * width;
  cl_int error;
-  conv_2d(cl::EnqueueArgs(runtime->command_queue(),
+  conv_2d(cl::EnqueueArgs(
-                          cl::NDRange(static_cast<int>(batch),
+              runtime->command_queue(),
-                                      static_cast<int>(channels),
+              cl::NDRange(static_cast<int>(batch), static_cast<int>(channels),
                          static_cast<int>(pixels)),
              cl::NDRange(1, 1, 128)),
          *(static_cast<cl::Buffer *>(input->buffer())),
          *(static_cast<cl::Buffer *>(filter->buffer())),
          *(static_cast<cl::Buffer *>(bias->buffer())),
          *(static_cast<cl::Buffer *>(output->buffer())),
-          static_cast<int>(input_channels),
+          static_cast<int>(input_channels), error);
-          error);
  MACE_CHECK(error == CL_SUCCESS);
 }
@@ -53,31 +54,126 @@ void Conv1x1V2(const Tensor *input,
  auto runtime = OpenCLRuntime::Get();
  auto program = runtime->program();
-  auto conv_2d = cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer,
-                                   int, int, int, int>(program, "conv_2d_1x1_v2");
  const index_t pixels = height * width;
  const index_t channel_blocks = (channels + 3) / 4;
  const index_t pixel_blocks = (pixels + 3) / 4;
-  cl_int error;
+  // TODO KernelFunctor has an extra clReleaseCommandQueue due to a copy
-  conv_2d(cl::EnqueueArgs(runtime->command_queue(),
+  // TODO check wired clReleaseCommandQueue latency
-                          cl::NDRange(static_cast<int>(batch),
+  // The KernelFunctor can cause segment faults in cb_retain_event
-                                      static_cast<int>(channel_blocks),
+  auto conv_2d_kernel = cl::Kernel(program, "conv_2d_1x1_v2");
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
+  uint32_t idx = 0;
+  conv_2d_kernel.setArg(idx++,
+                        *(static_cast<const cl::Buffer *>(input->buffer())));
+  conv_2d_kernel.setArg(idx++,
+                        *(static_cast<const cl::Buffer *>(filter->buffer())));
+  conv_2d_kernel.setArg(idx++,
+                        *(static_cast<const cl::Buffer *>(bias->buffer())));
+  conv_2d_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channels));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(channels));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(pixels));
+  auto command_queue = runtime->command_queue();
+  cl_int error = command_queue.enqueueNDRangeKernel(
+      conv_2d_kernel, cl::NullRange,
+      cl::NDRange(static_cast<int>(batch), static_cast<int>(channel_blocks),
                  static_cast<int>(pixel_blocks)),
-                          cl::NDRange(1, 1, 256)),
+      cl::NDRange(1, 2, kwg_size / 2));
-          *(static_cast<cl::Buffer *>(input->buffer())),
+  MACE_CHECK(error == CL_SUCCESS, error);
-          *(static_cast<cl::Buffer *>(filter->buffer())),
+}
-          *(static_cast<cl::Buffer *>(bias->buffer())),
-          *(static_cast<cl::Buffer *>(output->buffer())),
+void Conv1x1V3(const Tensor *input,
-          static_cast<int>(input_channels),
+               const Tensor *filter,
-          static_cast<int>(channels),
+               const Tensor *bias,
-          static_cast<int>(pixels),
+               Tensor *output) {
-          error);
+  const index_t batch = output->shape()[0];
+  const index_t channels = output->shape()[1];
+  const index_t height = output->shape()[2];
+  const index_t width = output->shape()[3];
+  const index_t input_channels = input->shape()[1];
+  auto runtime = OpenCLRuntime::Get();
+  auto program = runtime->program();
+  const index_t pixels = height * width;
+  const index_t pixel_blocks = (pixels + 3) / 4;
+  const index_t channel_blocks = (channels + 3) / 4;
+  const index_t input_channel_blocks = (input_channels + 3) / 4;
+  // FIXME temp hacking
+  static std::map<std::ptrdiff_t, cl::Image3D> input_image_map;
+  static std::map<std::ptrdiff_t, cl::Image3D> output_image_map;
+  cl::Image3D input_image;
+  cl::Image3D output_image;
+  auto input_iter =
+      input_image_map.find(reinterpret_cast<std::ptrdiff_t>(input->buffer()));
+  if (input_iter != input_image_map.end()) {
+    input_image = input_iter->second;
+  } else {
+    // The batch dimension is collapsed with channel
+    cl_int error;
+    cl::Image3D image =
+        cl::Image3D(OpenCLRuntime::Get()->context(),
+                    CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                    cl::ImageFormat(CL_RGBA, CL_FLOAT), height, width,
+                    batch * input_channel_blocks, 0, 0, nullptr, &error);
    MACE_CHECK(error == CL_SUCCESS);
+    input_image = image;
+    input_image_map.clear();
+    input_image_map.emplace(reinterpret_cast<std::ptrdiff_t>(input->buffer()),
+                            image);
+  }
+  auto output_iter =
+      output_image_map.find(reinterpret_cast<std::ptrdiff_t>(output->buffer()));
+  if (output_iter != output_image_map.end()) {
+    output_image = output_iter->second;
+  } else {
+    cl_int error;
+    cl::Image3D image =
+        cl::Image3D(OpenCLRuntime::Get()->context(),
+                    CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                    cl::ImageFormat(CL_RGBA, CL_FLOAT), height, width,
+                    batch * channel_blocks, 0, 0, nullptr, &error);
+    MACE_CHECK(error == CL_SUCCESS);
+    output_image = image;
+    output_image_map.clear();
+    output_image_map.emplace(reinterpret_cast<std::ptrdiff_t>(output->buffer()),
+                             image);
+  }
+  auto conv_2d_kernel = cl::Kernel(program, "conv_2d_1x1_v3");
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
+  uint32_t idx = 0;
+  conv_2d_kernel.setArg(idx++, input_image);
+  conv_2d_kernel.setArg(idx++,
+                        *(static_cast<const cl::Buffer *>(filter->buffer())));
+  conv_2d_kernel.setArg(idx++,
+                        *(static_cast<const cl::Buffer *>(bias->buffer())));
+  conv_2d_kernel.setArg(idx++, output_image);
+  conv_2d_kernel.setArg(idx++, static_cast<int>(batch));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(input_channels));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(channels));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(height));
+  conv_2d_kernel.setArg(idx++, static_cast<int>(width));
+  auto command_queue = runtime->command_queue();
+  cl_int error;
+  error = command_queue.enqueueNDRangeKernel(
+      conv_2d_kernel, cl::NullRange,
+      cl::NDRange(static_cast<int>(channel_blocks), static_cast<int>(height),
+                  static_cast<int>(width)),
+      cl::NDRange(1, 2, kwg_size / 2));
+  MACE_CHECK(error == CL_SUCCESS, error);
 }
-extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter,
+extern void Conv2dOpenclK1x1S1(const Tensor *input,
-                               const Tensor *bias, Tensor *output) {
+                               const Tensor *filter,
+                               const Tensor *bias,
+                               Tensor *output) {
  const index_t batch = output->shape()[0];
  const index_t height = output->shape()[2];
  const index_t width = output->shape()[3];