Merge pull request #1064 from smilejames/opencl

add opencl kernel of batchnorm, pool, fetch

Merge pull request #1064 from smilejames/opencl
add opencl kernel of batchnorm, pool, fetch
84f34a23 · Jiaying Zhao · GitHub · 6b3a0ebe · 79586ee3 · 84f34a23
7 changed file
--- a/src/operators/kernel/cl/batchnorm_kernel.cpp
+++ b/src/operators/kernel/cl/batchnorm_kernel.cpp
@@ -21,12 +21,67 @@ namespace operators {

 template <>
 bool BatchNormKernel<GPU_CL, float>::Init(BatchNormParam<GPU_CL> *param) {
+  this->cl_helper_.AddKernel("batchnorm", "batchnorm_kernel.cl");
+  const framework::CLImage *mean = param->InputMean();
+  const framework::CLImage *variance = param->InputVariance();
+  const framework::CLImage *scale = param->InputScale();
+  const framework::CLImage *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+
+  const int C = mean->numel();
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  float *new_scale_ptr = new float[C];
+  float *new_bias_ptr = new float[C];
+
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+
+  delete[](new_scale_ptr);
+  delete[](new_bias_ptr);
+
+  framework::CLImage *new_scale = new framework::CLImage();
+  framework::CLImage *new_bias = new framework::CLImage();
+
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
  return true;
 }

 template <>
 void BatchNormKernel<GPU_CL, float>::Compute(
-    const BatchNormParam<GPU_CL> &param) {}
+    const BatchNormParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.OutputY());
+
+  auto input = param.InputX()->GetCLImage();
+  auto out = param.OutputY()->GetCLImage();
+  auto new_scale = param.NewScale()->GetCLImage();
+  auto new_bias = param.NewBias()->GetCLImage();
+  const int out_height = param.OutputY()->HeightOfOneBlock();
+  const int out_width = param.OutputY()->WidthOfOneBlock();
+
+  clSetKernelArg(kernel, 0, sizeof(int), &out_height);
+  clSetKernelArg(kernel, 1, sizeof(int), &out_width);
+  clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
+  clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_scale);
+  clSetKernelArg(kernel, 4, sizeof(cl_mem), &new_bias);
+  clSetKernelArg(kernel, 5, sizeof(cl_mem), &out);
+
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                         default_work_size.data(), NULL, 0, NULL, NULL);
+}

 template class BatchNormKernel<GPU_CL, float>;


--- a/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void batchnorm(__private const int out_height,
+                        __private const int out_width,
+                        __read_only image2d_t input,
+                        __read_only image2d_t new_scale,
+                        __read_only image2d_t new_bias,
+                        __write_only image2d_t output) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  half4 new_scale = read_imageh(bn_scale, sampler, (int2)(out_c, 0));
+  half4 new_bias = read_imageh(bn_bias, sampler, (int2)(out_c, 0));
+
+  int pos_x = mad24(out_c, out_width, out_w);
+  half4 in = read_imageh(input, sampler, (int2)(pos_x, out_nh));
+  half4 out = mad(in, new_scale, new_bias);
+
+  write_imageh(output, (int2)(pos_x, nh), out);
+}
--- a/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void fetch(__private const int in_height,
+                    __private const int in_width,
+                    __private const int size_ch,
+                    __private const int size_block,
+                    __private const int size_batch,
+                    __read_only image2d_t input,
+                    __global float* out) {
+  const int in_c = get_global_id(0);
+  const int in_w = get_global_id(1);
+  const int in_nh = get_global_id(2);
+  const int in_n = in_nh / in_height;
+  const int in_h = in_nh % in_height;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  const int pos_x = mad24(in_c, in_width, in_w);
+  half4 in = read_imageh(input, sampler, (int2)(pos_x, in_nh));
+
+  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
+  out[index] = convert_float(in.x);
+  out[index + size_ch] = convert_float(in.y);
+  out[index + size_ch * 2] = convert_float(in.z);
+  out[index + size_ch * 3] = convert_float(in.w);
+}
--- a/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#define MIN_VALUE -FLT_MAX
+
+__kernel void pool_max(
+    __private const int in_height, __private const int in_width,
+    __private const int out_height, __private const int out_width,
+    __private const int pad_top, __private const int pad_left,
+    __private const int stride_h, __private const int stride_w,
+    __private const int ksize_h, __private const int ksize_w,
+    __read_only image2d_t input, __write_only image2d_t output) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int start_h = max(out_h * stride_h - pad_top, 0);
+  int end_h = min(start_h + ksize_h, in_height);
+
+  int start_w = max(out_w * stride_w - pad_left, 0);
+  int end_w = min(start_w + ksize_w, in_width);
+
+  const int pos_in_x = out_c * in_width;
+  const int pos_in_y = out_n * in_height;
+  half4 max_value = (half4)(MIN_VALUE);
+  for (int y = start_h; y < end_h; ++y) {
+    for (int x = start_w; x < end_w; ++x) {
+      half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+      max_value = max(max_value, tmp);
+    }
+  }
+
+  const int pos_out_x = mad24(out_c, out_width, out_w);
+  write_imageh(output, (int2)(pos_out_x, out_nh), max_value);
+}
+
+__kernel void pool_avg(
+    __private const int in_height, __private const int in_width,
+    __private const int out_height, __private const int out_width,
+    __private const int pad_top, __private const int pad_left,
+    __private const int stride_h, __private const int stride_w,
+    __private const int ksize_h, __private const int ksize_w,
+    __read_only image2d_t input, __write_only image2d_t output) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int start_h = max(out_h * stride_h - pad_top, 0);
+  int end_h = min(start_h + ksize_h, in_height);
+
+  int start_w = max(out_w * stride_w - pad_left, 0);
+  int end_w = min(start_w + ksize_w, in_width);
+
+  const int pos_in_x = out_c * in_width;
+  const int pos_in_y = out_n * in_height;
+  half4 sum = (half4)(0.0f);
+  int num = 0;
+  for (int y = start_h; y < end_h; ++y) {
+    for (int x = start_w; x < end_w; ++x) {
+      sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+      num++;
+    }
+  }
+  half4 avg = sum / num;
+  const int pos_out_x = mad24(out_c, out_width, out_w);
+  write_imageh(output, (int2)(pos_out_x, out_nh), avg);
+}
\ No newline at end of file
--- a/src/operators/kernel/cl/fetch_kernel.cpp
+++ b/src/operators/kernel/cl/fetch_kernel.cpp
@@ -19,11 +19,45 @@ namespace operators {

 template <>
 bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
+  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
  return true;
 }

 template <>
-void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {}
+void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX());
+
+  auto input = param.InputX()->GetCLImage();
+  auto *out = param.Out();
+
+  const auto &dims = param.InputX()->dims();
+  const int N = dims[0];
+  const int C = dims[1];
+  const int in_height = dims[2];
+  const int in_width = dims[3];
+
+  int size_ch = in_height * in_width;
+  int size_block = size_ch * 4;
+  int size_batch = size_ch * C;
+
+  // need create outputBuffer
+  cl_image_format imageFormat;
+  imageFormat.image_channel_order = CL_RGBA;
+  imageFormat.image_channel_data_type = CL_FLOAT;
+  cl_mem outputBuffer;
+
+  clSetKernelArg(kernel, 0, sizeof(int), &in_height);
+  clSetKernelArg(kernel, 1, sizeof(int), &in_width);
+  clSetKernelArg(kernel, 2, sizeof(int), &size_ch);
+  clSetKernelArg(kernel, 3, sizeof(int), &size_block);
+  clSetKernelArg(kernel, 4, sizeof(int), &size_batch);
+  clSetKernelArg(kernel, 5, sizeof(cl_mem), &input);
+  clSetKernelArg(kernel, 6, sizeof(cl_mem), &outputBuffer);
+
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                         default_work_size.data(), NULL, 0, NULL, NULL);
+}

 template class FetchKernel<GPU_CL, float>;


--- a/src/operators/kernel/cl/pool_kernel.cpp
+++ b/src/operators/kernel/cl/pool_kernel.cpp
@@ -21,11 +21,51 @@ namespace operators {

 template <>
 bool PoolKernel<GPU_CL, float>::Init(PoolParam<GPU_CL> *param) {
+  std::string pooling_type = param->PoolingType();
+  this->cl_helper_.AddKernel("pool_" + pooling_type, "pool_kernel.cl");
  return true;
 }

 template <>
-void PoolKernel<GPU_CL, float>::Compute(const PoolParam<GPU_CL> &param) {}
+void PoolKernel<GPU_CL, float>::Compute(const PoolParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+
+  auto input = param.Input()->GetCLImage();
+  auto out = param.Output()->GetCLImage();
+
+  const int in_height = param.Input()->HeightOfOneBlock();
+  const int in_width = param.Input()->WidthOfOneBlock();
+  const int out_height = param.Output()->HeightOfOneBlock();
+  const int out_width = param.Output()->WidthOfOneBlock();
+
+  std::string pooling_type = param.PoolingType();
+  std::vector<int> ksize = param.Ksize();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  const int pad_top = paddings[0];
+  const int pad_left = paddings[1];
+  const int stride_h = strides[0];
+  const int stride_w = strides[1];
+  const int ksize_h = ksize[0];
+  const int ksize_w = ksize[1];
+
+  clSetKernelArg(kernel, 0, sizeof(cl_int), &in_height);
+  clSetKernelArg(kernel, 1, sizeof(cl_int), &in_width);
+  clSetKernelArg(kernel, 2, sizeof(cl_int), &out_height);
+  clSetKernelArg(kernel, 3, sizeof(cl_int), &out_width);
+  clSetKernelArg(kernel, 4, sizeof(cl_int), &pad_top);
+  clSetKernelArg(kernel, 5, sizeof(cl_int), &pad_left);
+  clSetKernelArg(kernel, 6, sizeof(cl_int), &stride_h);
+  clSetKernelArg(kernel, 7, sizeof(cl_int), &stride_w);
+  clSetKernelArg(kernel, 8, sizeof(cl_int), &ksize_h);
+  clSetKernelArg(kernel, 9, sizeof(cl_int), &ksize_w);
+  clSetKernelArg(kernel, 10, sizeof(cl_mem), &input);
+  clSetKernelArg(kernel, 11, sizeof(cl_mem), &out);
+
+  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                         default_work_size.data(), NULL, 0, NULL, NULL);
+}

 template class PoolKernel<GPU_CL, float>;


--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -614,6 +614,14 @@ class BatchNormParam : OpParam {

  const string &DataFormat() const { return data_format_; }

+  void SetNewScale(RType *new_scale) { new_scale_ = new_scale; }
+
+  void SetNewBias(RType *new_bias) { new_bias_ = new_bias; }
+
+  const RType *NewScale() const { return new_scale_; }
+
+  const RType *NewBias() const { return new_bias_; }
+
 private:
  RType *input_x_;
  RType *output_y_;
@@ -625,6 +633,8 @@ class BatchNormParam : OpParam {
  float momentum_;
  bool is_test_;
  string data_format_;
+  RType *new_bias_;
+  RType *new_scale_;
 };
 #endif

@@ -936,14 +946,18 @@ class FetchParam : public OpParam {
  FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
             const AttributeMap &attrs, const Scope &scope) {
    input_x_ = InputXFrom<GType>(inputs, scope);
-    out_ = OutFrom<GType>(outputs, scope);
+    out_ = OutFrom(outputs, scope);
  }
  const RType *InputX() const { return input_x_; }
-  RType *Out() const { return out_; }
+  Tensor *Out() const { return out_; }
+
+  static Tensor *OutFrom(const VariableNameMap &outputs, const Scope &scope) {
+    return GetVarValue<Tensor>("Out", outputs, scope);
+  }

 private:
  RType *input_x_;
-  RType *out_;
+  Tensor *out_;
 };

 #ifdef TRANSPOSE_OP