pool_kernel.cl 2.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define MIN_VALUE -FLT_MAX

__kernel void pool_max(
    __private const int in_height, __private const int in_width,
    __private const int out_height, __private const int out_width,
    __private const int pad_top, __private const int pad_left,
    __private const int stride_h, __private const int stride_w,
    __private const int ksize_h, __private const int ksize_w,
    __read_only image2d_t input, __write_only image2d_t output) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
  const int out_n = out_nh / out_height;
  const int out_h = out_nh % out_height;

  const sampler_t sampler =
      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

  int start_h = max(out_h * stride_h - pad_top, 0);
  int end_h = min(start_h + ksize_h, in_height);

  int start_w = max(out_w * stride_w - pad_left, 0);
  int end_w = min(start_w + ksize_w, in_width);

  const int pos_in_x = out_c * in_width;
  const int pos_in_y = out_n * in_height;
  half4 max_value = (half4)(MIN_VALUE);
  for (int y = start_h; y < end_h; ++y) {
    for (int x = start_w; x < end_w; ++x) {
      half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
      max_value = max(max_value, tmp);
    }
  }

  const int pos_out_x = mad24(out_c, out_width, out_w);
  write_imageh(output, (int2)(pos_out_x, out_nh), max_value);
}

__kernel void pool_avg(
    __private const int in_height, __private const int in_width,
    __private const int out_height, __private const int out_width,
    __private const int pad_top, __private const int pad_left,
    __private const int stride_h, __private const int stride_w,
    __private const int ksize_h, __private const int ksize_w,
    __read_only image2d_t input, __write_only image2d_t output) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
  const int out_nh = get_global_id(2);
  const int out_n = out_nh / out_height;
  const int out_h = out_nh % out_height;

  const sampler_t sampler =
      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

  int start_h = max(out_h * stride_h - pad_top, 0);
  int end_h = min(start_h + ksize_h, in_height);

  int start_w = max(out_w * stride_w - pad_left, 0);
  int end_w = min(start_w + ksize_w, in_width);

  const int pos_in_x = out_c * in_width;
  const int pos_in_y = out_n * in_height;
  half4 sum = (half4)(0.0f);
  int num = 0;
  for (int y = start_h; y < end_h; ++y) {
    for (int x = start_w; x < end_w; ++x) {
      sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
      num++;
    }
  }
  half4 avg = sum / num;
  const int pos_out_x = mad24(out_c, out_width, out_w);
  write_imageh(output, (int2)(pos_out_x, out_nh), avg);
}