[MS][LITE] opencl bug fix for pooling

4ad86f42 · chenzomi · 2cab58fd · 4ad86f42 · 4ad86f42 · 4ad86f42
3 changed file
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
 __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape,
-                           const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
+                               const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
  // axis to dst tensor coordinate
  int X = get_global_id(0);
  int Y = get_global_id(1);
@@ -32,8 +32,7 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output,
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

 __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
-                                  const int4 output_shape, const int2 stride, const int2 kernel_size,
-                                  const int2 padding) {
+                               const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
  // axis to dst tensor coordinate
  int X = get_global_id(0);
  int Y = get_global_id(1);
@@ -49,17 +48,16 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d
  int xs = X * stride.x + padding.x;
  int ys = Y * stride.y + padding.y;

-  for (int kx = 0; kx < kernel_size.x; ++kx) {
-    int x_c = xs + kx;
-    bool outside_x = x_c < 0 || x_c >= input_shape.x;
-    for (int ky = 0; ky < kernel_size.y; ++ky) {
-      int y_c = ys + ky;
-      bool outside = outside_x || y_c < 0 || y_c >= input_shape.y;
-
-      r += read_imagef(input, smp_zero, (int2)(x_c, y_c * input_shape.w + Z));
+  for (int ky = 0; ky < kernel_size.y; ++ky) {
+    int y_c = ys + ky;
+    bool outside_y = y_c < 0 || y_c >= input_shape.y;
+    for (int kx = 0; kx < kernel_size.x; ++kx) {
+      int x_c = xs + kx;
+      bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
+      r += read_imagef(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c));
      window_size += !outside ? 1.0f : 0.0f;
    }
  }
  float4 result = convert_float4(r / window_size);
-  write_imagef(output, (int2)(X, Y * output_shape.w + Z), result);
+  write_imagef(output, (int2)(Y * output_shape.w + Z, X), result);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl
@@ -31,7 +31,7 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output,
  output[(output_shape.y * X + Y) * output_shape.w + Z] = maximum;
 }

-__constant sampler_t sample_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;

 __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
                               const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
@@ -48,20 +48,15 @@ __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d
  float4 maximum = (float4)(-10000.0f);
  int xs = X * stride.x + padding.x;
  int ys = Y * stride.y + padding.y;
-
-  for (int kx = 0; kx < kernel_size.x; ++kx) {
-    int x_c = xs + kx;
-    if (x_c < 0 || x_c >= input_shape.x) {
-      continue;
-    }
-    for (int ky = 0; ky < kernel_size.y; ++ky) {
-      int y_c = ys + ky;
-      if (y_c < 0 || y_c >= input_shape.y) {
-        continue;
-      }
-      float4 src = read_imagef(input, sample_none, (int2)(x_c, y_c * input_shape.w + Z));
+  for (int ky = 0; ky < kernel_size.y; ++ky) {
+    int y_c = ys + ky;
+    if (y_c < 0 || y_c >= input_shape.y) continue;
+    for (int kx = 0; kx < kernel_size.x; ++kx) {
+      int x_c = xs + kx;
+      if (x_c < 0 || x_c >= input_shape.x) continue;
+      float4 src = read_imagef(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c));
      maximum = max(src, maximum);
    }
  }
-  write_imagef(output, (int2)(X, Y * output_shape.w + Z), maximum);
+  write_imagef(output, (int2)(Y * output_shape.w + Z, X), maximum);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
@@ -82,7 +82,7 @@ int PoolingOpenCLKernel::Init() {
 std::vector<size_t> PoolingOpenCLKernel::InitGlobalSize() const {
  const size_t global_x = outputs_[0]->Height();
  const size_t global_y = outputs_[0]->Width();
-  const size_t global_z = UP_ROUND_DIV(outputs_[0]->Channel(), 4);
+  const size_t global_z = UP_DIV(outputs_[0]->Channel(), C4NUM);
  std::vector<size_t> global = {global_x, global_y, global_z};
  return global;
 }
@@ -90,13 +90,8 @@ std::vector<size_t> PoolingOpenCLKernel::InitGlobalSize() const {
 int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
  size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM);
  size_t im_dst_x, im_dst_y;
-  if (inputs_[0]->GetFormat() == schema::Format_NHWC4) {
-    im_dst_x = outputs_[0]->Height();
-    im_dst_y = outputs_[0]->Width() * CO4;
-  } else {
-    im_dst_y = outputs_[0]->Width();
-    im_dst_x = outputs_[0]->Height() * CO4;
-  }
+  im_dst_x = outputs_[0]->Width() * CO4;
+  im_dst_y = outputs_[0]->Height();
 #ifdef ENABLE_FP16
  size_t img_dtype = CL_HALF_FLOAT;
 #else
@@ -117,7 +112,7 @@ int PoolingOpenCLKernel::Run() {
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();

  // attribute
-  int slices = UP_ROUND_DIV(outputs_[0]->Channel(), 4);
+  int slices = UP_DIV(outputs_[0]->Channel(), C4NUM);
  cl_int4 input_shape = {inputs_[0]->Height(), inputs_[0]->Width(), inputs_[0]->Channel(), slices};
  cl_int4 output_shape = {outputs_[0]->Height(), outputs_[0]->Width(), outputs_[0]->Channel(), slices};
  cl_int2 stride = {parameter_->stride_h_, parameter_->stride_w_};