!5635 [MS][LITE][GPU]fix bug in matmul and pooling

Merge pull request !5635 from chenzupeng/master-lite

!5635 [MS][LITE][GPU]fix bug in matmul and pooling
Merge pull request !5635 from chenzupeng/master-lite
8e442ce7 · mindspore-ci-bot · Gitee · 03093778 · 96744911 · 8e442ce7
6 changed file
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl
-__kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape,
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+__kernel void AvgPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape,
                               const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
  // axis to dst tensor coordinate
  int X = get_global_id(0);
@@ -10,10 +13,10 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output,
    return;
  }

-  float4 r = (float4)(0.0f);
-  float window_size = 0.0f;
-  int xs = X * stride.x + padding.x;
-  int ys = Y * stride.y + padding.y;
+  FLT4 r = (FLT4)(0.0f);
+  FLT window_size = 0.0f;
+  int xs = X * stride.x - padding.x;
+  int ys = Y * stride.y - padding.y;

  for (int kx = 0; kx < kernel_size.x; ++kx) {
    int x_c = xs + kx;
@@ -21,11 +24,11 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output,
    for (int ky = 0; ky < kernel_size.y; ++ky) {
      int y_c = ys + ky;
      bool outside = outside_x || y_c < 0 || y_c >= input_shape.y;
-      r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (float4)(0.0f);
+      r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (FLT4)(0.0f);
      window_size += !outside ? 1.0f : 0.0f;
    }
  }
-  float4 result = convert_float4(r / window_size);
+  FLT4 result = TO_FLT4(r / window_size);
  output[(output_shape.y * X + Y) * output_shape.w + Z] = result;
 }

@@ -43,10 +46,10 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d
    return;
  }

-  float4 r = (float4)(0.0f);
-  float window_size = 0.0f;
-  int xs = X * stride.x + padding.x;
-  int ys = Y * stride.y + padding.y;
+  FLT4 r = (FLT4)(0.0f);
+  FLT window_size = 0.0f;
+  int xs = X * stride.x - padding.x;
+  int ys = Y * stride.y - padding.y;

  for (int ky = 0; ky < kernel_size.y; ++ky) {
    int y_c = ys + ky;
@@ -54,10 +57,10 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d
    for (int kx = 0; kx < kernel_size.x; ++kx) {
      int x_c = xs + kx;
      bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
-      r += read_imagef(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c));
+      r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (float4)(0.0f);
      window_size += !outside ? 1.0f : 0.0f;
    }
  }
-  float4 result = convert_float4(r / window_size);
-  write_imagef(output, (int2)(Y * output_shape.w + Z, X), result);
+  FLT4 result = TO_FLT4(r / window_size);
+  WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl
-__kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape,
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+__kernel void MaxPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape,
                               const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
  // axis to dst tensor coordinate
  int X = get_global_id(0);
@@ -10,9 +13,9 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output,
    return;
  }

-  float4 maximum = (float4)(-10000.0f);
-  int xs = X * stride.x + padding.x;
-  int ys = Y * stride.y + padding.y;
+  FLT4 maximum = (FLT4)(-10000.0f);
+  int xs = X * stride.x - padding.x;
+  int ys = Y * stride.y - padding.y;

  for (int kx = 0; kx < kernel_size.x; ++kx) {
    int x_c = xs + kx;
@@ -24,7 +27,7 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output,
      if (y_c < 0 || y_c >= input_shape.y) {
        continue;
      }
-      float4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z];
+      FLT4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z];
      maximum = max(src, maximum);
    }
  }
@@ -45,18 +48,18 @@ __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d
    return;
  }

-  float4 maximum = (float4)(-10000.0f);
-  int xs = X * stride.x + padding.x;
-  int ys = Y * stride.y + padding.y;
+  FLT4 maximum = (FLT4)(-10000.0f);
+  int xs = X * stride.x - padding.x;
+  int ys = Y * stride.y - padding.y;
  for (int ky = 0; ky < kernel_size.y; ++ky) {
    int y_c = ys + ky;
    if (y_c < 0 || y_c >= input_shape.y) continue;
    for (int kx = 0; kx < kernel_size.x; ++kx) {
      int x_c = xs + kx;
      if (x_c < 0 || x_c >= input_shape.x) continue;
-      float4 src = read_imagef(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c));
+      FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c));
      maximum = max(src, maximum);
    }
  }
-  write_imagef(output, (int2)(Y * output_shape.w + Z, X), maximum);
+  WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@@ -58,14 +58,13 @@ int MatMulOpenCLKernel::Init() {
  sizeCO = {co, UP_DIV(co, C4NUM)};
  PadWeight();
  in_ori_format_ = in_tensors_[0]->GetFormat();
-  in_tensors_[0]->SetFormat(schema::Format_NHWC4);
  out_ori_format_ = out_tensors_[0]->GetFormat();
-  out_tensors_[0]->SetFormat(schema::Format_NHWC4);
  if (out_tensors_[0]->shape().size() == 2) {
-    out_ori_format_ = schema::Format_NC;
    out_tensors_[0]->SetFormat(schema::Format_NC4);
-    in_ori_format_ = schema::Format_NC;
    in_tensors_[0]->SetFormat(schema::Format_NC4);
+  } else {
+    in_tensors_[0]->SetFormat(schema::Format_NHWC4);
+    out_tensors_[0]->SetFormat(schema::Format_NHWC4);
  }
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  return RET_OK;

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
@@ -60,7 +60,7 @@ int PoolingOpenCLKernel::Init() {
    return RET_INVALID_OP_NAME;
  }
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
-
+  enable_fp16_ = ocl_runtime->GetFp16Enable();
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
@@ -96,11 +96,10 @@ int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
  size_t im_dst_x, im_dst_y;
  im_dst_x = out_tensors_[0]->Width() * CO4;
  im_dst_y = out_tensors_[0]->Height();
-#ifdef ENABLE_FP16
-  size_t img_dtype = CL_HALF_FLOAT;
-#else
  size_t img_dtype = CL_FLOAT;
-#endif
+  if (enable_fp16_) {
+    img_dtype = CL_HALF_FLOAT;
+  }
  img_size->clear();
  std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
  *img_size = vec;
@@ -161,5 +160,6 @@ kernel::LiteKernel *OpenCLPooling2dKernelCreator(const std::vector<lite::tensor:
 }

 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator)
+REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h
@@ -44,6 +44,7 @@ class PoolingOpenCLKernel : public OpenCLKernel {
  std::vector<size_t> InitGlobalSize() const;
  PoolingParameter *parameter_;
  cl::Kernel kernel_;
+  bool enable_fp16_{false};
 };

 }  // namespace mindspore::kernel

--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@@ -50,6 +50,7 @@ class OpenCLKernel : public LiteKernel {
  }
  OpenCLMemType GetMemType() { return out_mem_type_; }
  void SetMemType(OpenCLMemType mem_type) { out_mem_type_ = mem_type; }
+  void SetFormatType(schema::Format format_type) { op_format_ = format_type; }
  schema::Format GetInOriFormat() { return in_ori_format_; }
  schema::Format GetOutOriFormat() { return out_ori_format_; }

@@ -57,6 +58,7 @@ class OpenCLKernel : public LiteKernel {
  OpenCLMemType out_mem_type_{OpenCLMemType::IMG};
  schema::Format in_ori_format_{schema::Format_NHWC};
  schema::Format out_ori_format_{schema::Format_NHWC4};
+  schema::Format op_format_{schema::Format_NC4HW4};
 };
 }  // namespace mindspore::kernel