!4727 [MS][LITE][GPU]softmax support Buffer output

Merge pull request !4727 from chenzupeng/master-lite

!4727 [MS][LITE][GPU]softmax support Buffer output
Merge pull request !4727 from chenzupeng/master-lite
b4b76b61 · mindspore-ci-bot · Gitee · 7e47cdc4 · 3bef0357 · b4b76b61
4 changed file
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl
@@ -48,3 +48,57 @@ __kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t
    slices_count++;
  } while (slices_count < slices_x32);
 }
+
+__kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *output, const float4 mask, const int slices,
+                             const int slices_x32) {
+  int tid = get_local_id(0);
+  float sum = 0.0f;
+  for (size_t i = tid; i < slices - 1; i += 32) {
+    float4 src = read_imagef(input, smp_none, (int2)(i, 0));
+    sum += dot((float4)(1.0f), exp(src));
+  }
+  if ((slices - 1) % 32 == tid) {
+    float4 src = read_imagef(input, smp_none, (int2)(slices - 1, 0));
+    sum += dot(mask, exp(src));
+  }
+
+  __local float4 tmp[8];
+  __local float *tmpx1 = (__local float *)tmp;
+  tmpx1[tid] = sum;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (tid == 0) {
+    sum = dot((float4)(1.0f), tmp[0]);
+    sum += dot((float4)(1.0f), tmp[1]);
+    sum += dot((float4)(1.0f), tmp[2]);
+    sum += dot((float4)(1.0f), tmp[3]);
+    sum += dot((float4)(1.0f), tmp[4]);
+    sum += dot((float4)(1.0f), tmp[5]);
+    sum += dot((float4)(1.0f), tmp[6]);
+    sum += dot((float4)(1.0f), tmp[7]);
+    tmpx1[0] = 1.0f / sum;
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+  sum = tmpx1[0];
+  for (size_t i = tid; i < slices - 1; i += 32) {
+    float4 result = read_imagef(input, smp_none, (int2)(i, 0));
+    result = exp(result) * sum;
+    output[i] = result;
+  }
+  if ((slices - 1) % 32 == tid) {
+    float4 result = read_imagef(input, smp_none, (int2)(slices - 1, 0));
+    result = exp(result) * sum;
+    __global float4 *remain_ptr4 = output;
+    remain_ptr4 += slices - 1;
+    __global float *remain_ptr = (__global float *)remain_ptr4;
+    remain_ptr[0] = result.x;
+    if (mask.y > 0.f) {
+      remain_ptr[1] = result.y;
+    }
+    if (mask.z > 0.f) {
+      remain_ptr[2] = result.z;
+    }
+    if (mask.w > 0.f) {
+      remain_ptr[3] = result.w;
+    }
+  }
+}
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@@ -62,7 +62,7 @@ int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
  int h = shapex[1];
  int w = shapex[2];
  int c = shapex[3];
-  im_dst_x = UP_DIV(w * c, C4NUM);
+  im_dst_x = w * UP_DIV(c, C4NUM);
  im_dst_y = h;
 #ifdef ENABLE_FP16
  size_t img_dtype = CL_HALF_FLOAT;

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
@@ -90,7 +90,8 @@ int SoftmaxOpenCLKernel::Init() {
  std::string program_name = "SoftMax";
  std::string source = softmax_source_fp32;
  runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
-
+  // framework not set this param yet! just use default.
+  parameter_->axis_ = 1;
  if (in_tensors_[0]->shape().size() == 4 && parameter_->axis_ == 3) {
    // support 4d tensor
    onexone_flag_ = false;
@@ -106,7 +107,10 @@ int SoftmaxOpenCLKernel::Init() {
 #ifdef PROGRAM_WITH_IL
  runtime_->CreateKernelFromIL(kernel_(), kernel_name);
 #else
-  if (mem_type_ == MEM_TYPE::BUF) {
+  if (!is_image_out_) {
+    out_mem_type_ = OpenCLMemType::BUF;
+  }
+  if (out_mem_type_ == OpenCLMemType::BUF) {
    kernel_name += "_BUF";
    program_name += "_BUF";
  } else {
@@ -119,6 +123,10 @@ int SoftmaxOpenCLKernel::Init() {
 #endif
  ori_format_ = out_tensors_[0]->GetFormat();
  out_tensors_[0]->SetFormat(schema::Format_NHWC4);
+  if (!is_image_out_) {
+    ori_format_ = schema::Format_NC;
+    out_tensors_[0]->SetFormat(schema::Format_NC);
+  }
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  return lite::RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h
@@ -41,17 +41,17 @@ class SoftmaxOpenCLKernel : public OpenCLKernel {
  int InitGlobalSize();
  int SetWorkGroupSize1x1();
  int SetWorkGroupSize();
-    std::vector<float> GetMaskForLastChannel(int channels);
+  std::vector<float> GetMaskForLastChannel(int channels);

 private:
  cl::Kernel kernel_;
  SoftmaxParameter *parameter_;
  lite::opencl::OpenCLRuntime *runtime_;
-  enum class MEM_TYPE { BUF, IMG } mem_type_{MEM_TYPE::IMG};

  bool onexone_flag_{false};
  std::vector<size_t> local_size_;
  std::vector<size_t> global_size_;
+  bool is_image_out_{false};
 };

 }  // namespace mindspore::kernel