!4138 change buffer to image2d for arithmetic

Merge pull request !4138 from liuchao/arith_image

!4138 change buffer to image2d for arithmetic
Merge pull request !4138 from liuchao/arith_image
a6dc9490 · mindspore-ci-bot · Gitee · 729d847d · 71dbdddb · a6dc9490
5 changed file
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
@@ -26,26 +26,9 @@ __kernel void ElementDiv(__global float *input_a, __global float *input_b, __glo
  output[idx] = input_a[idx] * input_b[idx];
 }

-__kernel void BoardcastAdd(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
+__kernel void BoardcastArith(__global float *input_a, float weight, float bias, __global float *output,
+                             const unsigned int n) {
  int idx = get_global_id(0);
  if (idx >= n) return;
-  output[idx] = input_a[idx] + input_b;
-}
-
-__kernel void BoardcastSub(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] - input_b;
-}
-
-__kernel void BoardcastMul(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] * input_b;
-}
-
-__kernel void BoardcastDiv(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] * input_b;
+  output[idx] = weight * input_a[idx] + bias;
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;

-__kernel void ElementAdd(__read_only image2d_t *input_a, __read_only image2d_t *input_b, __write_only image2d_t *output,
-                         const int4 output_shape) {
+__kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
+                         const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
-  int Z = get_global_id(2);
-  if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) return;
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }

-  if (idx >= n) return;
-  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y * output_shape.w + Z));
-  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y * output_shape.w + Z));
-  src = a + b;
-  write_imagef(output, (int2)(0, 0), src);
+  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
+  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
+  write_imagef(output, (int2)(X, Y), a + b);
+}
+
+__kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
+                         const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
+  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
+  write_imagef(output, (int2)(X, Y), a - b);
+}
+
+__kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
+                         const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
+  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
+  write_imagef(output, (int2)(X, Y), a * b);
+}
+
+__kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
+                         const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
+  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
+  write_imagef(output, (int2)(X, Y), a / b);
+}
+
+__kernel void BoardcastArith(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
+                             const int2 output_shape) {
+  int X = get_global_id(0);
+  int Y = get_global_id(1);
+  if (X >= output_shape.x || Y >= output_shape.y) {
+    return;
+  }
+
+  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
+  write_imagef(output, (int2)(X, Y), weight * a + bias);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@@ -40,10 +40,10 @@ std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const {
 }

 void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() {
-  global_size_ = InitGlobalSize();
-  int max_work_group_size = runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*runtime_->Device())());
-  local_size_ = GetCommonLocalSize(global_size_, max_work_group_size);
-  global_size_ = GetCommonGlobalSize(local_size_, global_size_);
+  size_t H = outputs_[0]->Batch() * outputs_[0]->Height();
+  size_t W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM);
+  local_size_ = {16, 16};
+  global_size_ = {H, W};
 }

 void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() {
@@ -51,63 +51,75 @@ void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() {
  global_size_ = {element_num};
 }

+int ArithmeticOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t>* img_size) {
+  size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM);
+  int H = outputs_[0]->Batch() * outputs_[0]->Height();
+  int W = outputs_[0]->Width() * CO4;
+  size_t im_dst_x, im_dst_y;
+  if (inputs_[0]->GetFormat() == schema::Format_NHWC4) {
+    im_dst_x = W;
+    im_dst_y = H;
+  } else {
+    im_dst_y = outputs_[0]->Batch() * outputs_[0]->Height() * CO4;
+    im_dst_x = outputs_[0]->Width();
+  }
+#ifdef ENABLE_FP16
+  size_t img_dtype = CL_HALF_FLOAT;
+#else
+  size_t img_dtype = CL_FLOAT;
+#endif
+  img_size->clear();
+  std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
+  *img_size = vec;
+  return 0;
+}
+
 int ArithmeticOpenCLKernel::Init() {
  runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
-  std::string element_name;
-  std::string boardcast_name;
+  std::string kernel_name;

  if (inputs_[1]->TensorType() == schema::NodeType_ValueNode && inputs_[1]->Data() != nullptr) {
    element_flag_ = false;
+    kernel_name = "BoardcastArith";
  } else {
    element_flag_ = true;
+    switch (opParameter->type_) {
+      case PrimitiveType_Mul:
+        kernel_name = "ElementMul";
+        break;
+      case PrimitiveType_Add:
+        kernel_name = "ElementAdd";
+        break;
+      case PrimitiveType_Sub:
+        kernel_name = "ElementSub";
+        break;
+      case PrimitiveType_Div:
+        kernel_name = "ElementDiv";
+        break;
+      default:
+        MS_LOG(ERROR) << "Error Operator type " << opParameter->type_;
+        break;
+    }
  }

-  switch (opParameter->type_) {
-    case PrimitiveType_Mul:
-      element_name = "ElementMul";
-      boardcast_name = "BoardcastMul";
-      break;
-    case PrimitiveType_Add:
-      element_name = "ElementAdd";
-      boardcast_name = "BoardcastAdd";
-      break;
-    case PrimitiveType_Sub:
-      element_name = "ElementSub";
-      boardcast_name = "BoardcastSub";
-      break;
-    case PrimitiveType_Div:
-      element_name = "ElementDiv";
-      boardcast_name = "BoardcastDiv";
-      break;
-    default:
-      MS_LOG(ERROR) << "Error Operator type " << opParameter->type_;
-      break;
-  }

 #ifdef PROGRAM_WITH_IL
  runtime_->CreateKernelFromIL(kernel_(), kernel_name);
 #else
  std::string program_name = "Arithmetic";
  std::set<std::string> build_options;
-  std::string source = arithmetic_buffer_source_fp32;
+  std::string source = arithmetic_image2d_source_fp32;
  runtime_->LoadSource(program_name, source);
-
-  if (element_flag_) {
-    runtime_->BuildKernel(kernel_, program_name, element_name, build_options);
-    MS_LOG(DEBUG) << element_name << " Init Done!";
-  } else {
-    runtime_->BuildKernel(kernel_, program_name, boardcast_name, build_options);
-    MS_LOG(DEBUG) << boardcast_name << " Init Done!";
-  }
+  runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
  outputs_[0]->SetFormat(schema::Format_NHWC4);
+  Image2dGetWorkGroupSize();
  return 0;
 }

 int ArithmeticOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->Name() << " Running!";
  auto runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
-  BufferGetWorkGroupSize();

  int arg_idx = 0;
  uint32_t element_num = outputs_[0]->ElementsC4Num();
@@ -116,11 +128,34 @@ int ArithmeticOpenCLKernel::Run() {
  if (element_flag_) {
    runtime_->SetKernelArg(kernel_, arg_idx++, inputs_[1]->Data());
  } else {
-    runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<float *>(inputs_[1]->Data())[0]);
+    float value = static_cast<float *>(inputs_[1]->Data())[0];
+    switch (opParameter->type_) {
+      case PrimitiveType_Mul:
+        weight_ = value;
+        break;
+      case PrimitiveType_Add:
+        bias_ = value;
+        break;
+      case PrimitiveType_Sub:
+        bias_ = -1 * value;
+        break;
+      case PrimitiveType_Div:
+        bias_ = 1 / value;
+        break;
+      default:
+        MS_LOG(ERROR) << "Error Operator type " << opParameter->type_;
+        break;
+    }
+    runtime_->SetKernelArg(kernel_, arg_idx++, weight_);
+    runtime_->SetKernelArg(kernel_, arg_idx++, bias_);
+    MS_LOG(DEBUG) << arg_idx-2 << " " << weight_;
+    MS_LOG(DEBUG) << arg_idx-1 << " " << bias_;
  }
  runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data());
-  runtime_->SetKernelArg(kernel_, arg_idx++, element_num);
-
+  int H = outputs_[0]->Batch() * outputs_[0]->Height();
+  int W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM);
+  cl_int2 output_shape{H, W};
+  runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
  runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
  return 0;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
@@ -24,15 +24,16 @@

 namespace mindspore::kernel {

-class ArithmeticOpenCLKernel : public ArithmeticCPUKernel {
+class ArithmeticOpenCLKernel : public OpenCLKernel {
 public:
  explicit ArithmeticOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                                  const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx)
-      : ArithmeticCPUKernel(parameter, inputs, outputs, ctx) {}
+      : OpenCLKernel(parameter, inputs, outputs) {}
  ~ArithmeticOpenCLKernel() override{};

  int Init() override;
  int Run() override;
+  int GetImageSize(size_t idx, std::vector<size_t>* img_size) override;

 private:
  std::vector<size_t> InitGlobalSize() const;
@@ -42,6 +43,8 @@ class ArithmeticOpenCLKernel : public ArithmeticCPUKernel {
  cl::Kernel kernel_;
  lite::opencl::OpenCLRuntime *runtime_;
  bool element_flag_{true};
+  float weight_{1.f};
+  float bias_{.0f};

  std::vector<size_t> local_size_;
  std::vector<size_t> global_size_;

--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
@@ -61,13 +61,12 @@ void LogData(void *data, const int size, const std::string prefix) {
 }

 void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) {
-  std::cout << "TestCase" << std::endl;
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
+  auto allocator = ocl_runtime->GetAllocator();

  bool is_bias_add = shape_b.empty();
  auto tensorType = schema::NodeType_ValueNode;

-  std::cout << "TestCase tensor" << std::endl;
  lite::tensor::Tensor *tensor_a =
    new lite::tensor::Tensor(kNumberTypeFloat32, shape_a, schema::Format_NHWC4, tensorType);
  lite::tensor::Tensor *tensor_b =
@@ -77,7 +76,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
  int64_t element_num = tensor_a->ElementsC4Num();
  int64_t element_num_b = is_bias_add ? 1 : tensor_b->ElementsC4Num();

-  std::cout << "TestCase new data" << std::endl;
  float *data_a = new float[element_num];
  float *data_b = new float[element_num_b];
  float *data_c_cpu = new float[element_num];
@@ -87,14 +85,12 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
  InitData(data_b, element_num_b);
  memset(data_c_ocl, 0, sizeof(float) * element_num);

-  std::cout << "TestCase run cpu" << std::endl;
  if (is_bias_add) {
    BoardcaseAdd(data_a, static_cast<float *>(data_b)[0], data_c_cpu, element_num);
  } else {
    ElementAdd(data_a, data_b, data_c_cpu, element_num);
  }

-  std::cout << "TestCase set data" << std::endl;
  std::vector<lite::tensor::Tensor *> inputs = {tensor_a};
  if (!is_bias_add) {
    inputs.push_back(tensor_b);
@@ -114,9 +110,10 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
    new kernel::ArithmeticOpenCLKernel(reinterpret_cast<OpParameter *>(param), arithmetic_inputs, outputs, &ctx);
  arith_kernel->Init();

+  tensor_a->MallocData(allocator);
+  tensor_b->MallocData(allocator);
  std::vector<kernel::LiteKernel *> kernels{arith_kernel};
  auto *kernel = new kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
-  std::cout << "TestCase Init" << std::endl;
  kernel->Init();

  memcpy(inputs[0]->Data(), data_a, sizeof(float) * element_num);
@@ -124,7 +121,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
    memcpy(inputs[1]->Data(), data_b, sizeof(float) * element_num_b);
  }

-  std::cout << "TestCase Run" << std::endl;
  kernel->Run();

  memcpy(data_c_ocl, outputs[0]->Data(), sizeof(float) * element_num);
@@ -136,7 +132,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
  LogData(outputs[0]->Data(), 10, "OpenCL compute : ");
  bool cmp = DataCompare(data_c_cpu, data_c_ocl, element_num);
  MS_LOG(INFO) << "Compare " << (cmp ? "success!" : "failed!");
-  std::cout << "TestCase End" << std::endl;

  // free
  delete[] data_a;
@@ -162,15 +157,15 @@ class TestArithmeticOpenCL : public mindspore::Common {
 };

 TEST_F(TestArithmeticOpenCL, AddElementwiseTest) {
-  const std::vector<int> &shape_a = {1, 32, 32, 4};
-  const std::vector<int> &shape_b = {1, 32, 32, 4};
+  const std::vector<int> &shape_a = {1, 1024, 1024, 4};
+  const std::vector<int> &shape_b = {1, 1024, 1024, 4};
  TestCase(shape_a, shape_b);
 }

-// TEST_F(TestOpenCLKernel, AddBoardcaseTest) {
-//  const std::vector<int> &shape_a = {1, 4, 128, 128};
-//  const std::vector<int> &shape_b = {};
-//  TestCase(shape_a, shape_b);
-//}
+TEST_F(TestArithmeticOpenCL, AddBoardcaseTest) {
+  const std::vector<int> &shape_a = {1, 128, 128, 4};
+  const std::vector<int> &shape_b = {};
+  TestCase(shape_a, shape_b);
+}

 }  // namespace mindspore