update fetch kernel

57ad25ef · zhaojiaying01 · 4f07b27a · 57ad25ef · 57ad25ef
Showing with 43 addition and 16 deletion

src/operators/kernel/cl/cl_kernel/fetch_kernel.cl src/operators/kernel/cl/cl_kernel/fetch_kernel.cl +22 -3

src/operators/kernel/cl/fetch_kernel.cpp src/operators/kernel/cl/fetch_kernel.cpp +21 -13

未找到文件。
--- a/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
@@ -2,11 +2,11 @@
 __kernel void fetch(__private const int in_height,
                    __private const int in_width,
+                    __read_only image2d_t input,
+                    __global float* out,
                    __private const int size_ch,
                    __private const int size_block,
-                    __private const int size_batch,
+                    __private const int size_batch) {
-                    __read_only image2d_t input,
-                    __global float* out) {
  const int in_c = get_global_id(0);
  const int in_w = get_global_id(1);
  const int in_nh = get_global_id(2);
@@ -25,3 +25,22 @@ __kernel void fetch(__private const int in_height,
  out[index + size_ch * 2] = convert_float(in.z);
  out[index + size_ch * 3] = convert_float(in.w);
 }
+__kernel void fetch_2d(__private const int in_height,
+                       __private const int in_width,
+                       __read_only image2d_t input,
+                       __global float* out) {
+  const int in_w = get_global_id(1);
+  const int in_h = get_global_id(2);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  half4 in = read_imageh(input, sampler, (int2)(in_w, in_h));
+  const int index = (in_h * in_width + in_w) * 4;
+  out[index] = convert_float(in.x);
+  out[index + 1] = convert_float(in.y);
+  out[index + 2] = convert_float(in.z);
+  out[index + 3] = convert_float(in.w);
+}
--- a/src/operators/kernel/cl/fetch_kernel.cpp
+++ b/src/operators/kernel/cl/fetch_kernel.cpp
@@ -20,7 +20,11 @@ namespace operators {
 template <>
 bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
+  if (param->InputX()->dims().size() <= 2) {
+    this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl");
+  } else {
+    this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
+  }
  auto *out = param->Out();
  out->mutable_data<float>();
  return true;
@@ -41,16 +45,15 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
    new_dims[4 - dim.size() + j] = dim[j];
  }
-  size_t N, C, in_height, in_width;
+  size_t C, in_height, in_width;
-  N = new_dims[0];
  C = new_dims[1];
  in_height = new_dims[2];
-  in_width = new_dims[3];
+  if (dim.size() <= 2) {
+    in_width = param.InputX()->ImageWidth();
-  int size_ch = in_height * in_width;
+  } else {
-  int size_block = size_ch * 4;
+    in_width = new_dims[3];
-  int size_batch = size_ch * C;
+  }
  CLTensor out_cl_tensor(this->cl_helper_.CLContext(),
                         this->cl_helper_.CLCommandQueue());
@@ -59,11 +62,16 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
  clSetKernelArg(kernel, 0, sizeof(int), &in_height);
  clSetKernelArg(kernel, 1, sizeof(int), &in_width);
-  clSetKernelArg(kernel, 2, sizeof(int), &size_ch);
+  clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
-  clSetKernelArg(kernel, 3, sizeof(int), &size_block);
+  clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
-  clSetKernelArg(kernel, 4, sizeof(int), &size_batch);
+  if (dim.size() > 2) {
-  clSetKernelArg(kernel, 5, sizeof(cl_mem), &input);
+    int size_ch = in_height * in_width;
-  clSetKernelArg(kernel, 6, sizeof(cl_mem), &outBuffer);
+    int size_block = size_ch * 4;
+    int size_batch = size_ch * C;
+    clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
+    clSetKernelArg(kernel, 5, sizeof(int), &size_block);
+    clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
+  }
  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
                         default_work_size.data(), NULL, 0, NULL, NULL);