add some function

2747503b · yangfei · 640a1ab5 · 408407f4 · 2747503b · 2747503b
9 changed file
--- a/src/framework/cl/cl_engine.h
+++ b/src/framework/cl/cl_engine.h
@@ -40,8 +40,8 @@ class CLEngine {
    return std::move(context_ptr);
  }

-  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter>
-  CreateClCommandQueue(cl_context context) {
+  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue(
+      cl_context context) {
    cl_int status;
    cl_command_queue queue =
        clCreateCommandQueue(context, devices_[0], 0, &status);

--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
@@ -182,28 +182,25 @@ class CLImage {
    DLOG << " image width: " << width;
    DLOG << " image height: " << height;

-    cl_image_format cf = {
-      .image_channel_order = CL_RGBA,
-      .image_channel_data_type = CL_HALF_FLOAT
-    };
+    cl_image_format cf = {.image_channel_order = CL_RGBA,
+                          .image_channel_data_type = CL_HALF_FLOAT};
    cl_image_desc cid = {
-      .image_type = CL_MEM_OBJECT_IMAGE2D,
-      .image_width = width,
-      .image_height = height,
-      .image_depth = 1,
-      .image_array_size = 1,
-      .image_row_pitch = 0,
-      .image_slice_pitch = 0,
-      .num_mip_levels = 0,
-      .num_samples = 0,
-      // .buffer = nullptr
+        .image_type = CL_MEM_OBJECT_IMAGE2D,
+        .image_width = width,
+        .image_height = height,
+        .image_depth = 1,
+        .image_array_size = 1,
+        .image_row_pitch = 0,
+        .image_slice_pitch = 0,
+        .num_mip_levels = 0,
+        .num_samples = 0,
+        // .buffer = nullptr
    };
    cid.buffer = nullptr;
    cl_image_ = clCreateImage(
-        context,
-        CL_MEM_READ_WRITE | (imageData ? CL_MEM_COPY_HOST_PTR : 0),
-        &cf,     // const cl_image_format *image_format
-        &cid,    // const cl_image_desc *image_desc
+        context, CL_MEM_READ_WRITE | (imageData ? CL_MEM_COPY_HOST_PTR : 0),
+        &cf,   // const cl_image_format *image_format
+        &cid,  // const cl_image_desc *image_desc
        reinterpret_cast<void *>(imageData.get()),  // void *host_ptr
        &err);


--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -37,7 +37,7 @@ limitations under the License. */
 #include "framework/cl/cl_image.h"
 #endif

-int debug_to = 2;
+int debug_to = 4;

 namespace paddle_mobile {
 namespace framework {

--- a/src/operators/kernel/cl/batchnorm_kernel.cpp
+++ b/src/operators/kernel/cl/batchnorm_kernel.cpp
@@ -47,15 +47,20 @@ bool BatchNormKernel<GPU_CL, float>::Init(BatchNormParam<GPU_CL> *param) {
    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
  }

-  delete[](new_scale_ptr);
-  delete[](new_bias_ptr);
-
  framework::CLImage *new_scale = new framework::CLImage();
+  new_scale->SetTensorData(new_scale_ptr, variance->dims());
+  new_scale->InitCLImage(this->cl_helper_.CLContext());
+
  framework::CLImage *new_bias = new framework::CLImage();
+  new_bias->SetTensorData(new_bias_ptr, variance->dims());
+  new_bias->InitCLImage(this->cl_helper_.CLContext());

  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);

+  delete[](new_scale_ptr);
+  delete[](new_bias_ptr);
+
  return true;
 }


--- a/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
@@ -3,8 +3,8 @@
 __kernel void batchnorm(__private const int out_height,
                        __private const int out_width,
                        __read_only image2d_t input,
-                        __read_only image2d_t new_scale,
-                        __read_only image2d_t new_bias,
+                        __read_only image2d_t new_scale_image,
+                        __read_only image2d_t new_bias_image,
                        __write_only image2d_t output) {
  const int out_c = get_global_id(0);
  const int out_w = get_global_id(1);
@@ -13,12 +13,12 @@ __kernel void batchnorm(__private const int out_height,
  const sampler_t sampler =
      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;

-  half4 new_scale = read_imageh(bn_scale, sampler, (int2)(out_c, 0));
-  half4 new_bias = read_imageh(bn_bias, sampler, (int2)(out_c, 0));
+  half4 new_scale = read_imageh(new_scale_image, sampler, (int2)(out_c, 0));
+  half4 new_bias = read_imageh(new_bias_image, sampler, (int2)(out_c, 0));

  int pos_x = mad24(out_c, out_width, out_w);
  half4 in = read_imageh(input, sampler, (int2)(pos_x, out_nh));
  half4 out = mad(in, new_scale, new_bias);

-  write_imageh(output, (int2)(pos_x, nh), out);
+  write_imageh(output, (int2)(pos_x, out_nh), out);
 }
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
@@ -12,10 +12,139 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#pragma OPENCL EXTENSION cl_khr_fp16 : enable

-__kernel void conv_3x3() {
+__kernel void conv_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input_image,
+                                              __read_only image2d_t filter,
+#ifdef BIASE
+                                              __read_only image2d_t bias,
+#endif

+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif

+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height,/* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+
+    int2 stride_xy;
+    stride_xy.x = stride;
+    stride_xy.y = stride;
+
+    int2 ouput_pos_in_one_block;
+    ouput_pos_in_one_block.x = out_w;
+    ouput_pos_in_one_block.y = out_nh;
+
+    int2 in_pos_in_one_block;
+    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+
+ #ifdef BIASE
+    half4 output = read_imageh(bias, sampler, int2(out_c, 0));
+#else
+    half4 output = 0.0;
+#endif
+
+   half4 input[9];
+
+   const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP         |
+                              CLK_FILTER_NEAREST;
+
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        input[0] = select(read_imageh(input_image, sampler,
+                            (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+                            (half4)(0.0),
+                            (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height));
+
+        input[1] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y - dilation)),
+                          (half4)(0.0),
+                          (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height));
+
+        input[2] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+                          (half4)(0.0),
+                          (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height));
+
+        input[3] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y)),
+                          (half4)(0.0),
+                          (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height));
+
+        input[4] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y)),
+                          (half4)(0.0),
+                          (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height));
+
+        input[5] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y)),
+                          (half4)(0.0),
+                          (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height));
+
+        input[6] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+                          (half4)(0.0),
+                          (ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+
+        input[7] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y + dilation)),
+                          (half4)(0.0),
+                          (ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+
+        input[8] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+                          (half4)(0.0),
+                          (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+
+        for (int j = 0; j < 9; ++j) {
+            int2 fuck;
+            fuck.x = i * 3 + j % 3;
+            fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3;
+            half4 weight_x = read_imageh(filter, sampler, fuck);
+            output.x += dot(input[j], weight_x);
+
+            fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3;
+            half4 weight_y = read_imageh(filter, sampler, fuck);
+            output.y += dot(input[j], weight_y);
+
+            fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3;
+            half4 weight_z = read_imageh(filter, sampler, fuck);
+            output.z += dot(input[j], weight_z);
+
+            fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3;
+            half4 weight_w = read_imageh(filter, sampler, fuck);
+            output.w += dot(input[j], weight_w);
+        }
+    }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, int2(out_c, 0)) + read_imageh(new_biase, sampler, int2(out_c, 0))
+#endif
+
+#ifdef RELU
+    output = activation(output);
+#endif
+
+    write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
 }


+
+
--- a/src/operators/kernel/cl/conv_kernel.cpp
+++ b/src/operators/kernel/cl/conv_kernel.cpp
@@ -78,7 +78,7 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {

  DLOG << " get Filter ";

-  auto output = param.Output();
+  auto output = param.Output()->GetCLImage();

  DLOG << " get Output ";

@@ -89,45 +89,54 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
  int input_width = param.Input()->WidthOfOneBlock();
  int input_height = param.Input()->HeightOfOneBlock();

+  int output_width = param.Output()->WidthOfOneBlock();
+  int output_height = param.Output()->HeightOfOneBlock();
+
  cl_int status;

  DLOG << " begin set kernel arg ";

-//  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-//  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-//  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-//  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-//  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-//  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
-//  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
-//  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
-//  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
-//  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
-//  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
-//  CL_CHECK_ERRORS(status);
-//
-//  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
-//  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);

  DLOG << " end set kernel arg ";

@@ -138,7 +147,6 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
                             default_work_size.data(), NULL, 0, NULL, NULL);
  CL_CHECK_ERRORS(status);
  DLOG << " end enqueue ";
-
 }

 template class ConvKernel<GPU_CL, float>;

--- a/src/operators/kernel/cl/feed_kernel.cpp
+++ b/src/operators/kernel/cl/feed_kernel.cpp
@@ -61,7 +61,11 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
  size_t region[3] = {height, width, 1};
  clEnqueueReadImage(commandQueue, cl_image, CL_TRUE, origin, region, 0, 0, out,
                     0, NULL, NULL);
+<<<<<<< HEAD
  for (int i = 0; i < numel; i++) DLOG << Half2Float(out[i])<<","<<i;
+=======
+//  for (int i = 0; i < numel; i++) DLOG << Half2Float(out[i]);
+>>>>>>> 289b739de8517c21872107c16790b9cb2e7042d7
 }

 template class FeedKernel<GPU_CL, float>;

--- a/tools/android-debug-script/push2android.sh
+++ b/tools/android-debug-script/push2android.sh
 #!/usr/bin/env sh

 push_fn () {
+
+
+cp ../../src/operators/kernel/cl/cl_kernel/*  ../../build/release/arm-v7a/build/cl_kernel/
+
+
 MODELS_PATH="../../test/models/*"
 MODELS_SRC="../../test/models"
 IMAGE_PATH="../../test/images/*"