Merge pull request #1138 from codeWorm2015/opencl

update conv kernel

Merge pull request #1138 from codeWorm2015/opencl
update conv kernel
661f49a3 · Ray Liu · GitHub · c7c428b5 · 9bcb44d6 · 661f49a3
9 changed file
--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
@@ -257,16 +257,21 @@ class CLImage {
      float *p = tensor_data;
      size_t i0 = 0;
      for (int n = 0; n < N; n++) {
-        for (int c = 0; c < C; c++) {
+        for (int c = 0; c < c_block_ * 4; c++) {
          size_t i1 = i0 + (c / 4) * W;
          for (int h = 0; h < H; h++) {
            size_t i2 = (i1 << 2) + c % 4;
            for (int w = 0; w < W; w++) {
-              // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 + (c
+              if (c < C) {
-              // % 4);
+                // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
-              imageData[i2] = Float2Half(*p);
+                // (c % 4);
-              i2 += 4;
+                imageData[i2] = Float2Half(*p);
-              p++;
+                i2 += 4;
+                p++;
+              } else {
+                imageData[i2] = 0.0;
+                i2 += 4;
+              }
            }
            i1 += width;
          }

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -37,7 +37,7 @@ limitations under the License. */
 #include "framework/cl/cl_image.h"
 #endif
-int debug_to = 3;
+int debug_to = 2;
 namespace paddle_mobile {
 namespace framework {

--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "feed_op.h"
+#include "operators/feed_op.h"
 namespace paddle_mobile {
 namespace operators {
@@ -22,6 +23,7 @@ void FeedOp<DeviceType, T>::InferShape() const {
  out_dims[0] = this->param_.BatchSize();
  this->param_.Out()->Resize(out_dims);
 }
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
@@ -65,6 +65,14 @@ __kernel void conv_3x3(__private const int global_size_dim0,
    const int out_w = get_global_id(1);
    const int out_nh = get_global_id(2);
+    if (out_c >= global_size_dim0 ||
+        out_w >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        printf(" out of range ");
+        return;
+    }
    int2 stride_xy;
    stride_xy.x = stride;
    stride_xy.y = stride;
@@ -135,24 +143,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
        input[8] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
                          (half4)(0.0f),
-                          (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+                          (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
        for (int j = 0; j < 9; ++j) {
            int2 fuck;
            fuck.x = i * 3 + j % 3;
-            fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
            half4 weight_x = read_imageh(filter, sampler, fuck);
            output.x += dot(input[j], weight_x);
-            fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
            half4 weight_y = read_imageh(filter, sampler, fuck);
            output.y += dot(input[j], weight_y);
-            fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
            half4 weight_z = read_imageh(filter, sampler, fuck);
            output.z += dot(input[j], weight_z);
-            fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
            half4 weight_w = read_imageh(filter, sampler, fuck);
            output.w += dot(input[j], weight_w);
        }

--- a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
@@ -63,6 +63,14 @@ __kernel void conv_3x3(__private const int global_size_dim0,
    const int out_w = get_global_id(1);
    const int out_nh = get_global_id(2);
+    if (out_c >= global_size_dim0 ||
+        out_w >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        printf(" out of range ");
+        return;
+    }
    int2 stride_xy;
    stride_xy.x = stride;
    stride_xy.y = stride;
@@ -133,24 +141,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
        input[8] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
                          (half4)(0.0f),
-                          (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+                          (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
        for (int j = 0; j < 9; ++j) {
            int2 fuck;
            fuck.x = i * 3 + j % 3;
-            fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
            half4 weight_x = read_imageh(filter, sampler, fuck);
            output.x += dot(input[j], weight_x);
-            fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
            half4 weight_y = read_imageh(filter, sampler, fuck);
            output.y += dot(input[j], weight_y);
-            fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
            half4 weight_z = read_imageh(filter, sampler, fuck);
            output.z += dot(input[j], weight_z);
-            fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
            half4 weight_w = read_imageh(filter, sampler, fuck);
            output.w += dot(input[j], weight_w);
        }
@@ -169,7 +177,6 @@ __kernel void conv_3x3(__private const int global_size_dim0,
 __kernel void depth_conv_3x3(__private const int global_size_dim0,
                                              __private const int global_size_dim1,
                                              __private const int global_size_dim2,

--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
@@ -44,6 +44,14 @@ __kernel void conv_3x3(__private const int global_size_dim0,
    const int out_w = get_global_id(1);
    const int out_nh = get_global_id(2);
+    if (out_c >= global_size_dim0 ||
+        out_w >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        printf(" out of range ");
+        return;
+    }
    int2 stride_xy;
    stride_xy.x = stride;
    stride_xy.y = stride;
@@ -114,24 +122,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
        input[8] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
                          (half4)(0.0f),
-                          (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+                          (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
        for (int j = 0; j < 9; ++j) {
            int2 fuck;
            fuck.x = i * 3 + j % 3;
-            fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
            half4 weight_x = read_imageh(filter, sampler, fuck);
            output.x += dot(input[j], weight_x);
-            fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
            half4 weight_y = read_imageh(filter, sampler, fuck);
            output.y += dot(input[j], weight_y);
-            fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
            half4 weight_z = read_imageh(filter, sampler, fuck);
            output.z += dot(input[j], weight_z);
-            fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
            half4 weight_w = read_imageh(filter, sampler, fuck);
            output.w += dot(input[j], weight_w);
        }
@@ -150,7 +158,6 @@ __kernel void conv_3x3(__private const int global_size_dim0,
 __kernel void depth_conv_3x3(__private const int global_size_dim0,
                                              __private const int global_size_dim1,
                                              __private const int global_size_dim2,

--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -54,6 +54,14 @@ __kernel void conv_3x3(__private const int global_size_dim0,
    const int out_w = get_global_id(1);
    const int out_nh = get_global_id(2);
+    if (out_c >= global_size_dim0 ||
+        out_w >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        printf(" out of range ");
+        return;
+    }
    int2 stride_xy;
    stride_xy.x = stride;
    stride_xy.y = stride;
@@ -124,24 +132,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
        input[8] = select(read_imageh(input_image, sampler,
                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
                          (half4)(0.0f),
-                          (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+                          (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
        for (int j = 0; j < 9; ++j) {
            int2 fuck;
            fuck.x = i * 3 + j % 3;
-            fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
            half4 weight_x = read_imageh(filter, sampler, fuck);
            output.x += dot(input[j], weight_x);
-            fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
            half4 weight_y = read_imageh(filter, sampler, fuck);
            output.y += dot(input[j], weight_y);
-            fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
            half4 weight_z = read_imageh(filter, sampler, fuck);
            output.z += dot(input[j], weight_z);
-            fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
            half4 weight_w = read_imageh(filter, sampler, fuck);
            output.w += dot(input[j], weight_w);
        }
@@ -158,9 +166,6 @@ __kernel void conv_3x3(__private const int global_size_dim0,
    write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
 }
 __kernel void depth_conv_3x3(__private const int global_size_dim0,
                                              __private const int global_size_dim1,
                                              __private const int global_size_dim2,

--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -165,6 +165,18 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
  int output_width = param.Output()->WidthOfOneBlock();
  int output_height = param.Output()->HeightOfOneBlock();
+  DLOG << " c block " << c_block;
+  DLOG << " w " << w;
+  DLOG << " nh " << nh;
+  DLOG << " stride " << stride;
+  DLOG << " offset " << offset;
+  DLOG << " input_c " << input_c;
+  DLOG << " dilation " << dilation;
+  DLOG << " input width " << input_width;
+  DLOG << " input height " << input_height;
+  DLOG << " output width " << output_width;
+  DLOG << " output height " << output_height;
  cl_int status;
  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);

--- a/src/operators/kernel/fpga/feed-kernel.cpp
+++ b/src/operators/kernel/fpga/feed-kernel.cpp
@@ -15,41 +15,41 @@ limitations under the License. */
 #include "operators/kernel/feed_kernel.h"
 namespace paddle_mobile {
-    namespace operators {
+namespace operators {
-        template <>
+template <>
-        bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
+bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
-            Tensor *output = param->Out();
+  Tensor *output = param->Out();
-            fpga::format_fp16_ofm(output);
+  fpga::format_fp16_ofm(output);
-            return true;
+  return true;
-        }
+}
-        template <>
+template <>
-        void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
+void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
-            auto input = reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX()));
+  auto input =
-            auto input_ptr = input->data<float>();
+      reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX()));
-            fpga::format_image(input);
+  auto input_ptr = input->data<float>();
-            Tensor *output = param.Out();
+  fpga::format_image(input);
-            auto output_ptr = output->data<float>();
+  Tensor *output = param.Out();
+  auto output_ptr = output->data<float>();
-            fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
-            args.input_data_type = fpga::DATA_TYPE_FP32;
-            args.output_data_type = fpga::DATA_TYPE_FP16;
+  args.input_data_type = fpga::DATA_TYPE_FP32;
-            args.input_layout_type = fpga::LAYOUT_CHW;
+  args.output_data_type = fpga::DATA_TYPE_FP16;
-            args.output_layout_type = fpga::LAYOUT_HWC;
+  args.input_layout_type = fpga::LAYOUT_CHW;
-            args.image.address = reinterpret_cast<void *>(input_ptr);
+  args.output_layout_type = fpga::LAYOUT_HWC;
-            args.image.channels = (uint32_t)input->dims()[1];
+  args.image.address = reinterpret_cast<void *>(input_ptr);
-            args.image.height = (uint32_t)input->dims()[2];
+  args.image.channels = (uint32_t)input->dims()[1];
-            args.image.width = (uint32_t)input->dims()[3];
+  args.image.height = (uint32_t)input->dims()[2];
-            args.image.pad_height = 0;
+  args.image.width = (uint32_t)input->dims()[3];
-            args.image.pad_width = 0;
+  args.image.pad_height = 0;
-            args.output.address = output_ptr;
+  args.image.pad_width = 0;
-            args.output.scale_address = output->scale;
+  args.output.address = output_ptr;
-            fpga::PerformBypass(args);
+  args.output.scale_address = output->scale;
-        }
+  fpga::PerformBypass(args);
-        template class FeedKernel<FPGA, float>;
+}
+template class FeedKernel<FPGA, float>;
-    }  // namespace operators
+}  // namespace operators
 }  // namespace paddle_mobile