From e8d74b3b23f6f4de0646491248da8144a932d786 Mon Sep 17 00:00:00 2001
From: liuruilong <liuruilong@baidu.com>
Date: Thu, 18 Oct 2018 16:55:22 +0800
Subject: [PATCH] update conv kernel

---
 src/framework/cl/cl_image.h                   | 17 +++--
 src/framework/executor.cpp                    |  2 +-
 src/operators/feed_op.cpp                     |  4 +-
 src/operators/kernel/arm/feed_kernel.cpp      | 26 +++----
 .../cl/cl_kernel/conv_add_bn_relu_kernel.cl   | 18 +++--
 .../kernel/cl/cl_kernel/conv_add_kernel.cl    | 19 +++--
 .../kernel/cl/cl_kernel/conv_kernel.cl        | 19 +++--
 .../kernel/cl/cl_kernel/conv_kernel.inc.cl    | 21 ++++--
 .../kernel/cl/conv_add_bn_relu_kernel.cpp     | 12 +++
 src/operators/kernel/fpga/feed-kernel.cpp     | 74 +++++++++----------
 src/operators/kernel/mali/feed_kernel.cpp     | 22 +++---
 11 files changed, 137 insertions(+), 97 deletions(-)
diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h
index b7625ca65d..8fc9f49ffa 100644
--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
@@ -257,16 +257,21 @@ class CLImage {
       float *p = tensor_data;
       size_t i0 = 0;
       for (int n = 0; n < N; n++) {
-        for (int c = 0; c < C; c++) {
+        for (int c = 0; c < c_block_ * 4; c++) {
           size_t i1 = i0 + (c / 4) * W;
           for (int h = 0; h < H; h++) {
             size_t i2 = (i1 << 2) + c % 4;
             for (int w = 0; w < W; w++) {
-              // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 + (c
-              // % 4);
-              imageData[i2] = Float2Half(*p);
-              i2 += 4;
-              p++;
+              if (c < C) {
+                // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
+                // (c % 4);
+                imageData[i2] = Float2Half(*p);
+                i2 += 4;
+                p++;
+              } else {
+                imageData[i2] = 0.0;
+                i2 += 4;
+              }
             }
             i1 += width;
           }
diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp
index 7980a2d1f6..22aa7cc98a 100644
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -37,7 +37,7 @@ limitations under the License. */
 #include "framework/cl/cl_image.h"
 #endif
 
-int debug_to = 3;
+int debug_to = 2;
 
 namespace paddle_mobile {
 namespace framework {
diff --git a/src/operators/feed_op.cpp b/src/operators/feed_op.cpp
index dc6d0e9227..9299d7cc19 100644
--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "feed_op.h"
+#include "operators/feed_op.h"
+
 namespace paddle_mobile {
 namespace operators {
 
@@ -22,6 +23,7 @@ void FeedOp<DeviceType, T>::InferShape() const {
   out_dims[0] = this->param_.BatchSize();
   this->param_.Out()->Resize(out_dims);
 }
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/arm/feed_kernel.cpp b/src/operators/kernel/arm/feed_kernel.cpp
index 03b0ec311e..598f6df01b 100644
--- a/src/operators/kernel/arm/feed_kernel.cpp
+++ b/src/operators/kernel/arm/feed_kernel.cpp
@@ -12,25 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "operators/kernel/feed_kernel.h"
 
 namespace paddle_mobile {
-    namespace operators {
+namespace operators {
 
-        template <>
-        bool FeedKernel<CPU, float>::Init(FeedParam<CPU> *param) {
-            return true;
-        }
+template <>
+bool FeedKernel<CPU, float>::Init(FeedParam<CPU> *param) {
+  return true;
+}
 
-        template <>
-        void FeedKernel<CPU, float>::Compute(const FeedParam<CPU> &param) {
-            param.Out()->ShareDataWith(*(param.InputX()));
-            param.Out()->set_lod(param.InputX()->lod());
-        }
+template <>
+void FeedKernel<CPU, float>::Compute(const FeedParam<CPU> &param) {
+  param.Out()->ShareDataWith(*(param.InputX()));
+  param.Out()->set_lod(param.InputX()->lod());
+}
 
-        template class FeedKernel<CPU, float>;
+template class FeedKernel<CPU, float>;
 
-    }  // namespace operators
+}  // namespace operators
 }  // namespace paddle_mobile
-
diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
index eb73248d74..ae04c64aa9 100644
--- a/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
@@ -65,6 +65,14 @@ __kernel void conv_3x3(__private const int global_size_dim0,
     const int out_w = get_global_id(1);
     const int out_nh = get_global_id(2);
 
+    if (out_c >= global_size_dim0 ||
+        out_w >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        printf(" out of range ");
+        return;
+    }
+
+
     int2 stride_xy;
     stride_xy.x = stride;
     stride_xy.y = stride;
@@ -135,24 +143,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
         input[8] = select(read_imageh(input_image, sampler,
                           (int2)(pos_in.x + dilation, pos_in.y + dilation)),
                           (half4)(0.0f),
-                          (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+                          (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
 
         for (int j = 0; j < 9; ++j) {
             int2 fuck;
             fuck.x = i * 3 + j % 3;
-            fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
             half4 weight_x = read_imageh(filter, sampler, fuck);
             output.x += dot(input[j], weight_x);
 
-            fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
             half4 weight_y = read_imageh(filter, sampler, fuck);
             output.y += dot(input[j], weight_y);
 
-            fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
             half4 weight_z = read_imageh(filter, sampler, fuck);
             output.z += dot(input[j], weight_z);
 
-            fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
             half4 weight_w = read_imageh(filter, sampler, fuck);
             output.w += dot(input[j], weight_w);
         }
diff --git a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
index 69fc177e9e..708321faf8 100644
--- a/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
@@ -63,6 +63,14 @@ __kernel void conv_3x3(__private const int global_size_dim0,
     const int out_w = get_global_id(1);
     const int out_nh = get_global_id(2);
 
+    if (out_c >= global_size_dim0 ||
+        out_w >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        printf(" out of range ");
+        return;
+    }
+
+
     int2 stride_xy;
     stride_xy.x = stride;
     stride_xy.y = stride;
@@ -133,24 +141,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
         input[8] = select(read_imageh(input_image, sampler,
                           (int2)(pos_in.x + dilation, pos_in.y + dilation)),
                           (half4)(0.0f),
-                          (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+                          (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
 
         for (int j = 0; j < 9; ++j) {
             int2 fuck;
             fuck.x = i * 3 + j % 3;
-            fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
             half4 weight_x = read_imageh(filter, sampler, fuck);
             output.x += dot(input[j], weight_x);
 
-            fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
             half4 weight_y = read_imageh(filter, sampler, fuck);
             output.y += dot(input[j], weight_y);
 
-            fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
             half4 weight_z = read_imageh(filter, sampler, fuck);
             output.z += dot(input[j], weight_z);
 
-            fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
             half4 weight_w = read_imageh(filter, sampler, fuck);
             output.w += dot(input[j], weight_w);
         }
@@ -169,7 +177,6 @@ __kernel void conv_3x3(__private const int global_size_dim0,
 
 
 
-
 __kernel void depth_conv_3x3(__private const int global_size_dim0,
                                               __private const int global_size_dim1,
                                               __private const int global_size_dim2,
diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
index 02717a0853..5e27f49556 100644
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
@@ -44,6 +44,14 @@ __kernel void conv_3x3(__private const int global_size_dim0,
     const int out_w = get_global_id(1);
     const int out_nh = get_global_id(2);
 
+    if (out_c >= global_size_dim0 ||
+        out_w >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        printf(" out of range ");
+        return;
+    }
+
+
     int2 stride_xy;
     stride_xy.x = stride;
     stride_xy.y = stride;
@@ -114,24 +122,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
         input[8] = select(read_imageh(input_image, sampler,
                           (int2)(pos_in.x + dilation, pos_in.y + dilation)),
                           (half4)(0.0f),
-                          (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+                          (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
 
         for (int j = 0; j < 9; ++j) {
             int2 fuck;
             fuck.x = i * 3 + j % 3;
-            fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
             half4 weight_x = read_imageh(filter, sampler, fuck);
             output.x += dot(input[j], weight_x);
 
-            fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
             half4 weight_y = read_imageh(filter, sampler, fuck);
             output.y += dot(input[j], weight_y);
 
-            fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
             half4 weight_z = read_imageh(filter, sampler, fuck);
             output.z += dot(input[j], weight_z);
 
-            fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
             half4 weight_w = read_imageh(filter, sampler, fuck);
             output.w += dot(input[j], weight_w);
         }
@@ -150,7 +158,6 @@ __kernel void conv_3x3(__private const int global_size_dim0,
 
 
 
-
 __kernel void depth_conv_3x3(__private const int global_size_dim0,
                                               __private const int global_size_dim1,
                                               __private const int global_size_dim2,
diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
index 0ff05d4968..333aa79043 100644
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -54,6 +54,14 @@ __kernel void conv_3x3(__private const int global_size_dim0,
     const int out_w = get_global_id(1);
     const int out_nh = get_global_id(2);
 
+    if (out_c >= global_size_dim0 ||
+        out_w >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        printf(" out of range ");
+        return;
+    }
+
+
     int2 stride_xy;
     stride_xy.x = stride;
     stride_xy.y = stride;
@@ -124,24 +132,24 @@ __kernel void conv_3x3(__private const int global_size_dim0,
         input[8] = select(read_imageh(input_image, sampler,
                           (int2)(pos_in.x + dilation, pos_in.y + dilation)),
                           (half4)(0.0f),
-                          (ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
+                          (ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
 
         for (int j = 0; j < 9; ++j) {
             int2 fuck;
             fuck.x = i * 3 + j % 3;
-            fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 0 * 3 + j / 3;
             half4 weight_x = read_imageh(filter, sampler, fuck);
             output.x += dot(input[j], weight_x);
 
-            fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 1 * 3 + j / 3;
             half4 weight_y = read_imageh(filter, sampler, fuck);
             output.y += dot(input[j], weight_y);
 
-            fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 2 * 3 + j / 3;
             half4 weight_z = read_imageh(filter, sampler, fuck);
             output.z += dot(input[j], weight_z);
 
-            fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3;
+            fuck.y = out_c * 4 * 3 + 3 * 3 + j / 3;
             half4 weight_w = read_imageh(filter, sampler, fuck);
             output.w += dot(input[j], weight_w);
         }
@@ -158,9 +166,6 @@ __kernel void conv_3x3(__private const int global_size_dim0,
     write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
 }
 
-
-
-
 __kernel void depth_conv_3x3(__private const int global_size_dim0,
                                               __private const int global_size_dim1,
                                               __private const int global_size_dim2,
diff --git a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
index 1c75b8be9f..0d5dc25157 100644
--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -165,6 +165,18 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
   int output_width = param.Output()->WidthOfOneBlock();
   int output_height = param.Output()->HeightOfOneBlock();
 
+  DLOG << " c block " << c_block;
+  DLOG << " w " << w;
+  DLOG << " nh " << nh;
+  DLOG << " stride " << stride;
+  DLOG << " offset " << offset;
+  DLOG << " input_c " << input_c;
+  DLOG << " dilation " << dilation;
+  DLOG << " input width " << input_width;
+  DLOG << " input height " << input_height;
+  DLOG << " output width " << output_width;
+  DLOG << " output height " << output_height;
+
   cl_int status;
 
   status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
diff --git a/src/operators/kernel/fpga/feed-kernel.cpp b/src/operators/kernel/fpga/feed-kernel.cpp
index c42f3345ea..161d8c9f0c 100644
--- a/src/operators/kernel/fpga/feed-kernel.cpp
+++ b/src/operators/kernel/fpga/feed-kernel.cpp
@@ -15,41 +15,41 @@ limitations under the License. */
 #include "operators/kernel/feed_kernel.h"
 
 namespace paddle_mobile {
-    namespace operators {
-
-        template <>
-        bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
-            Tensor *output = param->Out();
-            fpga::format_fp16_ofm(output);
-            return true;
-        }
-
-        template <>
-        void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
-            auto input = reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX()));
-            auto input_ptr = input->data<float>();
-            fpga::format_image(input);
-            Tensor *output = param.Out();
-            auto output_ptr = output->data<float>();
-
-            fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
-
-            args.input_data_type = fpga::DATA_TYPE_FP32;
-            args.output_data_type = fpga::DATA_TYPE_FP16;
-            args.input_layout_type = fpga::LAYOUT_CHW;
-            args.output_layout_type = fpga::LAYOUT_HWC;
-            args.image.address = reinterpret_cast<void *>(input_ptr);
-            args.image.channels = (uint32_t)input->dims()[1];
-            args.image.height = (uint32_t)input->dims()[2];
-            args.image.width = (uint32_t)input->dims()[3];
-            args.image.pad_height = 0;
-            args.image.pad_width = 0;
-            args.output.address = output_ptr;
-            args.output.scale_address = output->scale;
-            fpga::PerformBypass(args);
-        }
-        template class FeedKernel<FPGA, float>;
-
-    }  // namespace operators
+namespace operators {
+
+template <>
+bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
+  Tensor *output = param->Out();
+  fpga::format_fp16_ofm(output);
+  return true;
+}
+
+template <>
+void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
+  auto input =
+      reinterpret_cast<Tensor *>(const_cast<LoDTensor *>(param.InputX()));
+  auto input_ptr = input->data<float>();
+  fpga::format_image(input);
+  Tensor *output = param.Out();
+  auto output_ptr = output->data<float>();
+
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
+
+  args.input_data_type = fpga::DATA_TYPE_FP32;
+  args.output_data_type = fpga::DATA_TYPE_FP16;
+  args.input_layout_type = fpga::LAYOUT_CHW;
+  args.output_layout_type = fpga::LAYOUT_HWC;
+  args.image.address = reinterpret_cast<void *>(input_ptr);
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.image.height = (uint32_t)input->dims()[2];
+  args.image.width = (uint32_t)input->dims()[3];
+  args.image.pad_height = 0;
+  args.image.pad_width = 0;
+  args.output.address = output_ptr;
+  args.output.scale_address = output->scale;
+  fpga::PerformBypass(args);
+}
+template class FeedKernel<FPGA, float>;
+
+}  // namespace operators
 }  // namespace paddle_mobile
-
diff --git a/src/operators/kernel/mali/feed_kernel.cpp b/src/operators/kernel/mali/feed_kernel.cpp
index 4ca76e91aa..34be184d40 100644
--- a/src/operators/kernel/mali/feed_kernel.cpp
+++ b/src/operators/kernel/mali/feed_kernel.cpp
@@ -15,23 +15,19 @@ limitations under the License. */
 #include "operators/kernel/feed_kernel.h"
 
 namespace paddle_mobile {
-    namespace operators {
+namespace operators {
 
+template <>
+bool FeedKernel<GPU_MALI, float>::Init(FeedParam<GPU_MALI> *param) {
+  return true;
+}
 
-        template <>
-        bool FeedKernel<GPU_MALI, float>::Init(
-                FeedParam<GPU_MALI> *param) {
-            return true;
-        }
+template <>
+void FeedKernel<GPU_MALI, float>::Compute(const FeedParam<GPU_MALI> &param) {}
 
-        template <>
-        void FeedKernel<GPU_MALI, float>::Compute(
-                const FeedParam<GPU_MALI> &param) {
-        }
+template class FeedKernel<GPU_MALI, float>;
 
-        template class FeedKernel<GPU_MALI, float>;
-
-    }  // namespace operators
+}  // namespace operators
 }  // namespace paddle_mobile
 
 #endif
-- 
GitLab