Conv3x3 opencl : remove array.

e4ec380d · liuqi · bc7bd0a2 · e4ec380d
隐藏空白更改
内联并排

Showing with 118 addition and 116 deletion

mace/kernels/opencl/cl/conv_2d_3x3.cl mace/kernels/opencl/cl/conv_2d_3x3.cl +118 -116

未找到文件。
--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -3,7 +3,7 @@
 __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
                          __read_only image2d_t filter, /* cout%4 * cin * kw * kh, cout/4 */
 #ifdef BIAS
-                          __read_only image2d_t bias, /* cout%4 * cout/4 */
+    __read_only image2d_t bias, /* cout%4 * cout/4 */
 #endif
                          __write_only image2d_t output,
                          __private const int in_height,
@@ -19,24 +19,27 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  const int out_hb = get_global_id(2);
  const int rounded_in_ch = in_ch_blks * 4;

-  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  DATA_TYPE4 out0 = 0;
+  DATA_TYPE4 out1 = 0;
+  DATA_TYPE4 out2 = 0;
+  DATA_TYPE4 out3 = 0;
+  DATA_TYPE4 out4 = 0;

-  VEC_DATA_TYPE(DATA_TYPE, 4) out[5] = {0};
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 #ifdef BIAS
-  out[0] =
-      CMD_TYPE(read_image, CMD_DATA_TYPE)(bias, sampler, (int2)(out_ch_blk, 0));
-  out[1] = out[0];
-  out[2] = out[0];
-  out[3] = out[0];
-  out[4] = out[0];
+  out0 =
+     READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
+  out1 = out0;
+  out2 = out0;
+  out3 = out0;
+  out4 = out0;
 #endif

-  int w[5];
-  w[0] = out_w_blk - padding_left;
-  w[1] = w[0] + out_w_blks;
-  w[2] = w[1] + out_w_blks;
-  w[3] = w[2] + out_w_blks;
-  w[4] = w[3] + out_w_blks;
+  int w0 = out_w_blk - padding_left;
+  int w1 = w0 + out_w_blks;
+  int w2 = w1 + out_w_blks;
+  int w3 = w2 + out_w_blks;
+  int w4 = w3 + out_w_blks;

  const int batch_idx = out_hb / out_height;
  const int height_idx = out_hb % out_height;
@@ -51,112 +54,111 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]

  const int input_image_width = in_ch_blks * in_width;

-  VEC_DATA_TYPE(DATA_TYPE, 4) in[5];
-  VEC_DATA_TYPE(DATA_TYPE, 4) weights[4];
+  DATA_TYPE4 in0, in1, in2, in3, in4;
+  DATA_TYPE4 weights0, weights1, weights2, weights3;
  int in_idx, hb_idx, width_idx, in_width_idx;
  // Unrolling this loop hurt perfmance
-  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    for (int i = 0; i < 9; ++i) {
-
-      in_idx = in_ch_blk * in_width;
-
-      hb_idx = i / 3;
-      width_idx = i % 3;
-      in_width_idx = w[0] + width_idx;
-      // Judge the width border for padding input.
-      if (in_width_idx < 0 || in_width_idx >= in_width) {
-        in[0] = 0;
-      } else {
-        in[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
-      }
-      in_width_idx = w[1] + width_idx;
-      if (in_width_idx < 0 || in_width_idx >= in_width) {
-        in[1] = 0;
-      } else {
-        in[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
-      }
-      in_width_idx = w[2] + width_idx;
-      if (in_width_idx < 0 || in_width_idx >= in_width) {
-        in[2] = 0;
-      } else {
-        in[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
+  for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
+    for (short hb_idx = 0; hb_idx < 3; ++ hb_idx) {
+      for (short width_idx = 0; width_idx < 3; ++width_idx) {
+
+        in_idx = in_ch_blk * in_width;
+
+        in_width_idx = w0 + width_idx;
+        // Judge the width border for padding input.
+        if (in_width_idx < 0 || in_width_idx >= in_width) {
+          in0 = 0;
+        } else {
+          in0 = READ_IMAGET(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
+        }
+        in_width_idx = w1 + width_idx;
+        if (in_width_idx < 0 || in_width_idx >= in_width) {
+          in1 = 0;
+        } else {
+          in1 = READ_IMAGET(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
+        }
+        in_width_idx = w2 + width_idx;
+        if (in_width_idx < 0 || in_width_idx >= in_width) {
+          in2 = 0;
+        } else {
+          in2 = READ_IMAGET(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
+        }
+        in_width_idx = w3 + width_idx;
+        if (in_width_idx < 0 || in_width_idx >= in_width) {
+          in3 = 0;
+        } else {
+          in3 = READ_IMAGET(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
+        }
+        in_width_idx = w4 + width_idx;
+        if (in_width_idx < 0 || in_width_idx >= in_width) {
+          in4 = 0;
+        } else {
+          in4 = READ_IMAGET(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
+        }
+
+        int filter_idx = (in_ch_blk << 2) + (hb_idx *  3 + width_idx) * rounded_in_ch;
+        weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
+        weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
+        weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
+        weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
+
+        // Will prefetch L2 improve performance? How to pretch image data?
+
+        // Interleaving load and mul does not improve performance as expected
+        out0 += in0.x * weights0;
+        out0 += in0.y * weights1;
+        out0 += in0.z * weights2;
+        out0 += in0.w * weights3;
+
+        out1 += in1.x * weights0;
+        out1 += in1.y * weights1;
+        out1 += in1.z * weights2;
+        out1 += in1.w * weights3;
+
+        out2 += in2.x * weights0;
+        out2 += in2.y * weights1;
+        out2 += in2.z * weights2;
+        out2 += in2.w * weights3;
+
+        out3 += in3.x * weights0;
+        out3 += in3.y * weights1;
+        out3 += in3.z * weights2;
+        out3 += in3.w * weights3;
+
+        out4 += in4.x * weights0;
+        out4 += in4.y * weights1;
+        out4 += in4.z * weights2;
+        out4 += in4.w * weights3;
      }
-      in_width_idx = w[3] + width_idx;
-      if (in_width_idx < 0 || in_width_idx >= in_width) {
-        in[3] = 0;
-      } else {
-        in[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
-      }
-      in_width_idx = w[4] + width_idx;
-      if (in_width_idx < 0 || in_width_idx >= in_width) {
-        in[4] = 0;
-      } else {
-        in[4] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
-      }
-
-
-      int filter_idx = (in_ch_blk << 2) + i * rounded_in_ch;
-      weights[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
-      weights[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
-      weights[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
-      weights[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
-
-      // Will prefetch L2 improve performance? How to pretch image data?
-
-      // Interleaving load and mul does not improve performance as expected
-      out[0] += in[0].x * weights[0];
-      out[0] += in[0].y * weights[1];
-      out[0] += in[0].z * weights[2];
-      out[0] += in[0].w * weights[3];
-
-      out[1] += in[1].x * weights[0];
-      out[1] += in[1].y * weights[1];
-      out[1] += in[1].z * weights[2];
-      out[1] += in[1].w * weights[3];
-
-      out[2] += in[2].x * weights[0];
-      out[2] += in[2].y * weights[1];
-      out[2] += in[2].z * weights[2];
-      out[2] += in[2].w * weights[3];
-
-      out[3] += in[3].x * weights[0];
-      out[3] += in[3].y * weights[1];
-      out[3] += in[3].z * weights[2];
-      out[3] += in[3].w * weights[3];
-
-      out[4] += in[4].x * weights[0];
-      out[4] += in[4].y * weights[1];
-      out[4] += in[4].z * weights[2];
-      out[4] += in[4].w * weights[3];
    }
  }

  const int out_x_base = out_ch_blk * out_width;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
-                                       (int2)(out_x_base + w[0] + padding_left, out_hb),
-                                       out[0]);
-
-  w[1] += padding_left;
-  if (w[1] >= out_width) return;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
-                                       (int2)(out_x_base + w[1], out_hb),
-                                       out[1]);
-
-  w[2] += padding_left;
-  if (w[2] >= out_width) return;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
-                                       (int2)(out_x_base + w[2], out_hb),
-                                       out[2]);
-
-  w[3] += padding_left;
-  if (w[3] >= out_width) return;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
-                                       (int2)(out_x_base + w[3], out_hb),
-                                       out[3]);
-
-  w[4] += padding_left;
-  if (w[4] >= out_width) return;
-  CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
-                                       (int2)(out_x_base + w[4], out_hb),
-                                       out[4]);
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w0 + padding_left, out_hb),
+               out0);
+
+  w1 += padding_left;
+  if (w1 >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w1, out_hb),
+               out1);
+
+  w2 += padding_left;
+  if (w2 >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w2, out_hb),
+               out2);
+
+  w3 += padding_left;
+  if (w3 >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w3, out_hb),
+               out3);
+
+  w4 += padding_left;
+  if (w4 >= out_width) return;
+  WRITE_IMAGET(output,
+               (int2)(out_x_base + w4, out_hb),
+               out4);
 }