Add conv 1x1 benchmark code.

92e3e526 · liuqi · e52c49b3 · 92e3e526 · 92e3e526 · 92e3e526
3 changed file
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -25,15 +25,15 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 #ifdef BIAS
-  DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
+  float4 out0 = convert_float4(READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)));
-  DATA_TYPE4 out1 = out0;
+  float4 out1 = out0;
-  DATA_TYPE4 out2 = out0;
+  float4 out2 = out0;
-  DATA_TYPE4 out3 = out0;
+  float4 out3 = out0;
 #else
-  DATA_TYPE4 out0 = 0;
+  float4 out0 = 0;
-  DATA_TYPE4 out1 = 0;
+  float4 out1 = 0;
-  DATA_TYPE4 out2 = 0;
+  float4 out2 = 0;
-  DATA_TYPE4 out3 = 0;
+  float4 out3 = 0;
 #endif
  int4 w;
@@ -64,16 +64,16 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  int in_x_base = 0;
  for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
-    DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx));
+    float4 in0 = convert_float4(READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx)));
-    DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx));
+    float4 in1 = convert_float4(READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx)));
-    DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx));
+    float4 in2 = convert_float4(READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx)));
-    DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx));
+    float4 in3 = convert_float4(READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx)));
    const int filter_x0 = in_ch_blk << 2;
-    DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
+    float4 weights0 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk)));
-    DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
+    float4 weights1 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk)));
-    DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
+    float4 weights2 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk)));
-    DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
+    float4 weights3 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk)));
    // Will prefetch L2 improve performance? How to pretch image data?
    out0 += in0.x * weights0;
@@ -101,18 +101,18 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 #ifdef FUSED_BATCH_NORM
  // batch norm
-  DATA_TYPE4 bn_scale_value =
+  float4 bn_scale_value =
-      READ_IMAGET(bn_scale, sampler, (int2)(out_ch_blk, 0));
+      convert_float4(READ_IMAGET(bn_scale, sampler, (int2)(out_ch_blk, 0)));
-  DATA_TYPE4 scale0 = (DATA_TYPE4)(bn_scale_value.x);
+  float4 scale0 = (float4)(bn_scale_value.x);
-  DATA_TYPE4 scale1 = (DATA_TYPE4)(bn_scale_value.y);
+  float4 scale1 = (float4)(bn_scale_value.y);
-  DATA_TYPE4 scale2 = (DATA_TYPE4)(bn_scale_value.z);
+  float4 scale2 = (float4)(bn_scale_value.z);
-  DATA_TYPE4 scale3 = (DATA_TYPE4)(bn_scale_value.w);
+  float4 scale3 = (float4)(bn_scale_value.w);
-  DATA_TYPE4 bn_offset_value =
+  float4 bn_offset_value =
      READ_IMAGET(bn_offset, sampler, (int2)(out_ch_blk, 0));
-  DATA_TYPE4 offset0 = (DATA_TYPE4)(bn_offset_value.x);
+  float4 offset0 = (float4)(bn_offset_value.x);
-  DATA_TYPE4 offset1 = (DATA_TYPE4)(bn_offset_value.y);
+  float4 offset1 = (float4)(bn_offset_value.y);
-  DATA_TYPE4 offset2 = (DATA_TYPE4)(bn_offset_value.z);
+  float4 offset2 = (float4)(bn_offset_value.z);
-  DATA_TYPE4 offset3 = (DATA_TYPE4)(bn_offset_value.w);
+  float4 offset3 = (float4)(bn_offset_value.w);
  out0 = out0 * scale0 + offset0;
  out1 = out1 * scale1 + offset1;
@@ -128,6 +128,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  out3 = fmax(out3, 0);
 #endif
+#ifdef TYPE_FLOAT
  const int out_x_base = out_ch_blk * width;
  int out_x_idx = out_w_blk;
  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);
@@ -143,4 +144,21 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
  out_x_idx += out_w_blks;
  if (out_x_idx >= width) return;
  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
+#else
+  const int out_x_base = out_ch_blk * width;
+  int out_x_idx = out_w_blk;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), convert_half4(out0));
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), convert_half4(out1));
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), convert_half4(out2));
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), convert_half4(out3));
+#endif
 }
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -33,7 +33,7 @@ void Conv1x1(const Tensor *input,
  MACE_CHECK(input_batch == batch);
  std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace(input->dtype() == DT_FLOAT ? "-DTYPE_FLOAT" : "");
  built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype()));
  built_options.emplace("-DSTRIDE=" + ToString(stride));
  if (bias != nullptr) {

--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -91,7 +91,40 @@ static void Conv2d(int iters,
 #define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE)        \
  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
+// ICNet
+BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half);
+// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
+BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half);
+// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
+BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half);
+BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half);
 // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
 BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half);
+// Test RGB <-> YUV
+//BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
+//BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
+//
+//BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float);  // Test bad alignments
+//BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
+//BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
+//BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
+//BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
+//BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
+//BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
+//BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
 }  //  namespace mace