From 92e3e526988efb29cea8e9bd226face973ef265d Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Thu, 30 Nov 2017 17:30:57 +0800
Subject: [PATCH] Add conv 1x1 benchmark code.

---
 mace/kernels/opencl/cl/conv_2d_1x1.cl     | 72 ++++++++++++++---------
 mace/kernels/opencl/conv_2d_opencl_1x1.cc |  2 +-
 mace/ops/conv_2d_benchmark.cc             | 33 +++++++++++
 3 files changed, 79 insertions(+), 28 deletions(-)

diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl
index 9c7a37fd..a598a69f 100644
--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -25,15 +25,15 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 
 #ifdef BIAS
-  DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
-  DATA_TYPE4 out1 = out0;
-  DATA_TYPE4 out2 = out0;
-  DATA_TYPE4 out3 = out0;
+  float4 out0 = convert_float4(READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)));
+  float4 out1 = out0;
+  float4 out2 = out0;
+  float4 out3 = out0;
 #else
-  DATA_TYPE4 out0 = 0;
-  DATA_TYPE4 out1 = 0;
-  DATA_TYPE4 out2 = 0;
-  DATA_TYPE4 out3 = 0;
+  float4 out0 = 0;
+  float4 out1 = 0;
+  float4 out2 = 0;
+  float4 out3 = 0;
 #endif
 
   int4 w;
@@ -64,16 +64,16 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   int in_x_base = 0;
   for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
 
-    DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx));
-    DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx));
-    DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx));
-    DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx));
+    float4 in0 = convert_float4(READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx)));
+    float4 in1 = convert_float4(READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx)));
+    float4 in2 = convert_float4(READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx)));
+    float4 in3 = convert_float4(READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx)));
 
     const int filter_x0 = in_ch_blk << 2;
-    DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
-    DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
-    DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
-    DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
+    float4 weights0 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk)));
+    float4 weights1 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk)));
+    float4 weights2 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk)));
+    float4 weights3 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk)));
     // Will prefetch L2 improve performance? How to pretch image data?
 
     out0 += in0.x * weights0;
@@ -101,18 +101,18 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 
 #ifdef FUSED_BATCH_NORM
   // batch norm
-  DATA_TYPE4 bn_scale_value =
-      READ_IMAGET(bn_scale, sampler, (int2)(out_ch_blk, 0));
-  DATA_TYPE4 scale0 = (DATA_TYPE4)(bn_scale_value.x);
-  DATA_TYPE4 scale1 = (DATA_TYPE4)(bn_scale_value.y);
-  DATA_TYPE4 scale2 = (DATA_TYPE4)(bn_scale_value.z);
-  DATA_TYPE4 scale3 = (DATA_TYPE4)(bn_scale_value.w);
-  DATA_TYPE4 bn_offset_value =
+  float4 bn_scale_value =
+      convert_float4(READ_IMAGET(bn_scale, sampler, (int2)(out_ch_blk, 0)));
+  float4 scale0 = (float4)(bn_scale_value.x);
+  float4 scale1 = (float4)(bn_scale_value.y);
+  float4 scale2 = (float4)(bn_scale_value.z);
+  float4 scale3 = (float4)(bn_scale_value.w);
+  float4 bn_offset_value =
       READ_IMAGET(bn_offset, sampler, (int2)(out_ch_blk, 0));
-  DATA_TYPE4 offset0 = (DATA_TYPE4)(bn_offset_value.x);
-  DATA_TYPE4 offset1 = (DATA_TYPE4)(bn_offset_value.y);
-  DATA_TYPE4 offset2 = (DATA_TYPE4)(bn_offset_value.z);
-  DATA_TYPE4 offset3 = (DATA_TYPE4)(bn_offset_value.w);
+  float4 offset0 = (float4)(bn_offset_value.x);
+  float4 offset1 = (float4)(bn_offset_value.y);
+  float4 offset2 = (float4)(bn_offset_value.z);
+  float4 offset3 = (float4)(bn_offset_value.w);
 
   out0 = out0 * scale0 + offset0;
   out1 = out1 * scale1 + offset1;
@@ -128,6 +128,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   out3 = fmax(out3, 0);
 #endif
 
+#ifdef TYPE_FLOAT
   const int out_x_base = out_ch_blk * width;
   int out_x_idx = out_w_blk;
   WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0);
@@ -143,4 +144,21 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
   out_x_idx += out_w_blks;
   if (out_x_idx >= width) return;
   WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3);
+#else
+  const int out_x_base = out_ch_blk * width;
+  int out_x_idx = out_w_blk;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), convert_half4(out0));
+
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), convert_half4(out1));
+
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), convert_half4(out2));
+
+  out_x_idx += out_w_blks;
+  if (out_x_idx >= width) return;
+  WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), convert_half4(out3));
+#endif
 }
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index 477dbc98..614d2427 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -33,7 +33,7 @@ void Conv1x1(const Tensor *input,
   MACE_CHECK(input_batch == batch);
 
   std::set<std::string> built_options;
-  built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
+  built_options.emplace(input->dtype() == DT_FLOAT ? "-DTYPE_FLOAT" : "");
   built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype()));
   built_options.emplace("-DSTRIDE=" + ToString(stride));
   if (bias != nullptr) {
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index e3d426c0..98575c81 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -91,7 +91,40 @@ static void Conv2d(int iters,
 #define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE)        \
   BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
 
+// ICNet
+BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half);
+// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
+BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half);
+// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
+BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half);
+
+BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half);
 // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
 BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half);
 
+// Test RGB <-> YUV
+//BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
+//BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
+//
+//BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float);  // Test bad alignments
+//BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
+//BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float);
+//BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float);
+//BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float);
+//BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float);
+//BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float);
+//BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float);
+//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float);
+//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float);
+//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float);
 }  //  namespace mace
-- 
GitLab