From 92e3e526988efb29cea8e9bd226face973ef265d Mon Sep 17 00:00:00 2001 From: liuqi Date: Thu, 30 Nov 2017 17:30:57 +0800 Subject: [PATCH] Add conv 1x1 benchmark code. --- mace/kernels/opencl/cl/conv_2d_1x1.cl | 72 ++++++++++++++--------- mace/kernels/opencl/conv_2d_opencl_1x1.cc | 2 +- mace/ops/conv_2d_benchmark.cc | 33 +++++++++++ 3 files changed, 79 insertions(+), 28 deletions(-) diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index 9c7a37fd..a598a69f 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -25,15 +25,15 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; #ifdef BIAS - DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)); - DATA_TYPE4 out1 = out0; - DATA_TYPE4 out2 = out0; - DATA_TYPE4 out3 = out0; + float4 out0 = convert_float4(READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0))); + float4 out1 = out0; + float4 out2 = out0; + float4 out3 = out0; #else - DATA_TYPE4 out0 = 0; - DATA_TYPE4 out1 = 0; - DATA_TYPE4 out2 = 0; - DATA_TYPE4 out3 = 0; + float4 out0 = 0; + float4 out1 = 0; + float4 out2 = 0; + float4 out3 = 0; #endif int4 w; @@ -64,16 +64,16 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] int in_x_base = 0; for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { - DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx)); - DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx)); - DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx)); - DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx)); + float4 in0 = convert_float4(READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx))); + float4 in1 = convert_float4(READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx))); + float4 in2 = convert_float4(READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx))); + float4 in3 = convert_float4(READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx))); const int filter_x0 = in_ch_blk << 2; - DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk)); - DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk)); - DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk)); - DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk)); + float4 weights0 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk))); + float4 weights1 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk))); + float4 weights2 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk))); + float4 weights3 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk))); // Will prefetch L2 improve performance? How to pretch image data? out0 += in0.x * weights0; @@ -101,18 +101,18 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] #ifdef FUSED_BATCH_NORM // batch norm - DATA_TYPE4 bn_scale_value = - READ_IMAGET(bn_scale, sampler, (int2)(out_ch_blk, 0)); - DATA_TYPE4 scale0 = (DATA_TYPE4)(bn_scale_value.x); - DATA_TYPE4 scale1 = (DATA_TYPE4)(bn_scale_value.y); - DATA_TYPE4 scale2 = (DATA_TYPE4)(bn_scale_value.z); - DATA_TYPE4 scale3 = (DATA_TYPE4)(bn_scale_value.w); - DATA_TYPE4 bn_offset_value = + float4 bn_scale_value = + convert_float4(READ_IMAGET(bn_scale, sampler, (int2)(out_ch_blk, 0))); + float4 scale0 = (float4)(bn_scale_value.x); + float4 scale1 = (float4)(bn_scale_value.y); + float4 scale2 = (float4)(bn_scale_value.z); + float4 scale3 = (float4)(bn_scale_value.w); + float4 bn_offset_value = READ_IMAGET(bn_offset, sampler, (int2)(out_ch_blk, 0)); - DATA_TYPE4 offset0 = (DATA_TYPE4)(bn_offset_value.x); - DATA_TYPE4 offset1 = (DATA_TYPE4)(bn_offset_value.y); - DATA_TYPE4 offset2 = (DATA_TYPE4)(bn_offset_value.z); - DATA_TYPE4 offset3 = (DATA_TYPE4)(bn_offset_value.w); + float4 offset0 = (float4)(bn_offset_value.x); + float4 offset1 = (float4)(bn_offset_value.y); + float4 offset2 = (float4)(bn_offset_value.z); + float4 offset3 = (float4)(bn_offset_value.w); out0 = out0 * scale0 + offset0; out1 = out1 * scale1 + offset1; @@ -128,6 +128,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] out3 = fmax(out3, 0); #endif +#ifdef TYPE_FLOAT const int out_x_base = out_ch_blk * width; int out_x_idx = out_w_blk; WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0); @@ -143,4 +144,21 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] out_x_idx += out_w_blks; if (out_x_idx >= width) return; WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3); +#else + const int out_x_base = out_ch_blk * width; + int out_x_idx = out_w_blk; + WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), convert_half4(out0)); + + out_x_idx += out_w_blks; + if (out_x_idx >= width) return; + WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), convert_half4(out1)); + + out_x_idx += out_w_blks; + if (out_x_idx >= width) return; + WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), convert_half4(out2)); + + out_x_idx += out_w_blks; + if (out_x_idx >= width) return; + WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), convert_half4(out3)); +#endif } diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 477dbc98..614d2427 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -33,7 +33,7 @@ void Conv1x1(const Tensor *input, MACE_CHECK(input_batch == batch); std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); + built_options.emplace(input->dtype() == DT_FLOAT ? "-DTYPE_FLOAT" : ""); built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype())); built_options.emplace("-DSTRIDE=" + ToString(stride)); if (bias != nullptr) { diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index e3d426c0..98575c81 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -91,7 +91,40 @@ static void Conv2d(int iters, #define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \ BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL); +// ICNet +BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half); +// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105 +BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half); +// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108 +BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half); + +BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half); // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8 BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half); +// Test RGB <-> YUV +//BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float); +//BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float); +// +//BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float); +//BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments +//BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float); +//BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float); +//BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float); +//BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float); +//BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float); +//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float); +//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float); +//BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float); +//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float); +//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float); +//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float); +//BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float); +//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float); +//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float); +//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float); +//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float); +//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float); +//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float); +//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float); } // namespace mace -- GitLab