diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD index 4352f2a66cd413819e19bf98e70840cbea404d3f..87f81f7f1f31d51975db77265dbc0a99e67fecdf 100644 --- a/mace/kernels/BUILD +++ b/mace/kernels/BUILD @@ -7,13 +7,20 @@ package( licenses(["notice"]) # Apache 2.0 -load("//mace:mace.bzl", "if_android") -load("//mace:mace.bzl", "if_android_arm64") +load("//mace:mace.bzl", "if_android", "if_android_arm64") cc_library( name = "kernels", - srcs = glob(["*.cc"]) + if_android(glob(["opencl/*.cc"])) + if_android_arm64(glob(["neon/*.cc"])), - hdrs = glob(["*.h"]) + if_android(glob(["opencl/*.h"])) + if_android_arm64(glob(["neon/*.h"])), + srcs = glob(["*.cc"]) + if_android(glob([ + "opencl/*.cc", + ])) + if_android_arm64(glob([ + "neon/*.cc", + ])), + hdrs = glob(["*.h"]) + if_android(glob([ + "opencl/*.h", + ])) + if_android_arm64(glob([ + "neon/*.h", + ])), copts = [ "-std=c++11", "-fopenmp", diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index c9d49525723285a25410b2a976d0b95de0319c05..56f2cedc5e1f2427fcea57b91b9150e049f618ba 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -141,7 +141,6 @@ __kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */ } } -// TODO : validation __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __read_only image2d_t filter, /* cout%4 * cin, cout/4 */ __read_only image2d_t bias, /* cout%4 * cout/4 */ @@ -155,71 +154,151 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - half4 bias_value = read_imageh(bias, sampler, (int2)(out_w_blk, 1)); - half4 out0 = (half4)(bias_value.x, bias_value.x, bias_value.x, bias_value.x); - half4 out1 = (half4)(bias_value.y, bias_value.y, bias_value.y, bias_value.y); - half4 out2 = (half4)(bias_value.z, bias_value.z, bias_value.z, bias_value.z); - half4 out3 = (half4)(bias_value.w, bias_value.w, bias_value.w, bias_value.w); + half4 bias_value = read_imageh(bias, sampler, (int2)(out_ch_blk, 0)); + half4 out[4]; + out[0] = (half4)(bias_value.x); + out[1] = (half4)(bias_value.y); + out[2] = (half4)(bias_value.z); + out[3] = (half4)(bias_value.w); + + int w[4]; + w[0] = out_w_blk; + w[1] = w[0] + out_w_blks; + w[2] = w[1] + out_w_blks; + w[3] = w[2] + out_w_blks; // Unrolling this loop hurt perfmance int in_x_base = 0; for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { - const int in_x0 = in_x_base + out_w_blk;; - const int in_x1 = in_x0 + out_w_blks; - const int in_x2 = in_x1 + out_w_blks; - const int in_x3 = in_x2 + out_w_blks; - in_x_base += width; - - const half4 in0 = read_imageh(input, sampler, (int2)(in_x0, out_hb)); - const half4 in1 = read_imageh(input, sampler, (int2)(in_x1, out_hb)); - const half4 in2 = read_imageh(input, sampler, (int2)(in_x2, out_hb)); - const half4 in3 = read_imageh(input, sampler, (int2)(in_x3, out_hb)); + half4 in[4]; + in[0] = read_imageh(input, sampler, (int2)(in_x_base + w[0], out_hb)); + if (w[1] < width) { + // conditional load hurt perf, this branching helps sometimes + in[1] = read_imageh(input, sampler, (int2)(in_x_base + w[1], out_hb)); + in[2] = read_imageh(input, sampler, (int2)(in_x_base + w[2], out_hb)); + in[3] = read_imageh(input, sampler, (int2)(in_x_base + w[3], out_hb)); + } // The order matters, load input first then load filter, why? const int filter_x0 = in_ch_blk << 2; - const half4 weights0 = read_imageh(filter, sampler, (int2)(filter_x0, out_ch_blk)); - const half4 weights1 = read_imageh(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk)); - const half4 weights2 = read_imageh(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk)); - const half4 weights3 = read_imageh(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk)); + half4 weights[4]; + #pragma unroll + for (int c = 0; c < 4; ++c) { + weights[c] = read_imageh(filter, sampler, (int2)(filter_x0 + c, out_ch_blk)); + } // Will prefetch L2 improve performance? How to pretch image data? // Interleaving load and mul does not improve performance as expected - out0 += in0.x * weights0; - out1 += in1.x * weights0; - out2 += in2.x * weights0; - out3 += in3.x * weights0; - - out0 += in0.y * weights1; - out1 += in1.y * weights1; - out2 += in2.y * weights1; - out3 += in3.y * weights1; - - out0 += in0.z * weights2; - out1 += in1.z * weights2; - out2 += in2.z * weights2; - out3 += in3.z * weights2; - - out0 += in0.w * weights3; - out1 += in1.w * weights3; - out2 += in2.w * weights3; - out3 += in3.w * weights3; + #pragma unroll + for (int c = 0; c < 4; ++c) { + out[c] += in[c].x * weights[0]; + out[c] += in[c].y * weights[1]; + out[c] += in[c].z * weights[2]; + out[c] += in[c].w * weights[3]; + } + + in_x_base += width; } - const int out_x_offset = out_ch_blk * width; - const int w0 = out_w_blk; - write_imageh(output, (int2)(out_x_offset + w0, out_hb), out0); + const int out_x_base = out_ch_blk * width; + write_imageh(output, (int2)(out_x_base + w[0], out_hb), out[0]); - const int w1 = w0 + out_w_blks; - if (w1 >= width) return; - write_imageh(output, (int2)(out_x_offset + w1, out_hb), out1); + if (w[1] >= width) return; + write_imageh(output, (int2)(out_x_base + w[1], out_hb), out[1]); - const int w2 = w1 + out_w_blks; - if (w2 >= width) return; - write_imageh(output, (int2)(out_x_offset + w2, out_hb), out2); + if (w[2] >= width) return; + write_imageh(output, (int2)(out_x_base + w[2], out_hb), out[2]); - const int w3 = w2 + out_w_blks; - if (w3 >= width) return; - write_imageh(output, (int2)(out_x_offset + w3, out_hb), out3); + if (w[3] >= width) return; + write_imageh(output, (int2)(out_x_base + w[3], out_hb), out[3]); } +__kernel void conv_2d_1x1_h8(__read_only image2d_t input, /* [c%8 * w * c/8, h * b] */ + __read_only image2d_t filter, /* cout%8 * cin, cout/8 */ + __read_only image2d_t bias, /* cout%8 * cout/8 */ + __write_only image2d_t output, + __private const int in_ch_blks, + __private const int width) { + const int out_ch_blk = get_global_id(0); + const int out_w_blk = get_global_id(1); + const int out_w_blks = get_global_size(1); + const int out_hb = get_global_id(2); + + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + float4 bias_value = read_imagef(bias, sampler, (int2)(out_ch_blk, 0)); + half4 bias_value03 = as_half4(bias_value.xy); + half4 bias_value47 = as_half4(bias_value.zw); + half4 out[8]; + out[0] = (half4)(bias_value03.x); + out[1] = (half4)(bias_value03.y); + out[2] = (half4)(bias_value03.z); + out[3] = (half4)(bias_value03.w); + out[4] = (half4)(bias_value47.x); + out[5] = (half4)(bias_value47.y); + out[6] = (half4)(bias_value47.z); + out[7] = (half4)(bias_value47.w); + + int w[4]; + w[0] = out_w_blk; + w[1] = w[0] + out_w_blks; + w[2] = w[1] + out_w_blks; + w[3] = w[2] + out_w_blks; + + // Unrolling this loop hurt perfmance + int in_x_base = 0; + for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { + half4 in[8]; + #pragma unroll + for (int wi = 0; wi < 4; ++wi) { + float4 in_value = read_imagef(input, sampler, (int2)(in_x_base + w[0], out_hb)); + in[wi << 1] = as_half4(in_value.xy); + in[wi << 1 + 1] = as_half4(in_value.zw); + } + + // The order matters, load input first then load filter, why? + const int filter_x0 = in_ch_blk << 2; + half4 weights[8]; + #pragma unroll + for (int wi = 0; wi < 4; ++wi) { + float4 weights_value = read_imagef(filter, sampler, (int2)(filter_x0 + wi, out_ch_blk)); + weights[wi << 1] = as_half4(weights_value.xy); + weights[wi << 1 + 1] = as_half4(weights_value.zw); + } + // Will prefetch L2 improve performance? How to pretch image data? + + // Interleaving load and mul does not improve performance as expected + #pragma unroll + for (int wi = 0; wi < 4; ++wi) { + int idx = wi << 1; + out[idx] += in[idx].x * weights[0]; + out[idx] += in[idx].y * weights[1]; + out[idx] += in[idx].z * weights[2]; + out[idx] += in[idx].w * weights[3]; + + ++idx; + out[idx] += in[idx].x * weights[4]; + out[idx] += in[idx].y * weights[5]; + out[idx] += in[idx].z * weights[6]; + out[idx] += in[idx].w * weights[7]; + } + + in_x_base += width; + } + + const int out_x_base = out_ch_blk * width; + float4 out_value = (float4)(as_float2(out[0]), as_float2(out[1])); + write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value); + + if (w[1] >= width) return; + out_value = (float4)(as_float2(out[2]), as_float2(out[3])); + write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value); + + if (w[2] >= width) return; + out_value = (float4)(as_float2(out[4]), as_float2(out[5])); + write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value); + + if (w[3] >= width) return; + out_value = (float4)(as_float2(out[6]), as_float2(out[7])); + write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value); +} diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 1a85911de29fdde88c2cd41b15a2e2282178b5ac..aa8ee24fd642eab1cf03756149b33c27629985a8 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -66,8 +66,8 @@ void Conv1x1V2(const Tensor *input, void Conv1x1V3(const Tensor *input, const Tensor *filter, const Tensor *bias, + const int stride, Tensor *output) { - const index_t batch = output->dim(0); const index_t channels = output->dim(1); const index_t height = output->dim(2); diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index 0d201a2f5a7a60089b77b9387932c0c11b9f639d..8bfee960e777c7e90f08d2ae508efe620bc2b230 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -13,7 +13,6 @@ namespace mace { template static void Conv2d(int iters, - int iters_to_sync, int batch, int channels, int height, @@ -51,10 +50,8 @@ static void Conv2d(int iters, mace::testing::StartTiming(); while (iters--) { net.RunOp(D); - if (iters % iters_to_sync == 0) { - net.Sync(); - } } + net.Sync(); } // In common network, there are usually more than 1 layers, this is used to @@ -69,8 +66,8 @@ constexpr int kItersToSync = 10; const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::ItemsProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - Conv2d(iters, kItersToSync, N, C, H, W, KH, KW, STRIDE, \ - mace::Padding::P, OC); \ + Conv2d(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P, \ + OC); \ } \ BENCHMARK( \ BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE) @@ -80,8 +77,18 @@ constexpr int kItersToSync = 10; BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, NEON); \ BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL); -BM_CONV_2D(1, 3, 4032, 3016, 1, 1, 1, VALID, 3, float); // Test RGB <-> YUV -BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float); // Test RGB <-> YUV +// ICNet +BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, float); +BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, float); +// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105 +BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float); +// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108 +BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float); + +// Test RGB <-> YUV +BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float); +BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float); + BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float); BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float); diff --git a/mace/utils/utils.h b/mace/utils/utils.h index 49484c39a92a5a8bfe5bb22369e5636650e3b75a..536a7fb8805bc136e3b235151bbfb433b6c96836 100644 --- a/mace/utils/utils.h +++ b/mace/utils/utils.h @@ -24,6 +24,11 @@ Integer RoundUpDiv4(Integer i) { return (i + 3) >> 2; } +template +Integer RoundUpDiv8(Integer i) { + return (i + 7) >> 3; +} + template Integer CeilQuotient(Integer a, Integer b) { return (a + b - 1) / b;