提交 ef590c19 编写于 作者: 刘琦

Merge branch 'conv1x1' into 'master'

Update conv 1x1 opencl kernel

See merge request !116
...@@ -7,13 +7,20 @@ package( ...@@ -7,13 +7,20 @@ package(
licenses(["notice"]) # Apache 2.0 licenses(["notice"]) # Apache 2.0
load("//mace:mace.bzl", "if_android") load("//mace:mace.bzl", "if_android", "if_android_arm64")
load("//mace:mace.bzl", "if_android_arm64")
cc_library( cc_library(
name = "kernels", name = "kernels",
srcs = glob(["*.cc"]) + if_android(glob(["opencl/*.cc"])) + if_android_arm64(glob(["neon/*.cc"])), srcs = glob(["*.cc"]) + if_android(glob([
hdrs = glob(["*.h"]) + if_android(glob(["opencl/*.h"])) + if_android_arm64(glob(["neon/*.h"])), "opencl/*.cc",
])) + if_android_arm64(glob([
"neon/*.cc",
])),
hdrs = glob(["*.h"]) + if_android(glob([
"opencl/*.h",
])) + if_android_arm64(glob([
"neon/*.h",
])),
copts = [ copts = [
"-std=c++11", "-std=c++11",
"-fopenmp", "-fopenmp",
......
...@@ -141,7 +141,6 @@ __kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */ ...@@ -141,7 +141,6 @@ __kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */
} }
} }
// TODO : validation
__kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, cout/4 */ __read_only image2d_t filter, /* cout%4 * cin, cout/4 */
__read_only image2d_t bias, /* cout%4 * cout/4 */ __read_only image2d_t bias, /* cout%4 * cout/4 */
...@@ -155,71 +154,151 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] ...@@ -155,71 +154,151 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
half4 bias_value = read_imageh(bias, sampler, (int2)(out_w_blk, 1)); half4 bias_value = read_imageh(bias, sampler, (int2)(out_ch_blk, 0));
half4 out0 = (half4)(bias_value.x, bias_value.x, bias_value.x, bias_value.x); half4 out[4];
half4 out1 = (half4)(bias_value.y, bias_value.y, bias_value.y, bias_value.y); out[0] = (half4)(bias_value.x);
half4 out2 = (half4)(bias_value.z, bias_value.z, bias_value.z, bias_value.z); out[1] = (half4)(bias_value.y);
half4 out3 = (half4)(bias_value.w, bias_value.w, bias_value.w, bias_value.w); out[2] = (half4)(bias_value.z);
out[3] = (half4)(bias_value.w);
int w[4];
w[0] = out_w_blk;
w[1] = w[0] + out_w_blks;
w[2] = w[1] + out_w_blks;
w[3] = w[2] + out_w_blks;
// Unrolling this loop hurt perfmance // Unrolling this loop hurt perfmance
int in_x_base = 0; int in_x_base = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
const int in_x0 = in_x_base + out_w_blk;; half4 in[4];
const int in_x1 = in_x0 + out_w_blks; in[0] = read_imageh(input, sampler, (int2)(in_x_base + w[0], out_hb));
const int in_x2 = in_x1 + out_w_blks; if (w[1] < width) {
const int in_x3 = in_x2 + out_w_blks; // conditional load hurt perf, this branching helps sometimes
in_x_base += width; in[1] = read_imageh(input, sampler, (int2)(in_x_base + w[1], out_hb));
in[2] = read_imageh(input, sampler, (int2)(in_x_base + w[2], out_hb));
const half4 in0 = read_imageh(input, sampler, (int2)(in_x0, out_hb)); in[3] = read_imageh(input, sampler, (int2)(in_x_base + w[3], out_hb));
const half4 in1 = read_imageh(input, sampler, (int2)(in_x1, out_hb)); }
const half4 in2 = read_imageh(input, sampler, (int2)(in_x2, out_hb));
const half4 in3 = read_imageh(input, sampler, (int2)(in_x3, out_hb));
// The order matters, load input first then load filter, why? // The order matters, load input first then load filter, why?
const int filter_x0 = in_ch_blk << 2; const int filter_x0 = in_ch_blk << 2;
const half4 weights0 = read_imageh(filter, sampler, (int2)(filter_x0, out_ch_blk)); half4 weights[4];
const half4 weights1 = read_imageh(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk)); #pragma unroll
const half4 weights2 = read_imageh(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk)); for (int c = 0; c < 4; ++c) {
const half4 weights3 = read_imageh(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk)); weights[c] = read_imageh(filter, sampler, (int2)(filter_x0 + c, out_ch_blk));
}
// Will prefetch L2 improve performance? How to pretch image data? // Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected // Interleaving load and mul does not improve performance as expected
out0 += in0.x * weights0; #pragma unroll
out1 += in1.x * weights0; for (int c = 0; c < 4; ++c) {
out2 += in2.x * weights0; out[c] += in[c].x * weights[0];
out3 += in3.x * weights0; out[c] += in[c].y * weights[1];
out[c] += in[c].z * weights[2];
out0 += in0.y * weights1; out[c] += in[c].w * weights[3];
out1 += in1.y * weights1; }
out2 += in2.y * weights1;
out3 += in3.y * weights1; in_x_base += width;
out0 += in0.z * weights2;
out1 += in1.z * weights2;
out2 += in2.z * weights2;
out3 += in3.z * weights2;
out0 += in0.w * weights3;
out1 += in1.w * weights3;
out2 += in2.w * weights3;
out3 += in3.w * weights3;
} }
const int out_x_offset = out_ch_blk * width; const int out_x_base = out_ch_blk * width;
const int w0 = out_w_blk; write_imageh(output, (int2)(out_x_base + w[0], out_hb), out[0]);
write_imageh(output, (int2)(out_x_offset + w0, out_hb), out0);
const int w1 = w0 + out_w_blks; if (w[1] >= width) return;
if (w1 >= width) return; write_imageh(output, (int2)(out_x_base + w[1], out_hb), out[1]);
write_imageh(output, (int2)(out_x_offset + w1, out_hb), out1);
const int w2 = w1 + out_w_blks; if (w[2] >= width) return;
if (w2 >= width) return; write_imageh(output, (int2)(out_x_base + w[2], out_hb), out[2]);
write_imageh(output, (int2)(out_x_offset + w2, out_hb), out2);
const int w3 = w2 + out_w_blks; if (w[3] >= width) return;
if (w3 >= width) return; write_imageh(output, (int2)(out_x_base + w[3], out_hb), out[3]);
write_imageh(output, (int2)(out_x_offset + w3, out_hb), out3);
} }
__kernel void conv_2d_1x1_h8(__read_only image2d_t input, /* [c%8 * w * c/8, h * b] */
__read_only image2d_t filter, /* cout%8 * cin, cout/8 */
__read_only image2d_t bias, /* cout%8 * cout/8 */
__write_only image2d_t output,
__private const int in_ch_blks,
__private const int width) {
const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
float4 bias_value = read_imagef(bias, sampler, (int2)(out_ch_blk, 0));
half4 bias_value03 = as_half4(bias_value.xy);
half4 bias_value47 = as_half4(bias_value.zw);
half4 out[8];
out[0] = (half4)(bias_value03.x);
out[1] = (half4)(bias_value03.y);
out[2] = (half4)(bias_value03.z);
out[3] = (half4)(bias_value03.w);
out[4] = (half4)(bias_value47.x);
out[5] = (half4)(bias_value47.y);
out[6] = (half4)(bias_value47.z);
out[7] = (half4)(bias_value47.w);
int w[4];
w[0] = out_w_blk;
w[1] = w[0] + out_w_blks;
w[2] = w[1] + out_w_blks;
w[3] = w[2] + out_w_blks;
// Unrolling this loop hurt perfmance
int in_x_base = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
half4 in[8];
#pragma unroll
for (int wi = 0; wi < 4; ++wi) {
float4 in_value = read_imagef(input, sampler, (int2)(in_x_base + w[0], out_hb));
in[wi << 1] = as_half4(in_value.xy);
in[wi << 1 + 1] = as_half4(in_value.zw);
}
// The order matters, load input first then load filter, why?
const int filter_x0 = in_ch_blk << 2;
half4 weights[8];
#pragma unroll
for (int wi = 0; wi < 4; ++wi) {
float4 weights_value = read_imagef(filter, sampler, (int2)(filter_x0 + wi, out_ch_blk));
weights[wi << 1] = as_half4(weights_value.xy);
weights[wi << 1 + 1] = as_half4(weights_value.zw);
}
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
#pragma unroll
for (int wi = 0; wi < 4; ++wi) {
int idx = wi << 1;
out[idx] += in[idx].x * weights[0];
out[idx] += in[idx].y * weights[1];
out[idx] += in[idx].z * weights[2];
out[idx] += in[idx].w * weights[3];
++idx;
out[idx] += in[idx].x * weights[4];
out[idx] += in[idx].y * weights[5];
out[idx] += in[idx].z * weights[6];
out[idx] += in[idx].w * weights[7];
}
in_x_base += width;
}
const int out_x_base = out_ch_blk * width;
float4 out_value = (float4)(as_float2(out[0]), as_float2(out[1]));
write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
if (w[1] >= width) return;
out_value = (float4)(as_float2(out[2]), as_float2(out[3]));
write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
if (w[2] >= width) return;
out_value = (float4)(as_float2(out[4]), as_float2(out[5]));
write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
if (w[3] >= width) return;
out_value = (float4)(as_float2(out[6]), as_float2(out[7]));
write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
}
...@@ -66,8 +66,8 @@ void Conv1x1V2(const Tensor *input, ...@@ -66,8 +66,8 @@ void Conv1x1V2(const Tensor *input,
void Conv1x1V3(const Tensor *input, void Conv1x1V3(const Tensor *input,
const Tensor *filter, const Tensor *filter,
const Tensor *bias, const Tensor *bias,
const int stride,
Tensor *output) { Tensor *output) {
const index_t batch = output->dim(0); const index_t batch = output->dim(0);
const index_t channels = output->dim(1); const index_t channels = output->dim(1);
const index_t height = output->dim(2); const index_t height = output->dim(2);
......
...@@ -13,7 +13,6 @@ namespace mace { ...@@ -13,7 +13,6 @@ namespace mace {
template <DeviceType D, typename T> template <DeviceType D, typename T>
static void Conv2d(int iters, static void Conv2d(int iters,
int iters_to_sync,
int batch, int batch,
int channels, int channels,
int height, int height,
...@@ -51,10 +50,8 @@ static void Conv2d(int iters, ...@@ -51,10 +50,8 @@ static void Conv2d(int iters,
mace::testing::StartTiming(); mace::testing::StartTiming();
while (iters--) { while (iters--) {
net.RunOp(D); net.RunOp(D);
if (iters % iters_to_sync == 0) {
net.Sync();
}
} }
net.Sync();
} }
// In common network, there are usually more than 1 layers, this is used to // In common network, there are usually more than 1 layers, this is used to
...@@ -69,8 +66,8 @@ constexpr int kItersToSync = 10; ...@@ -69,8 +66,8 @@ constexpr int kItersToSync = 10;
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \ const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \ mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Conv2d<DEVICE, TYPE>(iters, kItersToSync, N, C, H, W, KH, KW, STRIDE, \ Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P, \
mace::Padding::P, OC); \ OC); \
} \ } \
BENCHMARK( \ BENCHMARK( \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE) BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
...@@ -80,8 +77,18 @@ constexpr int kItersToSync = 10; ...@@ -80,8 +77,18 @@ constexpr int kItersToSync = 10;
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, NEON); \ BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, NEON); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL); BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
BM_CONV_2D(1, 3, 4032, 3016, 1, 1, 1, VALID, 3, float); // Test RGB <-> YUV // ICNet
BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float); // Test RGB <-> YUV BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, float);
BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, float);
// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float);
// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float);
// Test RGB <-> YUV
BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float); BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments
BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float); BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
......
...@@ -24,6 +24,11 @@ Integer RoundUpDiv4(Integer i) { ...@@ -24,6 +24,11 @@ Integer RoundUpDiv4(Integer i) {
return (i + 3) >> 2; return (i + 3) >> 2;
} }
template <typename Integer>
Integer RoundUpDiv8(Integer i) {
return (i + 7) >> 3;
}
template <typename Integer> template <typename Integer>
Integer CeilQuotient(Integer a, Integer b) { Integer CeilQuotient(Integer a, Integer b) {
return (a + b - 1) / b; return (a + b - 1) / b;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册