提交 ef590c19 编写于 作者: 刘琦

Merge branch 'conv1x1' into 'master'

Update conv 1x1 opencl kernel

See merge request !116
......@@ -7,13 +7,20 @@ package(
licenses(["notice"]) # Apache 2.0
load("//mace:mace.bzl", "if_android")
load("//mace:mace.bzl", "if_android_arm64")
load("//mace:mace.bzl", "if_android", "if_android_arm64")
cc_library(
name = "kernels",
srcs = glob(["*.cc"]) + if_android(glob(["opencl/*.cc"])) + if_android_arm64(glob(["neon/*.cc"])),
hdrs = glob(["*.h"]) + if_android(glob(["opencl/*.h"])) + if_android_arm64(glob(["neon/*.h"])),
srcs = glob(["*.cc"]) + if_android(glob([
"opencl/*.cc",
])) + if_android_arm64(glob([
"neon/*.cc",
])),
hdrs = glob(["*.h"]) + if_android(glob([
"opencl/*.h",
])) + if_android_arm64(glob([
"neon/*.h",
])),
copts = [
"-std=c++11",
"-fopenmp",
......
......@@ -141,7 +141,6 @@ __kernel void conv_2d_1x1_v2(__global const DATA_TYPE *input, /* n, c, h, w */
}
}
// TODO : validation
__kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * cin, cout/4 */
__read_only image2d_t bias, /* cout%4 * cout/4 */
......@@ -155,71 +154,151 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
half4 bias_value = read_imageh(bias, sampler, (int2)(out_w_blk, 1));
half4 out0 = (half4)(bias_value.x, bias_value.x, bias_value.x, bias_value.x);
half4 out1 = (half4)(bias_value.y, bias_value.y, bias_value.y, bias_value.y);
half4 out2 = (half4)(bias_value.z, bias_value.z, bias_value.z, bias_value.z);
half4 out3 = (half4)(bias_value.w, bias_value.w, bias_value.w, bias_value.w);
half4 bias_value = read_imageh(bias, sampler, (int2)(out_ch_blk, 0));
half4 out[4];
out[0] = (half4)(bias_value.x);
out[1] = (half4)(bias_value.y);
out[2] = (half4)(bias_value.z);
out[3] = (half4)(bias_value.w);
int w[4];
w[0] = out_w_blk;
w[1] = w[0] + out_w_blks;
w[2] = w[1] + out_w_blks;
w[3] = w[2] + out_w_blks;
// Unrolling this loop hurt perfmance
int in_x_base = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
const int in_x0 = in_x_base + out_w_blk;;
const int in_x1 = in_x0 + out_w_blks;
const int in_x2 = in_x1 + out_w_blks;
const int in_x3 = in_x2 + out_w_blks;
in_x_base += width;
const half4 in0 = read_imageh(input, sampler, (int2)(in_x0, out_hb));
const half4 in1 = read_imageh(input, sampler, (int2)(in_x1, out_hb));
const half4 in2 = read_imageh(input, sampler, (int2)(in_x2, out_hb));
const half4 in3 = read_imageh(input, sampler, (int2)(in_x3, out_hb));
half4 in[4];
in[0] = read_imageh(input, sampler, (int2)(in_x_base + w[0], out_hb));
if (w[1] < width) {
// conditional load hurt perf, this branching helps sometimes
in[1] = read_imageh(input, sampler, (int2)(in_x_base + w[1], out_hb));
in[2] = read_imageh(input, sampler, (int2)(in_x_base + w[2], out_hb));
in[3] = read_imageh(input, sampler, (int2)(in_x_base + w[3], out_hb));
}
// The order matters, load input first then load filter, why?
const int filter_x0 = in_ch_blk << 2;
const half4 weights0 = read_imageh(filter, sampler, (int2)(filter_x0, out_ch_blk));
const half4 weights1 = read_imageh(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
const half4 weights2 = read_imageh(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
const half4 weights3 = read_imageh(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
half4 weights[4];
#pragma unroll
for (int c = 0; c < 4; ++c) {
weights[c] = read_imageh(filter, sampler, (int2)(filter_x0 + c, out_ch_blk));
}
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
out0 += in0.x * weights0;
out1 += in1.x * weights0;
out2 += in2.x * weights0;
out3 += in3.x * weights0;
out0 += in0.y * weights1;
out1 += in1.y * weights1;
out2 += in2.y * weights1;
out3 += in3.y * weights1;
out0 += in0.z * weights2;
out1 += in1.z * weights2;
out2 += in2.z * weights2;
out3 += in3.z * weights2;
out0 += in0.w * weights3;
out1 += in1.w * weights3;
out2 += in2.w * weights3;
out3 += in3.w * weights3;
#pragma unroll
for (int c = 0; c < 4; ++c) {
out[c] += in[c].x * weights[0];
out[c] += in[c].y * weights[1];
out[c] += in[c].z * weights[2];
out[c] += in[c].w * weights[3];
}
in_x_base += width;
}
const int out_x_offset = out_ch_blk * width;
const int w0 = out_w_blk;
write_imageh(output, (int2)(out_x_offset + w0, out_hb), out0);
const int out_x_base = out_ch_blk * width;
write_imageh(output, (int2)(out_x_base + w[0], out_hb), out[0]);
const int w1 = w0 + out_w_blks;
if (w1 >= width) return;
write_imageh(output, (int2)(out_x_offset + w1, out_hb), out1);
if (w[1] >= width) return;
write_imageh(output, (int2)(out_x_base + w[1], out_hb), out[1]);
const int w2 = w1 + out_w_blks;
if (w2 >= width) return;
write_imageh(output, (int2)(out_x_offset + w2, out_hb), out2);
if (w[2] >= width) return;
write_imageh(output, (int2)(out_x_base + w[2], out_hb), out[2]);
const int w3 = w2 + out_w_blks;
if (w3 >= width) return;
write_imageh(output, (int2)(out_x_offset + w3, out_hb), out3);
if (w[3] >= width) return;
write_imageh(output, (int2)(out_x_base + w[3], out_hb), out[3]);
}
__kernel void conv_2d_1x1_h8(__read_only image2d_t input, /* [c%8 * w * c/8, h * b] */
__read_only image2d_t filter, /* cout%8 * cin, cout/8 */
__read_only image2d_t bias, /* cout%8 * cout/8 */
__write_only image2d_t output,
__private const int in_ch_blks,
__private const int width) {
const int out_ch_blk = get_global_id(0);
const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
float4 bias_value = read_imagef(bias, sampler, (int2)(out_ch_blk, 0));
half4 bias_value03 = as_half4(bias_value.xy);
half4 bias_value47 = as_half4(bias_value.zw);
half4 out[8];
out[0] = (half4)(bias_value03.x);
out[1] = (half4)(bias_value03.y);
out[2] = (half4)(bias_value03.z);
out[3] = (half4)(bias_value03.w);
out[4] = (half4)(bias_value47.x);
out[5] = (half4)(bias_value47.y);
out[6] = (half4)(bias_value47.z);
out[7] = (half4)(bias_value47.w);
int w[4];
w[0] = out_w_blk;
w[1] = w[0] + out_w_blks;
w[2] = w[1] + out_w_blks;
w[3] = w[2] + out_w_blks;
// Unrolling this loop hurt perfmance
int in_x_base = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
half4 in[8];
#pragma unroll
for (int wi = 0; wi < 4; ++wi) {
float4 in_value = read_imagef(input, sampler, (int2)(in_x_base + w[0], out_hb));
in[wi << 1] = as_half4(in_value.xy);
in[wi << 1 + 1] = as_half4(in_value.zw);
}
// The order matters, load input first then load filter, why?
const int filter_x0 = in_ch_blk << 2;
half4 weights[8];
#pragma unroll
for (int wi = 0; wi < 4; ++wi) {
float4 weights_value = read_imagef(filter, sampler, (int2)(filter_x0 + wi, out_ch_blk));
weights[wi << 1] = as_half4(weights_value.xy);
weights[wi << 1 + 1] = as_half4(weights_value.zw);
}
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
#pragma unroll
for (int wi = 0; wi < 4; ++wi) {
int idx = wi << 1;
out[idx] += in[idx].x * weights[0];
out[idx] += in[idx].y * weights[1];
out[idx] += in[idx].z * weights[2];
out[idx] += in[idx].w * weights[3];
++idx;
out[idx] += in[idx].x * weights[4];
out[idx] += in[idx].y * weights[5];
out[idx] += in[idx].z * weights[6];
out[idx] += in[idx].w * weights[7];
}
in_x_base += width;
}
const int out_x_base = out_ch_blk * width;
float4 out_value = (float4)(as_float2(out[0]), as_float2(out[1]));
write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
if (w[1] >= width) return;
out_value = (float4)(as_float2(out[2]), as_float2(out[3]));
write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
if (w[2] >= width) return;
out_value = (float4)(as_float2(out[4]), as_float2(out[5]));
write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
if (w[3] >= width) return;
out_value = (float4)(as_float2(out[6]), as_float2(out[7]));
write_imagef(output, (int2)(out_x_base + w[0], out_hb), out_value);
}
......@@ -66,8 +66,8 @@ void Conv1x1V2(const Tensor *input,
void Conv1x1V3(const Tensor *input,
const Tensor *filter,
const Tensor *bias,
const int stride,
Tensor *output) {
const index_t batch = output->dim(0);
const index_t channels = output->dim(1);
const index_t height = output->dim(2);
......
......@@ -13,7 +13,6 @@ namespace mace {
template <DeviceType D, typename T>
static void Conv2d(int iters,
int iters_to_sync,
int batch,
int channels,
int height,
......@@ -51,10 +50,8 @@ static void Conv2d(int iters,
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
if (iters % iters_to_sync == 0) {
net.Sync();
}
}
net.Sync();
}
// In common network, there are usually more than 1 layers, this is used to
......@@ -69,8 +66,8 @@ constexpr int kItersToSync = 10;
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Conv2d<DEVICE, TYPE>(iters, kItersToSync, N, C, H, W, KH, KW, STRIDE, \
mace::Padding::P, OC); \
Conv2d<DEVICE, TYPE>(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P, \
OC); \
} \
BENCHMARK( \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
......@@ -80,8 +77,18 @@ constexpr int kItersToSync = 10;
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, NEON); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
BM_CONV_2D(1, 3, 4032, 3016, 1, 1, 1, VALID, 3, float); // Test RGB <-> YUV
BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float); // Test RGB <-> YUV
// ICNet
BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, float);
BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, float);
// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float);
// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float);
// Test RGB <-> YUV
BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);
BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float);
BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments
BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float);
......
......@@ -24,6 +24,11 @@ Integer RoundUpDiv4(Integer i) {
return (i + 3) >> 2;
}
template <typename Integer>
Integer RoundUpDiv8(Integer i) {
return (i + 7) >> 3;
}
template <typename Integer>
Integer CeilQuotient(Integer a, Integer b) {
return (a + b - 1) / b;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册