提交 1374a365 编写于 作者: L liuqi

Optimize opencl conv 3x3 kernel.

上级 fd284f6a
......@@ -13,6 +13,7 @@ namespace {
static cl_channel_type DataTypeToCLChannelType(const DataType t) {
switch (t) {
case DT_HALF:
return CL_HALF_FLOAT;
case DT_FLOAT:
return CL_FLOAT;
case DT_INT8:
......
......@@ -8,7 +8,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int in_channels,
__private const int in_ch_blks,
__private const int out_height,
__private const int out_width,
__private const int padding_top,
......@@ -17,25 +17,26 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const int out_w_blk = get_global_id(1);
const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2);
const int in_ch_blks = (in_channels + 3) / 4;
const int rounded_in_ch = in_ch_blks * 4;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
VEC_DATA_TYPE(DATA_TYPE, 4) out[4] = {0};
VEC_DATA_TYPE(DATA_TYPE, 4) out[5] = {0};
#ifdef BIAS
out[0] =
CMD_TYPE(read_image, CMD_DATA_TYPE)(bias, sampler, (int2)(out_ch_blk, 0));
out[1] = out[0];
out[2] = out[0];
out[3] = out[0];
out[4] = out[0];
#endif
int w[4];
int w[5];
w[0] = out_w_blk - padding_left;
w[1] = w[0] + out_w_blks;
w[2] = w[1] + out_w_blks;
w[3] = w[2] + out_w_blks;
w[4] = w[3] + out_w_blks;
const int batch_idx = out_hb / out_height;
const int height_idx = out_hb % out_height;
......@@ -50,64 +51,83 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const int input_image_width = in_ch_blks * in_width;
VEC_DATA_TYPE(DATA_TYPE, 4) in[5];
VEC_DATA_TYPE(DATA_TYPE, 4) weights[4];
int in_idx, hb_idx, width_idx, in_width_idx;
// Unrolling this loop hurt perfmance
int idx = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
VEC_DATA_TYPE(DATA_TYPE, 4) in[36];
VEC_DATA_TYPE(DATA_TYPE, 4) weights[36];
int filter_idx = in_ch_blk << 2;
int in_idx = in_ch_blk * in_width;
#pragma unroll
for (int i = 0; i < 3; ++i) {
for (int j = 0; j < 3; ++j) {
idx = i * 12 + j * 4;
int in_width_idx = w[0] + j;
// Judge the width border for padding input.
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[idx + 0] = 0;
} else {
in[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
}
in_width_idx = w[1] + j;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[idx + 1] = 0;
} else {
in[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
}
in_width_idx = w[2] + j;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[idx + 2] = 0;
} else {
in[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
}
in_width_idx = w[3] + j;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[idx + 3] = 0;
} else {
in[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i]));
}
weights[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
weights[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
weights[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
weights[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
filter_idx += rounded_in_ch;
for (int i = 0; i < 9; ++i) {
in_idx = in_ch_blk * in_width;
hb_idx = i / 3;
width_idx = i % 3;
in_width_idx = w[0] + width_idx;
// Judge the width border for padding input.
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[0] = 0;
} else {
in[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
}
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
#pragma unroll
for (int c = 0; c < 4; ++c) {
for (int i = 0; i < 9; ++i) {
out[c] += in[c + i * 4].x * weights[0 + i * 4];
out[c] += in[c + i * 4].y * weights[1 + i * 4];
out[c] += in[c + i * 4].z * weights[2 + i * 4];
out[c] += in[c + i * 4].w * weights[3 + i * 4];
in_width_idx = w[1] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[1] = 0;
} else {
in[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[2] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[2] = 0;
} else {
in[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[3] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[3] = 0;
} else {
in[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
in_width_idx = w[4] + width_idx;
if (in_width_idx < 0 || in_width_idx >= in_width) {
in[4] = 0;
} else {
in[4] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx]));
}
int filter_idx = (in_ch_blk << 2) + i * rounded_in_ch;
weights[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
weights[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
weights[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
weights[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data?
// Interleaving load and mul does not improve performance as expected
out[0] += in[0].x * weights[0];
out[0] += in[0].y * weights[1];
out[0] += in[0].z * weights[2];
out[0] += in[0].w * weights[3];
out[1] += in[1].x * weights[0];
out[1] += in[1].y * weights[1];
out[1] += in[1].z * weights[2];
out[1] += in[1].w * weights[3];
out[2] += in[2].x * weights[0];
out[2] += in[2].y * weights[1];
out[2] += in[2].z * weights[2];
out[2] += in[2].w * weights[3];
out[3] += in[3].x * weights[0];
out[3] += in[3].y * weights[1];
out[3] += in[3].z * weights[2];
out[3] += in[3].w * weights[3];
out[4] += in[4].x * weights[0];
out[4] += in[4].y * weights[1];
out[4] += in[4].z * weights[2];
out[4] += in[4].w * weights[3];
}
}
......@@ -133,4 +153,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
(int2)(out_x_base + w[3], out_hb),
out[3]);
w[4] += padding_left;
if (w[4] >= out_width) return;
CMD_TYPE(write_image, CMD_DATA_TYPE)(output,
(int2)(out_x_base + w[4], out_hb),
out[4]);
}
......@@ -22,7 +22,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
const index_t channel_blocks = RoundUpDiv4(channels);
const index_t input_channel_blocks = RoundUpDiv4(input_channels);
const index_t width_blocks = RoundUpDiv4(width);
const index_t width_blocks = RoundUpDiv<index_t, 5>(width);
std::set<std::string> built_options;
built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype()));
......@@ -44,7 +44,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(1)));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(2)));
conv_2d_kernel.setArg(idx++, static_cast<int>(input->dim(3)));
conv_2d_kernel.setArg(idx++, static_cast<int>(input_channel_blocks));
conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width));
conv_2d_kernel.setArg(idx++, padding[0] / 2);
......@@ -56,7 +56,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)),
cl::NDRange(4, 15, 8),
cl::NDRange(16, 16, 4),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error);
......
......@@ -27,10 +27,10 @@ static void Conv2d(int iters,
OpsTestNet net;
// Add input data
net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
net.AddRandomInput<D, float>("Filter",
net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
net.AddRandomInput<D, T>("Filter",
{kernel_h, kernel_w, channels, output_channels});
net.AddRandomInput<D, float>("Bias", {output_channels});
net.AddRandomInput<D, T>("Bias", {output_channels});
if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
......@@ -88,6 +88,7 @@ static void Conv2d(int iters,
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
// ICNet
......@@ -99,7 +100,7 @@ BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float);
BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float);
// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, VALID, 32, float);
BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, float);
// Test RGB <-> YUV
BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
......
......@@ -84,18 +84,18 @@ TEST_F(Conv2dOpTest, NEONSimple) {
TestSimple3x3SAME<DeviceType::NEON>();
}
template<DeviceType D>
template<DeviceType D, typename T>
void TestNHWCSimple3x3VALID() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>(
net.AddInputFromArray<D, T>(
"Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
......@@ -130,23 +130,23 @@ void TestNHWCSimple3x3VALID() {
net.RunOp(D);
}
auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
auto expected = CreateTensor<T>({1, 1, 1, 1}, {18.1f});
ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 0.001);
}
template<DeviceType D>
template<DeviceType D, typename T>
void TestNHWCSimple3x3SAME() {
OpsTestNet net;
// Add input data
net.AddInputFromArray<D, float>(
net.AddInputFromArray<D, T>(
"Input", {1, 3, 3, 2},
{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
net.AddInputFromArray<D, float>(
net.AddInputFromArray<D, T>(
"Filter", {3, 3, 2, 1},
{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
net.AddInputFromArray<D, float>("Bias", {1}, {0.1f});
net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});
if (D == DeviceType::OPENCL) {
BufferToImage<D>(net, "Input", "InputImage", kernels::BufferType::IN_OUT);
......@@ -181,21 +181,21 @@ void TestNHWCSimple3x3SAME() {
net.RunOp(D);
}
auto expected = CreateTensor<float>(
auto expected = CreateTensor<T>(
{1, 3, 3, 1},
{8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f});
ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
ExpectTensorNear<T>(*expected, *net.GetOutput("Output"), 0.001);
}
TEST_F(Conv2dOpTest, CPUSimple) {
TestNHWCSimple3x3VALID<DeviceType::CPU>();
TestNHWCSimple3x3SAME<DeviceType::CPU>();
TestNHWCSimple3x3VALID<DeviceType::CPU, float>();
TestNHWCSimple3x3SAME<DeviceType::CPU, float>();
}
TEST_F(Conv2dOpTest, OPENCLSimple) {
TestNHWCSimple3x3VALID<DeviceType::OPENCL>();
TestNHWCSimple3x3SAME<DeviceType::OPENCL>();
TestNHWCSimple3x3VALID<DeviceType::OPENCL, float>();
TestNHWCSimple3x3SAME<DeviceType::OPENCL, float>();
}
template<DeviceType D>
......@@ -457,11 +457,11 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
srand(time(NULL));
// generate random input
index_t batch = 3 + rand() % 10;
index_t batch = 1;
index_t height = shape[0];
index_t width = shape[1];
index_t input_channels = shape[2] + rand() % 10;
index_t output_channels = shape[3] + rand() % 10;
index_t input_channels = shape[2];
index_t output_channels = shape[3];
// Construct graph
OpsTestNet net;
OpDefBuilder("Conv2D", "Conv2dTest")
......@@ -509,6 +509,7 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
for (int kernel_size : {3}) {
for (int stride : {1}) {
func(kernel_size, kernel_size, stride, stride, VALID);
func(kernel_size, kernel_size, stride, stride, SAME);
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册