diff --git a/mace/core/opencl_allocator.cc b/mace/core/opencl_allocator.cc index 3b393542281266a4564767e732ea703c4371e738..75004f75276a54c47f82626eefa818dcef3941da 100644 --- a/mace/core/opencl_allocator.cc +++ b/mace/core/opencl_allocator.cc @@ -13,6 +13,7 @@ namespace { static cl_channel_type DataTypeToCLChannelType(const DataType t) { switch (t) { case DT_HALF: + return CL_HALF_FLOAT; case DT_FLOAT: return CL_FLOAT; case DT_INT8: diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 33d7305b6e8ebb77d97071616fa5dfa9eb7c3a5d..ef76aa79c56f310f77eda05a0a97c3091d9faba2 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -8,7 +8,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] __write_only image2d_t output, __private const int in_height, __private const int in_width, - __private const int in_channels, + __private const int in_ch_blks, __private const int out_height, __private const int out_width, __private const int padding_top, @@ -17,25 +17,26 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const int out_w_blk = get_global_id(1); const int out_w_blks = get_global_size(1); const int out_hb = get_global_id(2); - const int in_ch_blks = (in_channels + 3) / 4; const int rounded_in_ch = in_ch_blks * 4; const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - VEC_DATA_TYPE(DATA_TYPE, 4) out[4] = {0}; + VEC_DATA_TYPE(DATA_TYPE, 4) out[5] = {0}; #ifdef BIAS out[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(bias, sampler, (int2)(out_ch_blk, 0)); out[1] = out[0]; out[2] = out[0]; out[3] = out[0]; + out[4] = out[0]; #endif - int w[4]; + int w[5]; w[0] = out_w_blk - padding_left; w[1] = w[0] + out_w_blks; w[2] = w[1] + out_w_blks; w[3] = w[2] + out_w_blks; + w[4] = w[3] + out_w_blks; const int batch_idx = out_hb / out_height; const int height_idx = out_hb % out_height; @@ -50,64 +51,83 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const int input_image_width = in_ch_blks * in_width; + VEC_DATA_TYPE(DATA_TYPE, 4) in[5]; + VEC_DATA_TYPE(DATA_TYPE, 4) weights[4]; + int in_idx, hb_idx, width_idx, in_width_idx; // Unrolling this loop hurt perfmance - int idx = 0; for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { - VEC_DATA_TYPE(DATA_TYPE, 4) in[36]; - VEC_DATA_TYPE(DATA_TYPE, 4) weights[36]; - - int filter_idx = in_ch_blk << 2; - int in_idx = in_ch_blk * in_width; - - #pragma unroll - for (int i = 0; i < 3; ++i) { - for (int j = 0; j < 3; ++j) { - idx = i * 12 + j * 4; - int in_width_idx = w[0] + j; - // Judge the width border for padding input. - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[idx + 0] = 0; - } else { - in[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i])); - } - in_width_idx = w[1] + j; - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[idx + 1] = 0; - } else { - in[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i])); - } - in_width_idx = w[2] + j; - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[idx + 2] = 0; - } else { - in[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i])); - } - in_width_idx = w[3] + j; - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[idx + 3] = 0; - } else { - in[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[i])); - } - - weights[idx + 0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk)); - weights[idx + 1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk)); - weights[idx + 2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk)); - weights[idx + 3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk)); - - filter_idx += rounded_in_ch; + for (int i = 0; i < 9; ++i) { + + in_idx = in_ch_blk * in_width; + + hb_idx = i / 3; + width_idx = i % 3; + in_width_idx = w[0] + width_idx; + // Judge the width border for padding input. + if (in_width_idx < 0 || in_width_idx >= in_width) { + in[0] = 0; + } else { + in[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); } - } - // Will prefetch L2 improve performance? How to pretch image data? - - // Interleaving load and mul does not improve performance as expected - #pragma unroll - for (int c = 0; c < 4; ++c) { - for (int i = 0; i < 9; ++i) { - out[c] += in[c + i * 4].x * weights[0 + i * 4]; - out[c] += in[c + i * 4].y * weights[1 + i * 4]; - out[c] += in[c + i * 4].z * weights[2 + i * 4]; - out[c] += in[c + i * 4].w * weights[3 + i * 4]; + in_width_idx = w[1] + width_idx; + if (in_width_idx < 0 || in_width_idx >= in_width) { + in[1] = 0; + } else { + in[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); + } + in_width_idx = w[2] + width_idx; + if (in_width_idx < 0 || in_width_idx >= in_width) { + in[2] = 0; + } else { + in[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); + } + in_width_idx = w[3] + width_idx; + if (in_width_idx < 0 || in_width_idx >= in_width) { + in[3] = 0; + } else { + in[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); + } + in_width_idx = w[4] + width_idx; + if (in_width_idx < 0 || in_width_idx >= in_width) { + in[4] = 0; + } else { + in[4] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); } + + + int filter_idx = (in_ch_blk << 2) + i * rounded_in_ch; + weights[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk)); + weights[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk)); + weights[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk)); + weights[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk)); + + // Will prefetch L2 improve performance? How to pretch image data? + + // Interleaving load and mul does not improve performance as expected + out[0] += in[0].x * weights[0]; + out[0] += in[0].y * weights[1]; + out[0] += in[0].z * weights[2]; + out[0] += in[0].w * weights[3]; + + out[1] += in[1].x * weights[0]; + out[1] += in[1].y * weights[1]; + out[1] += in[1].z * weights[2]; + out[1] += in[1].w * weights[3]; + + out[2] += in[2].x * weights[0]; + out[2] += in[2].y * weights[1]; + out[2] += in[2].z * weights[2]; + out[2] += in[2].w * weights[3]; + + out[3] += in[3].x * weights[0]; + out[3] += in[3].y * weights[1]; + out[3] += in[3].z * weights[2]; + out[3] += in[3].w * weights[3]; + + out[4] += in[4].x * weights[0]; + out[4] += in[4].y * weights[1]; + out[4] += in[4].z * weights[2]; + out[4] += in[4].w * weights[3]; } } @@ -133,4 +153,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] CMD_TYPE(write_image, CMD_DATA_TYPE)(output, (int2)(out_x_base + w[3], out_hb), out[3]); + + w[4] += padding_left; + if (w[4] >= out_width) return; + CMD_TYPE(write_image, CMD_DATA_TYPE)(output, + (int2)(out_x_base + w[4], out_hb), + out[4]); } diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index b7e11e817922287a9b048ed9299c5d332f3ef0cf..5836caa8842cfc1749b2ea78ca6d55fcf11e2e3b 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -22,7 +22,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, const index_t channel_blocks = RoundUpDiv4(channels); const index_t input_channel_blocks = RoundUpDiv4(input_channels); - const index_t width_blocks = RoundUpDiv4(width); + const index_t width_blocks = RoundUpDiv(width); std::set built_options; built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); @@ -44,7 +44,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, conv_2d_kernel.setArg(idx++, *(static_cast(output->buffer()))); conv_2d_kernel.setArg(idx++, static_cast(input->dim(1))); conv_2d_kernel.setArg(idx++, static_cast(input->dim(2))); - conv_2d_kernel.setArg(idx++, static_cast(input->dim(3))); + conv_2d_kernel.setArg(idx++, static_cast(input_channel_blocks)); conv_2d_kernel.setArg(idx++, static_cast(height)); conv_2d_kernel.setArg(idx++, static_cast(width)); conv_2d_kernel.setArg(idx++, padding[0] / 2); @@ -56,7 +56,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, conv_2d_kernel, cl::NullRange, cl::NDRange(static_cast(channel_blocks), static_cast(width_blocks), static_cast(height * batch)), - cl::NDRange(4, 15, 8), + cl::NDRange(16, 16, 4), NULL, OpenCLRuntime::Get()->GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS, error); diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index 24211ca1832921c89828b6ec00f45e33a152b77c..2409013ec322966fe2ffc4c682bc1bcbd17da19b 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -27,10 +27,10 @@ static void Conv2d(int iters, OpsTestNet net; // Add input data - net.AddRandomInput("Input", {batch, height, width, channels}); - net.AddRandomInput("Filter", + net.AddRandomInput("Input", {batch, height, width, channels}); + net.AddRandomInput("Filter", {kernel_h, kernel_w, channels, output_channels}); - net.AddRandomInput("Bias", {output_channels}); + net.AddRandomInput("Bias", {output_channels}); if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); @@ -88,6 +88,7 @@ static void Conv2d(int iters, BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE) #define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \ + BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \ BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL); // ICNet @@ -99,7 +100,7 @@ BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float); BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float); // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8 -BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, VALID, 32, float); +BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, float); // Test RGB <-> YUV BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float); diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index 6120f403b31af34c5689fdd2664ede5924edd826..864c882a3eca7b6c1e2a7355830af8744699415a 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -84,18 +84,18 @@ TEST_F(Conv2dOpTest, NEONSimple) { TestSimple3x3SAME(); } -template +template void TestNHWCSimple3x3VALID() { OpsTestNet net; // Add input data - net.AddInputFromArray( + net.AddInputFromArray( "Input", {1, 3, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - net.AddInputFromArray( + net.AddInputFromArray( "Filter", {3, 3, 2, 1}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); - net.AddInputFromArray("Bias", {1}, {0.1f}); + net.AddInputFromArray("Bias", {1}, {0.1f}); if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); @@ -130,23 +130,23 @@ void TestNHWCSimple3x3VALID() { net.RunOp(D); } - auto expected = CreateTensor({1, 1, 1, 1}, {18.1f}); - ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); + auto expected = CreateTensor({1, 1, 1, 1}, {18.1f}); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } -template +template void TestNHWCSimple3x3SAME() { OpsTestNet net; // Add input data - net.AddInputFromArray( + net.AddInputFromArray( "Input", {1, 3, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - net.AddInputFromArray( + net.AddInputFromArray( "Filter", {3, 3, 2, 1}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); - net.AddInputFromArray("Bias", {1}, {0.1f}); + net.AddInputFromArray("Bias", {1}, {0.1f}); if (D == DeviceType::OPENCL) { BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); @@ -181,21 +181,21 @@ void TestNHWCSimple3x3SAME() { net.RunOp(D); } - auto expected = CreateTensor( + auto expected = CreateTensor( {1, 3, 3, 1}, {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f}); - ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } TEST_F(Conv2dOpTest, CPUSimple) { - TestNHWCSimple3x3VALID(); - TestNHWCSimple3x3SAME(); + TestNHWCSimple3x3VALID(); + TestNHWCSimple3x3SAME(); } TEST_F(Conv2dOpTest, OPENCLSimple) { - TestNHWCSimple3x3VALID(); - TestNHWCSimple3x3SAME(); + TestNHWCSimple3x3VALID(); + TestNHWCSimple3x3SAME(); } template @@ -457,11 +457,11 @@ static void TestComplexConvNxNS12(const std::vector &shape) { srand(time(NULL)); // generate random input - index_t batch = 3 + rand() % 10; + index_t batch = 3 + (rand() % 10); index_t height = shape[0]; index_t width = shape[1]; - index_t input_channels = shape[2] + rand() % 10; - index_t output_channels = shape[3] + rand() % 10; + index_t input_channels = shape[2] + (rand() % 10); + index_t output_channels = shape[3] + (rand() % 10); // Construct graph OpsTestNet net; OpDefBuilder("Conv2D", "Conv2dTest") @@ -509,6 +509,7 @@ static void TestComplexConvNxNS12(const std::vector &shape) { for (int kernel_size : {3}) { for (int stride : {1}) { + func(kernel_size, kernel_size, stride, stride, VALID); func(kernel_size, kernel_size, stride, stride, SAME); } }