From e52c49b3bc46271f79c50e3883800c30ba2eb650 Mon Sep 17 00:00:00 2001 From: liuqi Date: Thu, 30 Nov 2017 17:19:46 +0800 Subject: [PATCH] Support conv 1x1 with stride == 2 and padding == SAME. --- mace/kernels/opencl/cl/conv_2d_1x1.cl | 62 +++++++++++++++-------- mace/kernels/opencl/cl/conv_2d_3x3.cl | 21 ++++---- mace/kernels/opencl/conv_2d_opencl_1x1.cc | 12 +++-- mace/kernels/opencl/conv_2d_opencl_3x3.cc | 2 +- mace/ops/conv_2d_test.cc | 49 ++++++++++++------ mace/ops/ops_test_util.h | 7 --- 6 files changed, 96 insertions(+), 57 deletions(-) diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index 689a7df2..9c7a37fd 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -10,8 +10,13 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] __read_only image2d_t bn_offset, /* cout%4 * cout/4 */ #endif __write_only image2d_t output, + __private const int in_height, + __private const int in_width, __private const int in_ch_blks, - __private const int width) { + __private const int height, + __private const int width, + __private const int padding_top, + __private const int padding_left) { const int out_ch_blk = get_global_id(0); const int out_w_blk = get_global_id(1); const int out_w_blks = get_global_size(1); @@ -32,24 +37,37 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] #endif int4 w; - w.x = out_w_blk; +#if STRIDE == 1 + w.x = out_w_blk - padding_left; w.y = w.x + out_w_blks; w.z = w.y + out_w_blks; w.w = w.z + out_w_blks; + int out_hb_idx = (out_hb % height) - padding_top; +#else + w.x = out_w_blk * 2 - padding_left; + w.y = (out_w_blk + out_w_blks) * 2 - padding_left; + w.z = (out_w_blk + 2 * out_w_blks) * 2 - padding_left; + w.w = (out_w_blk + 3 * out_w_blks) * 2 - padding_left; + int out_hb_idx = (out_hb % height) * 2 - padding_top; +#endif + + w.x = select(w.x, INT_MIN, (w.x < 0 || w.x >= in_width)); + w.y = select(w.y, INT_MIN, (w.y < 0 || w.y >= in_width)); + w.z = select(w.z, INT_MIN, (w.z < 0 || w.z >= in_width)); + w.w = select(w.w, INT_MIN, (w.w < 0 || w.w >= in_width)); + + out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height, + -1, + out_hb_idx >= in_height); // Unrolling this loop hurt perfmance int in_x_base = 0; for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { - DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb)); - DATA_TYPE4 in1 = 0; - DATA_TYPE4 in2 = 0; - DATA_TYPE4 in3 = 0; - if (w.y < width) { - // conditional load hurt perf, this branching helps sometimes - in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb)); - in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb)); - in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb)); - } + + DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx)); + DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx)); + DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx)); + DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx)); const int filter_x0 = in_ch_blk << 2; DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk)); @@ -78,7 +96,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] out3 += in3.z * weights2; out3 += in3.w * weights3; - in_x_base += width; + in_x_base += in_width; } #ifdef FUSED_BATCH_NORM @@ -111,14 +129,18 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] #endif const int out_x_base = out_ch_blk * width; - WRITE_IMAGET(output, (int2)(out_x_base + w.x, out_hb), out0); + int out_x_idx = out_w_blk; + WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0); - if (w.y >= width) return; - WRITE_IMAGET(output, (int2)(out_x_base + w.y, out_hb), out1); + out_x_idx += out_w_blks; + if (out_x_idx >= width) return; + WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out1); - if (w.z >= width) return; - WRITE_IMAGET(output, (int2)(out_x_base + w.z, out_hb), out2); + out_x_idx += out_w_blks; + if (out_x_idx >= width) return; + WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out2); - if (w.w >= width) return; - WRITE_IMAGET(output, (int2)(out_x_base + w.w, out_hb), out3); + out_x_idx += out_w_blks; + if (out_x_idx >= width) return; + WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3); } diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 12849ee3..e2f2827c 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -19,23 +19,24 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const int out_hb = get_global_id(2); const int rounded_in_ch = in_ch_blks * 4; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; +#ifdef BIAS + float4 out0 = + convert_float4(READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0))); + float4 out1 = out0; + float4 out2 = out0; + float4 out3 = out0; + float4 out4 = out0; +#else float4 out0 = 0; float4 out1 = 0; float4 out2 = 0; float4 out3 = 0; float4 out4 = 0; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -#ifdef BIAS - out0 = - convert_float4(READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0))); - out1 = out0; - out2 = out0; - out3 = out0; - out4 = out0; #endif -#ifdef STRIDE_1 +#if STRIDE == 1 int in_width0 = out_w_blk - padding_left; int in_width1 = in_width0 + out_w_blks; int in_width2 = in_width1 + out_w_blks; diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index e8a88db5..477dbc98 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -15,6 +15,7 @@ void Conv1x1(const Tensor *input, const Tensor *filter, const Tensor *bias, const int stride, + const int *padding, Tensor *output) { const index_t batch = output->dim(0); const index_t height = output->dim(1); @@ -29,9 +30,7 @@ void Conv1x1(const Tensor *input, const index_t width_blocks = RoundUpDiv4(width); const index_t input_channel_blocks = RoundUpDiv4(input_channels); - MACE_CHECK(stride == 1); MACE_CHECK(input_batch == batch); - MACE_CHECK(stride != 1 || (input_height == height && input_width == width)); std::set built_options; built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); @@ -54,8 +53,13 @@ void Conv1x1(const Tensor *input, conv_2d_kernel.setArg(idx++, *(static_cast(bias->buffer()))); } conv_2d_kernel.setArg(idx++, *(static_cast(output->buffer()))); + conv_2d_kernel.setArg(idx++, static_cast(input_height)); + conv_2d_kernel.setArg(idx++, static_cast(input_width)); conv_2d_kernel.setArg(idx++, static_cast(input_channel_blocks)); + conv_2d_kernel.setArg(idx++, static_cast(height)); conv_2d_kernel.setArg(idx++, static_cast(width)); + conv_2d_kernel.setArg(idx++, padding[0] / 2); + conv_2d_kernel.setArg(idx++, padding[1] / 2); auto command_queue = runtime->command_queue(); cl_int error; @@ -74,7 +78,7 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *bias, const int *padding, Tensor *output) { - Conv1x1(input, filter, bias, 1, output); + Conv1x1(input, filter, bias, 1, padding, output); }; extern void Conv2dOpenclK1x1S2(const Tensor *input, @@ -82,7 +86,7 @@ extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *bias, const int *padding, Tensor *output) { - Conv1x1(input, filter, bias, 2, output); + Conv1x1(input, filter, bias, 2, padding, output); }; } // namespace kernels diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 9a39b499..fc824708 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -28,7 +28,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, built_options.emplace(input->dtype() == DT_FLOAT ? "-DTYPE_FLOAT" : ""); built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype())); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); - built_options.emplace(stride == 1 ? "-DSTRIDE_1" : ""); + built_options.emplace("-DSTRIDE=" + ToString(stride)); auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index b5f2df23..220056b6 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -420,15 +420,6 @@ template void TestConv1x1() { // Construct graph OpsTestNet net; - OpDefBuilder("Conv2D", "Conv2DTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); // Add input data net.AddInputFromArray( @@ -445,8 +436,37 @@ void TestConv1x1() { {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}); net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); - // Run - net.RunOp(D); + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + + OpDefBuilder("Conv2D", "Conv2DTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("Conv2D", "Conv2DTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + } // Check auto expected = CreateTensor( @@ -465,9 +485,9 @@ TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1(); } -//TEST_F(Conv2dOpTest, OPENCLConv1x1) { -// TestConv1x1(); -//} +TEST_F(Conv2dOpTest, OPENCLConv1x1) { + TestConv1x1(); +} template static void TestComplexConvNxNS12(const std::vector &shape) { @@ -631,4 +651,3 @@ static void TestHalfComplexConvNxNS12(const std::vector &shape) { //TEST_F(Conv2dOpTest, OPENCLHalfAlignedConvNxNS12) { // TestHalfComplexConvNxNS12({32, 32, 64, 128}); //} - diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index fefcfe5f..8d593940 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -351,13 +351,6 @@ void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) { Expector::Near(x, y, abs_err); } -template -std::string ToString(const T &input) { - std::stringstream ss; - ss << input; - return ss.str(); -} - template void BufferToImage(OpsTestNet &net, const std::string &input_name, -- GitLab