From 5262c860d22fa11639af6cf7462024363359a352 Mon Sep 17 00:00:00 2001 From: liuqi Date: Thu, 30 Nov 2017 20:38:41 +0800 Subject: [PATCH] Refactor opencl conv kernel and op. --- mace/kernels/opencl/buffer_to_image.cc | 4 +- mace/kernels/opencl/cl/conv_2d_1x1.cl | 71 +++++++++-------------- mace/kernels/opencl/cl/conv_2d_3x3.cl | 69 ++++++---------------- mace/kernels/opencl/conv_2d_opencl.cc | 12 ++-- mace/kernels/opencl/conv_2d_opencl_1x1.cc | 11 ++-- mace/kernels/opencl/conv_2d_opencl_3x3.cc | 17 +++--- mace/ops/conv_2d_test.cc | 36 ++++++------ 7 files changed, 89 insertions(+), 131 deletions(-) diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index 511e4598..61faa995 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -24,8 +24,8 @@ void BufferToImageFunctor::operator()(Tensor *buffer, } std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(image->dtype())); - built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(image->dtype())); + built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(DataTypeToEnum::value)); auto runtime = OpenCLRuntime::Get(); string kernel_name; switch (type) { diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index 7484ea51..7aaf367c 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -23,15 +23,15 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; #ifdef BIAS - float4 out0 = convert_float4(READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0))); - float4 out1 = out0; - float4 out2 = out0; - float4 out3 = out0; + DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)); + DATA_TYPE4 out1 = out0; + DATA_TYPE4 out2 = out0; + DATA_TYPE4 out3 = out0; #else - float4 out0 = 0; - float4 out1 = 0; - float4 out2 = 0; - float4 out3 = 0; + DATA_TYPE4 out0 = 0; + DATA_TYPE4 out1 = 0; + DATA_TYPE4 out2 = 0; + DATA_TYPE4 out3 = 0; #endif int4 w; @@ -62,16 +62,16 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] int in_x_base = 0; for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { - float4 in0 = convert_float4(READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx))); - float4 in1 = convert_float4(READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx))); - float4 in2 = convert_float4(READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx))); - float4 in3 = convert_float4(READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx))); + DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx)); + DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx)); + DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx)); + DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx)); const int filter_x0 = in_ch_blk << 2; - float4 weights0 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk))); - float4 weights1 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk))); - float4 weights2 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk))); - float4 weights3 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk))); + DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk)); + DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk)); + DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk)); + DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk)); // Will prefetch L2 improve performance? How to pretch image data? out0 += in0.x * weights0; @@ -99,18 +99,18 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] #ifdef FUSED_BATCH_NORM // batch norm - float4 bn_scale_value = - convert_float4(READ_IMAGET(bn_scale, sampler, (int2)(out_ch_blk, 0))); - float4 scale0 = (float4)(bn_scale_value.x); - float4 scale1 = (float4)(bn_scale_value.y); - float4 scale2 = (float4)(bn_scale_value.z); - float4 scale3 = (float4)(bn_scale_value.w); - float4 bn_offset_value = + DATA_TYPE4 bn_scale_value = + READ_IMAGET(bn_scale, sampler, (int2)(out_ch_blk, 0)); + DATA_TYPE4 scale0 = (DATA_TYPE4)(bn_scale_value.x); + DATA_TYPE4 scale1 = (DATA_TYPE4)(bn_scale_value.y); + DATA_TYPE4 scale2 = (DATA_TYPE4)(bn_scale_value.z); + DATA_TYPE4 scale3 = (DATA_TYPE4)(bn_scale_value.w); + DATA_TYPE4 bn_offset_value = READ_IMAGET(bn_offset, sampler, (int2)(out_ch_blk, 0)); - float4 offset0 = (float4)(bn_offset_value.x); - float4 offset1 = (float4)(bn_offset_value.y); - float4 offset2 = (float4)(bn_offset_value.z); - float4 offset3 = (float4)(bn_offset_value.w); + DATA_TYPE4 offset0 = (DATA_TYPE4)(bn_offset_value.x); + DATA_TYPE4 offset1 = (DATA_TYPE4)(bn_offset_value.y); + DATA_TYPE4 offset2 = (DATA_TYPE4)(bn_offset_value.z); + DATA_TYPE4 offset3 = (DATA_TYPE4)(bn_offset_value.w); out0 = out0 * scale0 + offset0; out1 = out1 * scale1 + offset1; @@ -126,7 +126,6 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] out3 = fmax(out3, 0); #endif -#ifdef TYPE_FLOAT const int out_x_base = out_ch_blk * width; int out_x_idx = out_w_blk; WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0); @@ -142,21 +141,5 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] out_x_idx += out_w_blks; if (out_x_idx >= width) return; WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3); -#else - const int out_x_base = out_ch_blk * width; - int out_x_idx = out_w_blk; - WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), convert_half4(out0)); - - out_x_idx += out_w_blks; - if (out_x_idx >= width) return; - WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), convert_half4(out1)); - out_x_idx += out_w_blks; - if (out_x_idx >= width) return; - WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), convert_half4(out2)); - - out_x_idx += out_w_blks; - if (out_x_idx >= width) return; - WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), convert_half4(out3)); -#endif } diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index e2f2827c..81d39c1e 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -19,21 +19,20 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const int out_hb = get_global_id(2); const int rounded_in_ch = in_ch_blks * 4; - const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; #ifdef BIAS - float4 out0 = - convert_float4(READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0))); - float4 out1 = out0; - float4 out2 = out0; - float4 out3 = out0; - float4 out4 = out0; + DATA_TYPE4 out0 = + READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)); + DATA_TYPE4 out1 = out0; + DATA_TYPE4 out2 = out0; + DATA_TYPE4 out3 = out0; + DATA_TYPE4 out4 = out0; #else - float4 out0 = 0; - float4 out1 = 0; - float4 out2 = 0; - float4 out3 = 0; - float4 out4 = 0; + DATA_TYPE4 out0 = 0; + DATA_TYPE4 out1 = 0; + DATA_TYPE4 out2 = 0; + DATA_TYPE4 out3 = 0; + DATA_TYPE4 out4 = 0; #endif #if STRIDE == 1 @@ -54,8 +53,8 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const int batch_idx = (out_hb / out_height) * in_height; - float4 in0, in1, in2, in3, in4; - float4 weights0, weights1, weights2, weights3; + DATA_TYPE4 in0, in1, in2, in3, in4; + DATA_TYPE4 weights0, weights1, weights2, weights3; int in_idx, hb_idx, width_idx, in_width_idx; // Unrolling this loop hurt perfmance for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { @@ -75,7 +74,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] in_width_value = select(in_idx + in_width_value, \ -1, \ (in_width_value < 0 || in_width_value >= in_width)); \ - in##i = convert_float4(READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value))); + in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value)); READ_INPUT(0); READ_INPUT(1); @@ -86,10 +85,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] #undef READ_INPUT int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch; - weights0 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk))); - weights1 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk))); - weights2 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk))); - weights3 = convert_float4(READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk))); + weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk)); + weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk)); + weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk)); + weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk)); // Will prefetch L2 improve performance? How to pretch image data? @@ -122,7 +121,6 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] } } -#ifdef TYPE_FLOAT const int out_x_base = out_ch_blk * out_width; int w = out_w_blk; WRITE_IMAGET(output, @@ -152,36 +150,5 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out4); -#else - const int out_x_base = out_ch_blk * out_width; - int w = out_w_blk; - WRITE_IMAGET(output, - (int2)(out_x_base + w, out_hb), - convert_half4(out0)); - - w += out_w_blks; - if (w >= out_width) return; - WRITE_IMAGET(output, - (int2)(out_x_base + w, out_hb), - convert_half4(out1)); - - w += out_w_blks; - if (w >= out_width) return; - WRITE_IMAGET(output, - (int2)(out_x_base + w, out_hb), - convert_half4(out2)); - - w += out_w_blks; - if (w >= out_width) return; - WRITE_IMAGET(output, - (int2)(out_x_base + w, out_hb), - convert_half4(out3)); - - w += out_w_blks; - if (w >= out_width) return; - WRITE_IMAGET(output, - (int2)(out_x_base + w, out_hb), - convert_half4(out4)); -#endif } diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc index 2a09fd9c..46066b01 100644 --- a/mace/kernels/opencl/conv_2d_opencl.cc +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -10,19 +10,19 @@ namespace kernels { extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, const Tensor *bias, const int *padding, - Tensor *output); + const DataType dt, Tensor *output); extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, const Tensor *bias, const int *padding, - Tensor *output); + const DataType dt, Tensor *output); extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, const Tensor *bias, const int *padding, - Tensor *output); + const DataType dt, Tensor *output); extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, const Tensor *bias, const int *padding, - Tensor *output); + const DataType dt, Tensor *output); template void Conv2dFunctor::operator()(const Tensor *input, @@ -31,7 +31,7 @@ void Conv2dFunctor::operator()(const Tensor *input, Tensor *output) { typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter, const Tensor *bias, const int *padding, - Tensor *output); + DataType dt, Tensor *output); // Selection matrix: kernel_size x stride_size static const Conv2dOpenclFunction selector[5][2] = { {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2}, @@ -70,7 +70,7 @@ void Conv2dFunctor::operator()(const Tensor *input, } auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; - conv2d_func(input, filter, bias, paddings.data(), output); + conv2d_func(input, filter, bias, paddings.data(), DataTypeToEnum::value, output); } template struct Conv2dFunctor; diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index b24b4567..e089e912 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -15,6 +15,7 @@ void Conv1x1(const Tensor *input, const Tensor *filter, const Tensor *bias, const int stride, + const DataType dt, Tensor *output) { const index_t batch = output->dim(0); const index_t height = output->dim(1); @@ -32,8 +33,8 @@ void Conv1x1(const Tensor *input, MACE_CHECK(input_batch == batch); std::set built_options; - built_options.emplace(input->dtype() == DT_FLOAT ? "-DTYPE_FLOAT" : ""); - built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype())); + built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(dt)); built_options.emplace("-DSTRIDE=" + ToString(stride)); if (bias != nullptr) { built_options.emplace("-DBIAS"); @@ -74,16 +75,18 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, const Tensor *bias, const int *padding, + const DataType dt, Tensor *output) { - Conv1x1(input, filter, bias, 1, output); + Conv1x1(input, filter, bias, 1, dt, output); }; extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, const Tensor *bias, const int *padding, + const DataType dt, Tensor *output) { - Conv1x1(input, filter, bias, 2, output); + Conv1x1(input, filter, bias, 2, dt, output); }; } // namespace kernels diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index fc824708..b280b042 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -13,7 +13,8 @@ namespace kernels { static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, const Tensor *bias, const uint32_t stride, - const int *padding, Tensor *output) { + const int *padding, const DataType dt, + Tensor *output) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -25,8 +26,8 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, const index_t width_blocks = RoundUpDiv(width); std::set built_options; - built_options.emplace(input->dtype() == DT_FLOAT ? "-DTYPE_FLOAT" : ""); - built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype())); + built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(dt)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace("-DSTRIDE=" + ToString(stride)); @@ -63,13 +64,15 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, } void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, Tensor *output) { - Conv2d3x3S12(input, filter, bias, 1, padding, output); + const Tensor *bias, const int *padding, + const DataType dt, Tensor *output) { + Conv2d3x3S12(input, filter, bias, 1, padding, dt, output); }; void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, Tensor *output) { - Conv2d3x3S12(input, filter, bias, 2, padding, output); + const Tensor *bias, const int *padding, + const DataType dt, Tensor *output) { + Conv2d3x3S12(input, filter, bias, 2, padding, dt, output); }; } // namespace kernels diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index 220056b6..1cda8017 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -558,14 +558,14 @@ static void TestComplexConvNxNS12(const std::vector &shape) { } TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) { - TestComplexConvNxNS12({32, 32, 64, 128}); + TestComplexConvNxNS12({32, 32, 32, 64}); } TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) { TestComplexConvNxNS12({107, 113, 5, 7}); } -template +template static void TestHalfComplexConvNxNS12(const std::vector &shape) { testing::internal::LogToStderr(); auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, @@ -612,15 +612,15 @@ static void TestHalfComplexConvNxNS12(const std::vector &shape) { std::vector input_data(float_input_data.begin(), float_input_data.end()); std::vector filter_data(float_filter_data.begin(), float_filter_data.end()); std::vector bias_data(float_bias_data.begin(), float_bias_data.end()); - net.AddInputFromArray("InputHalf", {batch, height, width, input_channels}, input_data); - net.AddInputFromArray( + net.AddInputFromArray("InputHalf", {batch, height, width, input_channels}, input_data); + net.AddInputFromArray( "FilterHalf", {kernel_h, kernel_w, input_channels, output_channels}, filter_data); - net.AddInputFromArray("BiasHalf", {output_channels}, bias_data); + net.AddInputFromArray("BiasHalf", {output_channels}, bias_data); // run on gpu - BufferToImage(net, "InputHalf", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "FilterHalf", "FilterImage", kernels::BufferType::FILTER); - BufferToImage(net, "BiasHalf", "BiasImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "InputHalf", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "FilterHalf", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "BiasHalf", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputImage") @@ -630,24 +630,26 @@ static void TestHalfComplexConvNxNS12(const std::vector &shape) { .AddIntsArg("strides", {stride_h, stride_w}) .AddIntArg("padding", type) .AddIntsArg("dilations", {1, 1}) - .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run on device net.RunOp(D); - ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); - ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 1.0); + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.2); }; - for (int kernel_size : {3}) { - for (int stride : {1}) { + for (int kernel_size : {1, 3}) { + for (int stride : {1, 2}) { func(kernel_size, kernel_size, stride, stride, VALID); } } } -// TODO: support half input & float computation -//TEST_F(Conv2dOpTest, OPENCLHalfAlignedConvNxNS12) { -// TestHalfComplexConvNxNS12({32, 32, 64, 128}); -//} +TEST_F(Conv2dOpTest, OPENCLHalfAlignedConvNxNS12) { + TestHalfComplexConvNxNS12({32, 32, 32, 64}); +} + +TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConvNxNS12) { + TestHalfComplexConvNxNS12({107, 113, 5, 7}); +} -- GitLab