From 2d79d5c35a6a6d1b517cd78f00c2ffe95b6fe7f2 Mon Sep 17 00:00:00 2001 From: wuchenghui Date: Fri, 2 Mar 2018 16:39:30 +0800 Subject: [PATCH] multi-input concat opencl kernel --- mace/kernels/opencl/cl/concat.cl | 16 ++++++++ mace/kernels/opencl/concat.cc | 64 ++++++++++++++++++++++++++++++-- mace/ops/concat_test.cc | 7 +++- 3 files changed, 83 insertions(+), 4 deletions(-) diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl index 3a3efea4..af13422d 100644 --- a/mace/kernels/opencl/cl/concat.cl +++ b/mace/kernels/opencl/cl/concat.cl @@ -71,6 +71,22 @@ __kernel void concat_channel(__read_only image2d_t input0, WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx), data); } +// Required: All input channels are divisible by 4 +__kernel void concat_channel_multi(__read_only image2d_t input, + __private const int chan_blk_offset, + __write_only image2d_t output) { + const int chan_blk_idx = get_global_id(0); + const int width_idx = get_global_id(1); + const int width = get_global_size(1); + const int hb_idx = get_global_id(2); + DATA_TYPE4 data = 0; + data = READ_IMAGET(input, + SAMPLER, + (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx)); + + WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx + chan_blk_offset, width, width_idx), hb_idx), data); +} + //__kernel void concat_width(__read_only image2d_t input0, // __read_only image2d_t input1, // __private const int input0_width, diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index beb6fefc..686e3a7a 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -63,21 +63,71 @@ static void Concat2(cl::Kernel *kernel, TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future); } +static void ConcatN(cl::Kernel *kernel, + const std::vector &input_list, + const DataType dt, + Tensor *output, + StatsFuture *future) { + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channel = output->dim(3); + + const int channel_blk = RoundUpDiv4(channel); + + if (kernel->get() == nullptr) { + auto runtime = OpenCLRuntime::Global(); + std::set built_options; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); + built_options.emplace("-Dconcat_channel_multi=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); + *kernel = runtime->BuildKernel("concat", kernel_name, built_options); + } + + const int inputs_count = input_list.size(); + index_t chan_blk_offset = 0; + for (int i = 0; i < inputs_count; ++i) { + const Tensor *input = input_list[i]; + uint32_t idx = 0; + kernel->setArg(idx++, *(static_cast(input->buffer()))); + kernel->setArg(idx++, static_cast(chan_blk_offset)); + kernel->setArg(idx++, *(static_cast(output->buffer()))); + + index_t input_channel_blk = input->dim(3) / 4; + chan_blk_offset += input_channel_blk; + const uint32_t gws[3] = { + static_cast(input_channel_blk), + static_cast(width), + static_cast(batch * height), + }; + const std::vector lws = {8, 16, 8, 1}; + std::stringstream ss; + ss << "concat_n_opencl_kernel_" + << input_channel_blk << "_" + << width << "_" + << batch * height; + TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future); + } +} + template void ConcatFunctor::operator()(const std::vector &input_list, Tensor *output, StatsFuture *future) { const int inputs_count = input_list.size(); - MACE_CHECK(inputs_count == 2 && axis_ == 3) - << "Concat opencl kernel only support two elements with axis == 3"; + MACE_CHECK(inputs_count >= 2 && axis_ == 3) + << "Concat opencl kernel only support >=2 elements with axis == 3"; const Tensor *input0 = input_list[0]; + bool divisible_four = input0->dim(axis_) % 4 == 0; std::vector output_shape(input0->shape()); for (int i = 1; i < inputs_count; ++i) { const Tensor *input = input_list[i]; MACE_CHECK(input->dim_size() == input0->dim_size(), "Ranks of all input tensors must be same."); + divisible_four &= input->dim(axis_) % 4 == 0; for (int j = 0; j < input->dim_size(); ++j) { if (j == axis_) { continue; @@ -87,6 +137,8 @@ void ConcatFunctor::operator()(const std::vectordim(axis_); } + MACE_CHECK(inputs_count == 2 || divisible_four, + "Dimensions of inputs should be divisible by 4 when inputs_count > 2."); std::vector image_shape; CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape); output->ResizeImage(output_shape, image_shape); @@ -96,7 +148,13 @@ void ConcatFunctor::operator()(const std::vector::value, output, future); break; - default:MACE_NOT_IMPLEMENTED; + default: + if (divisible_four) { + ConcatN(&kernel_, input_list, DataTypeToEnum::value, output, future); + } + else { + MACE_NOT_IMPLEMENTED; + } } }; diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc index 25dba6c9..a49e593c 100644 --- a/mace/ops/concat_test.cc +++ b/mace/ops/concat_test.cc @@ -143,7 +143,7 @@ template void OpenclRandomTest(const std::vector> &shapes, const int axis) { srand(time(nullptr)); - int num_inputs = 2; + int num_inputs = shapes.size(); int concat_axis_size = 0; // Construct graph OpsTestNet net; @@ -212,3 +212,8 @@ TEST_F(ConcatOpTest, OPENCLHalfAligned) { TEST_F(ConcatOpTest, OPENCLUnAligned) { OpenclRandomTest({{3, 32, 32, 13}, {3, 32, 32, 17}}, 3); } + +TEST_F(ConcatOpTest, OPENCLAlignedMultiInput) { + OpenclRandomTest({{3, 32, 32, 32}, {3, 32, 32, 32}, + {3, 32, 32, 32}, {3, 32, 32, 32}}, 3); +} \ No newline at end of file -- GitLab