From b469a9451441ece4d2055e25d7267b99330f235a Mon Sep 17 00:00:00 2001 From: liuqi Date: Fri, 3 Nov 2017 09:55:42 +0800 Subject: [PATCH] Finish depthwise 3x3 conv with stride 2. --- mace/kernels/opencl/cl/conv_2d_3x3.cl | 17 +--- mace/kernels/opencl/cl/conv_helper.h | 15 +++ mace/kernels/opencl/cl/depthwise_conv_3x3.cl | 93 ++++++++++--------- mace/kernels/opencl/conv_2d_opencl_3x3.cc | 28 +++--- mace/kernels/opencl/depthwise_conv_opencl.cc | 4 +- .../opencl/depthwise_conv_opencl_3x3.cc | 73 +++++++++------ 6 files changed, 133 insertions(+), 97 deletions(-) create mode 100644 mace/kernels/opencl/cl/conv_helper.h diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 1c97a581..cd4e5eed 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -1,11 +1,4 @@ -float4 conv1x3_s1(const float *input_ptr, - const float *filter_ptr); -float4 conv1x3_s2(const float *input_ptr, - const float *filter_ptr); -float conv3x3(const float *input_ptr, - const float *filter_ptr, - const int row_width); - +#include void kernel conv_2d_3x3(global const float *input, global const float *filter, global const float *bias, @@ -18,9 +11,9 @@ void kernel conv_2d_3x3(global const float *input, private const uint out_width, private const uint stride_h, private const uint stride_w) { - int batch = get_global_id(0); - int out_chan_blk = get_global_id(1); - int out_pixel_blk = get_global_id(2); + const int batch = get_global_id(0); + const int out_chan_blk = get_global_id(1); + const int out_pixel_blk = get_global_id(2); const uint in_pixel = in_height * in_width; const uint out_pixel = out_height * out_width; @@ -43,10 +36,10 @@ void kernel conv_2d_3x3(global const float *input, uint pixels = out_pixel_end - out_pixel_begin; for (uint i = out_chan_begin; i < out_chan_end; ++i) { - float4 res = (float4)bias[i]; float *output_ptr = output_base + i * out_pixel; const float *filter_base = filter + i * in_chan_num * 9; if (pixels == 4) { + float4 res = (float4)bias[i]; for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) { const float* input_ptr = input_base + in_chan_idx * in_pixel; const float* filter_ptr = filter_base + in_chan_idx * 9; diff --git a/mace/kernels/opencl/cl/conv_helper.h b/mace/kernels/opencl/cl/conv_helper.h new file mode 100644 index 00000000..553af09d --- /dev/null +++ b/mace/kernels/opencl/cl/conv_helper.h @@ -0,0 +1,15 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_ +#define MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_ + +float4 conv1x3_s1(const float *input_ptr, + const float *filter_ptr); +float4 conv1x3_s2(const float *input_ptr, + const float *filter_ptr); +float conv3x3(const float *input_ptr, + const float *filter_ptr, + const int row_width); +#endif // MACE_KERNELS_OPENCL_CL_CONV_HELPER_H_ diff --git a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl index 2cdbe3fa..b88e339a 100644 --- a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl +++ b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl @@ -1,61 +1,66 @@ -float4 conv1x3_s1(const float *input_ptr, - const float *filter_ptr); -float conv3x3(const float *input_ptr, - const float *filter_ptr, - const int row_width); +#include +//TODO merge the depthwise with conv 3x3 to remove duplicate code. +void kernel depthwise_conv_3x3(global const float *input, /* n, c, h, w */ + global const float *filter, /* m, i, kh, kw */ + global const float *bias, /* o */ + global float *output, /* n, c, h, w */ + private const uint in_chan_num, + private const uint out_chan_num, + private const uint in_height, + private const uint in_width, + private const uint out_height, + private const uint out_width, + private const uint stride_h, + private const uint stride_w) { + const int batch = get_global_id(0); + const int out_chan_blk = get_global_id(1); + const int out_pixel_blk = get_global_id(2); -void kernel depthwise_conv_3x3_s1(global const float *input, /* n, c, h, w */ - global const float *filter, /* m, i, kh, kw */ - global const float *bias, /* o */ - global float *output, /* n, c, h, w */ - private const int in_chan_num, - private const int out_chan_num, - private const int in_height, - private const int in_width, - private const int out_height, - private const int out_width) { - int batch = get_global_id(0); - int out_chan_blk = get_global_id(1); - int out_pixel_blk = get_global_id(2); + const uint in_pixel = in_height * in_width; + const uint out_pixel = out_height * out_width; + const uint multiplier = out_chan_num / in_chan_num; - const int in_pixel = in_height * in_width; - const int out_pixel = out_height * out_width; - const int multiplier = out_chan_num / in_chan_num; + const uint round_out_width = (out_width + 3) / 4; + const uint out_pixel_height = out_pixel_blk / round_out_width; + const uint out_pixel_width = out_pixel_blk % round_out_width; - const int round_out_width = (out_width + 3) / 4; - const int out_pixel_height = out_pixel_blk / round_out_width; - const int out_pixel_width = out_pixel_blk % round_out_width; + const uint out_chan_begin = out_chan_blk * 4; + const uint out_chan_end = min(out_chan_begin + 4, out_chan_num); + const uint out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; + const uint out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); + const uint in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4; - const int out_chan_begin = out_chan_blk * 4; - const int out_chan_end = min(out_chan_begin + 4, out_chan_num); - const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; - const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); - const int in_pixel_begin = out_pixel_height * in_width + out_pixel_width * 4; - - const int in_offset = batch * in_chan_num * in_pixel; - const int out_offset = batch * out_chan_num * out_pixel; + const uint in_offset = batch * in_chan_num * in_pixel; + const uint out_offset = batch * out_chan_num * out_pixel; const float *input_base = input + in_offset + in_pixel_begin; float *output_base = output + out_offset + out_pixel_begin; - int pixels = out_pixel_end - out_pixel_begin; + uint pixels = out_pixel_end - out_pixel_begin; - for (int i = out_chan_begin; i < out_chan_end; ++i) { + for (uint i = out_chan_begin; i < out_chan_end; ++i) { float bias_value = bias[i]; const float *input_ptr = input_base + (i / multiplier) * in_pixel; const float *filter_ptr = filter + i * 9; float *output_ptr = output_base + i * out_pixel; - if (pixels < 4) { - for (int out_idx = 0; out_idx < pixels; ++out_idx) { - output_ptr[out_idx] = bias_value; - output_ptr[out_idx] += conv3x3(input_ptr, filter_ptr, in_width); - input_ptr += 1; + if (pixels == 4) { + float4 res = (float4)bias[i]; + if (stride_w == 1) { + res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3); + res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3); + res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3); + } else { + res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3); + res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3); + res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3); } - } else { - float4 res = (float4)bias_value; - res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3); - res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3); - res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3); vstore4(res, 0, output_ptr); + } else { + for (uint p = 0; p < pixels; ++p) { + float res = bias[i]; + res += conv3x3(input_ptr, filter_ptr, in_width); + output_ptr[p] = res; + input_ptr += stride_w; + } } } diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 448f31d6..90011b8f 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -22,21 +22,21 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter, auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); - auto bm_kernel = cl::Kernel(program, "conv_2d_3x3"); + auto conv_kernel = cl::Kernel(program, "conv_2d_3x3"); uint32_t idx = 0; - bm_kernel.setArg(idx++, *(static_cast(input->buffer()))); - bm_kernel.setArg(idx++, *(static_cast(filter->buffer()))); - bm_kernel.setArg(idx++, *(static_cast(bias->buffer()))); - bm_kernel.setArg(idx++, *(static_cast(output->buffer()))); - bm_kernel.setArg(idx++, static_cast(input->dim(1))); - bm_kernel.setArg(idx++, static_cast(channels)); - bm_kernel.setArg(idx++, static_cast(input->dim(2))); - bm_kernel.setArg(idx++, static_cast(input->dim(3))); - bm_kernel.setArg(idx++, static_cast(height)); - bm_kernel.setArg(idx++, static_cast(width)); - bm_kernel.setArg(idx++, stride); - bm_kernel.setArg(idx++, stride); + conv_kernel.setArg(idx++, *(static_cast(input->buffer()))); + conv_kernel.setArg(idx++, *(static_cast(filter->buffer()))); + conv_kernel.setArg(idx++, *(static_cast(bias->buffer()))); + conv_kernel.setArg(idx++, *(static_cast(output->buffer()))); + conv_kernel.setArg(idx++, static_cast(input->dim(1))); + conv_kernel.setArg(idx++, static_cast(channels)); + conv_kernel.setArg(idx++, static_cast(input->dim(2))); + conv_kernel.setArg(idx++, static_cast(input->dim(3))); + conv_kernel.setArg(idx++, static_cast(height)); + conv_kernel.setArg(idx++, static_cast(width)); + conv_kernel.setArg(idx++, stride); + conv_kernel.setArg(idx++, stride); const uint32_t gws[3] = {static_cast(output->dim(0)), static_cast(channel_blocks), static_cast(pixel_blocks)}; @@ -44,7 +44,7 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter, static_cast(1), static_cast(256)}; cl_int error = runtime->command_queue().enqueueNDRangeKernel( - bm_kernel, cl::NullRange, + conv_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(lws[0], lws[1], lws[2])); MACE_CHECK(error == CL_SUCCESS); diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc index ca6a5b44..7dcb996f 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl.cc +++ b/mace/kernels/opencl/depthwise_conv_opencl.cc @@ -10,6 +10,8 @@ namespace kernels { extern void DepthwiseConvOpenclK3x3S1(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output); +extern void DepthwiseConvOpenclK3x3S2(const Tensor *input, const Tensor *filter, + const Tensor *bias, Tensor *output); template <> void DepthwiseConv2dFunctor::operator()(const Tensor *input, const Tensor *filter, @@ -21,7 +23,7 @@ void DepthwiseConv2dFunctor::operator()(const Tensor static const Conv2dOpenclFunction selector[5][2] = { {nullptr, nullptr}, {nullptr, nullptr}, - {DepthwiseConvOpenclK3x3S1, nullptr}, + {DepthwiseConvOpenclK3x3S1, DepthwiseConvOpenclK3x3S2}, {nullptr, nullptr}, {nullptr, nullptr}}; diff --git a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc index e76858a5..0c393e76 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc +++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc @@ -9,10 +9,11 @@ namespace mace { namespace kernels { -extern void DepthwiseConvOpenclK3x3S1(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output) { +static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const uint32_t stride, + Tensor *output) { const index_t batch = output->dim(0); const index_t channels = output->dim(1); const index_t height = output->dim(2); @@ -24,33 +25,53 @@ extern void DepthwiseConvOpenclK3x3S1(const Tensor *input, const index_t input_width = input->dim(3); MACE_CHECK(input_batch == batch); - - auto runtime = OpenCLRuntime::Get(); - auto program = runtime->program(); - auto conv_2d = cl::KernelFunctor(program, "depthwise_conv_3x3_s1"); const index_t pixels = height * width; const index_t channel_blocks = (channels + 3) / 4; const index_t pixel_blocks = (width + 3) / 4 * height; - cl_int error; - conv_2d(cl::EnqueueArgs(runtime->command_queue(), - cl::NDRange(static_cast(batch), - static_cast(channel_blocks), - static_cast(pixel_blocks)), - cl::NDRange(1, 1, 256)), - *(static_cast(input->buffer())), - *(static_cast(filter->buffer())), - *(static_cast(bias->buffer())), - *(static_cast(output->buffer())), - static_cast(input_channels), - static_cast(channels), - static_cast(input_height), - static_cast(input_width), - static_cast(height), - static_cast(width), - error); + auto runtime = OpenCLRuntime::Get(); + auto program = runtime->program(); + auto conv_kernel = cl::Kernel(program, "depthwise_conv_3x3"); + + uint32_t idx = 0; + conv_kernel.setArg(idx++, *(static_cast(input->buffer()))); + conv_kernel.setArg(idx++, *(static_cast(filter->buffer()))); + conv_kernel.setArg(idx++, *(static_cast(bias->buffer()))); + conv_kernel.setArg(idx++, *(static_cast(output->buffer()))); + conv_kernel.setArg(idx++, static_cast(input->dim(1))); + conv_kernel.setArg(idx++, static_cast(channels)); + conv_kernel.setArg(idx++, static_cast(input->dim(2))); + conv_kernel.setArg(idx++, static_cast(input->dim(3))); + conv_kernel.setArg(idx++, static_cast(height)); + conv_kernel.setArg(idx++, static_cast(width)); + conv_kernel.setArg(idx++, stride); + conv_kernel.setArg(idx++, stride); + + const uint32_t gws[3] = {static_cast(output->dim(0)), + static_cast(channel_blocks), + static_cast(pixel_blocks)}; + const uint32_t lws[3] = {static_cast(1), + static_cast(1), + static_cast(256)}; + cl_int error = runtime->command_queue().enqueueNDRangeKernel( + conv_kernel, cl::NullRange, + cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2])); MACE_CHECK(error == CL_SUCCESS); +} + +extern void DepthwiseConvOpenclK3x3S1(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output) { + InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 1, output); +}; + +extern void DepthwiseConvOpenclK3x3S2(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output) { + InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 2, output); }; } // namespace kernels -- GitLab