From ff5d176ee8ad6697c4218c752b3edc1bb53f56ab Mon Sep 17 00:00:00 2001 From: Liangliang He Date: Wed, 24 Jan 2018 18:08:39 +0800 Subject: [PATCH] Add depthwise conv2d opencl kernel --- mace/kernels/conv_2d.h | 8 +- mace/kernels/depthwise_conv2d.h | 167 ++++++---- mace/kernels/opencl/REAEMD.md | 12 + mace/kernels/opencl/buffer_to_image.cc | 5 +- mace/kernels/opencl/cl/buffer_to_image.cl | 50 +++ mace/kernels/opencl/cl/conv_2d.cl | 23 +- mace/kernels/opencl/cl/conv_2d_1x1.cl | 2 +- mace/kernels/opencl/cl/conv_2d_3x3.cl | 5 +- mace/kernels/opencl/cl/depthwise_conv2d.cl | 129 ++++++++ mace/kernels/opencl/cl/depthwise_conv_3x3.cl | 126 -------- mace/kernels/opencl/conv_2d_opencl_general.cc | 15 +- mace/kernels/opencl/depthwise_conv_opencl.cc | 178 +++++++--- .../opencl/depthwise_conv_opencl_3x3.cc | 98 ------ mace/kernels/opencl/helper.cc | 31 +- mace/kernels/opencl/helper.h | 7 +- mace/ops/buffer_to_image.h | 2 +- mace/ops/buffer_to_image_test.cc | 12 +- mace/ops/conv_2d_benchmark.cc | 2 +- mace/ops/conv_2d_test.cc | 16 +- mace/ops/core_test.cc | 4 +- mace/ops/depthwise_conv2d.cc | 6 + mace/ops/depthwise_conv2d.h | 35 +- mace/ops/depthwise_conv2d_test.cc | 303 +++++++++++++++--- mace/ops/depthwise_conv_2d_benchmark.cc | 78 +++-- mace/ops/fused_conv_2d_test.cc | 18 +- mace/ops/image_to_buffer.h | 2 +- 26 files changed, 852 insertions(+), 482 deletions(-) create mode 100644 mace/kernels/opencl/REAEMD.md create mode 100644 mace/kernels/opencl/cl/depthwise_conv2d.cl delete mode 100644 mace/kernels/opencl/cl/depthwise_conv_3x3.cl delete mode 100644 mace/kernels/opencl/depthwise_conv_opencl_3x3.cc diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index 08b04477..628acceb 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -21,15 +21,15 @@ struct Conv2dFunctorBase { const float relux_max_limit, const float prelu_alpha) : strides_(strides), - dilations_(dilations), paddings_(paddings), + dilations_(dilations), activation_(activation), relux_max_limit_(relux_max_limit), prelu_alpha_(prelu_alpha) {} const int *strides_; // [stride_h, stride_w] - const int *dilations_; // [dilation_h, dilation_w] const Padding paddings_; + const int *dilations_; // [dilation_h, dilation_w] const ActivationType activation_; const float relux_max_limit_; const float prelu_alpha_; @@ -50,8 +50,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { relux_max_limit, prelu_alpha) {} - void operator()(const Tensor *input, - const Tensor *filter, + void operator()(const Tensor *input, // NHWC + const Tensor *filter, // HWIO const Tensor *bias, Tensor *output, StatsFuture *future) { diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index 09d87b94..92bc4e33 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -2,28 +2,57 @@ // Copyright (c) 2017 XiaoMi All rights reserved. // -#ifndef MACE_KERNELS_DEPTHWISE_CONV_H_ -#define MACE_KERNELS_DEPTHWISE_CONV_H_ +#ifndef MACE_KERNELS_DEPTHWISE_CONV2D_H_ +#define MACE_KERNELS_DEPTHWISE_CONV2D_H_ -#include "mace/core/future.h" #include "mace/core/common.h" -#include "mace/kernels/conv_pool_2d_util.h" +#include "mace/core/future.h" #include "mace/core/public/mace.h" +#include "mace/kernels/conv_pool_2d_util.h" namespace mace { namespace kernels { +struct DepthwiseConv2dFunctorBase { + DepthwiseConv2dFunctorBase(const int *strides, + const Padding padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha) + : strides_(strides), + padding_(padding), + dilations_(dilations), + activation_(activation), + relux_max_limit_(relux_max_limit), + prelu_alpha_(prelu_alpha) {} + + const int *strides_; // [stride_h, stride_w] + const Padding padding_; + const int *dilations_; // [dilation_h, dilation_w] + const ActivationType activation_; + const float relux_max_limit_; + const float prelu_alpha_; +}; + template -struct DepthwiseConv2dFunctor { - DepthwiseConv2dFunctor() {} +struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase { DepthwiseConv2dFunctor(const int *strides, - const std::vector &paddings, - const int *dilations) - : strides_(strides), paddings_(paddings), dilations_(dilations) {} - - void operator()(const Tensor *input, // NCHW - const Tensor *filter, // c_out, c_in, kernel_h, kernel_w - const Tensor *bias, // c_out + const Padding padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha) + : DepthwiseConv2dFunctorBase(strides, + padding, + dilations, + activation, + relux_max_limit, + prelu_alpha) {} + + void operator()(const Tensor *input, // NHWC + const Tensor *filter, // HWIM + const Tensor *bias, // O Tensor *output, StatsFuture *future) { MACE_CHECK_NOTNULL(input); @@ -31,18 +60,36 @@ struct DepthwiseConv2dFunctor { MACE_CHECK_NOTNULL(bias); MACE_CHECK_NOTNULL(output); + // Create a fake conv_2d filter to calculate the paddings and output size + std::vector fake_filter_shape(4); + fake_filter_shape[0] = filter->shape()[0]; + fake_filter_shape[1] = filter->shape()[1]; + fake_filter_shape[3] = filter->shape()[2] * filter->shape()[3]; + fake_filter_shape[2] = 1; + + std::vector output_shape(4); + std::vector paddings(2); + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), fake_filter_shape.data(), dilations_, strides_, + padding_, output_shape.data(), paddings.data()); + auto input_shape = fake_filter_shape; + output->Resize(output_shape); + index_t batch = output->dim(0); - index_t channels = output->dim(1); - index_t height = output->dim(2); - index_t width = output->dim(3); + index_t height = output->dim(1); + index_t width = output->dim(2); + index_t channels = output->dim(3); index_t input_batch = input->dim(0); - index_t input_channels = input->dim(1); - index_t input_height = input->dim(2); - index_t input_width = input->dim(3); + index_t input_height = input->dim(1); + index_t input_width = input->dim(2); + index_t input_channels = input->dim(3); - index_t kernel_h = filter->dim(2); - index_t kernel_w = filter->dim(3); + index_t kernel_h = filter->dim(0); + index_t kernel_w = filter->dim(1); + index_t multiplier = filter->dim(3); + MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=", input_channels); + MACE_CHECK(channels == input_channels * multiplier); int stride_h = strides_[0]; int stride_w = strides_[1]; @@ -53,13 +100,12 @@ struct DepthwiseConv2dFunctor { MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch"); // The left-upper most offset of the padded input - int padded_h_start = 0 - paddings_[0] / 2; - int padded_w_start = 0 - paddings_[1] / 2; - index_t padded_h_stop = input_height + paddings_[0] - paddings_[0] / 2; - index_t padded_w_stop = input_width + paddings_[1] - paddings_[1] / 2; + int padded_h_start = 0 - paddings[0] / 2; + int padded_w_start = 0 - paddings[1] / 2; + index_t padded_h_stop = input_height + paddings[0] - paddings[0] / 2; + index_t padded_w_stop = input_width + paddings[1] - paddings[1] / 2; - index_t kernel_size = kernel_h * kernel_w; - index_t multiplier = filter->dim(0); + const index_t kernel_size = kernel_h * kernel_w; Tensor::MappingGuard input_mapper(input); Tensor::MappingGuard filter_mapper(filter); @@ -67,20 +113,22 @@ struct DepthwiseConv2dFunctor { Tensor::MappingGuard output_mapper(output); const T *input_ptr = input->data(); const T *filter_ptr = filter->data(); - const T *bias_ptr = bias->data(); + const T *bias_ptr = bias->data(); T *output_ptr = output->mutable_data(); #pragma omp parallel for collapse(2) for (int n = 0; n < batch; ++n) { - for (int c = 0; c < channels; ++c) { - T bias_channel = bias_ptr ? bias_ptr[c] : 0; - for (int h = 0; h < height; ++h) { - for (int w = 0; w < width; ++w) { - index_t offset = n * channels * height * width + - c * height * width + h * width + w; + for (int h = 0; h < height; ++h) { + for (int w = 0; w < width; ++w) { + for (int c = 0; c < channels; ++c) { + const index_t inc = c / multiplier; + const index_t m = c % multiplier; + T bias_channel = bias_ptr ? bias_ptr[c] : 0; + index_t offset = n * height * width * channels + + h * width * channels + w * channels + c; output_ptr[offset] = bias_channel; T sum = 0; - const T *filter_base = filter_ptr + c * kernel_size; + const T *filter_base = filter_ptr + inc * multiplier + m; for (int kh = 0; kh < kernel_h; ++kh) { for (int kw = 0; kw < kernel_w; ++kw) { int inh = padded_h_start + h * stride_h + dilation_h * kh; @@ -92,12 +140,12 @@ struct DepthwiseConv2dFunctor { "Out of range read from input: ", inh, ", ", inw); } else { index_t input_offset = - n * input_channels * input_height * input_width + - (c / multiplier) * input_height * input_width + - inh * input_width + inw; - sum += input_ptr[input_offset] * *filter_base; + n * input_height * input_width * input_channels + + inh * input_width * input_channels + + inw * input_channels + inc; + sum += input_ptr[input_offset] * filter_base[0]; // HWIM } - ++filter_base; + filter_base += input_channels * multiplier; } } output_ptr[offset] += sum; @@ -106,10 +154,6 @@ struct DepthwiseConv2dFunctor { } } } - - const int *strides_; // [stride_h, stride_w] - std::vector paddings_; // [padding_h, padding_w] - const int *dilations_; // [dilation_h, dilation_w] }; template <> @@ -120,15 +164,30 @@ void DepthwiseConv2dFunctor::operator()( Tensor *output, StatsFuture *future); -template <> -void DepthwiseConv2dFunctor::operator()( - const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output, - StatsFuture *future); +template +struct DepthwiseConv2dFunctor + : DepthwiseConv2dFunctorBase { + DepthwiseConv2dFunctor(const int *strides, + const Padding padding, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha) + : DepthwiseConv2dFunctorBase(strides, + padding, + dilations, + activation, + relux_max_limit, + prelu_alpha) {} + + void operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output, + StatsFuture *future); +}; -} // namespace kernels -} // namespace mace +} // namespace kernels +} // namespace mace -#endif // MACE_KERNELS_DEPTHWISE_CONV_H_ +#endif // MACE_KERNELS_DEPTHWISE_CONV2D_H_ diff --git a/mace/kernels/opencl/REAEMD.md b/mace/kernels/opencl/REAEMD.md new file mode 100644 index 00000000..9546e21e --- /dev/null +++ b/mace/kernels/opencl/REAEMD.md @@ -0,0 +1,12 @@ +OpenCL Image Storage Layout +=== + +Input/Output +--- + +Conv2D Filter +--- + +Depthwise Conv2D Filter +--- + diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index d5260870..ae81d32f 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -27,9 +27,12 @@ void BufferToImageFunctor::operator()(Tensor *buffer, string kernel_name; switch (type) { - case FILTER: + case CONV2D_FILTER: kernel_name = i2b_ ? "filter_image_to_buffer" : "filter_buffer_to_image"; break; + case DW_CONV2D_FILTER: + kernel_name = i2b_ ? "dw_filter_image_to_buffer" : "dw_filter_buffer_to_image"; + break; case IN_OUT: kernel_name = i2b_ ? "in_out_image_to_buffer" : "in_out_buffer_to_image"; break; diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl index 78e009a6..2ac05209 100644 --- a/mace/kernels/opencl/cl/buffer_to_image.cl +++ b/mace/kernels/opencl/cl/buffer_to_image.cl @@ -72,6 +72,56 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc } } + +__kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, ic, m */ + __private const int filter_w, + __private const int in_channel, + __private const int multiplier, + __write_only image2d_t output) { /* ic%4 * kh * kw * m, ic/4 */ + const int w = get_global_id(0); + const int h = get_global_id(1); + + DATA_TYPE4 values = 0; + if (multiplier == 1) { + const int in_channel_idx = h << 2; + const int h_idx = w / filter_w; + const int w_idx = w % filter_w; + + const int offset = mad24(mad24(h_idx, filter_w, w_idx), + in_channel, in_channel_idx); + + const int size = in_channel - in_channel_idx; + if (in_channel_idx < in_channel) { + if (size < 4) { + switch(size) { + case 3: + values.z = *(input + offset + 2); + case 2: + values.y = *(input + offset + 1); + case 1: + values.x = *(input + offset); + } + } else { + values = vload4(0, input + offset); + } + } + } else { + const int in_channel_idx = h << 2; + const int m = w % multiplier; + const int hw_idx = w / multiplier; + const int h_idx = hw_idx / filter_w; + const int w_idx = hw_idx % filter_w; + + const int offset = mad24(mad24(mad24(h_idx, filter_w, w_idx), + in_channel, in_channel_idx), + multiplier, m); + // TODO support multiplier > 1 + } + + int2 coord = (int2)(w, h); + WRITE_IMAGET(output, coord, values); +} + __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */ __private const int height, __private const int width, diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl index ce33c093..522f28c7 100644 --- a/mace/kernels/opencl/cl/conv_2d.cl +++ b/mace/kernels/opencl/cl/conv_2d.cl @@ -1,7 +1,7 @@ #include __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ - __read_only image2d_t filter, /* cout%4 * cin * kw * kh, cout/4 */ + __read_only image2d_t filter, /* cout%4 * cin * kh * kw, cout/4 */ #ifdef BIAS __read_only image2d_t bias, /* cout%4 * cout/4 */ #endif @@ -15,6 +15,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ __private const int out_width, __private const int filter_height, __private const int filter_width, + __private const int stride, __private const int padding_top, __private const int padding_left, __private const int dilation_h, @@ -38,19 +39,12 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ DATA_TYPE4 out3 = 0; #endif -#if STRIDE == 1 - int in_width0 = out_w_blk - padding_left; - int in_width1 = in_width0 + out_w_blks; - int in_width2 = in_width1 + out_w_blks; - int in_width3 = in_width2 + out_w_blks; - const int height_idx = (out_hb % out_height) - padding_top; -#else - int in_width0 = (out_w_blk << 1) - padding_left; - int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left; - int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left; - int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left; - const int height_idx = ((out_hb % out_height) << 1) - padding_top; -#endif + int in_width_stride = mul24(out_w_blks, stride); + int in_width0 = mad24(out_w_blk, stride, -padding_left); + int in_width1 = in_width0 + in_width_stride; + int in_width2 = in_width1 + in_width_stride; + int in_width3 = in_width2 + in_width_stride; + const int height_idx = mad24((out_hb % out_height), stride, -padding_top); const int batch_idx = mul24((out_hb / out_height), in_height); const int rounded_in_ch_x_filter_width = mul24(rounded_in_ch, filter_width); @@ -61,6 +55,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ const int in_idx = mul24(in_ch_blk, in_width); int filter_x_part0 = in_ch_blk << 2; for (short hb_idx = 0; hb_idx < filter_height; ++hb_idx) { + // TODO (heliangliang) optimize out these muls int in_hb_value = height_idx + mul24(hb_idx, dilation_h); in_hb_value = select(in_hb_value + batch_idx, -1, diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index a34c69ce..de19cd77 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -37,7 +37,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] w.z = w.y + out_w_blks; w.w = w.z + out_w_blks; int out_hb_idx = (out_hb % height); -#else +#elif STRIDE == 2 w.x = out_w_blk << 1; w.y = (out_w_blk + out_w_blks) << 1; w.z = (out_w_blk + (out_w_blks << 1)) << 1; diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 448c2c9e..8d0b4d1a 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -1,7 +1,7 @@ #include __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ - __read_only image2d_t filter, /* cout%4 * cin * kw * kh, cout/4 */ + __read_only image2d_t filter, /* cout%4 * cin * kh * kw, cout/4 */ #ifdef BIAS __read_only image2d_t bias, /* cout%4 * cout/4 */ #endif @@ -45,7 +45,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] int in_width3 = in_width2 + out_w_blks; int in_width4 = in_width3 + out_w_blks; const int height_idx = (out_hb % out_height) - padding_top; -#else +#elif STRIDE == 2 int in_width0 = (out_w_blk << 1) - padding_left; int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left; int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left; @@ -63,6 +63,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const int in_idx = mul24(in_ch_blk, in_width); int filter_x_part0 = in_ch_blk << 2; for (short hb_idx = 0; hb_idx < 3; ++hb_idx) { + // TODO (heliangliang) optimize out these muls int in_hb_value = height_idx + mul24(hb_idx, dilation_h); in_hb_value = select(in_hb_value + batch_idx, -1, diff --git a/mace/kernels/opencl/cl/depthwise_conv2d.cl b/mace/kernels/opencl/cl/depthwise_conv2d.cl new file mode 100644 index 00000000..280ba54c --- /dev/null +++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl @@ -0,0 +1,129 @@ +#include + +// Only multiplier = 1 is supported +__kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ + __read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */ +#ifdef BIAS + __read_only image2d_t bias, /* cout%4 * cout/4 */ +#endif + __write_only image2d_t output, + __private const DATA_TYPE relux_max_limit, + __private const DATA_TYPE prelu_alpha, + __private const short in_height, + __private const short in_width, + __private const short in_ch_blks, + __private const short out_height, + __private const short out_width, + __private const short filter_height, + __private const short filter_width, + __private const short padding_top, + __private const short padding_left, + __private const short dilation_h, + __private const short dilation_w) { + const short out_ch_blk = get_global_id(0); + const short out_w_blk = get_global_id(1); + const short out_w_blks = get_global_size(1); + const short out_hb = get_global_id(2); + const short rounded_in_ch = in_ch_blks << 2; + const short in_ch_blk = out_ch_blk; // multiplier = 1 + +#ifdef BIAS + DATA_TYPE4 out0 = + READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0)); + DATA_TYPE4 out1 = out0; + DATA_TYPE4 out2 = out0; + DATA_TYPE4 out3 = out0; +#else + DATA_TYPE4 out0 = 0; + DATA_TYPE4 out1 = 0; + DATA_TYPE4 out2 = 0; + DATA_TYPE4 out3 = 0; +#endif + + const short out_h = out_hb % out_height; +#if STRIDE == 1 + const short in_width0 = out_w_blk - padding_left; + const short in_width1 = in_width0 + out_w_blks; + const short in_width2 = in_width1 + out_w_blks; + const short in_width3 = in_width2 + out_w_blks; + const short height_idx = out_h - padding_top; +#elif STRIDE == 2 + int in_width0 = (out_w_blk << 1) - padding_left; + int in_width1 = ((out_w_blk + out_w_blks) << 1) - padding_left; + int in_width2 = ((out_w_blk + (out_w_blks << 1)) << 1) - padding_left; + int in_width3 = ((out_w_blk + (out_w_blks << 1) + out_w_blks) << 1) - padding_left; + int in_width4 = ((out_w_blk + (out_w_blks << 2)) << 1) - padding_left; + const int height_idx = (out_h << 1) - padding_top; +#else + const short in_width_stride = mul24(out_w_blks, STRIDE); + const short in_width0 = mad24(out_w_blk, STRIDE, -padding_left); + const short in_width1 = in_width0 + in_width_stride; + const short in_width2 = in_width1 + in_width_stride; + const short in_width3 = in_width2 + in_width_stride; + const short height_idx = mad24(out_h, STRIDE, -padding_top); +#endif + + const short batch_idx = mul24((out_hb / out_height), in_height); + const short rounded_in_ch_x_filter_width = mul24(rounded_in_ch, filter_width); + + const short in_idx = mul24(in_ch_blk, in_width); + short filter_idx = 0; + short in_hb_idx = height_idx; + for (short filter_h_idx = 0; filter_h_idx < filter_height; ++filter_h_idx) { + short in_hb = select(in_hb_idx + batch_idx, + -1, + (in_hb_idx < 0 || in_hb_idx >= in_height)); + short in_w_idx = 0; + for (short filter_w_idx = 0; filter_w_idx < filter_width; ++filter_w_idx) { + short in_w; + DATA_TYPE4 in0, in1, in2, in3; +#define READ_INPUT(i) \ + in_w = in_w_idx + in_width##i; \ + in_w = select(in_idx + in_w, \ + -1, \ + (in_w < 0 || in_w >= in_width)); \ + in##i = READ_IMAGET(input, SAMPLER, (int2)(in_w, in_hb)); + + READ_INPUT(0); + READ_INPUT(1); + READ_INPUT(2); + READ_INPUT(3); + +#undef READ_INPUT + + DATA_TYPE4 weights = READ_IMAGET(filter, SAMPLER, + (int2)(filter_idx, in_ch_blk)); + + out0 = mad(in0, weights, out0); + out1 = mad(in1, weights, out1); + out2 = mad(in2, weights, out2); + out3 = mad(in3, weights, out3); + ++filter_idx; + in_w_idx += dilation_w; + } + in_hb_idx += dilation_h; + } + +#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID) + out0 = do_activation(out0, relux_max_limit, prelu_alpha); + out1 = do_activation(out1, relux_max_limit, prelu_alpha); + out2 = do_activation(out2, relux_max_limit, prelu_alpha); + out3 = do_activation(out3, relux_max_limit, prelu_alpha); +#endif + + const short out_x_base = mul24(out_ch_blk, out_width); + short w = out_w_blk; + WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0); + + w += out_w_blks; + if (w >= out_width) return; + WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1); + + w += out_w_blks; + if (w >= out_width) return; + WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2); + + w += out_w_blks; + if (w >= out_width) return; + WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3); +} diff --git a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl deleted file mode 100644 index 29dbc340..00000000 --- a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl +++ /dev/null @@ -1,126 +0,0 @@ -#include - -VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s1(const DATA_TYPE *input_ptr, - const DATA_TYPE *filter_ptr) { - VEC_DATA_TYPE(DATA_TYPE,4) row0 = vload4(0, input_ptr); - VEC_DATA_TYPE(DATA_TYPE,2) input1 = vload2(0, input_ptr+4); - VEC_DATA_TYPE(DATA_TYPE,4) row1 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input1.s0); - VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s23, input1.s01); - VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr); - return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 + - (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 + - (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2; -} - -VEC_DATA_TYPE(DATA_TYPE,4) conv1x3_s2(const DATA_TYPE *input_ptr, - const DATA_TYPE *filter_ptr) { - VEC_DATA_TYPE(DATA_TYPE,8) input = vload8(0, input_ptr); - VEC_DATA_TYPE(DATA_TYPE,4) row0 = input.even; - VEC_DATA_TYPE(DATA_TYPE,4) row1 = input.odd; - VEC_DATA_TYPE(DATA_TYPE,4) row2 = (VEC_DATA_TYPE(DATA_TYPE,4))(row0.s123, input_ptr[8]); - VEC_DATA_TYPE(DATA_TYPE,3) filter_values = vload3(0, filter_ptr); - return (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s0 * row0 + - (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s1 * row1 + - (VEC_DATA_TYPE(DATA_TYPE,4))filter_values.s2 * row2; -} - -// Supported data type: half/float -DATA_TYPE conv3x3(const DATA_TYPE *input_ptr, - const DATA_TYPE *filter_ptr, - const int row_width) { - VEC_DATA_TYPE(DATA_TYPE,3) input_value = vload3(0, input_ptr); - VEC_DATA_TYPE(DATA_TYPE,3) filter_value = vload3(0, filter_ptr); - VEC_DATA_TYPE(DATA_TYPE,3) res = input_value * filter_value; - input_ptr += row_width; - input_value = vload3(0, input_ptr); - filter_value = vload3(1, filter_ptr); - res += input_value * filter_value; - input_ptr += row_width; - input_value = vload3(0, input_ptr); - filter_value = vload3(2, filter_ptr); - res += input_value * filter_value; - - return res.s0 + res.s1 + res.s2; -} -//TODO merge the depthwise with conv 3x3 to remove duplicate code. -__kernel void depthwise_conv_3x3(__global const DATA_TYPE *input, /* n, c, h, w */ - __global const DATA_TYPE *filter, /* m, i, kh, kw */ -#ifdef BIAS - __global const DATA_TYPE *bias, /* o */ -#endif - __global DATA_TYPE *output, /* n, c, h, w */ - __private const int in_chan_num, - __private const int out_chan_num, - __private const int in_height, - __private const int in_width, - __private const int out_height, - __private const int out_width) { - int batch = get_global_id(0); - int out_chan_blk = get_global_id(1); - int out_pixel_blk = get_global_id(2); - - const int in_pixel = in_height * in_width; - const int out_pixel = out_height * out_width; - const int multiplier = out_chan_num / in_chan_num; - - const int round_out_width = (out_width + 3) / 4; - const int out_pixel_height = out_pixel_blk / round_out_width; - const int out_pixel_width = out_pixel_blk % round_out_width; - - const int out_chan_begin = out_chan_blk * 4; - const int out_chan_end = min(out_chan_begin + 4, out_chan_num); - const int out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; - const int out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); -#ifdef STRIDE_1 - const int in_pixel_begin = out_pixel_height * in_width + out_pixel_width * 4; -#else - const int in_pixel_begin = out_pixel_height * 2 * in_width + out_pixel_width * 2 * 4; -#endif - - const int in_offset = batch * in_chan_num * in_pixel; - const int out_offset = batch * out_chan_num * out_pixel; - const DATA_TYPE *input_base = input + in_offset + in_pixel_begin; - DATA_TYPE *output_base = output + out_offset + out_pixel_begin; - - const int pixels = out_pixel_end - out_pixel_begin; - - for (int i = out_chan_begin; i < out_chan_end; ++i) { - const DATA_TYPE *input_ptr = input_base + (i / multiplier) * in_pixel; - const DATA_TYPE *filter_ptr = filter + i * 9; - DATA_TYPE *output_ptr = output_base + i * out_pixel; - if (pixels == 4) { -#ifdef BIAS - VEC_DATA_TYPE(DATA_TYPE,4) res = (VEC_DATA_TYPE(DATA_TYPE,4))bias[i]; -#else - VEC_DATA_TYPE(DATA_TYPE,4) res = 0; -#endif /* defined(BIAS) */ - -#ifdef STRIDE_1 - res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3); - res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3); - res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3); -#else - res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3); - res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3); - res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3); -#endif - vstore4(res, 0, output_ptr); - } else { - for (int p = 0; p < pixels; ++p) { -#ifdef BIAS - DATA_TYPE res = bias[i]; -#else - DATA_TYPE res = 0; -#endif - res += conv3x3(input_ptr, filter_ptr, in_width); - output_ptr[p] = res; -#ifdef STRIDE_1 - input_ptr += 1; -#else - input_ptr += 2; -#endif - } - } - } - -} diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc index 2a96d864..264add71 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -81,13 +81,14 @@ void Conv2dOpencl(const Tensor *input, *(static_cast(output->buffer()))); conv_2d_kernel.setArg(idx++, relux_max_limit); conv_2d_kernel.setArg(idx++, prelu_alpha); - conv_2d_kernel.setArg(idx++, static_cast(input->dim(1))); - conv_2d_kernel.setArg(idx++, static_cast(input->dim(2))); - conv_2d_kernel.setArg(idx++, static_cast(input_channel_blocks)); - conv_2d_kernel.setArg(idx++, static_cast(height)); - conv_2d_kernel.setArg(idx++, static_cast(width)); - conv_2d_kernel.setArg(idx++, static_cast(filter->dim(0))); - conv_2d_kernel.setArg(idx++, static_cast(filter->dim(1))); + conv_2d_kernel.setArg(idx++, static_cast(input->dim(1))); + conv_2d_kernel.setArg(idx++, static_cast(input->dim(2))); + conv_2d_kernel.setArg(idx++, static_cast(input_channel_blocks)); + conv_2d_kernel.setArg(idx++, static_cast(height)); + conv_2d_kernel.setArg(idx++, static_cast(width)); + conv_2d_kernel.setArg(idx++, static_cast(filter->dim(0))); + conv_2d_kernel.setArg(idx++, static_cast(filter->dim(1))); + conv_2d_kernel.setArg(idx++, static_cast(stride)); conv_2d_kernel.setArg(idx++, padding[0] / 2); conv_2d_kernel.setArg(idx++, padding[1] / 2); conv_2d_kernel.setArg(idx++, dilations[0]); diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc index 67e15501..4397b508 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl.cc +++ b/mace/kernels/opencl/depthwise_conv_opencl.cc @@ -2,60 +2,164 @@ // Copyright (c) 2017 XiaoMi All rights reserved. // +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/activation.h" #include "mace/kernels/depthwise_conv2d.h" +#include "mace/kernels/opencl/helper.h" +#include "mace/utils/tuner.h" namespace mace { namespace kernels { -extern void DepthwiseConvOpenclK3x3S1(const Tensor *input, const Tensor *filter, - const Tensor *bias, Tensor *output, - StatsFuture *future); - -extern void DepthwiseConvOpenclK3x3S2(const Tensor *input, const Tensor *filter, - const Tensor *bias, Tensor *output, - StatsFuture *future); -template <> -void DepthwiseConv2dFunctor::operator()(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output, - StatsFuture *future) { - typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter, - const Tensor *bias, Tensor *output, - StatsFuture *future); - // Selection matrix: kernel_size x stride_size - static const Conv2dOpenclFunction selector[5][2] = { - {nullptr, nullptr}, - {nullptr, nullptr}, - {DepthwiseConvOpenclK3x3S1, DepthwiseConvOpenclK3x3S2}, - {nullptr, nullptr}, - {nullptr, nullptr}}; +void DepthwiseConv2d(const Tensor *input, // NHWC + const Tensor *filter, // HWIM + const Tensor *bias, + const int stride, + const int *paddings, + const int *dilations, + const ActivationType activation, + const float relux_max_limit, + const float prelu_alpha, + const DataType dt, + Tensor *output, + StatsFuture *future) { + const index_t batch = output->dim(0); + const index_t height = output->dim(1); + const index_t width = output->dim(2); + const index_t channels = output->dim(3); + const index_t input_batch = input->dim(0); + const index_t input_height = input->dim(1); + const index_t input_width = input->dim(2); + const index_t input_channels = input->dim(3); + + const index_t filter_height = filter->dim(0); + const index_t filter_width = filter->dim(1); + const index_t multiplier = filter->dim(3); + MACE_CHECK(multiplier == 1, "Multiplier > 1 not supported"); + MACE_CHECK(multiplier * input_channels == channels); + MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=", + input_channels); + + const index_t channel_blocks = RoundUpDiv4(channels); + const index_t input_channel_blocks = RoundUpDiv4(input_channels); + const index_t width_blocks = RoundUpDiv4(width); + + auto runtime = OpenCLRuntime::Global(); + std::set built_options; + std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); + built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); + built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + built_options.emplace("-DSTRIDE=" + ToString(stride)); + switch (activation) { + case NOOP: + break; + case RELU: + built_options.emplace("-DUSE_RELU"); + break; + case RELUX: + built_options.emplace("-DUSE_RELUX"); + break; + case PRELU: + built_options.emplace("-DUSE_PRELU"); + break; + case TANH: + built_options.emplace("-DUSE_TANH"); + break; + case SIGMOID: + built_options.emplace("-DUSE_SIGMOID"); + break; + defeult: + LOG(FATAL) << "Unknown activation type: " << activation; + } + + auto dw_conv2d_kernel = + runtime->BuildKernel("depthwise_conv2d", kernel_name, built_options); + + uint32_t idx = 0; + dw_conv2d_kernel.setArg(idx++, + *(static_cast(input->buffer()))); + dw_conv2d_kernel.setArg( + idx++, *(static_cast(filter->buffer()))); + if (bias != nullptr) { + dw_conv2d_kernel.setArg( + idx++, *(static_cast(bias->buffer()))); + } + dw_conv2d_kernel.setArg( + idx++, *(static_cast(output->buffer()))); + dw_conv2d_kernel.setArg(idx++, relux_max_limit); + dw_conv2d_kernel.setArg(idx++, prelu_alpha); + dw_conv2d_kernel.setArg(idx++, static_cast(input_height)); + dw_conv2d_kernel.setArg(idx++, static_cast(input_width)); + dw_conv2d_kernel.setArg(idx++, static_cast(input_channel_blocks)); + dw_conv2d_kernel.setArg(idx++, static_cast(height)); + dw_conv2d_kernel.setArg(idx++, static_cast(width)); + dw_conv2d_kernel.setArg(idx++, static_cast(filter_height)); + dw_conv2d_kernel.setArg(idx++, static_cast(filter_width)); + dw_conv2d_kernel.setArg(idx++, static_cast(paddings[0] / 2)); + dw_conv2d_kernel.setArg(idx++, static_cast(paddings[1] / 2)); + dw_conv2d_kernel.setArg(idx++, static_cast(dilations[0])); + dw_conv2d_kernel.setArg(idx++, static_cast(dilations[1])); + + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width_blocks), + static_cast(height * batch)}; + std::vector lws = {8, 16, 8, 1}; + std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation, + batch, height, width, channels, multiplier); + TuningOrRun3DKernel(dw_conv2d_kernel, tuning_key, gws, lws, future); +} + +template +void DepthwiseConv2dFunctor::operator()( + const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output, + StatsFuture *future) { + typedef void (*Conv2dOpenclFunction)(const Tensor *input, + const Tensor *filter, const Tensor *bias, + Tensor *output, StatsFuture *future); index_t kernel_h = filter->dim(2); index_t kernel_w = filter->dim(3); - if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] || - strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 || - selector[kernel_h - 1][strides_[0] - 1] == nullptr) { - LOG(WARNING) << "OpenCL conv2d kernel with " + if (strides_[0] != strides_[1]) { + LOG(WARNING) << "OpenCL depthwise conv2d kernel with " << "filter" << kernel_h << "x" << kernel_w << "," << " stride " << strides_[0] << "x" << strides_[1] << " is not implemented yet, using slow version"; // TODO(heliangliang) The CPU/NEON kernel should map the buffer - DepthwiseConv2dFunctor(strides_, paddings_, dilations_)( - input, filter, bias, output, future); + DepthwiseConv2dFunctor( + strides_, padding_, dilations_, activation_, relux_max_limit_, + prelu_alpha_)(input, filter, bias, output, future); return; } - auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; - if (paddings_[0] > 0 || paddings_[1] > 0) { - Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum::v()); - ConstructInputWithPadding(input, paddings_.data(), &padded_input); - conv2d_func(&padded_input, filter, bias, output, future); - }else { - conv2d_func(input, filter, bias, output, future); - } + // Create a fake conv_2d filter to calculate the paddings and output size + std::vector fake_filter_shape(4); + fake_filter_shape[0] = filter->shape()[0]; + fake_filter_shape[1] = filter->shape()[1]; + fake_filter_shape[3] = filter->shape()[2] * filter->shape()[3]; + fake_filter_shape[2] = 1; + std::vector output_shape(4); + std::vector paddings(2); + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), fake_filter_shape.data(), dilations_, strides_, + padding_, output_shape.data(), paddings.data()); + + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); + output->ResizeImage(output_shape, output_image_shape); + + DepthwiseConv2d(input, filter, bias, strides_[0], paddings.data(), dilations_, + activation_, relux_max_limit_, prelu_alpha_, + DataTypeToEnum::value, output, future); } +template struct DepthwiseConv2dFunctor; +template struct DepthwiseConv2dFunctor; + } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc deleted file mode 100644 index ff8420fc..00000000 --- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc +++ /dev/null @@ -1,98 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#include "mace/core/common.h" -#include "mace/core/runtime/opencl/opencl_runtime.h" -#include "mace/kernels/conv_2d.h" -#include "mace/kernels/opencl/helper.h" - -namespace mace { -namespace kernels { - -static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - const uint32_t stride, - Tensor *output, - StatsFuture *future) { - const index_t batch = output->dim(0); - const index_t channels = output->dim(1); - const index_t height = output->dim(2); - const index_t width = output->dim(3); - - const index_t input_batch = input->dim(0); - const index_t input_channels = input->dim(1); - const index_t input_height = input->dim(2); - const index_t input_width = input->dim(3); - - MACE_CHECK(input_batch == batch); - const index_t pixels = height * width; - const index_t channel_blocks = (channels + 3) / 4; - const index_t pixel_blocks = (width + 3) / 4 * height; - - auto runtime = OpenCLRuntime::Global(); - std::set built_options; - std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv_3x3"); - built_options.emplace("-Ddepthwise_conv_3x3=" + kernel_name); - built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(input->dtype())); - built_options.emplace(stride == 1 ? "-DSTRIDE_1" : ""); - built_options.emplace(bias != nullptr ? "-DBIAS" : ""); - auto conv_kernel = runtime->BuildKernel("depthwise_conv_3x3", kernel_name, built_options); - - uint32_t idx = 0; - conv_kernel.setArg(idx++, *(static_cast(input->buffer()))); - conv_kernel.setArg(idx++, *(static_cast(filter->buffer()))); - if (bias != nullptr) { - conv_kernel.setArg(idx++, *(static_cast(bias->buffer()))); - } - conv_kernel.setArg(idx++, *(static_cast(output->buffer()))); - conv_kernel.setArg(idx++, static_cast(input->dim(1))); - conv_kernel.setArg(idx++, static_cast(channels)); - conv_kernel.setArg(idx++, static_cast(input->dim(2))); - conv_kernel.setArg(idx++, static_cast(input->dim(3))); - conv_kernel.setArg(idx++, static_cast(height)); - conv_kernel.setArg(idx++, static_cast(width)); - - const uint32_t gws[3] = {static_cast(output->dim(0)), - static_cast(channel_blocks), - static_cast(pixel_blocks)}; - const uint32_t lws[3] = {static_cast(1), - static_cast(1), - static_cast(256)}; - cl::Event event; - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - conv_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), - nullptr, &event); - MACE_CHECK(error == CL_SUCCESS); - - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } -} - -extern void DepthwiseConvOpenclK3x3S1(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output, - StatsFuture *future) { - InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 1, output, future); -}; - -extern void DepthwiseConvOpenclK3x3S2(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output, - StatsFuture *future) { - InnerDepthwiseConvOpenclK3x3S12(input, filter, bias, 2, output, future); -}; - -} // namespace kernels -} // namespace mace diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index e220d344..56c157a6 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -9,7 +9,7 @@ namespace mace { namespace kernels { -// [(c+3)/4*W, N * H] +// [(C + 3) / 4 * W, N * H] void CalInOutputImageShape(const std::vector &shape, /* NHWC */ std::vector &image_shape) { MACE_CHECK(shape.size() == 4); @@ -18,13 +18,22 @@ void CalInOutputImageShape(const std::vector &shape, /* NHWC */ image_shape[1] = shape[0] * shape[1]; } -// [H * W * RoundUp<4>(Ic), (Oc + 3) / 4] -void CalFilterImageShape(const std::vector &shape, /* HWIO*/ - std::vector &image_shape) { +// [RoundUp<4>(Ic) * H * W, (Oc + 3) / 4] +void CalConv2dFilterImageShape(const std::vector &shape, /* HWIO */ + std::vector &image_shape) { MACE_CHECK(shape.size() == 4); image_shape.resize(2); image_shape[0] = shape[0] * shape[1] * RoundUp(shape[2], 4); - image_shape[1] = RoundUpDiv4(shape.back()); + image_shape[1] = RoundUpDiv4(shape[3]); +} + +// [H * W * M, (Ic + 3) / 4] +void CalDepthwiseConv2dFilterImageShape(const std::vector &shape, /* HWIM */ + std::vector &image_shape) { + MACE_CHECK(shape.size() == 4); + image_shape.resize(2); + image_shape[0] = shape[0] * shape[1] * shape[3]; + image_shape[1] = RoundUpDiv4(shape[2]); } // [(size + 3) / 4, 1] @@ -40,11 +49,17 @@ void CalImage2DShape(const std::vector &shape, /* NHWC */ const BufferType type, std::vector &image_shape) { switch (type) { - case FILTER:CalFilterImageShape(shape, image_shape); + case CONV2D_FILTER: + CalConv2dFilterImageShape(shape, image_shape); + break; + case DW_CONV2D_FILTER: + CalDepthwiseConv2dFilterImageShape(shape, image_shape); break; - case IN_OUT:CalInOutputImageShape(shape, image_shape); + case IN_OUT: + CalInOutputImageShape(shape, image_shape); break; - case ARGUMENT:CalArgImageShape(shape, image_shape); + case ARGUMENT: + CalArgImageShape(shape, image_shape); break; default:LOG(FATAL) << "Mace not supported yet."; } diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index 466064b6..dc40514f 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -17,9 +17,10 @@ namespace kernels { const float kMaxKernelExeTime = 1000.0; // microseconds enum BufferType { - FILTER = 0, - IN_OUT = 1, - ARGUMENT = 2 + CONV2D_FILTER = 0, + DW_CONV2D_FILTER = 1, + IN_OUT = 2, + ARGUMENT = 3 }; void CalImage2DShape(const std::vector &shape, /* NHWC */ diff --git a/mace/ops/buffer_to_image.h b/mace/ops/buffer_to_image.h index 40c412b9..4b3cfb6e 100644 --- a/mace/ops/buffer_to_image.h +++ b/mace/ops/buffer_to_image.h @@ -20,7 +20,7 @@ class BufferToImageOp: public Operator { const Tensor *input_tensor = this->Input(INPUT); kernels::BufferType type = static_cast(OperatorBase::GetSingleArgument( - "buffer_type", static_cast(kernels::FILTER))); + "buffer_type", static_cast(kernels::CONV2D_FILTER))); Tensor *output = this->Output(OUTPUT); functor_(const_cast(input_tensor), type, output, future); diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc index 43092084..f77bbde0 100644 --- a/mace/ops/buffer_to_image_test.cc +++ b/mace/ops/buffer_to_image_test.cc @@ -75,27 +75,27 @@ TEST(BufferToImageTest, InputLarge) { } TEST(BufferToImageTest, Filter1x1Small) { - TestBidirectionTransform(kernels::FILTER, {1, 1, 3, 5}); + TestBidirectionTransform(kernels::CONV2D_FILTER, {1, 1, 3, 5}); } TEST(BufferToImageTest, Filter1x1Media) { - TestBidirectionTransform(kernels::FILTER, {1, 1, 13, 17}); + TestBidirectionTransform(kernels::CONV2D_FILTER, {1, 1, 13, 17}); } TEST(BufferToImageTest, Filter1x1Large) { - TestBidirectionTransform(kernels::FILTER, {1, 1, 128, 512}); + TestBidirectionTransform(kernels::CONV2D_FILTER, {1, 1, 128, 512}); } TEST(BufferToImageTest, Filter3x3Small) { - TestBidirectionTransform(kernels::FILTER, {3, 3, 3, 5}); + TestBidirectionTransform(kernels::CONV2D_FILTER, {3, 3, 3, 5}); } TEST(BufferToImageTest, Filter3x3Meida) { - TestBidirectionTransform(kernels::FILTER, {3, 3, 13, 17}); + TestBidirectionTransform(kernels::CONV2D_FILTER, {3, 3, 13, 17}); } TEST(BufferToImageTest, Filter3x3Large) { - TestBidirectionTransform(kernels::FILTER, {3, 3, 128, 256}); + TestBidirectionTransform(kernels::CONV2D_FILTER, {3, 3, 128, 256}); } template diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index 83e264d8..42d187b3 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -36,7 +36,7 @@ static void Conv2d(int iters, BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage(net, "Filter", "FilterImage", - kernels::BufferType::FILTER); + kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index d4df0df8..a12842e2 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -102,7 +102,7 @@ void TestNHWCSimple3x3VALID() { BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage(net, "Filter", "FilterImage", - kernels::BufferType::FILTER); + kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") @@ -159,7 +159,7 @@ void TestNHWCSimple3x3SAME() { BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage(net, "Filter", "FilterImage", - kernels::BufferType::FILTER); + kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") @@ -264,7 +264,7 @@ void TestNHWCSimple3x3WithoutBias() { BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage(net, "Filter", "FilterImage", - kernels::BufferType::FILTER); + kernels::BufferType::CONV2D_FILTER); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputImage") @@ -371,7 +371,7 @@ static void TestNHWCCombined3x3() { BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage(net, "Filter", "FilterImage", - kernels::BufferType::FILTER); + kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); @@ -444,7 +444,7 @@ void TestConv1x1() { BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage(net, "Filter", "FilterImage", - kernels::BufferType::FILTER); + kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); @@ -535,7 +535,7 @@ static void TestComplexConvNxNS12(const std::vector &shape) { BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage(net, "Filter", "FilterImage", - kernels::BufferType::FILTER); + kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); @@ -628,7 +628,7 @@ static void TestHalfComplexConvNxNS12(const std::vector &input_shape, BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage(net, "Filter", "FilterImage", - kernels::BufferType::FILTER); + kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); @@ -759,7 +759,7 @@ static void TestDilationConvNxN(const std::vector &shape, const int dil // run on gpu BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") diff --git a/mace/ops/core_test.cc b/mace/ops/core_test.cc index 206c4d6c..30731dfe 100644 --- a/mace/ops/core_test.cc +++ b/mace/ops/core_test.cc @@ -15,7 +15,7 @@ TEST(CoreTest, INIT_MODE) { OpDefBuilder("BufferToImage", "BufferToImageTest") .Input("Input") .Output("B2IOutput") - .AddIntArg("buffer_type", kernels::BufferType::FILTER) + .AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER) .AddIntArg("mode", static_cast(NetMode::INIT)) .Finalize(&op_defs[op_defs.size() - 1]); @@ -33,7 +33,7 @@ TEST(CoreTest, INIT_MODE) { OpDefBuilder("ImageToBuffer", "ImageToBufferTest") .Input("B2IOutput") .Output("Output") - .AddIntArg("buffer_type", kernels::BufferType::FILTER) + .AddIntArg("buffer_type", kernels::BufferType::CONV2D_FILTER) .Finalize(&op_defs[op_defs.size() - 1]); NetDef net_def; diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index 4e99a378..da8a51d5 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -26,6 +26,12 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) { .TypeConstraint("T") .Build(), DepthwiseConv2dOp); + + REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d") + .Device(DeviceType::OPENCL) + .TypeConstraint("T") + .Build(), + DepthwiseConv2dOp); } } // namespace mace diff --git a/mace/ops/depthwise_conv2d.h b/mace/ops/depthwise_conv2d.h index 9b36e566..ed4ff152 100644 --- a/mace/ops/depthwise_conv2d.h +++ b/mace/ops/depthwise_conv2d.h @@ -2,8 +2,8 @@ // Copyright (c) 2017 XiaoMi All rights reserved. // -#ifndef MACE_OPS_DEPTHWISE_CONV_H_ -#define MACE_OPS_DEPTHWISE_CONV_H_ +#ifndef MACE_OPS_DEPTHWISE_CONV2D_H_ +#define MACE_OPS_DEPTHWISE_CONV2D_H_ #include @@ -18,10 +18,13 @@ template class DepthwiseConv2dOp : public ConvPool2dOpBase { public: DepthwiseConv2dOp(const OperatorDef &op_def, Workspace *ws) - : ConvPool2dOpBase(op_def, ws) { - functor_.strides_ = this->strides_.data(); - functor_.dilations_ = this->dilations_.data(); - } + : ConvPool2dOpBase(op_def, ws), + functor_(this->strides_.data(), + this->padding_, + this->dilations_.data(), + kernels::ActivationType::NOOP, + 0.0f, + 0.0f) {} bool Run(StatsFuture *future) override { const Tensor *input = this->Input(INPUT); @@ -31,23 +34,7 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase { bias = this->Input(BIAS); } Tensor *output = this->Output(OUTPUT); - - // resize filter shape. - std::vector filter_shape(filter->shape().begin(), - filter->shape().end()); - filter_shape[0] *= filter_shape[1]; - filter_shape[1] = 1; - std::vector output_shape(4); - std::vector paddings(2); - kernels::CalcPaddingAndOutputSize( - input->shape().data(), filter_shape.data(), this->dilations_.data(), - this->strides_.data(), this->padding_, output_shape.data(), - paddings.data()); - output->Resize(output_shape); - functor_.paddings_ = paddings; - functor_(input, filter, bias, output, future); - return true; } @@ -59,6 +46,6 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase { OP_OUTPUT_TAGS(OUTPUT); }; -} // namespace mace +} // namespace mace -#endif // MACE_OPS_DEPTHWISE_CONV_H_ +#endif // MACE_OPS_DEPTHWISE_CONV2D_H_ diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc index e2f2872d..43950a49 100644 --- a/mace/ops/depthwise_conv2d_test.cc +++ b/mace/ops/depthwise_conv2d_test.cc @@ -7,45 +7,200 @@ using namespace mace; +namespace { + class DepthwiseConv2dOpTest : public OpsTestBase {}; -template +template void SimpleValidTest() { testing::internal::LogToStderr(); // Construct graph OpsTestNet net; - OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); // Add input data - net.AddInputFromArray("Input", {1, 2, 2, 3}, - {1, 3, 5, 7, 9, 11, 2, 4, 6, 8, 10, 12}); net.AddInputFromArray( - "Filter", {2, 2, 2, 2}, - {1.0f, 5.0f, 9.0f, 13.0f, 2.0f, 6.0f, 10.0f, 14.0f, 3.0f, 7.0f, 11.0f, - 15.0f, 4.0f, 8.0f, 12.0f, 16.0f}); - net.AddInputFromArray("Bias", {4}, {.1f, .2f, .3f, .4f}); - // Run - net.RunOp(D); + "Input", {1, 3, 3, 2}, + {1, 2, 2, 4, 3, 6, 4, 8, 5, 10, 6, 12, 7, 14, 8, 16, 9, 18}); + net.AddInputFromArray( + "Filter", {2, 2, 2, 1}, {1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f}); + net.AddInputFromArray("Bias", {2}, {.1f, .2f}); + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", + kernels::BufferType::DW_CONV2D_FILTER); + BufferToImage(net, "Bias", "BiasImage", + kernels::BufferType::ARGUMENT); + OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", + kernels::BufferType::IN_OUT); + + } else { + OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + } // Check - auto expected = CreateTensor( - {1, 4, 1, 2}, - {196.1f, 252.1f, 216.2f, 280.2f, 272.3f, 344.3f, 296.4f, 376.4f}); + auto expected = CreateTensor({1, 2, 2, 2}, {37.1f, 148.2f, 47.1f, 188.2f, + 67.1f, 268.2f, 77.1f, 308.2f}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +} + +TEST_F(DepthwiseConv2dOpTest, SimpleCPU) { + SimpleValidTest(); +} + +TEST_F(DepthwiseConv2dOpTest, SimpleOpenCL) { + SimpleValidTest(); +} - ExpectTensorNear(*expected, *net.GetOutput("Output"), 1e-5); +TEST_F(DepthwiseConv2dOpTest, SimpleOpenCLHalf) { + SimpleValidTest(); } -TEST_F(DepthwiseConv2dOpTest, SimpleCPU) { SimpleValidTest(); } +template +void ComplexValidTest() { + testing::internal::LogToStderr(); + // Construct graph + OpsTestNet net; -template + // Add input data + net.AddInputFromArray( + "Input", {1, 10, 10, 3}, + {0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, + 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, + 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, + 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, + 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59, + 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, + 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8, 0.81, 0.82, 0.83, + 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, + 0.96, 0.97, 0.98, 0.99, 1.0, 1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, + 1.08, 1.09, 1.1, 1.11, 1.12, 1.13, 1.14, 1.15, 1.16, 1.17, 1.18, 1.19, + 1.2, 1.21, 1.22, 1.23, 1.24, 1.25, 1.26, 1.27, 1.28, 1.29, 1.3, 1.31, + 1.32, 1.33, 1.34, 1.35, 1.36, 1.37, 1.38, 1.39, 1.4, 1.41, 1.42, 1.43, + 1.44, 1.45, 1.46, 1.47, 1.48, 1.49, 1.5, 1.51, 1.52, 1.53, 1.54, 1.55, + 1.56, 1.57, 1.58, 1.59, 1.6, 1.61, 1.62, 1.63, 1.64, 1.65, 1.66, 1.67, + 1.68, 1.69, 1.7, 1.71, 1.72, 1.73, 1.74, 1.75, 1.76, 1.77, 1.78, 1.79, + 1.8, 1.81, 1.82, 1.83, 1.84, 1.85, 1.86, 1.87, 1.88, 1.89, 1.9, 1.91, + 1.92, 1.93, 1.94, 1.95, 1.96, 1.97, 1.98, 1.99, 2.0, 2.01, 2.02, 2.03, + 2.04, 2.05, 2.06, 2.07, 2.08, 2.09, 2.1, 2.11, 2.12, 2.13, 2.14, 2.15, + 2.16, 2.17, 2.18, 2.19, 2.2, 2.21, 2.22, 2.23, 2.24, 2.25, 2.26, 2.27, + 2.28, 2.29, 2.3, 2.31, 2.32, 2.33, 2.34, 2.35, 2.36, 2.37, 2.38, 2.39, + 2.4, 2.41, 2.42, 2.43, 2.44, 2.45, 2.46, 2.47, 2.48, 2.49, 2.5, 2.51, + 2.52, 2.53, 2.54, 2.55, 2.56, 2.57, 2.58, 2.59, 2.6, 2.61, 2.62, 2.63, + 2.64, 2.65, 2.66, 2.67, 2.68, 2.69, 2.7, 2.71, 2.72, 2.73, 2.74, 2.75, + 2.76, 2.77, 2.78, 2.79, 2.8, 2.81, 2.82, 2.83, 2.84, 2.85, 2.86, 2.87, + 2.88, 2.89, 2.9, 2.91, 2.92, 2.93, 2.94, 2.95, 2.96, 2.97, 2.98, 2.99}); + net.AddInputFromArray( + "Filter", {5, 5, 3, 1}, + {0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, + 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, + 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, + 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, + 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, + 0.55, 0.56, 0.57, 0.58, 0.59, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, + 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74}); + net.AddInputFromArray("Bias", {6}, + {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}); + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", + kernels::BufferType::DW_CONV2D_FILTER); + BufferToImage(net, "Bias", "BiasImage", + kernels::BufferType::ARGUMENT); + OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", + kernels::BufferType::IN_OUT); + + } else { + OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {2, 2}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + } + + // Check + auto expected = CreateTensor( + {1, 5, 5, 3}, + {4.48200035, 4.63479996, 4.79079962, 5.85899973, 6.05599976, + 6.25699997, 6.38100004, 6.59000015, 6.80300045, 6.90299988, + 7.1239996, 7.34899998, 4.03559971, 4.16820002, 4.30319977, + 8.90999985, 9.1760006, 9.44599915, 11.20499992, 11.54500103, + 11.89000034, 11.74499989, 12.09999943, 12.46000004, 12.28499985, + 12.65500069, 13.03000069, 7.00200033, 7.22399998, 7.44900036, + 13.4100008, 13.79599953, 14.18599987, 16.60500145, 17.09499741, + 17.59000015, 17.14500046, 17.65000153, 18.15999794, 17.68499947, + 18.20499992, 18.72999954, 9.97200012, 10.28399944, 10.59899998, + 17.90999985, 18.41600037, 18.92599869, 22.00500107, 22.64500046, + 23.28999901, 22.54500008, 23.19999886, 23.8599987, 23.0850029, + 23.75500107, 24.43000031, 12.94200039, 13.34400082, 13.7489996, + 6.97500038, 7.29659986, 7.62060022, 8.32049942, 8.72700024, + 9.13650036, 8.5095005, 8.92500019, 9.34349918, 8.69849968, + 9.12300014, 9.55049992, 4.55220032, 4.80690002, 5.06340027}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.2); +} + +TEST_F(DepthwiseConv2dOpTest, ComplexCPU) { + ComplexValidTest(); +} + +TEST_F(DepthwiseConv2dOpTest, ComplexOpenCL) { + ComplexValidTest(); +} + +TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) { + ComplexValidTest(); +} + +template void TestNxNS12(const index_t height, const index_t width) { testing::internal::LogToStderr(); auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, @@ -53,11 +208,18 @@ void TestNxNS12(const index_t height, const index_t width) { srand(time(NULL)); // generate random input - index_t batch = 1; - index_t input_channels = 3; - index_t multiplier = 2; + index_t batch = 1 + rand() % 5; + index_t input_channels = 3 + rand() % 16; + index_t multiplier = 1; // Construct graph OpsTestNet net; + + // Add input data + net.AddRandomInput("Input", + {batch, height, width, input_channels}); + net.AddRandomInput( + "Filter", {kernel_h, kernel_w, input_channels, multiplier}); + net.AddRandomInput("Bias", {multiplier * input_channels}); OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") .Input("Input") .Input("Filter") @@ -66,24 +228,56 @@ void TestNxNS12(const index_t height, const index_t width) { .AddIntsArg("strides", {stride_h, stride_w}) .AddIntArg("padding", type) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); - // Add input data - net.AddRandomInput("Input", - {batch, input_channels, height, width}); - net.AddRandomInput( - "Filter", {multiplier, input_channels, kernel_h, kernel_w}); - net.AddRandomInput("Bias", {multiplier * input_channels}); - // Run on device - net.RunOp(D); - + // Run on cpu + net.RunOp(); // Check Tensor expected; expected.Copy(*net.GetOutput("Output")); - // run cpu - net.RunOp(); - ExpectTensorNear(expected, *net.GetOutput("Output"), 1e-3); + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", + kernels::BufferType::DW_CONV2D_FILTER); + BufferToImage(net, "Bias", "BiasImage", + kernels::BufferType::ARGUMENT); + OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "DeviceOutput", + kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("DeviceOutput") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + } + + // Check + ExpectTensorNear(expected, *net.GetOutput("DeviceOutput"), 0.1); }; for (int kernel_size : {3}) { @@ -96,32 +290,47 @@ void TestNxNS12(const index_t height, const index_t width) { #if __ARM_NEON TEST_F(DepthwiseConv2dOpTest, NeonSimpleNxNS12) { - TestNxNS12(4, 4); + TestNxNS12(4, 4); } #endif TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12) { - TestNxNS12(4, 4); + TestNxNS12(4, 4); +} + +TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12Half) { + TestNxNS12(4, 4); } #if __ARM_NEON TEST_F(DepthwiseConv2dOpTest, NeonAlignedNxNS12) { - TestNxNS12(64, 64); - TestNxNS12(128, 128); + TestNxNS12(64, 64); + TestNxNS12(128, 128); } #endif TEST_F(DepthwiseConv2dOpTest, OpenCLAlignedNxNS12) { - TestNxNS12(64, 64); - TestNxNS12(128, 128); + TestNxNS12(64, 64); + TestNxNS12(128, 128); +} + +TEST_F(DepthwiseConv2dOpTest, OpenCLAlignedNxNS12Half) { + TestNxNS12(64, 64); + TestNxNS12(128, 128); } #if __ARM_NEON TEST_F(DepthwiseConv2dOpTest, NeonUnalignedNxNS12) { - TestNxNS12(107, 113); + TestNxNS12(107, 113); } #endif TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12) { - TestNxNS12(107, 113); + TestNxNS12(107, 113); } + +TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12Half) { + TestNxNS12(107, 113); +} + +} // namespace diff --git a/mace/ops/depthwise_conv_2d_benchmark.cc b/mace/ops/depthwise_conv_2d_benchmark.cc index 8b4a5776..5acc514f 100644 --- a/mace/ops/depthwise_conv_2d_benchmark.cc +++ b/mace/ops/depthwise_conv_2d_benchmark.cc @@ -14,44 +14,65 @@ namespace mace { template static void DepthwiseConv2d(int iters, int batch, - int channels, + int input_channels, int height, int width, int kernel_h, int kernel_w, int stride, Padding padding, - int output_channels) { + int multiplier) { mace::testing::StopTiming(); OpsTestNet net; - OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {stride, stride}) - .AddIntArg("padding", padding) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {batch, channels, height, width}); - net.AddRandomInput("Filter", - {output_channels, channels, kernel_h, kernel_w}); - net.AddRandomInput("Bias", {output_channels * channels}); + net.AddRandomInput("Input", {batch, height, width, input_channels}); + net.AddRandomInput( + "Filter", {kernel_h, kernel_w, input_channels, multiplier}); + net.AddRandomInput("Bias", {input_channels * multiplier}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", + kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", + kernels::BufferType::DW_CONV2D_FILTER); + BufferToImage(net, "Bias", "BiasImage", + kernels::BufferType::ARGUMENT); + OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("Output") + .AddIntsArg("strides", {stride, stride}) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + } else { + OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride, stride}) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + } // Warm-up - for (int i = 0; i < 5; ++i) { + for (int i = 0; i < 2; ++i) { net.RunOp(D); + net.Sync(); } - net.Sync(); mace::testing::StartTiming(); while (iters--) { net.RunOp(D); + net.Sync(); } - net.Sync(); } #define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, \ @@ -68,20 +89,21 @@ static void DepthwiseConv2d(int iters, BENCHMARK( \ BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE) -#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \ - BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \ +#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \ + BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \ BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL); -BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 2, float); -BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 2, float); -BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 2, float); -BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 2, float); +BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 1, float); +BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 1, float); +BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 1, float); +BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 1, float); BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 1, float); BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 1, SAME, 1, float); -BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 2, float); -BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 2, float); -BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 2, float); -BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 2, float); +BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 1, float); +BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 1, float); +BM_DEPTHWISE_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 1, float); +BM_DEPTHWISE_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 1, float); BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 1, float); BM_DEPTHWISE_CONV_2D(1, 3, 512, 512, 3, 3, 2, SAME, 1, float); + } // namespace mace diff --git a/mace/ops/fused_conv_2d_test.cc b/mace/ops/fused_conv_2d_test.cc index eef3b1a7..bdc4c3cf 100644 --- a/mace/ops/fused_conv_2d_test.cc +++ b/mace/ops/fused_conv_2d_test.cc @@ -26,7 +26,7 @@ void TestNHWCSimple3x3VALID() { BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage(net, "Filter", "FilterImage", - kernels::BufferType::FILTER); + kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("FusedConv2D", "FusedConv2dTest") @@ -83,7 +83,7 @@ void TestNHWCSimple3x3SAME() { BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage(net, "Filter", "FilterImage", - kernels::BufferType::FILTER); + kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("FusedConv2D", "FusedConv2dTest") @@ -151,7 +151,7 @@ void TestNHWCSimple3x3WithoutBias() { BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage(net, "Filter", "FilterImage", - kernels::BufferType::FILTER); + kernels::BufferType::CONV2D_FILTER); OpDefBuilder("FusedConv2D", "FusedConv2dTest") .Input("InputImage") @@ -220,7 +220,7 @@ void TestConv1x1() { BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage(net, "Filter", "FilterImage", - kernels::BufferType::FILTER); + kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); @@ -311,7 +311,7 @@ static void TestComplexConvNxNS12(const std::vector &shape) { BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage(net, "Filter", "FilterImage", - kernels::BufferType::FILTER); + kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); @@ -397,7 +397,7 @@ static void TestHalfComplexConvNxNS12(const std::vector &shape) { BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage(net, "Filter", "FilterImage", - kernels::BufferType::FILTER); + kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); @@ -475,7 +475,7 @@ static void TestGeneralConvNxNS12(const std::vector &image_shape, BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); BufferToImage(net, "Filter", "FilterImage", - kernels::BufferType::FILTER); + kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); @@ -551,7 +551,7 @@ static void TestAtrousConvNxN(const std::vector &shape, const int dilat // run on gpu BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("FusedConv2D", "FusedConv2dTest") @@ -633,7 +633,7 @@ static void TestGeneralHalfAtrousConv(const std::vector &image_shape, // run on gpu BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::CONV2D_FILTER); BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("FusedConv2D", "FusedConv2dTest") diff --git a/mace/ops/image_to_buffer.h b/mace/ops/image_to_buffer.h index 9b07b829..777919d4 100644 --- a/mace/ops/image_to_buffer.h +++ b/mace/ops/image_to_buffer.h @@ -21,7 +21,7 @@ class ImageToBufferOp: public Operator { Tensor *output = this->Output(OUTPUT); kernels::BufferType type = static_cast(OperatorBase::GetSingleArgument( - "buffer_type", static_cast(kernels::FILTER))); + "buffer_type", static_cast(kernels::CONV2D_FILTER))); functor_(output, type, const_cast(input_tensor), future); return true; } -- GitLab