diff --git a/mace/core/half.h b/mace/core/half.h index dde806fb153f76982f26f1c9d6beb28eab516ab2..9df24bd43956aa56b5de833800d63cdda5281269 100644 --- a/mace/core/half.h +++ b/mace/core/half.h @@ -1098,7 +1098,7 @@ namespace half_float /// Conversion constructor. /// \param rhs float to convert - explicit half(float rhs) : data_(detail::float2half(rhs)) {} + half(float rhs) : data_(detail::float2half(rhs)) {} /// Conversion to single-precision. /// \return single precision value representing expression value diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index a717c6a48513eb075ae4b36124213a109a7f4786..e9a41cfcafef011da308a4df81b3dbc79874bfb2 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -11,13 +11,23 @@ namespace mace { namespace kernels { +struct Conv2dFunctorBase { + Conv2dFunctorBase(const int *strides, + const Padding &paddings, + const int *dilations) + : strides_(strides), dilations_(dilations), paddings_(paddings) {} + + const int *strides_; // [stride_h, stride_w] + const int *dilations_; // [dilation_h, dilation_w] + Padding paddings_; +}; + template -struct Conv2dFunctor { - Conv2dFunctor() {} +struct Conv2dFunctor : Conv2dFunctorBase { Conv2dFunctor(const int *strides, const Padding &paddings, const int *dilations) - : strides_(strides), dilations_(dilations), paddings_(paddings) {} + : Conv2dFunctorBase(strides, paddings, dilations) {} void operator()(const Tensor *input, const Tensor *filter, @@ -76,9 +86,10 @@ struct Conv2dFunctor { for (int h = 0; h < height; ++h) { for (int w = 0; w < width; ++w) { for (int c = 0; c < channels; ++c) { - T bias_channel = bias_data ? bias_data[c] : 0; + T bias_channel = 0.0f; + if (bias) bias_channel = bias_data[c]; *output_data = bias_channel; - T sum = 0; + T sum = 0.0f; const T *filter_ptr = filter_data + c; for (int kh = 0; kh < kernel_h; ++kh) { for (int kw = 0; kw < kernel_w; ++kw) { @@ -113,9 +124,6 @@ struct Conv2dFunctor { } - const int *strides_; // [stride_h, stride_w] - const int *dilations_; // [dilation_h, dilation_w] - Padding paddings_; }; template<> @@ -123,11 +131,19 @@ void Conv2dFunctor::operator()(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output); -template<> -void Conv2dFunctor::operator()(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output); + +template +struct Conv2dFunctor : Conv2dFunctorBase { + Conv2dFunctor(const int *strides, + const Padding &paddings, + const int *dilations) + : Conv2dFunctorBase(strides, paddings, dilations) {} + + void operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output); +}; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index 511e4598309561a5a453113784db9de4d933399b..61faa995ce86792a302068af11aed7b784b2834f 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -24,8 +24,8 @@ void BufferToImageFunctor::operator()(Tensor *buffer, } std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(image->dtype())); - built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(image->dtype())); + built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(DataTypeToEnum::value)); + built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(DataTypeToEnum::value)); auto runtime = OpenCLRuntime::Get(); string kernel_name; switch (type) { diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index 689a7df2f7663e3ed509faedc08eb871ba9c2a1b..7aaf367c560b5ef24242d18972dfd96ab1db8d61 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -10,7 +10,10 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] __read_only image2d_t bn_offset, /* cout%4 * cout/4 */ #endif __write_only image2d_t output, + __private const int in_height, + __private const int in_width, __private const int in_ch_blks, + __private const int height, __private const int width) { const int out_ch_blk = get_global_id(0); const int out_w_blk = get_global_id(1); @@ -32,24 +35,37 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] #endif int4 w; +#if STRIDE == 1 w.x = out_w_blk; w.y = w.x + out_w_blks; w.z = w.y + out_w_blks; w.w = w.z + out_w_blks; + int out_hb_idx = (out_hb % height); +#else + w.x = out_w_blk * 2; + w.y = (out_w_blk + out_w_blks) * 2; + w.z = (out_w_blk + 2 * out_w_blks) * 2; + w.w = (out_w_blk + 3 * out_w_blks) * 2; + int out_hb_idx = (out_hb % height) * 2; +#endif + + w.x = select(w.x, INT_MIN, w.x >= in_width); + w.y = select(w.y, INT_MIN, w.y >= in_width); + w.z = select(w.z, INT_MIN, w.z >= in_width); + w.w = select(w.w, INT_MIN, w.w >= in_width); + + out_hb_idx = select(out_hb_idx + (out_hb / height) * in_height, + -1, + out_hb_idx >= in_height); // Unrolling this loop hurt perfmance int in_x_base = 0; for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { - DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb)); - DATA_TYPE4 in1 = 0; - DATA_TYPE4 in2 = 0; - DATA_TYPE4 in3 = 0; - if (w.y < width) { - // conditional load hurt perf, this branching helps sometimes - in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb)); - in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb)); - in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb)); - } + + DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx)); + DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx)); + DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx)); + DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx)); const int filter_x0 = in_ch_blk << 2; DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk)); @@ -78,7 +94,7 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] out3 += in3.z * weights2; out3 += in3.w * weights3; - in_x_base += width; + in_x_base += in_width; } #ifdef FUSED_BATCH_NORM @@ -111,14 +127,19 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] #endif const int out_x_base = out_ch_blk * width; - WRITE_IMAGET(output, (int2)(out_x_base + w.x, out_hb), out0); + int out_x_idx = out_w_blk; + WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out0); + + out_x_idx += out_w_blks; + if (out_x_idx >= width) return; + WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out1); - if (w.y >= width) return; - WRITE_IMAGET(output, (int2)(out_x_base + w.y, out_hb), out1); + out_x_idx += out_w_blks; + if (out_x_idx >= width) return; + WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out2); - if (w.z >= width) return; - WRITE_IMAGET(output, (int2)(out_x_base + w.z, out_hb), out2); + out_x_idx += out_w_blks; + if (out_x_idx >= width) return; + WRITE_IMAGET(output, (int2)(out_x_base + out_x_idx, out_hb), out3); - if (w.w >= width) return; - WRITE_IMAGET(output, (int2)(out_x_base + w.w, out_hb), out3); } diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index ef76aa79c56f310f77eda05a0a97c3091d9faba2..81d39c1eb8a08254d26112341bdeed827be32c39 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -20,143 +20,135 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const int rounded_in_ch = in_ch_blks * 4; const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - VEC_DATA_TYPE(DATA_TYPE, 4) out[5] = {0}; #ifdef BIAS - out[0] = - CMD_TYPE(read_image, CMD_DATA_TYPE)(bias, sampler, (int2)(out_ch_blk, 0)); - out[1] = out[0]; - out[2] = out[0]; - out[3] = out[0]; - out[4] = out[0]; + DATA_TYPE4 out0 = + READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)); + DATA_TYPE4 out1 = out0; + DATA_TYPE4 out2 = out0; + DATA_TYPE4 out3 = out0; + DATA_TYPE4 out4 = out0; +#else + DATA_TYPE4 out0 = 0; + DATA_TYPE4 out1 = 0; + DATA_TYPE4 out2 = 0; + DATA_TYPE4 out3 = 0; + DATA_TYPE4 out4 = 0; +#endif + +#if STRIDE == 1 + int in_width0 = out_w_blk - padding_left; + int in_width1 = in_width0 + out_w_blks; + int in_width2 = in_width1 + out_w_blks; + int in_width3 = in_width2 + out_w_blks; + int in_width4 = in_width3 + out_w_blks; + const int height_idx = (out_hb % out_height) - padding_top; +#else + int in_width0 = out_w_blk * 2 - padding_left; + int in_width1 = (out_w_blk + out_w_blks) * 2 - padding_left; + int in_width2 = (out_w_blk + 2 * out_w_blks) * 2 - padding_left; + int in_width3 = (out_w_blk + 3 * out_w_blks) * 2 - padding_left; + int in_width4 = (out_w_blk + 4 * out_w_blks) * 2 - padding_left; + const int height_idx = (out_hb % out_height) * 2 - padding_top; #endif - int w[5]; - w[0] = out_w_blk - padding_left; - w[1] = w[0] + out_w_blks; - w[2] = w[1] + out_w_blks; - w[3] = w[2] + out_w_blks; - w[4] = w[3] + out_w_blks; - - const int batch_idx = out_hb / out_height; - const int height_idx = out_hb % out_height; - int in_hb[3]; - in_hb[0] = height_idx - padding_top; - in_hb[1] = in_hb[0] + 1; - in_hb[2] = in_hb[1] + 1; - // Judge the height border for padding input. - in_hb[0] = (in_hb[0] < 0 || in_hb[0] >= in_height) ? -1 : in_hb[0] + batch_idx * in_height; - in_hb[1] = (in_hb[1] < 0 || in_hb[1] >= in_height) ? -1 : in_hb[1] + batch_idx * in_height; - in_hb[2] = (in_hb[2] < 0 || in_hb[2] >= in_height) ? -1 : in_hb[2] + batch_idx * in_height; - - const int input_image_width = in_ch_blks * in_width; - - VEC_DATA_TYPE(DATA_TYPE, 4) in[5]; - VEC_DATA_TYPE(DATA_TYPE, 4) weights[4]; + const int batch_idx = (out_hb / out_height) * in_height; + + DATA_TYPE4 in0, in1, in2, in3, in4; + DATA_TYPE4 weights0, weights1, weights2, weights3; int in_idx, hb_idx, width_idx, in_width_idx; // Unrolling this loop hurt perfmance - for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { - for (int i = 0; i < 9; ++i) { - - in_idx = in_ch_blk * in_width; - - hb_idx = i / 3; - width_idx = i % 3; - in_width_idx = w[0] + width_idx; - // Judge the width border for padding input. - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[0] = 0; - } else { - in[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); - } - in_width_idx = w[1] + width_idx; - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[1] = 0; - } else { - in[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); + for (short in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { + for (short hb_idx = 0; hb_idx < 3; ++hb_idx) { + for (short width_idx = 0; width_idx < 3; ++width_idx) { + + in_idx = in_ch_blk * in_width; + + int in_hb_value = height_idx + hb_idx; + in_hb_value = select(in_hb_value + batch_idx, + -1, + (in_hb_value < 0 || in_hb_value >= in_height)); + + int in_width_value; +#define READ_INPUT(i) \ + in_width_value = in_width##i + width_idx; \ + in_width_value = select(in_idx + in_width_value, \ + -1, \ + (in_width_value < 0 || in_width_value >= in_width)); \ + in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value)); + + READ_INPUT(0); + READ_INPUT(1); + READ_INPUT(2); + READ_INPUT(3); + READ_INPUT(4); + +#undef READ_INPUT + + int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch; + weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk)); + weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk)); + weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk)); + weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk)); + + // Will prefetch L2 improve performance? How to pretch image data? + + // Interleaving load and mul does not improve performance as expected + out0 += in0.x * weights0; + out0 += in0.y * weights1; + out0 += in0.z * weights2; + out0 += in0.w * weights3; + + out1 += in1.x * weights0; + out1 += in1.y * weights1; + out1 += in1.z * weights2; + out1 += in1.w * weights3; + + out2 += in2.x * weights0; + out2 += in2.y * weights1; + out2 += in2.z * weights2; + out2 += in2.w * weights3; + + out3 += in3.x * weights0; + out3 += in3.y * weights1; + out3 += in3.z * weights2; + out3 += in3.w * weights3; + + out4 += in4.x * weights0; + out4 += in4.y * weights1; + out4 += in4.z * weights2; + out4 += in4.w * weights3; } - in_width_idx = w[2] + width_idx; - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[2] = 0; - } else { - in[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); - } - in_width_idx = w[3] + width_idx; - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[3] = 0; - } else { - in[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); - } - in_width_idx = w[4] + width_idx; - if (in_width_idx < 0 || in_width_idx >= in_width) { - in[4] = 0; - } else { - in[4] = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, (int2)(in_idx + in_width_idx, in_hb[hb_idx])); - } - - - int filter_idx = (in_ch_blk << 2) + i * rounded_in_ch; - weights[0] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 0, out_ch_blk)); - weights[1] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 1, out_ch_blk)); - weights[2] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 2, out_ch_blk)); - weights[3] = CMD_TYPE(read_image, CMD_DATA_TYPE)(filter, sampler, (int2)(filter_idx + 3, out_ch_blk)); - - // Will prefetch L2 improve performance? How to pretch image data? - - // Interleaving load and mul does not improve performance as expected - out[0] += in[0].x * weights[0]; - out[0] += in[0].y * weights[1]; - out[0] += in[0].z * weights[2]; - out[0] += in[0].w * weights[3]; - - out[1] += in[1].x * weights[0]; - out[1] += in[1].y * weights[1]; - out[1] += in[1].z * weights[2]; - out[1] += in[1].w * weights[3]; - - out[2] += in[2].x * weights[0]; - out[2] += in[2].y * weights[1]; - out[2] += in[2].z * weights[2]; - out[2] += in[2].w * weights[3]; - - out[3] += in[3].x * weights[0]; - out[3] += in[3].y * weights[1]; - out[3] += in[3].z * weights[2]; - out[3] += in[3].w * weights[3]; - - out[4] += in[4].x * weights[0]; - out[4] += in[4].y * weights[1]; - out[4] += in[4].z * weights[2]; - out[4] += in[4].w * weights[3]; } } const int out_x_base = out_ch_blk * out_width; - CMD_TYPE(write_image, CMD_DATA_TYPE)(output, - (int2)(out_x_base + w[0] + padding_left, out_hb), - out[0]); - - w[1] += padding_left; - if (w[1] >= out_width) return; - CMD_TYPE(write_image, CMD_DATA_TYPE)(output, - (int2)(out_x_base + w[1], out_hb), - out[1]); - - w[2] += padding_left; - if (w[2] >= out_width) return; - CMD_TYPE(write_image, CMD_DATA_TYPE)(output, - (int2)(out_x_base + w[2], out_hb), - out[2]); - - w[3] += padding_left; - if (w[3] >= out_width) return; - CMD_TYPE(write_image, CMD_DATA_TYPE)(output, - (int2)(out_x_base + w[3], out_hb), - out[3]); - - w[4] += padding_left; - if (w[4] >= out_width) return; - CMD_TYPE(write_image, CMD_DATA_TYPE)(output, - (int2)(out_x_base + w[4], out_hb), - out[4]); + int w = out_w_blk; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out0); + + w += out_w_blks; + if (w >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out1); + + w += out_w_blks; + if (w >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out2); + + w += out_w_blks; + if (w >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out3); + + w += out_w_blks; + if (w >= out_width) return; + WRITE_IMAGET(output, + (int2)(out_x_base + w, out_hb), + out4); + } diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc index 528928e618abf37a0220ed1d9ebf6a5a7c602564..46066b01337db0a8567119259c1efc45d2a97e6e 100644 --- a/mace/kernels/opencl/conv_2d_opencl.cc +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -10,33 +10,33 @@ namespace kernels { extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, const Tensor *bias, const int *padding, - Tensor *output); + const DataType dt, Tensor *output); extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, const Tensor *bias, const int *padding, - Tensor *output); + const DataType dt, Tensor *output); extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, const Tensor *bias, const int *padding, - Tensor *output); + const DataType dt, Tensor *output); extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, const Tensor *bias, const int *padding, - Tensor *output); + const DataType dt, Tensor *output); -template <> -void Conv2dFunctor::operator()(const Tensor *input, +template +void Conv2dFunctor::operator()(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output) { typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter, const Tensor *bias, const int *padding, - Tensor *output); + DataType dt, Tensor *output); // Selection matrix: kernel_size x stride_size static const Conv2dOpenclFunction selector[5][2] = { {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2}, {nullptr, nullptr}, - {Conv2dOpenclK3x3S1, nullptr}, + {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2}, {nullptr, nullptr}, {nullptr, nullptr}}; @@ -50,7 +50,7 @@ void Conv2dFunctor::operator()(const Tensor *input, << " stride " << strides_[0] << "x" << strides_[1] << " is not implemented yet, using slow version"; // TODO(heliangliang) The CPU/NEON kernel should map the buffer - Conv2dFunctor(strides_, paddings_, dilations_)( + Conv2dFunctor(strides_, paddings_, dilations_)( input, filter, bias, output); return; } @@ -70,8 +70,11 @@ void Conv2dFunctor::operator()(const Tensor *input, } auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; - conv2d_func(input, filter, bias, paddings.data(), output); + conv2d_func(input, filter, bias, paddings.data(), DataTypeToEnum::value, output); } +template struct Conv2dFunctor; +template struct Conv2dFunctor; + } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index e8a88db510173d47d1c68d7cfb1c2678a10eb91a..e089e91251967a92a7b5a2bbfba340358bfbeb04 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -15,6 +15,7 @@ void Conv1x1(const Tensor *input, const Tensor *filter, const Tensor *bias, const int stride, + const DataType dt, Tensor *output) { const index_t batch = output->dim(0); const index_t height = output->dim(1); @@ -29,13 +30,11 @@ void Conv1x1(const Tensor *input, const index_t width_blocks = RoundUpDiv4(width); const index_t input_channel_blocks = RoundUpDiv4(input_channels); - MACE_CHECK(stride == 1); MACE_CHECK(input_batch == batch); - MACE_CHECK(stride != 1 || (input_height == height && input_width == width)); std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); - built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype())); + built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(dt)); built_options.emplace("-DSTRIDE=" + ToString(stride)); if (bias != nullptr) { built_options.emplace("-DBIAS"); @@ -54,7 +53,10 @@ void Conv1x1(const Tensor *input, conv_2d_kernel.setArg(idx++, *(static_cast(bias->buffer()))); } conv_2d_kernel.setArg(idx++, *(static_cast(output->buffer()))); + conv_2d_kernel.setArg(idx++, static_cast(input_height)); + conv_2d_kernel.setArg(idx++, static_cast(input_width)); conv_2d_kernel.setArg(idx++, static_cast(input_channel_blocks)); + conv_2d_kernel.setArg(idx++, static_cast(height)); conv_2d_kernel.setArg(idx++, static_cast(width)); auto command_queue = runtime->command_queue(); @@ -73,16 +75,18 @@ extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, const Tensor *bias, const int *padding, + const DataType dt, Tensor *output) { - Conv1x1(input, filter, bias, 1, output); + Conv1x1(input, filter, bias, 1, dt, output); }; extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, const Tensor *bias, const int *padding, + const DataType dt, Tensor *output) { - Conv1x1(input, filter, bias, 2, output); + Conv1x1(input, filter, bias, 2, dt, output); }; } // namespace kernels diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 5836caa8842cfc1749b2ea78ca6d55fcf11e2e3b..b280b0420f67b7d96be87cbaa8cf8374fff6a33c 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -13,7 +13,8 @@ namespace kernels { static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, const Tensor *bias, const uint32_t stride, - const int *padding, Tensor *output) { + const int *padding, const DataType dt, + Tensor *output) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -25,9 +26,10 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, const index_t width_blocks = RoundUpDiv(width); std::set built_options; - built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(input->dtype())); - built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(input->dtype())); + built_options.emplace("-DDATA_TYPE=" + DataTypeToCLType(dt)); + built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(dt)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); + built_options.emplace("-DSTRIDE=" + ToString(stride)); auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); @@ -62,12 +64,15 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, } void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, Tensor *output) { - Conv2d3x3S12(input, filter, bias, 1, padding, output); + const Tensor *bias, const int *padding, + const DataType dt, Tensor *output) { + Conv2d3x3S12(input, filter, bias, 1, padding, dt, output); }; void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, Tensor *output) { + const Tensor *bias, const int *padding, + const DataType dt, Tensor *output) { + Conv2d3x3S12(input, filter, bias, 2, padding, dt, output); }; } // namespace kernels diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index 05221e55dedde3c7cc17d3f99d2818491d930b87..4f4d1c56147df61da58a5a6478f1958e2b289a39 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -57,9 +57,8 @@ void CalImage2DShape(const std::vector &shape, /* NHWC */ std::string DataTypeToCLType(const DataType dt) { switch (dt) { case DT_FLOAT: - return "float"; case DT_HALF: - return "half"; + return "float"; case DT_UINT8: return "uchar"; case DT_INT8: @@ -85,9 +84,8 @@ std::string DataTypeToCLType(const DataType dt) { std::string DataTypeToOPENCLCMDDataType(const DataType dt) { switch (dt) { case DT_FLOAT: - return "f"; case DT_HALF: - return "h"; + return "f"; default: LOG(FATAL) << "Not supported data type for opencl cmd data type"; return ""; diff --git a/mace/ops/buffer_to_image.cc b/mace/ops/buffer_to_image.cc index fe726d1be60c0cd83613fb1834396e01cab9cd04..56711794b7fef1546ec67e63d873289bea2ef1cc 100644 --- a/mace/ops/buffer_to_image.cc +++ b/mace/ops/buffer_to_image.cc @@ -14,6 +14,6 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage") REGISTER_OPENCL_OPERATOR(OpKeyBuilder("BufferToImage") .TypeConstraint("T") .Build(), - BufferToImageOp); + BufferToImageOp); } // namespace mace diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc index ea5fbe21592830bcc31ef303311b15aba3b3a98c..7bd667ca3988320529a702224e3045a99ca38de8 100644 --- a/mace/ops/buffer_to_image_test.cc +++ b/mace/ops/buffer_to_image_test.cc @@ -15,6 +15,7 @@ void TestBidirectionTransform(const int type, const std::vector &input_ .Input("Input") .Output("B2IOutput") .AddIntArg("buffer_type", type) + .AddIntArg("T", DataTypeToEnum::value) .Finalize(net.NewOperatorDef()); // Add input data @@ -27,6 +28,7 @@ void TestBidirectionTransform(const int type, const std::vector &input_ .Input("B2IOutput") .Output("I2BOutput") .AddIntArg("buffer_type", type) + .AddIntArg("T", DataTypeToEnum::value) .Finalize(net.NewOperatorDef()); // Run @@ -40,6 +42,10 @@ TEST(BufferToImageTest, ArgSmall) { TestBidirectionTransform(kernels::ARGUMENT, {1}); } +TEST(BufferToImageTest, ArgHalfSmall) { + TestBidirectionTransform(kernels::ARGUMENT, {1}); +} + TEST(BufferToImageTest, ArgMedia) { TestBidirectionTransform(kernels::ARGUMENT, {11}); } diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index 128c849aa9978b569423f3b25afccf5b7c607f8c..617bd2c5600670513f67140979fd3ccee3ed6c98 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -11,6 +11,11 @@ REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D") .Build(), Conv2dOp); +REGISTER_CPU_OPERATOR(OpKeyBuilder("Conv2D") + .TypeConstraint("T") + .Build(), + Conv2dOp); + #if __ARM_NEON REGISTER_NEON_OPERATOR(OpKeyBuilder("Conv2D") .TypeConstraint("T") @@ -23,4 +28,9 @@ REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D") .Build(), Conv2dOp); +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Conv2D") + .TypeConstraint("T") + .Build(), + Conv2dOp); + } // namespace mace diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index 2409013ec322966fe2ffc4c682bc1bcbd17da19b..b7f6fc731dc0e092d74c5ef6b7434e61e79635f1 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -27,15 +27,15 @@ static void Conv2d(int iters, OpsTestNet net; // Add input data - net.AddRandomInput("Input", {batch, height, width, channels}); - net.AddRandomInput("Filter", + net.AddRandomInput("Input", {batch, height, width, channels}); + net.AddRandomInput("Filter", {kernel_h, kernel_w, channels, output_channels}); - net.AddRandomInput("Bias", {output_channels}); + net.AddRandomInput("Bias", {output_channels}); if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); - BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputImage") .Input("FilterImage") @@ -44,6 +44,7 @@ static void Conv2d(int iters, .AddIntsArg("strides", {stride, stride}) .AddIntArg("padding", padding) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); } else { OpDefBuilder("Conv2D", "Conv2dTest") @@ -54,6 +55,7 @@ static void Conv2d(int iters, .AddIntsArg("strides", {stride, stride}) .AddIntArg("padding", padding) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); } @@ -88,43 +90,42 @@ static void Conv2d(int iters, BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE) #define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \ - BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \ BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL); // ICNet -BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, float); -BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, float); +BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half); // SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105 -BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float); +BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half); // SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108 -BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float); +BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half); +BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half); // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8 -BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, float); +BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half); // Test RGB <-> YUV -BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float); -BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float); - -BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float); -BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments -BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float); -BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float); -BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float); -BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float); -BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float); -BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float); -BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float); -BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float); -BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float); -BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float); -BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float); -BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float); -BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float); -BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float); -BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float); -BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float); -BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float); -BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float); -BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float); +//BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float); +//BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float); +// +//BM_CONV_2D(1, 64, 32, 32, 1, 1, 1, VALID, 128, float); +//BM_CONV_2D(1, 64, 33, 31, 1, 1, 1, VALID, 128, float); // Test bad alignments +//BM_CONV_2D(1, 3, 512, 512, 1, 1, 1, VALID, 3, float); +//BM_CONV_2D(1, 32, 112, 112, 1, 1, 1, VALID, 64, float); +//BM_CONV_2D(1, 64, 56, 56, 1, 1, 1, VALID, 128, float); +//BM_CONV_2D(1, 256, 28, 28, 1, 1, 1, VALID, 256, float); +//BM_CONV_2D(1, 1024, 7, 7, 1, 1, 1, VALID, 1024, float); +//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, VALID, 128, float); +//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, VALID, 128, float); +//BM_CONV_2D(1, 3, 512, 512, 3, 3, 1, VALID, 3, float); +//BM_CONV_2D(1, 64, 32, 32, 3, 3, 1, SAME, 128, float); +//BM_CONV_2D(1, 64, 33, 31, 3, 3, 1, SAME, 128, float); +//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, VALID, 128, float); +//BM_CONV_2D(1, 3, 512, 512, 3, 3, 2, VALID, 3, float); +//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, VALID, 128, float); +//BM_CONV_2D(1, 64, 32, 32, 3, 3, 2, SAME, 128, float); +//BM_CONV_2D(1, 64, 33, 31, 3, 3, 2, SAME, 128, float); +//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, VALID, 128, float); +//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, VALID, 128, float); +//BM_CONV_2D(1, 64, 32, 32, 5, 5, 1, SAME, 128, float); +//BM_CONV_2D(1, 64, 32, 31, 5, 5, 1, SAME, 128, float); } // namespace mace diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index 3ee0dd6085ae2c06d5aeac0d82c9704352b63152..b4fd374b578d3b1eef058f495d331c2182619246 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -98,9 +98,9 @@ void TestNHWCSimple3x3VALID() { net.AddInputFromArray("Bias", {1}, {0.1f}); if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); - BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputImage") .Input("FilterImage") @@ -109,12 +109,13 @@ void TestNHWCSimple3x3VALID() { .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); net.RunOp(D); // Transfer output - ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); } else { OpDefBuilder("Conv2D", "Conv2dTest") @@ -125,13 +126,14 @@ void TestNHWCSimple3x3VALID() { .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); } - auto expected = CreateTensor({1, 1, 1, 1}, {18.1f}); - ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); + auto expected = CreateTensor({1, 1, 1, 1}, {18.1f}); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); } template @@ -149,9 +151,9 @@ void TestNHWCSimple3x3SAME() { net.AddInputFromArray("Bias", {1}, {0.1f}); if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); - BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputImage") .Input("FilterImage") @@ -160,12 +162,13 @@ void TestNHWCSimple3x3SAME() { .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); // Transfer output - ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); } else { OpDefBuilder("Conv2D", "Conv2dTest") @@ -176,16 +179,17 @@ void TestNHWCSimple3x3SAME() { .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); } - auto expected = CreateTensor( + auto expected = CreateTensor( {1, 3, 3, 1}, {8.1f, 12.1f, 8.1f, 12.1f, 18.1f, 12.1f, 8.1f, 12.1f, 8.1f}); - ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); } TEST_F(Conv2dOpTest, CPUSimple) { @@ -233,22 +237,22 @@ TEST_F(Conv2dOpTest, NEONWithouBias) { TestSimple3x3WithoutBias(); } -template +template void TestNHWCSimple3x3WithoutBias() { OpsTestNet net; // Add input data - net.AddInputFromArray( + net.AddInputFromArray( "Input", {1, 3, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - net.AddInputFromArray( + net.AddInputFromArray( "Filter", {3, 3, 2, 1}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputImage") @@ -257,11 +261,12 @@ void TestNHWCSimple3x3WithoutBias() { .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); // Transfer output - ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); } else { OpDefBuilder("Conv2D", "Conv2dTest") .Input("Input") @@ -270,6 +275,7 @@ void TestNHWCSimple3x3WithoutBias() { .AddIntsArg("strides", {1, 1}) .AddIntArg("padding", Padding::VALID) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run @@ -279,15 +285,15 @@ void TestNHWCSimple3x3WithoutBias() { // Check auto expected = CreateTensor({1, 1, 1, 1}, {18.0f}); - ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); } TEST_F(Conv2dOpTest, CPUWithoutBias) { - TestNHWCSimple3x3WithoutBias(); + TestNHWCSimple3x3WithoutBias(); } TEST_F(Conv2dOpTest, OPENCLWithoutBias) { - TestNHWCSimple3x3WithoutBias(); + TestNHWCSimple3x3WithoutBias(); } template @@ -333,27 +339,27 @@ TEST_F(Conv2dOpTest, NEONCombined) { TestCombined3x3(); } -template +template static void TestNHWCCombined3x3() { // Construct graph OpsTestNet net; // Add input data - net.AddInputFromArray( + net.AddInputFromArray( "Input", {1, 5, 5, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - net.AddInputFromArray( + net.AddInputFromArray( "Filter", {3, 3, 2, 2}, {1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f, 1.0f, 0.5f}); - net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); + net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); - BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2DTest") .Input("InputImage") @@ -363,11 +369,12 @@ static void TestNHWCCombined3x3() { .AddIntsArg("strides", {2, 2}) .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); - ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); } else { OpDefBuilder("Conv2D", "Conv2DTest") .Input("Input") @@ -377,6 +384,7 @@ static void TestNHWCCombined3x3() { .AddIntsArg("strides", {2, 2}) .AddIntArg("padding", Padding::SAME) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run net.RunOp(D); @@ -388,16 +396,21 @@ static void TestNHWCCombined3x3() { {1, 3, 3, 2}, {8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 18.1f, 9.2f, 12.1f, 6.2f, 8.1f, 4.2f, 12.1f, 6.2f, 8.1f, 4.2f}); - ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); + +} +TEST_F(Conv2dOpTest, CPUStride2) { + TestNHWCCombined3x3(); } -TEST_F(Conv2dOpTest, CPUCombined) { - TestNHWCCombined3x3(); +TEST_F(Conv2dOpTest, OPENCLStride2) { + TestNHWCCombined3x3(); } template void TestConv1x1() { + // Construct graph OpsTestNet net; // Add input data @@ -415,37 +428,35 @@ void TestConv1x1() { {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}); net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); - // Construct graph if (D == DeviceType::OPENCL) { - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); - BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); - OpDefBuilder("Conv2D", "Conv2dTest") - .Input("InputImage") - .Input("FilterImage") - .Input("BiasImage") - .Output("OutputImage") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + OpDefBuilder("Conv2D", "Conv2DTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + // Run net.RunOp(D); - // Transfer output - ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); - + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); } else { OpDefBuilder("Conv2D", "Conv2DTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {1, 1}) - .AddIntArg("padding", Padding::VALID) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); - + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + // Run net.RunOp(D); } @@ -470,7 +481,7 @@ TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1(); } -template +template static void TestComplexConvNxNS12(const std::vector &shape) { testing::internal::LogToStderr(); auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, @@ -478,7 +489,6 @@ static void TestComplexConvNxNS12(const std::vector &shape) { srand(time(NULL)); // generate random input - // TODO test all sizes index_t batch = 3 + (rand() % 10); index_t height = shape[0]; index_t width = shape[1]; @@ -494,13 +504,14 @@ static void TestComplexConvNxNS12(const std::vector &shape) { .AddIntsArg("strides", {stride_h, stride_w}) .AddIntArg("padding", type) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {batch, height, width, input_channels}); - net.AddRandomInput( + net.AddRandomInput("Input", {batch, height, width, input_channels}); + net.AddRandomInput( "Filter", {kernel_h, kernel_w, input_channels, output_channels}); - net.AddRandomInput("Bias", {output_channels}); + net.AddRandomInput("Bias", {output_channels}); // run on cpu net.RunOp(); @@ -509,9 +520,9 @@ static void TestComplexConvNxNS12(const std::vector &shape) { expected.Copy(*net.GetOutput("Output")); // run on gpu - BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); - BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); - BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); OpDefBuilder("Conv2D", "Conv2dTest") .Input("InputImage") @@ -521,16 +532,17 @@ static void TestComplexConvNxNS12(const std::vector &shape) { .AddIntsArg("strides", {stride_h, stride_w}) .AddIntArg("padding", type) .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run on device net.RunOp(D); - ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.001); }; for (int kernel_size : {1, 3}) { - for (int stride : {1}) { + for (int stride : {1, 2}) { func(kernel_size, kernel_size, stride, stride, VALID); func(kernel_size, kernel_size, stride, stride, SAME); } @@ -538,9 +550,90 @@ static void TestComplexConvNxNS12(const std::vector &shape) { } TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) { - TestComplexConvNxNS12({32, 32, 64, 128}); + TestComplexConvNxNS12({32, 32, 32, 64}); } TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) { - TestComplexConvNxNS12({107, 113, 5, 7}); + TestComplexConvNxNS12({107, 113, 5, 7}); +} + +template +static void TestHalfComplexConvNxNS12(const std::vector &shape) { + testing::internal::LogToStderr(); + auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, + Padding type) { + srand(time(NULL)); + + // generate random input + index_t batch = 3 + (rand() % 10); + index_t height = shape[0]; + index_t width = shape[1]; + index_t input_channels = shape[2] + (rand() % 10); + index_t output_channels = shape[3] + (rand() % 10); + // Construct graph + OpsTestNet net; + OpDefBuilder("Conv2D", "Conv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + + std::vector float_input_data; + GenerateRandomRealTypeData({batch, height, width, input_channels}, float_input_data); + std::vector float_filter_data; + GenerateRandomRealTypeData({kernel_h, kernel_w, input_channels, output_channels}, float_filter_data); + std::vector float_bias_data; + GenerateRandomRealTypeData({output_channels}, float_bias_data); + // Add input data + net.AddInputFromArray("Input", {batch, height, width, input_channels}, float_input_data); + net.AddInputFromArray( + "Filter", {kernel_h, kernel_w, input_channels, output_channels}, float_filter_data); + net.AddInputFromArray("Bias", {output_channels}, float_bias_data); + + // run on cpu + net.RunOp(); + // Check + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // run on gpu + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + + OpDefBuilder("Conv2D", "Conv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataType::DT_HALF)) + .Finalize(net.NewOperatorDef()); + // Run on device + net.RunOp(D); + + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); + + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.2); + }; + + for (int kernel_size : {1, 3}) { + for (int stride : {1, 2}) { + func(kernel_size, kernel_size, stride, stride, VALID); + } + } +} + +TEST_F(Conv2dOpTest, OPENCLHalfAlignedConvNxNS12) { + TestHalfComplexConvNxNS12({32, 32, 32, 64}); +} + +TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConvNxNS12) { + TestHalfComplexConvNxNS12({107, 113, 5, 7}); } diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 3bdb3ca5cd65152072b5074450331f76eebeadae..8d593940cf0c5059d5064a27c7edb3558b9f559b 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -210,13 +210,17 @@ void GenerateRandomRealTypeData(const std::vector &shape, std::vector &res) { std::random_device rd; std::mt19937 gen(rd()); - std::normal_distribution nd(0, 1); + std::normal_distribution nd(0, 1); index_t size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); res.resize(size); - std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); }); + if (DataTypeToEnum::value == DT_HALF) { + std::generate(res.begin(), res.end(), [&gen, &nd] { return half_float::half_cast(nd(gen)); }); + } else { + std::generate(res.begin(), res.end(), [&gen, &nd] { return nd(gen); }); + } } template @@ -290,39 +294,40 @@ inline void ExpectEqual(const double &a, const double &b) { EXPECT_DOUBLE_EQ(a, b); } -inline void AssertSameTypeDims(const Tensor &x, const Tensor &y) { - ASSERT_EQ(x.dtype(), y.dtype()); +inline void AssertSameDims(const Tensor &x, const Tensor &y) { ASSERT_TRUE(IsSameSize(x, y)) << "x.shape [" << ShapeToString(x) << "] vs " << "y.shape [ " << ShapeToString(y) << "]"; } -template ::value> +template ::value> struct Expector; // Partial specialization for float and double. -template -struct Expector { - static void Equal(const T &a, const T &b) { ExpectEqual(a, b); } +template +struct Expector { + static void Equal(const EXP_TYPE &a, const RES_TYPE &b) { ExpectEqual(a, b); } static void Equal(const Tensor &x, const Tensor &y) { - ASSERT_EQ(x.dtype(), DataTypeToEnum::v()); - AssertSameTypeDims(x, y); + ASSERT_EQ(x.dtype(), DataTypeToEnum::v()); + ASSERT_EQ(y.dtype(), DataTypeToEnum::v()); + AssertSameDims(x, y); Tensor::MappingGuard x_mapper(&x); Tensor::MappingGuard y_mapper(&y); - auto a = x.data(); - auto b = y.data(); + auto a = x.data(); + auto b = y.data(); for (int i = 0; i < x.size(); ++i) { ExpectEqual(a(i), b(i)); } } static void Near(const Tensor &x, const Tensor &y, const double abs_err) { - ASSERT_EQ(x.dtype(), DataTypeToEnum::v()); - AssertSameTypeDims(x, y); + ASSERT_EQ(x.dtype(), DataTypeToEnum::v()); + ASSERT_EQ(y.dtype(), DataTypeToEnum::v()); + AssertSameDims(x, y); Tensor::MappingGuard x_mapper(&x); Tensor::MappingGuard y_mapper(&y); - auto a = x.data(); - auto b = y.data(); + auto a = x.data(); + auto b = y.data(); for (int i = 0; i < x.size(); ++i) { EXPECT_NEAR(a[i], b[i], abs_err) << "a = " << a << " b = " << b << " index = " << i; @@ -335,10 +340,18 @@ template void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) { static_assert(is_floating_point_type::value, "T is not a floating point type"); - Expector::Near(x, y, abs_err); + Expector::Near(x, y, abs_err); +} + +template +void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) { + static_assert(is_floating_point_type::value + && is_floating_point_type::value, + "T is not a floating point type"); + Expector::Near(x, y, abs_err); } -template +template void BufferToImage(OpsTestNet &net, const std::string &input_name, const std::string &output_name, @@ -347,6 +360,7 @@ void BufferToImage(OpsTestNet &net, .Input(input_name) .Output(output_name) .AddIntArg("buffer_type", type) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run @@ -355,7 +369,7 @@ void BufferToImage(OpsTestNet &net, net.Sync(); } -template +template void ImageToBuffer(OpsTestNet &net, const std::string &input_name, const std::string &output_name, @@ -364,6 +378,7 @@ void ImageToBuffer(OpsTestNet &net, .Input(input_name) .Output(output_name) .AddIntArg("buffer_type", type) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) .Finalize(net.NewOperatorDef()); // Run