From 8eb55b6107b6ed61f32e37dbc8d8a9c21691c9c5 Mon Sep 17 00:00:00 2001 From: liuqi Date: Fri, 1 Dec 2017 13:41:25 +0800 Subject: [PATCH] Finish fused conv: fold relu to convolution. --- mace/kernels/fused_conv_2d.h | 71 ++++ mace/kernels/opencl/cl/conv_2d_1x1.cl | 25 -- mace/kernels/opencl/cl/conv_2d_3x3.cl | 9 + mace/kernels/opencl/conv_2d_opencl.cc | 39 +- mace/kernels/opencl/conv_2d_opencl_1x1.cc | 10 +- mace/kernels/opencl/conv_2d_opencl_3x3.cc | 33 +- mace/kernels/opencl/fused_conv_2d_opencl.cc | 87 +++++ mace/ops/fused_conv_2d.cc | 30 ++ mace/ops/fused_conv_2d.h | 46 +++ mace/ops/fused_conv_2d_test.cc | 410 ++++++++++++++++++++ 10 files changed, 706 insertions(+), 54 deletions(-) create mode 100644 mace/kernels/fused_conv_2d.h create mode 100644 mace/kernels/opencl/fused_conv_2d_opencl.cc create mode 100644 mace/ops/fused_conv_2d.cc create mode 100644 mace/ops/fused_conv_2d.h create mode 100644 mace/ops/fused_conv_2d_test.cc diff --git a/mace/kernels/fused_conv_2d.h b/mace/kernels/fused_conv_2d.h new file mode 100644 index 00000000..4daf28e6 --- /dev/null +++ b/mace/kernels/fused_conv_2d.h @@ -0,0 +1,71 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_KERNELS_FUSED_CONV_2D_H_ +#define MACE_KERNELS_FUSED_CONV_2D_H_ + +#include "mace/core/tensor.h" +#include "mace/kernels/conv_pool_2d_util.h" +#include "mace/kernels/conv_2d.h" + +namespace mace { +namespace kernels { + +struct FusedConv2dFunctorBase { + FusedConv2dFunctorBase(const int *strides, + const Padding &paddings, + const int *dilations) + : strides_(strides), dilations_(dilations), paddings_(paddings) {} + + const int *strides_; // [stride_h, stride_w] + const int *dilations_; // [dilation_h, dilation_w] + Padding paddings_; +}; + +template +struct FusedConv2dFunctor : FusedConv2dFunctorBase { + FusedConv2dFunctor(const int *strides, + const Padding &paddings, + const int *dilations) + : FusedConv2dFunctorBase(strides, paddings, dilations) {} + + void operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output) { + Conv2dFunctor(strides_, paddings_, dilations_)(input, filter, bias, output); + T *output_data = output->mutable_data(); + + T zero_value; + if (DataTypeToEnum::value == DataType::DT_HALF) { + zero_value = half_float::half_cast(0.0f); + } else { + zero_value = 0; + } + auto output_size = output->size(); + for (int n = 0; n < output_size; ++n) { + *output_data = *output_data < 0 ? zero_value : *output_data; + output_data++; + } + } + +}; + +template +struct FusedConv2dFunctor : FusedConv2dFunctorBase { + FusedConv2dFunctor(const int *strides, + const Padding &paddings, + const int *dilations) + : FusedConv2dFunctorBase(strides, paddings, dilations) {} + + void operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output); +}; + +} // namespace kernels +} // namespace mace + +#endif // MACE_KERNELS_FUSED_CONV_2D_H_ diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index 7aaf367c..bf384467 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -4,10 +4,6 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] __read_only image2d_t filter, /* cout%4 * cin, cout/4 */ #ifdef BIAS __read_only image2d_t bias, /* cout%4 * cout/4 */ -#endif -#ifdef FUSED_BATCH_NORM - __read_only image2d_t bn_scale, /* cout%4 * cout/4 */ - __read_only image2d_t bn_offset, /* cout%4 * cout/4 */ #endif __write_only image2d_t output, __private const int in_height, @@ -97,27 +93,6 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] in_x_base += in_width; } -#ifdef FUSED_BATCH_NORM - // batch norm - DATA_TYPE4 bn_scale_value = - READ_IMAGET(bn_scale, sampler, (int2)(out_ch_blk, 0)); - DATA_TYPE4 scale0 = (DATA_TYPE4)(bn_scale_value.x); - DATA_TYPE4 scale1 = (DATA_TYPE4)(bn_scale_value.y); - DATA_TYPE4 scale2 = (DATA_TYPE4)(bn_scale_value.z); - DATA_TYPE4 scale3 = (DATA_TYPE4)(bn_scale_value.w); - DATA_TYPE4 bn_offset_value = - READ_IMAGET(bn_offset, sampler, (int2)(out_ch_blk, 0)); - DATA_TYPE4 offset0 = (DATA_TYPE4)(bn_offset_value.x); - DATA_TYPE4 offset1 = (DATA_TYPE4)(bn_offset_value.y); - DATA_TYPE4 offset2 = (DATA_TYPE4)(bn_offset_value.z); - DATA_TYPE4 offset3 = (DATA_TYPE4)(bn_offset_value.w); - - out0 = out0 * scale0 + offset0; - out1 = out1 * scale1 + offset1; - out2 = out2 * scale2 + offset2; - out3 = out3 * scale3 + offset3; -#endif - #ifdef FUSED_RELU // TODO relux out0 = fmax(out0, 0); diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 81d39c1e..98847ab0 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -121,6 +121,15 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] } } +#ifdef FUSED_RELU + // TODO relux + out0 = fmax(out0, 0); + out1 = fmax(out1, 0); + out2 = fmax(out2, 0); + out3 = fmax(out3, 0); + out4 = fmax(out4, 0); +#endif + const int out_x_base = out_ch_blk * out_width; int w = out_w_blk; WRITE_IMAGET(output, diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc index 46066b01..40663ea0 100644 --- a/mace/kernels/opencl/conv_2d_opencl.cc +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -9,29 +9,34 @@ namespace mace { namespace kernels { extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, - const DataType dt, Tensor *output); + const Tensor *bias, const bool is_relu, + const int *padding, const DataType dt, + Tensor *output); extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, - const DataType dt, Tensor *output); + const Tensor *bias, const bool is_relu, + const int *padding, const DataType dt, + Tensor *output); extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, - const DataType dt, Tensor *output); + const Tensor *bias, const bool is_relu, + const int *padding, const DataType dt, + Tensor *output); extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, - const DataType dt, Tensor *output); + const Tensor *bias, const bool is_relu, + const int *padding, const DataType dt, + Tensor *output); template void Conv2dFunctor::operator()(const Tensor *input, - const Tensor *filter, - const Tensor *bias, - Tensor *output) { + const Tensor *filter, + const Tensor *bias, + Tensor *output) { typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, - DataType dt, Tensor *output); + const Tensor *bias, const bool is_relu, + const int *padding, const DataType dt, + Tensor *output); // Selection matrix: kernel_size x stride_size static const Conv2dOpenclFunction selector[5][2] = { {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2}, @@ -70,11 +75,13 @@ void Conv2dFunctor::operator()(const Tensor *input, } auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; - conv2d_func(input, filter, bias, paddings.data(), DataTypeToEnum::value, output); + conv2d_func(input, filter, bias, false, paddings.data(), DataTypeToEnum::value, output); } -template struct Conv2dFunctor; -template struct Conv2dFunctor; +template +struct Conv2dFunctor; +template +struct Conv2dFunctor; } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index e089e912..b460eee9 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -14,6 +14,7 @@ namespace kernels { void Conv1x1(const Tensor *input, const Tensor *filter, const Tensor *bias, + const bool is_relu, const int stride, const DataType dt, Tensor *output) { @@ -39,6 +40,9 @@ void Conv1x1(const Tensor *input, if (bias != nullptr) { built_options.emplace("-DBIAS"); } + if (is_relu) { + built_options.emplace("-DFUSED_RELU"); + } auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); @@ -74,19 +78,21 @@ void Conv1x1(const Tensor *input, extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, const Tensor *bias, + const bool is_relu, const int *padding, const DataType dt, Tensor *output) { - Conv1x1(input, filter, bias, 1, dt, output); + Conv1x1(input, filter, bias, is_relu, 1, dt, output); }; extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, const Tensor *bias, + const bool is_relu, const int *padding, const DataType dt, Tensor *output) { - Conv1x1(input, filter, bias, 2, dt, output); + Conv1x1(input, filter, bias, is_relu, 2, dt, output); }; } // namespace kernels diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index b280b042..cd12c3b7 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -12,9 +12,9 @@ namespace mace { namespace kernels { static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, - const Tensor *bias, const uint32_t stride, - const int *padding, const DataType dt, - Tensor *output) { + const Tensor *bias, const bool is_relu, + const uint32_t stride, const int *padding, + const DataType dt, Tensor *output) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -30,6 +30,9 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, built_options.emplace("-DCMD_DATA_TYPE=" + DataTypeToOPENCLCMDDataType(dt)); built_options.emplace(bias != nullptr ? "-DBIAS" : ""); built_options.emplace("-DSTRIDE=" + ToString(stride)); + if (is_relu) { + built_options.emplace("-DFUSED_RELU"); + } auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); @@ -63,16 +66,24 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, MACE_CHECK(error == CL_SUCCESS, error); } -void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, - const DataType dt, Tensor *output) { - Conv2d3x3S12(input, filter, bias, 1, padding, dt, output); +void Conv2dOpenclK3x3S1(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const bool is_relu, + const int *padding, + const DataType dt, + Tensor *output) { + Conv2d3x3S12(input, filter, bias, is_relu, 1, padding, dt, output); }; -void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, - const Tensor *bias, const int *padding, - const DataType dt, Tensor *output) { - Conv2d3x3S12(input, filter, bias, 2, padding, dt, output); +void Conv2dOpenclK3x3S2(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + const bool is_relu, + const int *padding, + const DataType dt, + Tensor *output) { + Conv2d3x3S12(input, filter, bias, is_relu, 2, padding, dt, output); }; } // namespace kernels diff --git a/mace/kernels/opencl/fused_conv_2d_opencl.cc b/mace/kernels/opencl/fused_conv_2d_opencl.cc new file mode 100644 index 00000000..786b4557 --- /dev/null +++ b/mace/kernels/opencl/fused_conv_2d_opencl.cc @@ -0,0 +1,87 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/kernels/fused_conv_2d.h" +#include "mace/kernels/opencl/helper.h" + +namespace mace { +namespace kernels { + +extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, + const Tensor *bias, const bool is_relu, + const int *padding, const DataType dt, + Tensor *output); + +extern void Conv2dOpenclK1x1S2(const Tensor *input, const Tensor *filter, + const Tensor *bias, const bool is_relu, + const int *padding, const DataType dt, + Tensor *output); + +extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, + const Tensor *bias, const bool is_relu, + const int *padding, const DataType dt, + Tensor *output); + +extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, + const Tensor *bias, const bool is_relu, + const int *padding, const DataType dt, + Tensor *output); + +template +void FusedConv2dFunctor::operator()(const Tensor *input, + const Tensor *filter, + const Tensor *bias, + Tensor *output) { + typedef void (*Conv2dOpenclFunction)(const Tensor *input, const Tensor *filter, + const Tensor *bias, const bool is_relu, + const int *padding, const DataType dt, + Tensor *output); + // Selection matrix: kernel_size x stride_size + static const Conv2dOpenclFunction selector[5][2] = { + {Conv2dOpenclK1x1S1, Conv2dOpenclK1x1S2}, + {nullptr, nullptr}, + {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2}, + {nullptr, nullptr}, + {nullptr, nullptr}}; + + index_t kernel_h = filter->dim(0); + index_t kernel_w = filter->dim(1); + if (kernel_h != kernel_w || kernel_h > 5 || strides_[0] != strides_[1] || + strides_[0] > 2 || dilations_[0] != 1 || dilations_[1] != 1 || + selector[kernel_h - 1][strides_[0] - 1] == nullptr) { + LOG(WARNING) << "OpenCL conv2d kernel with " + << "filter" << kernel_h << "x" << kernel_w << "," + << " stride " << strides_[0] << "x" << strides_[1] + << " is not implemented yet, using slow version"; + // TODO(heliangliang) The CPU/NEON kernel should map the buffer + FusedConv2dFunctor(strides_, paddings_, dilations_)( + input, filter, bias, output); + return; + } + + std::vector output_shape(4); + std::vector paddings(2); + kernels::CalcNHWCPaddingAndOutputSize( + input->shape().data(), filter->shape().data(), dilations_, + strides_, paddings_, output_shape.data(), paddings.data()); + + if (input->is_image()) { + std::vector output_image_shape; + CalImage2DShape(output_shape, BufferType::IN_OUT, output_image_shape); + output->ResizeImage(output_shape, output_image_shape); + } else { + output->Resize(output_shape); + } + + auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; + conv2d_func(input, filter, bias, true, paddings.data(), DataTypeToEnum::value, output); +} + +template +struct FusedConv2dFunctor; +template +struct FusedConv2dFunctor; + +} // namespace kernels +} // namespace mace diff --git a/mace/ops/fused_conv_2d.cc b/mace/ops/fused_conv_2d.cc new file mode 100644 index 00000000..6e6b0172 --- /dev/null +++ b/mace/ops/fused_conv_2d.cc @@ -0,0 +1,30 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/ops/fused_conv_2d.h" + +namespace mace { + +REGISTER_CPU_OPERATOR(OpKeyBuilder("FusedConv2D") + .TypeConstraint("T") + .Build(), + FusedConv2dOp); + +REGISTER_CPU_OPERATOR(OpKeyBuilder("FusedConv2D") + .TypeConstraint("T") + .Build(), + FusedConv2dOp); + + +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("FusedConv2D") + .TypeConstraint("T") + .Build(), + FusedConv2dOp); + +REGISTER_OPENCL_OPERATOR(OpKeyBuilder("FusedConv2D") + .TypeConstraint("T") + .Build(), + FusedConv2dOp); + +} // namespace mace diff --git a/mace/ops/fused_conv_2d.h b/mace/ops/fused_conv_2d.h new file mode 100644 index 00000000..c6baafea --- /dev/null +++ b/mace/ops/fused_conv_2d.h @@ -0,0 +1,46 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_OPS_FUSED_CONV_2D_H_ +#define MACE_OPS_FUSED_CONV_2D_H_ + +#include + +#include "mace/core/operator.h" +#include "mace/kernels/fused_conv_2d.h" +#include "mace/ops/conv_pool_2d_base.h" + +namespace mace { + +template +class FusedConv2dOp : public ConvPool2dOpBase { + public: + FusedConv2dOp(const OperatorDef &op_def, Workspace *ws) + : ConvPool2dOpBase(op_def, ws), + functor_(this->strides_.data(), this->padding_, + this->dilations_.data()) { + } + + bool Run() override { + const Tensor *input = this->Input(INPUT); + const Tensor *filter = this->Input(FILTER); + const Tensor *bias = this->InputSize() > 2 ? this->Input(BIAS) : nullptr; + Tensor *output = this->Output(OUTPUT); + + functor_(input, filter, bias, output); + + return true; + } + + private: + kernels::FusedConv2dFunctor functor_; + + protected: + OP_INPUT_TAGS(INPUT, FILTER, BIAS); + OP_OUTPUT_TAGS(OUTPUT); +}; + +} // namespace mace + +#endif // MACE_OPS_FUSED_CONV_2D_H_ diff --git a/mace/ops/fused_conv_2d_test.cc b/mace/ops/fused_conv_2d_test.cc new file mode 100644 index 00000000..896fbbc6 --- /dev/null +++ b/mace/ops/fused_conv_2d_test.cc @@ -0,0 +1,410 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/ops/fused_conv_2d.h" +#include "mace/ops/ops_test_util.h" + +using namespace mace; + +class FusedConv2dOpTest : public OpsTestBase {}; + +template +void TestNHWCSimple3x3VALID() { + OpsTestNet net; + // Add input data + net.AddInputFromArray( + "Input", {1, 3, 3, 2}, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); + net.AddInputFromArray( + "Filter", {3, 3, 2, 1}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + net.AddInputFromArray("Bias", {1}, {-0.1f}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + + } else { + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + } + + auto expected = CreateTensor({1, 1, 1, 1}, {0.0f}); + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); +} + +template +void TestNHWCSimple3x3SAME() { + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {1, 3, 3, 2}, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); + net.AddInputFromArray( + "Filter", {3, 3, 2, 1}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + net.AddInputFromArray("Bias", {1}, {-0.1f}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + + } else { + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::SAME) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + } + + auto expected = CreateTensor( + {1, 3, 3, 1}, + {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); +} + +TEST_F(FusedConv2dOpTest, CPUSimple) { + TestNHWCSimple3x3VALID(); + TestNHWCSimple3x3SAME(); +} + +TEST_F(FusedConv2dOpTest, OPENCLSimple) { + TestNHWCSimple3x3VALID(); + TestNHWCSimple3x3SAME(); +} + +template +void TestNHWCSimple3x3WithoutBias() { + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {1, 3, 3, 2}, + {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}); + net.AddInputFromArray( + "Filter", {3, 3, 2, 1}, + {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + // Transfer output + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("Input") + .Input("Filter") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Run + net.RunOp(D); + } + + // Check + auto expected = CreateTensor({1, 1, 1, 1}, {0.0f}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.01); +} + +TEST_F(FusedConv2dOpTest, CPUWithoutBias) { + TestNHWCSimple3x3WithoutBias(); +} + +TEST_F(FusedConv2dOpTest, OPENCLWithoutBias) { + TestNHWCSimple3x3WithoutBias(); +} + +template +void TestConv1x1() { + // Construct graph + OpsTestNet net; + + // Add input data + net.AddInputFromArray( + "Input", {1, 3, 10, 5}, + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + net.AddInputFromArray( + "Filter", {1, 1, 5, 2}, + {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}); + net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); + + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + + ImageToBuffer(net, "OutputImage", "Output", kernels::BufferType::IN_OUT); + } else { + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {1, 1}) + .AddIntArg("padding", Padding::VALID) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + // Run + net.RunOp(D); + } + + // Check + auto expected = CreateTensor( + {1, 3, 10, 2}, + {5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, + 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f}); + + ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); +} + +TEST_F(FusedConv2dOpTest, CPUConv1x1) { + TestConv1x1(); +} + +TEST_F(FusedConv2dOpTest, OPENCLConv1x1) { + TestConv1x1(); +} + +template +static void TestComplexConvNxNS12(const std::vector &shape) { + testing::internal::LogToStderr(); + auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, + Padding type) { + srand(time(NULL)); + + // generate random input + index_t batch = 3 + (rand() % 10); + index_t height = shape[0]; + index_t width = shape[1]; + index_t input_channels = shape[2] + (rand() % 10); + index_t output_channels = shape[3] + (rand() % 10); + // Construct graph + OpsTestNet net; + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + + // Add input data + net.AddRandomInput("Input", {batch, height, width, input_channels}); + net.AddRandomInput( + "Filter", {kernel_h, kernel_w, input_channels, output_channels}); + net.AddRandomInput("Bias", {output_channels}); + + // run on cpu + net.RunOp(); + // Check + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // run on gpu + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataTypeToEnum::value)) + .Finalize(net.NewOperatorDef()); + // Run on device + net.RunOp(D); + + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.001); + }; + + for (int kernel_size : {1, 3}) { + for (int stride : {1, 2}) { + func(kernel_size, kernel_size, stride, stride, VALID); + func(kernel_size, kernel_size, stride, stride, SAME); + } + } +} + +TEST_F(FusedConv2dOpTest, OPENCLUnalignedConvNxNS12) { + TestComplexConvNxNS12({107, 113, 5, 7}); +} + +template +static void TestHalfComplexConvNxNS12(const std::vector &shape) { + testing::internal::LogToStderr(); + auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, + Padding type) { + srand(time(NULL)); + + // generate random input + index_t batch = 3 + (rand() % 10); + index_t height = shape[0]; + index_t width = shape[1]; + index_t input_channels = shape[2] + (rand() % 10); + index_t output_channels = shape[3] + (rand() % 10); + // Construct graph + OpsTestNet net; + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + + std::vector float_input_data; + GenerateRandomRealTypeData({batch, height, width, input_channels}, float_input_data); + std::vector float_filter_data; + GenerateRandomRealTypeData({kernel_h, kernel_w, input_channels, output_channels}, float_filter_data); + std::vector float_bias_data; + GenerateRandomRealTypeData({output_channels}, float_bias_data); + // Add input data + net.AddInputFromArray("Input", {batch, height, width, input_channels}, float_input_data); + net.AddInputFromArray( + "Filter", {kernel_h, kernel_w, input_channels, output_channels}, float_filter_data); + net.AddInputFromArray("Bias", {output_channels}, float_bias_data); + + // run on cpu + net.RunOp(); + // Check + Tensor expected; + expected.Copy(*net.GetOutput("Output")); + + // run on gpu + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + + OpDefBuilder("FusedConv2D", "FusedConv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("OutputImage") + .AddIntsArg("strides", {stride_h, stride_w}) + .AddIntArg("padding", type) + .AddIntsArg("dilations", {1, 1}) + .AddIntArg("T", static_cast(DataType::DT_HALF)) + .Finalize(net.NewOperatorDef()); + // Run on device + net.RunOp(D); + + ImageToBuffer(net, "OutputImage", "OPENCLOutput", kernels::BufferType::IN_OUT); + + ExpectTensorNear(expected, *net.GetOutput("OPENCLOutput"), 0.2); + }; + + for (int kernel_size : {1, 3}) { + for (int stride : {1, 2}) { + func(kernel_size, kernel_size, stride, stride, VALID); + } + } +} + +TEST_F(FusedConv2dOpTest, OPENCLHalfAlignedConvNxNS12) { + TestHalfComplexConvNxNS12({32, 32, 32, 64}); +} + -- GitLab