diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl new file mode 100644 index 0000000000000000000000000000000000000000..1c97a581b9527a41ff48c4d40e1c97e74329bba0 --- /dev/null +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -0,0 +1,76 @@ +float4 conv1x3_s1(const float *input_ptr, + const float *filter_ptr); +float4 conv1x3_s2(const float *input_ptr, + const float *filter_ptr); +float conv3x3(const float *input_ptr, + const float *filter_ptr, + const int row_width); + +void kernel conv_2d_3x3(global const float *input, + global const float *filter, + global const float *bias, + global float *output, + private const uint in_chan_num, + private const uint out_chan_num, + private const uint in_height, + private const uint in_width, + private const uint out_height, + private const uint out_width, + private const uint stride_h, + private const uint stride_w) { + int batch = get_global_id(0); + int out_chan_blk = get_global_id(1); + int out_pixel_blk = get_global_id(2); + + const uint in_pixel = in_height * in_width; + const uint out_pixel = out_height * out_width; + + const uint round_out_width = (out_width + 3) / 4; + const uint out_pixel_height = out_pixel_blk / round_out_width; + const uint out_pixel_width = out_pixel_blk % round_out_width; + + const uint out_chan_begin = out_chan_blk * 4; + const uint out_chan_end = min(out_chan_begin + 4, out_chan_num); + const uint out_pixel_begin = out_pixel_height * out_width + out_pixel_width * 4; + const uint out_pixel_end = min(out_pixel_begin + 4, (out_pixel_height + 1) * out_width); + const uint in_pixel_begin = out_pixel_height * stride_h * in_width + out_pixel_width * stride_w * 4; + + const uint in_offset = batch * in_chan_num * in_pixel; + const uint out_offset = batch * out_chan_num * out_pixel; + const float *input_base = input + in_offset + in_pixel_begin; + float *output_base = output + out_offset + out_pixel_begin; + + uint pixels = out_pixel_end - out_pixel_begin; + + for (uint i = out_chan_begin; i < out_chan_end; ++i) { + float4 res = (float4)bias[i]; + float *output_ptr = output_base + i * out_pixel; + const float *filter_base = filter + i * in_chan_num * 9; + if (pixels == 4) { + for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) { + const float* input_ptr = input_base + in_chan_idx * in_pixel; + const float* filter_ptr = filter_base + in_chan_idx * 9; + if (stride_w == 1) { + res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3); + res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3); + res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3); + } else { + res += conv1x3_s2(input_ptr + 0 * in_width, filter_ptr + 0 * 3); + res += conv1x3_s2(input_ptr + 1 * in_width, filter_ptr + 1 * 3); + res += conv1x3_s2(input_ptr + 2 * in_width, filter_ptr + 2 * 3); + } + } + vstore4(res, 0, output_ptr); + } else { + for (uint p = 0; p < pixels; ++p) { + float res = bias[i]; + for (uint in_chan_idx = 0; in_chan_idx < in_chan_num; ++in_chan_idx) { + const float* input_ptr = input_base + in_chan_idx * in_pixel + p * stride_w; + const float* filter_ptr = filter_base + in_chan_idx * 9; + res += conv3x3(input_ptr, filter_ptr, in_width); + } + output_ptr[p] = res; + } + } + } +} diff --git a/mace/kernels/opencl/cl/conv_helper.cl b/mace/kernels/opencl/cl/conv_helper.cl new file mode 100644 index 0000000000000000000000000000000000000000..f569ef3a51dd6a5069644ecf9e2780f8a5bd3920 --- /dev/null +++ b/mace/kernels/opencl/cl/conv_helper.cl @@ -0,0 +1,41 @@ +float4 conv1x3_s1(const float *input_ptr, + const float *filter_ptr) { + float4 row0 = vload4(0, input_ptr); + float2 input1 = vload2(0, input_ptr+4); + float4 row1 = (float4)(row0.s123, input1.s0); + float4 row2 = (float4)(row0.s23, input1.s01); + float3 filter_values = vload3(0, filter_ptr); + return (float4)filter_values.s0 * row0 + + (float4)filter_values.s1 * row1 + + (float4)filter_values.s2 * row2; +} + +float4 conv1x3_s2(const float *input_ptr, + const float *filter_ptr) { + float8 input = vload8(0, input_ptr); + float4 row0 = input.even; + float4 row1 = input.odd; + float4 row2 = (float4)(row0.s123, input_ptr[8]); + float3 filter_values = vload3(0, filter_ptr); + return (float4)filter_values.s0 * row0 + + (float4)filter_values.s1 * row1 + + (float4)filter_values.s2 * row2; +} + +float conv3x3(const float *input_ptr, + const float *filter_ptr, + const int row_width) { + float3 input_value = vload3(0, input_ptr); + float3 filter_value = vload3(0, filter_ptr); + float3 res = input_value * filter_value; + input_ptr += row_width; + input_value = vload3(0, input_ptr); + filter_value = vload3(1, filter_ptr); + res += input_value * filter_value; + input_ptr += row_width; + input_value = vload3(0, input_ptr); + filter_value = vload3(2, filter_ptr); + res += input_value * filter_value; + + return res.s0 + res.s1 + res.s2; +} diff --git a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl index 084156e13788e53769458ca901cbbc19cbc84f10..2cdbe3fa2a7c696678c921d1d39d9ab34751710c 100644 --- a/mace/kernels/opencl/cl/depthwise_conv_3x3.cl +++ b/mace/kernels/opencl/cl/depthwise_conv_3x3.cl @@ -1,37 +1,8 @@ -inline float4 conv1x3(const float *input_ptr, - const float *filter_ptr) { - float8 input = vload8(0, input_ptr); - float4 row0 = convert_float4(input.s0123); - float4 row1 = convert_float4(input.s1234); - float4 row2 = convert_float4(input.s2345); - return (float4)filter_ptr[0] * row0 + (float4)filter_ptr[1] * row1 - + (float4)filter_ptr[2] * row2; -} - -inline float4 conv3x3x4(const float *input_ptr, - const float *filter_ptr, - const int row_width) { - float4 res; - res = conv1x3(input_ptr + 0 * row_width, filter_ptr + 0 * 3); - res += conv1x3(input_ptr + 1 * row_width, filter_ptr + 1 * 3); - res += conv1x3(input_ptr + 2 * row_width, filter_ptr + 2 * 3); - - return res; -} - -inline float conv3x3(const float *input_ptr, +float4 conv1x3_s1(const float *input_ptr, + const float *filter_ptr); +float conv3x3(const float *input_ptr, const float *filter_ptr, - const int row_width) { - float res = input_ptr[0] * filter_ptr[0] + input_ptr[1] * filter_ptr[1] + input_ptr[2] * filter_ptr[2]; - input_ptr += row_width; - filter_ptr += 3; - res += input_ptr[0] * filter_ptr[0] + input_ptr[1] * filter_ptr[1] + input_ptr[2] * filter_ptr[2]; - input_ptr += row_width; - filter_ptr += 3; - res += input_ptr[0] * filter_ptr[0] + input_ptr[1] * filter_ptr[1] + input_ptr[2] * filter_ptr[2]; - - return res; -} + const int row_width); void kernel depthwise_conv_3x3_s1(global const float *input, /* n, c, h, w */ global const float *filter, /* m, i, kh, kw */ @@ -80,8 +51,10 @@ void kernel depthwise_conv_3x3_s1(global const float *input, /* n, c, h, w */ input_ptr += 1; } } else { - float4 res = conv3x3x4(input_ptr, filter_ptr, in_width); - res += (float4)bias_value; + float4 res = (float4)bias_value; + res += conv1x3_s1(input_ptr + 0 * in_width, filter_ptr + 0 * 3); + res += conv1x3_s1(input_ptr + 1 * in_width, filter_ptr + 1 * 3); + res += conv1x3_s1(input_ptr + 2 * in_width, filter_ptr + 2 * 3); vstore4(res, 0, output_ptr); } } diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc index 9eeec07ac57315c1ffa0ced1aee33f11eaeece8a..fcdb3de208fa6da2997dc391d94f514107bc60d7 100644 --- a/mace/kernels/opencl/conv_2d_opencl.cc +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -3,7 +3,6 @@ // #include "mace/kernels/conv_2d.h" -#include "mace/kernels/conv_pool_2d_util.h" namespace mace { namespace kernels { @@ -11,6 +10,11 @@ namespace kernels { extern void Conv2dOpenclK1x1S1(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output); +extern void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, + const Tensor *bias, Tensor *output); + +extern void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, + const Tensor *bias, Tensor *output); template <> void Conv2dFunctor::operator()(const Tensor *input, const Tensor *filter, @@ -22,7 +26,7 @@ void Conv2dFunctor::operator()(const Tensor *input, static const Conv2dOpenclFunction selector[5][2] = { {Conv2dOpenclK1x1S1, nullptr}, {nullptr, nullptr}, - {nullptr, nullptr}, + {Conv2dOpenclK3x3S1, Conv2dOpenclK3x3S2}, {nullptr, nullptr}, {nullptr, nullptr}}; @@ -40,11 +44,16 @@ void Conv2dFunctor::operator()(const Tensor *input, input, filter, bias, output); return; } - - MACE_CHECK(paddings_[0] == 0 && paddings_[1] == 0, "Padding not supported"); - auto conv2d_func = selector[kernel_h - 1][strides_[0] - 1]; - conv2d_func(input, filter, bias, output); + if (paddings_[0] > 0 || paddings_[1] > 0) { + Tensor padded_input(GetDeviceAllocator(DeviceType::OPENCL), DataTypeToEnum::v()); + Tensor::MappingGuard input_mapper(input); + ConstructInputWithPadding(input->data(), input->shape().data(), paddings_.data(), + &padded_input); + conv2d_func(&padded_input, filter, bias, output); + }else { + conv2d_func(input, filter, bias, output); + } } } // namespace kernels diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc new file mode 100644 index 0000000000000000000000000000000000000000..448f31d6f2c940855b8ba308fd49b0df8f71dd41 --- /dev/null +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -0,0 +1,64 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#include "mace/core/common.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/kernels/conv_2d.h" + +namespace mace { +namespace kernels { + +static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter, + const Tensor *bias, const uint32_t stride, Tensor *output) { + const index_t channels = output->shape()[1]; + const index_t height = output->shape()[2]; + const index_t width = output->shape()[3]; + + MACE_CHECK(input->dim(0) == output->dim(0)); + + const index_t channel_blocks = (channels + 3) / 4; + const index_t pixel_blocks = (width + 3) / 4 * height; + + auto runtime = OpenCLRuntime::Get(); + auto program = runtime->program(); + auto bm_kernel = cl::Kernel(program, "conv_2d_3x3"); + + uint32_t idx = 0; + bm_kernel.setArg(idx++, *(static_cast(input->buffer()))); + bm_kernel.setArg(idx++, *(static_cast(filter->buffer()))); + bm_kernel.setArg(idx++, *(static_cast(bias->buffer()))); + bm_kernel.setArg(idx++, *(static_cast(output->buffer()))); + bm_kernel.setArg(idx++, static_cast(input->dim(1))); + bm_kernel.setArg(idx++, static_cast(channels)); + bm_kernel.setArg(idx++, static_cast(input->dim(2))); + bm_kernel.setArg(idx++, static_cast(input->dim(3))); + bm_kernel.setArg(idx++, static_cast(height)); + bm_kernel.setArg(idx++, static_cast(width)); + bm_kernel.setArg(idx++, stride); + bm_kernel.setArg(idx++, stride); + const uint32_t gws[3] = {static_cast(output->dim(0)), + static_cast(channel_blocks), + static_cast(pixel_blocks)}; + const uint32_t lws[3] = {static_cast(1), + static_cast(1), + static_cast(256)}; + cl_int error = runtime->command_queue().enqueueNDRangeKernel( + bm_kernel, cl::NullRange, + cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2])); + MACE_CHECK(error == CL_SUCCESS); +} + +void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, + const Tensor *bias, Tensor *output) { + InnerConv2dK3x3S12(input, filter, bias, 1, output); +}; + +void Conv2dOpenclK3x3S2(const Tensor *input, const Tensor *filter, + const Tensor *bias, Tensor *output) { + InnerConv2dK3x3S12(input, filter, bias, 2, output); +}; + +} // namespace kernels +} // namespace mace diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index ca3ccf72b96e642e34fc1f6c8b45d787e000dfa9..d408b05bfe7eeb1acdddbcf103f1c9bb434dca9a 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -3,7 +3,6 @@ // #include -#include #include "mace/core/operator.h" #include "mace/core/testing/test_benchmark.h" @@ -14,7 +13,6 @@ namespace mace { template static void Conv2d(int iters, - int iters_to_sync, int batch, int channels, int height, @@ -32,37 +30,32 @@ static void Conv2d(int iters, .Input("Filter") .Input("Bias") .Output("Output") - .AddIntsArg("strides", {stride, stride}) - .AddIntArg("padding", padding) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); + .Finalize(net.operator_def()); + + // Add args + net.AddIntsArg("strides", {stride, stride}); + net.AddIntArg("padding", padding); + net.AddIntsArg("dilations", {1, 1}); // Add input data net.AddRandomInput("Input", {batch, channels, height, width}); net.AddRandomInput("Filter", - {output_channels, channels, kernel_h, kernel_w}); + {output_channels, channels, kernel_h, kernel_w}); net.AddRandomInput("Bias", {output_channels}); // Warm-up for (int i = 0; i < 5; ++i) { net.RunOp(D); - net.Sync(); } + net.Sync(); mace::testing::StartTiming(); while (iters--) { net.RunOp(D); - if (iters % iters_to_sync == 0) { - net.Sync(); - } } + net.Sync(); } -// In common network, there are usually more than 1 layers, this is used to -// approximate the amortized latency. The OpenCL runtime for Mali/Adreno is -// in-order. -constexpr int kItersToSync = 10; - #define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, DEVICE) \ static void \ BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \ @@ -70,8 +63,8 @@ constexpr int kItersToSync = 10; const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::ItemsProcessed(tot); \ mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - Conv2d(iters, kItersToSync, N, C, H, W, KH, KW, STRIDE, \ - mace::Padding::P, OC); \ + Conv2d(iters, N, C, H, W, KH, KW, STRIDE, mace::Padding::P, \ + OC); \ } \ BENCHMARK( \ BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE) diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc index a8e9f189c255e09178b5b6aa02f0e9ab28606786..31d3130a88796bd21776e2a67ea12be1ed9a7fdb 100644 --- a/mace/ops/conv_2d_test.cc +++ b/mace/ops/conv_2d_test.cc @@ -3,16 +3,15 @@ // #include "mace/ops/conv_2d.h" -#include "mace/core/operator.h" #include "mace/ops/ops_test_util.h" using namespace mace; class Conv2dOpTest : public OpsTestBase {}; -TEST_F(Conv2dOpTest, Simple_VALID) { - // Construct graph - auto &net = test_net(); +template +void TestSimple3x3VALID() { + OpsTestNet net; OpDefBuilder("Conv2D", "Conv2dTest") .Input("Input") .Input("Filter") @@ -26,27 +25,28 @@ TEST_F(Conv2dOpTest, Simple_VALID) { // Add args // Add input data - net.AddInputFromArray( + net.AddInputFromArray( "Input", {1, 2, 3, 3}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - net.AddInputFromArray( + net.AddInputFromArray( "Filter", {1, 2, 3, 3}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); - net.AddInputFromArray("Bias", {1}, {0.1f}); + net.AddInputFromArray("Bias", {1}, {0.1f}); // Run - net.RunOp(); + net.RunOp(D); // Check auto expected = CreateTensor({1, 1, 1, 1}, {18.1f}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); + } -TEST_F(Conv2dOpTest, Simple_SAME) { - // Construct graph - auto &net = test_net(); +template +void TestSimple3x3SAME() { + OpsTestNet net; OpDefBuilder("Conv2D", "Conv2dTest") .Input("Input") .Input("Filter") @@ -58,17 +58,17 @@ TEST_F(Conv2dOpTest, Simple_SAME) { .Finalize(net.NewOperatorDef()); // Add input data - net.AddInputFromArray( + net.AddInputFromArray( "Input", {1, 2, 3, 3}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - net.AddInputFromArray( + net.AddInputFromArray( "Filter", {1, 2, 3, 3}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}); - net.AddInputFromArray("Bias", {1}, {0.1f}); + net.AddInputFromArray("Bias", {1}, {0.1f}); // Run - net.RunOp(); + net.RunOp(D); // Check auto expected = CreateTensor( @@ -78,9 +78,25 @@ TEST_F(Conv2dOpTest, Simple_SAME) { ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } -TEST_F(Conv2dOpTest, Combined) { +TEST_F(Conv2dOpTest, CPUSimple) { + TestSimple3x3VALID(); + TestSimple3x3SAME(); +} + +TEST_F(Conv2dOpTest, NEONSimple) { + TestSimple3x3VALID(); + TestSimple3x3SAME(); +} + +TEST_F(Conv2dOpTest, OPENCLSimple) { + TestSimple3x3VALID(); + TestSimple3x3SAME(); +} + +template +static void TestCombined3x3() { // Construct graph - auto &net = test_net(); + OpsTestNet net; OpDefBuilder("Conv2D", "Conv2DTest") .Input("Input") .Input("Filter") @@ -92,19 +108,19 @@ TEST_F(Conv2dOpTest, Combined) { .Finalize(net.NewOperatorDef()); // Add input data - net.AddInputFromArray( + net.AddInputFromArray( "Input", {1, 2, 5, 5}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - net.AddInputFromArray( + net.AddInputFromArray( "Filter", {2, 2, 3, 3}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}); - net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); + net.AddInputFromArray("Bias", {2}, {0.1f, 0.2f}); // Run - net.RunOp(); + net.RunOp(D); // Check auto expected = CreateTensor( @@ -112,6 +128,19 @@ TEST_F(Conv2dOpTest, Combined) { 4.2f, 6.2f, 4.2f, 6.2f, 9.2f, 6.2f, 4.2f, 6.2f, 4.2f}); ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); + +} + +TEST_F(Conv2dOpTest, CPUCombined) { + TestCombined3x3(); +} + +TEST_F(Conv2dOpTest, NEONCombined) { + TestCombined3x3(); +} + +TEST_F(Conv2dOpTest, OPENCLCombined) { + TestCombined3x3(); } template @@ -159,13 +188,16 @@ void TestConv1x1() { ExpectTensorNear(*expected, *net.GetOutput("Output"), 0.001); } -TEST_F(Conv2dOpTest, Conv1x1) { +TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1(); +} + +TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1(); } -// TODO we need more tests -TEST_F(Conv2dOpTest, AlignedConvNxNS12) { +template +static void TestAlignedConvNxNS12() { testing::internal::LogToStderr(); auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, Padding type) { @@ -178,7 +210,7 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) { index_t width = 32; index_t output_channels = 128; // Construct graph - auto &net = test_net(); + OpsTestNet net; OpDefBuilder("Conv2D", "Conv2dTest") .Input("Input") .Input("Filter") @@ -190,19 +222,19 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) { .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {batch, input_channels, height, width}); - net.AddRandomInput( + net.AddRandomInput("Input", {batch, input_channels, height, width}); + net.AddRandomInput( "Filter", {output_channels, input_channels, kernel_h, kernel_w}); - net.AddRandomInput("Bias", {output_channels}); - // run cpu - net.RunOp(); + net.AddRandomInput("Bias", {output_channels}); + // Run on device + net.RunOp(D); // Check Tensor expected; expected.Copy(*net.GetOutput("Output")); - // Run NEON - net.RunOp(DeviceType::NEON); + // run cpu + net.RunOp(); ExpectTensorNear(expected, *net.GetOutput("Output"), 0.001); }; @@ -214,7 +246,16 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) { } } -TEST_F(Conv2dOpTest, UnalignedConvNxNS12) { +TEST_F(Conv2dOpTest, NEONAlignedConvNxNS12) { + TestAlignedConvNxNS12(); +} + +TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) { + TestAlignedConvNxNS12(); +} + +template +static void TestUnalignedConvNxNS12() { testing::internal::LogToStderr(); auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w, Padding type) { @@ -227,7 +268,7 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) { index_t width = 113; index_t output_channels = 3 + rand() % 10; // Construct graph - auto &net = test_net(); + OpsTestNet net; OpDefBuilder("Conv2D", "Conv2dTest") .Input("Input") .Input("Filter") @@ -239,19 +280,19 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) { .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {batch, input_channels, height, width}); - net.AddRandomInput( + net.AddRandomInput("Input", {batch, input_channels, height, width}); + net.AddRandomInput( "Filter", {output_channels, input_channels, kernel_h, kernel_w}); - net.AddRandomInput("Bias", {output_channels}); - // run cpu - net.RunOp(); + net.AddRandomInput("Bias", {output_channels}); + // Run on device + net.RunOp(D); // Check Tensor expected; expected.Copy(*net.GetOutput("Output")); - // Run NEON - net.RunOp(DeviceType::NEON); + // run cpu + net.RunOp(); ExpectTensorNear(expected, *net.GetOutput("Output"), 0.001); }; @@ -262,3 +303,11 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) { } } } + +TEST_F(Conv2dOpTest, NEONUnalignedConvNxNS12) { + TestUnalignedConvNxNS12(); +} + +TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) { + TestUnalignedConvNxNS12(); +} diff --git a/mace/ops/depthwise_conv_2d_benchmark.cc b/mace/ops/depthwise_conv_2d_benchmark.cc index 13f64ddfbcdeb62378fd6b9bfc43a647910c0fdb..71dca4cc212428a16de996344a06e5ed3ebfb1c5 100644 --- a/mace/ops/depthwise_conv_2d_benchmark.cc +++ b/mace/ops/depthwise_conv_2d_benchmark.cc @@ -57,7 +57,7 @@ static void DepthwiseConv2d(int iters, #define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, \ DEVICE) \ static void \ - BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \ + BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \ int iters) { \ const int64_t tot = static_cast(iters) * N * C * H * W; \ mace::testing::ItemsProcessed(tot); \ @@ -66,7 +66,7 @@ static void DepthwiseConv2d(int iters, mace::Padding::P, OC); \ } \ BENCHMARK( \ - BM_DEPTHWISE_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE) + BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE) #define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \ BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \