diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index 8bfee960e777c7e90f08d2ae508efe620bc2b230..24211ca1832921c89828b6ec00f45e33a152b77c 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -25,39 +25,54 @@ static void Conv2d(int iters, mace::testing::StopTiming(); OpsTestNet net; - OpDefBuilder("Conv2D", "Conv2dTest") - .Input("Input") - .Input("Filter") - .Input("Bias") - .Output("Output") - .AddIntsArg("strides", {stride, stride}) - .AddIntArg("padding", padding) - .AddIntsArg("dilations", {1, 1}) - .Finalize(net.NewOperatorDef()); // Add input data - net.AddRandomInput("Input", {batch, channels, height, width}); + net.AddRandomInput("Input", {batch, height, width, channels}); net.AddRandomInput("Filter", - {output_channels, channels, kernel_h, kernel_w}); + {kernel_h, kernel_w, channels, output_channels}); net.AddRandomInput("Bias", {output_channels}); + if (D == DeviceType::OPENCL) { + BufferToImage(net, "Input", "InputImage", kernels::BufferType::IN_OUT); + BufferToImage(net, "Filter", "FilterImage", kernels::BufferType::FILTER); + BufferToImage(net, "Bias", "BiasImage", kernels::BufferType::ARGUMENT); + OpDefBuilder("Conv2D", "Conv2dTest") + .Input("InputImage") + .Input("FilterImage") + .Input("BiasImage") + .Output("Output") + .AddIntsArg("strides", {stride, stride}) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + } else { + OpDefBuilder("Conv2D", "Conv2dTest") + .Input("Input") + .Input("Filter") + .Input("Bias") + .Output("Output") + .AddIntsArg("strides", {stride, stride}) + .AddIntArg("padding", padding) + .AddIntsArg("dilations", {1, 1}) + .Finalize(net.NewOperatorDef()); + } + // Warm-up - for (int i = 0; i < 5; ++i) { + for (int i = 0; i < 2; ++i) { net.RunOp(D); + net.Sync(); } - net.Sync(); mace::testing::StartTiming(); while (iters--) { net.RunOp(D); + net.Sync(); } - net.Sync(); } // In common network, there are usually more than 1 layers, this is used to // approximate the amortized latency. The OpenCL runtime for Mali/Adreno is // in-order. -constexpr int kItersToSync = 10; #define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, DEVICE) \ static void \ @@ -73,8 +88,6 @@ constexpr int kItersToSync = 10; BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE) #define BM_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \ - BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \ - BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, NEON); \ BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL); // ICNet @@ -85,6 +98,9 @@ BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, float); // SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108 BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, float); +// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8 +BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, VALID, 32, float); + // Test RGB <-> YUV BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float); BM_CONV_2D(1, 3, 480, 480, 1, 1, 1, VALID, 3, float);