diff --git a/test/ccbenchmark/mace/ops/buffer_to_image_benchmark.cc b/test/ccbenchmark/mace/ops/buffer_to_image_benchmark.cc index 07685255c407a59e57f2edd2d01570bddf2e54bd..00ed98927d034a4da4cdfe6235ac491792a47dd6 100644 --- a/test/ccbenchmark/mace/ops/buffer_to_image_benchmark.cc +++ b/test/ccbenchmark/mace/ops/buffer_to_image_benchmark.cc @@ -14,8 +14,8 @@ #ifdef MACE_ENABLE_OPENCL -#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/benchmark_utils/test_benchmark.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/ops/opencl/buffer_transformer.h" #include "mace/ops/ops_test_util.h" @@ -25,29 +25,88 @@ namespace test { namespace { template -void FilterBufferToImage(int iters, - int out_channel, int in_channel, - int height, int width) { +void FilterBufferToImage( + int iters, int out_channel, int in_channel, int height, int width) { + mace::testing::StopTiming(); + + OpsTestNet net; + OpContext context(net.ws(), OpTestContext::Get()->GetDevice(DeviceType::GPU)); + + // Add input data + net.AddRandomInput("Input", {out_channel, in_channel, height, width}); + // Create output + Tensor *b2i_output = net.ws()->CreateTensor( + "B2IOutput", context.device()->allocator(), DataTypeToEnum::value); + + auto transform_func = [&]() { + OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) + .Transform(&context, net.ws()->GetTensor("Input"), + OpenCLBufferType::CONV2D_FILTER, MemoryType::GPU_IMAGE, 0, + b2i_output); + }; + + for (int i = 0; i < 5; ++i) { + transform_func(); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + transform_func(); + } + net.Sync(); +} +} // namespace + +#define MACE_BM_FILTER_B2I_MACRO(O, I, H, W, TYPE, DEVICE) \ + static void MACE_BM_FILTER_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * O * I * H * W; \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + FilterBufferToImage(iters, O, I, H, W); \ + } \ + MACE_BENCHMARK(MACE_BM_FILTER_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE) + +#define MACE_BM_FILTER_B2I(O, I, H, W) \ + MACE_BM_FILTER_B2I_MACRO(O, I, H, W, float, GPU); \ + MACE_BM_FILTER_B2I_MACRO(O, I, H, W, half, GPU); + +MACE_BM_FILTER_B2I(5, 3, 3, 3); +MACE_BM_FILTER_B2I(5, 3, 7, 7); +MACE_BM_FILTER_B2I(32, 16, 1, 1); +MACE_BM_FILTER_B2I(32, 16, 3, 3); +MACE_BM_FILTER_B2I(32, 16, 5, 5); +MACE_BM_FILTER_B2I(32, 16, 7, 7); +MACE_BM_FILTER_B2I(64, 32, 1, 1); +MACE_BM_FILTER_B2I(64, 32, 3, 3); +MACE_BM_FILTER_B2I(64, 32, 5, 5); +MACE_BM_FILTER_B2I(64, 32, 7, 7); +MACE_BM_FILTER_B2I(128, 64, 1, 1); +MACE_BM_FILTER_B2I(128, 64, 3, 3); +MACE_BM_FILTER_B2I(128, 32, 1, 1); +MACE_BM_FILTER_B2I(128, 32, 3, 3); +MACE_BM_FILTER_B2I(256, 32, 1, 1); +MACE_BM_FILTER_B2I(256, 32, 3, 3); + +namespace { +template +void InOutBufferToImage( + int iters, int batch, int height, int width, int channel) { mace::testing::StopTiming(); OpsTestNet net; - OpContext context(net.ws(), - OpTestContext::Get()->GetDevice(DeviceType::GPU)); + OpContext context(net.ws(), OpTestContext::Get()->GetDevice(DeviceType::GPU)); // Add input data - net.AddRandomInput("Input", - {out_channel, in_channel, height, width}); + net.AddRandomInput("Input", {batch, height, width, channel}); // Create output Tensor *b2i_output = net.ws()->CreateTensor( "B2IOutput", context.device()->allocator(), DataTypeToEnum::value); auto transform_func = [&]() { OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) - .Transform(&context, - net.ws()->GetTensor("Input"), - OpenCLBufferType::IN_OUT_CHANNEL, - MemoryType::GPU_IMAGE, - 0, + .Transform(&context, net.ws()->GetTensor("Input"), + OpenCLBufferType::IN_OUT_CHANNEL, MemoryType::GPU_IMAGE, 0, b2i_output); }; @@ -64,36 +123,83 @@ void FilterBufferToImage(int iters, } } // namespace -#define MACE_BM_B2I_MACRO(O, I, H, W, TYPE, DEVICE) \ - static void MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE( \ - int iters) { \ - const int64_t tot = static_cast(iters) * O * I * H * W; \ - mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ - FilterBufferToImage(iters, O, I, H, W); \ - } \ - MACE_BENCHMARK(MACE_BM_B2I_##O##_##I##_##H##_##W##_##TYPE##_##DEVICE) - -#define MACE_BM_B2I(O, I, H, W) \ - MACE_BM_B2I_MACRO(O, I, H, W, float, GPU); \ - MACE_BM_B2I_MACRO(O, I, H, W, half, GPU); - -MACE_BM_B2I(5, 3, 3, 3); -MACE_BM_B2I(5, 3, 7, 7); -MACE_BM_B2I(32, 16, 1, 1); -MACE_BM_B2I(32, 16, 3, 3); -MACE_BM_B2I(32, 16, 5, 5); -MACE_BM_B2I(32, 16, 7, 7); -MACE_BM_B2I(64, 32, 1, 1); -MACE_BM_B2I(64, 32, 3, 3); -MACE_BM_B2I(64, 32, 5, 5); -MACE_BM_B2I(64, 32, 7, 7); -MACE_BM_B2I(128, 64, 1, 1); -MACE_BM_B2I(128, 64, 3, 3); -MACE_BM_B2I(128, 32, 1, 1); -MACE_BM_B2I(128, 32, 3, 3); -MACE_BM_B2I(256, 32, 1, 1); -MACE_BM_B2I(256, 32, 3, 3); +#define MACE_BM_IN_OUT_B2I_MACRO(N, H, W, C, TYPE, DEVICE) \ + static void MACE_BM_IN_OUT_B2I_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * H * W * C; \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + InOutBufferToImage(iters, N, H, W, C); \ + } \ + MACE_BENCHMARK(MACE_BM_IN_OUT_B2I_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) + +#define MACE_BM_IN_OUT_B2I(N, H, W, C) \ + MACE_BM_IN_OUT_B2I_MACRO(N, H, W, C, float, GPU); \ + MACE_BM_IN_OUT_B2I_MACRO(N, H, W, C, half, GPU); + +MACE_BM_IN_OUT_B2I(256, 1, 1, 32); +MACE_BM_IN_OUT_B2I(256, 3, 3, 32); +MACE_BM_IN_OUT_B2I(1, 4096, 4096, 3); + +namespace { +template +void InOutImageToBuffer( + int iters, int batch, int height, int width, int channel) { + mace::testing::StopTiming(); + + OpsTestNet net; + OpContext context(net.ws(), OpTestContext::Get()->GetDevice(DeviceType::GPU)); + + // Add input data + net.AddRandomInput("Input", {batch, height, width, channel}); + // Create output + Tensor *b2i_output = net.ws()->CreateTensor( + "B2IOutput", context.device()->allocator(), DataTypeToEnum::value); + + auto transform_func_b2i = [&]() { + OpenCLBufferTransformer(MemoryType::GPU_BUFFER, MemoryType::GPU_IMAGE) + .Transform(&context, net.ws()->GetTensor("Input"), + OpenCLBufferType::IN_OUT_CHANNEL, MemoryType::GPU_IMAGE, 0, + b2i_output); + }; + transform_func_b2i(); + + Tensor *i2b_output = net.ws()->CreateTensor( + "I2BOutput", context.device()->allocator(), DataTypeToEnum::value); + auto transform_func_i2b = [&]() { + OpenCLBufferTransformer(MemoryType::GPU_IMAGE, MemoryType::GPU_BUFFER) + .Transform(&context, b2i_output, OpenCLBufferType::IN_OUT_CHANNEL, + MemoryType::GPU_BUFFER, 0, i2b_output); + }; + + for (int i = 0; i < 5; ++i) { + transform_func_i2b(); + } + net.Sync(); + + mace::testing::StartTiming(); + while (iters--) { + transform_func_i2b(); + } + net.Sync(); +} +} // namespace + +#define MACE_BM_IN_OUT_I2B_MACRO(N, H, W, C, TYPE, DEVICE) \ + static void MACE_BM_IN_OUT_I2B_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE( \ + int iters) { \ + const int64_t tot = static_cast(iters) * N * H * W * C; \ + mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \ + InOutImageToBuffer(iters, N, H, W, C); \ + } \ + MACE_BENCHMARK(MACE_BM_IN_OUT_I2B_##N##_##H##_##W##_##C##_##TYPE##_##DEVICE) + +#define MACE_BM_IN_OUT_I2B(N, H, W, C) \ + MACE_BM_IN_OUT_I2B_MACRO(N, H, W, C, float, GPU); \ + MACE_BM_IN_OUT_I2B_MACRO(N, H, W, C, half, GPU); +MACE_BM_IN_OUT_I2B(256, 1, 1, 32); +MACE_BM_IN_OUT_I2B(256, 3, 3, 32); +MACE_BM_IN_OUT_I2B(1, 4096, 4096, 3); } // namespace test } // namespace ops } // namespace mace