From 225b7e09d15328272b8f8af95d546d8c68c59fe7 Mon Sep 17 00:00:00 2001 From: liyin Date: Thu, 4 Apr 2019 18:19:58 +0800 Subject: [PATCH] Benchmark thread pool and openmp --- mace/ops/thread_pool_benchmark.cc | 168 ++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 mace/ops/thread_pool_benchmark.cc diff --git a/mace/ops/thread_pool_benchmark.cc b/mace/ops/thread_pool_benchmark.cc new file mode 100644 index 00000000..1fd14713 --- /dev/null +++ b/mace/ops/thread_pool_benchmark.cc @@ -0,0 +1,168 @@ +// Copyright 2019 The MACE Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// OpenMP and Mace thread pool should be benchmarked separately. + +#include +#include +#include + +#include "mace/core/testing/test_benchmark.h" +#include "mace/utils/thread_pool.h" + +#define MACE_EMPTY_STATEMENT asm volatile("":::"memory"); + +namespace mace { +namespace ops { +namespace test { + +namespace { + +const size_t kMaxSize = 100000000; +const size_t image_size = 56 * 56; +std::vector output_data(kMaxSize), bias_data(kMaxSize); + +void OpenMPBenchmark1D(int iters, int size) { + while (iters--) { + const int b = 0; +#pragma omp parallel for schedule(runtime) + for (int c = 0; c < size; ++c) { + for (size_t i = 0; i < image_size; ++i) { + output_data[(b * size + c) * image_size + i] += bias_data[c]; + } + } + } +} + +void ThreadPoolBenchmark1D(int iters, int size) { + mace::testing::StopTiming(); + utils::ThreadPool thread_pool(4, CPUAffinityPolicy::AFFINITY_BIG_ONLY); + thread_pool.Init(); + mace::testing::StartTiming(); + + while (iters--) { + const int b = 0; // 'const' keyword affects performance + int batch_size = size * image_size; + thread_pool.Compute1D([&](size_t start0, size_t end0, size_t step0) { + for (size_t c = start0; c < end0; c += step0) { + for (size_t i = 0; i < image_size; ++i) { + output_data[(b * size + c) * image_size + i] += bias_data[c]; + } + } + }, 0, size, 1); + } +} + +void OpenMPBenchmark2D(int iters, int size0, int size1) { + while (iters--) { +#pragma omp parallel for collapse(2) schedule(runtime) + for (int b = 0; b < size0; ++b) { + for (int c = 0; c < size1; ++c) { + for (size_t i = 0; i < image_size; ++i) { + // it seems like OpenMP optimize the following mac + output_data[(b * size1 + c) * image_size + i] += bias_data[c]; + } + } + } + } +} + +void ThreadPoolBenchmark2D(int iters, int size0, int size1) { + mace::testing::StopTiming(); + utils::ThreadPool thread_pool(4, CPUAffinityPolicy::AFFINITY_BIG_ONLY); + thread_pool.Init(); + mace::testing::StartTiming(); + + while (iters--) { + thread_pool.Compute2D([&](size_t start0, size_t end0, size_t step0, + size_t start1, size_t end1, size_t step1) { + for (size_t b = start0; b < end0; b += step0) { + for (size_t c = start1; c < end1; c += step1) { + for (size_t i = 0; i < image_size; ++i) { + output_data[(b * size1 + c) * image_size + i] += bias_data[c]; + } + } + } + }, 0, size0, 1, 0, size1, 1); + } +} + +} // namespace + +#define MACE_BM_THREADPOOL_OPENMP_1D(SIZE) \ + static void MACE_BM_THREADPOOL_OPENMP_1D_##SIZE(int iters) { \ + const int64_t tot = static_cast(iters) * SIZE; \ + mace::testing::MacsProcessed(static_cast(iters) * SIZE); \ + mace::testing::BytesProcessed(tot * sizeof(float)); \ + OpenMPBenchmark1D(iters, SIZE); \ + } \ + MACE_BENCHMARK(MACE_BM_THREADPOOL_OPENMP_1D_##SIZE) + +#define MACE_BM_THREADPOOL_MACE_1D(SIZE) \ + static void MACE_BM_THREADPOOL_MACE_1D_##SIZE(int iters) { \ + const int64_t tot = static_cast(iters) * SIZE; \ + mace::testing::MacsProcessed(static_cast(iters) * SIZE); \ + mace::testing::BytesProcessed(tot * sizeof(float)); \ + ThreadPoolBenchmark1D(iters, SIZE); \ + } \ + MACE_BENCHMARK(MACE_BM_THREADPOOL_MACE_1D_##SIZE) + +#define MACE_BM_THREADPOOL_OPENMP_2D(SIZE0, SIZE1) \ + static void MACE_BM_THREADPOOL_OPENMP_2D_##SIZE0##_##SIZE1(int iters) { \ + const int64_t tot = static_cast(iters) * SIZE0 * SIZE1; \ + mace::testing::MacsProcessed(static_cast(iters) * SIZE0 * SIZE1);\ + mace::testing::BytesProcessed(tot * sizeof(float)); \ + OpenMPBenchmark2D(iters, SIZE0, SIZE1); \ + } \ + MACE_BENCHMARK(MACE_BM_THREADPOOL_OPENMP_2D_##SIZE0##_##SIZE1) + +#define MACE_BM_THREADPOOL_MACE_2D(SIZE0, SIZE1) \ + static void MACE_BM_THREADPOOL_MACE_2D_##SIZE0##_##SIZE1(int iters) { \ + const int64_t tot = static_cast(iters) * SIZE0 * SIZE1; \ + mace::testing::MacsProcessed(static_cast(iters) * SIZE0 * SIZE1);\ + mace::testing::BytesProcessed(tot * sizeof(float)); \ + ThreadPoolBenchmark2D(iters, SIZE0, SIZE1); \ + } \ + MACE_BENCHMARK(MACE_BM_THREADPOOL_MACE_2D_##SIZE0##_##SIZE1) + +// OpenMP and Mace threadpool need to be benchmarked separately. + +MACE_BM_THREADPOOL_OPENMP_1D(64); +MACE_BM_THREADPOOL_OPENMP_1D(128); +MACE_BM_THREADPOOL_OPENMP_1D(256); +MACE_BM_THREADPOOL_OPENMP_1D(512); +MACE_BM_THREADPOOL_OPENMP_1D(1024); + +MACE_BM_THREADPOOL_OPENMP_2D(1, 64); +MACE_BM_THREADPOOL_OPENMP_2D(1, 128); +MACE_BM_THREADPOOL_OPENMP_2D(1, 256); +MACE_BM_THREADPOOL_OPENMP_2D(1, 512); +MACE_BM_THREADPOOL_OPENMP_2D(1, 1024); + + +MACE_BM_THREADPOOL_MACE_1D(64); +MACE_BM_THREADPOOL_MACE_1D(128); +MACE_BM_THREADPOOL_MACE_1D(256); +MACE_BM_THREADPOOL_MACE_1D(512); +MACE_BM_THREADPOOL_MACE_1D(1024); + +MACE_BM_THREADPOOL_MACE_2D(1, 64); +MACE_BM_THREADPOOL_MACE_2D(1, 128); +MACE_BM_THREADPOOL_MACE_2D(1, 256); +MACE_BM_THREADPOOL_MACE_2D(1, 512); +MACE_BM_THREADPOOL_MACE_2D(1, 1024); + +} // namespace test +} // namespace ops +} // namespace mace -- GitLab