提交 9ff29fbf 编写于 作者: L Liangliang He

Merge branch 'mthreadpool' into 'master'

Benchmark thread pool and openmp

See merge request !1051
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// OpenMP and Mace thread pool should be benchmarked separately.
#include <algorithm>
#include <string>
#include <vector>
#include "mace/core/testing/test_benchmark.h"
#include "mace/utils/thread_pool.h"
#define MACE_EMPTY_STATEMENT asm volatile("":::"memory");
namespace mace {
namespace ops {
namespace test {
namespace {
const size_t kMaxSize = 100000000;
const size_t image_size = 56 * 56;
std::vector<float> output_data(kMaxSize), bias_data(kMaxSize);
void OpenMPBenchmark1D(int iters, int size) {
while (iters--) {
const int b = 0;
#pragma omp parallel for schedule(runtime)
for (int c = 0; c < size; ++c) {
for (size_t i = 0; i < image_size; ++i) {
output_data[(b * size + c) * image_size + i] += bias_data[c];
}
}
}
}
void ThreadPoolBenchmark1D(int iters, int size) {
mace::testing::StopTiming();
utils::ThreadPool thread_pool(4, CPUAffinityPolicy::AFFINITY_BIG_ONLY);
thread_pool.Init();
mace::testing::StartTiming();
while (iters--) {
const int b = 0; // 'const' keyword affects performance
int batch_size = size * image_size;
thread_pool.Compute1D([&](size_t start0, size_t end0, size_t step0) {
for (size_t c = start0; c < end0; c += step0) {
for (size_t i = 0; i < image_size; ++i) {
output_data[(b * size + c) * image_size + i] += bias_data[c];
}
}
}, 0, size, 1);
}
}
void OpenMPBenchmark2D(int iters, int size0, int size1) {
while (iters--) {
#pragma omp parallel for collapse(2) schedule(runtime)
for (int b = 0; b < size0; ++b) {
for (int c = 0; c < size1; ++c) {
for (size_t i = 0; i < image_size; ++i) {
// it seems like OpenMP optimize the following mac
output_data[(b * size1 + c) * image_size + i] += bias_data[c];
}
}
}
}
}
void ThreadPoolBenchmark2D(int iters, int size0, int size1) {
mace::testing::StopTiming();
utils::ThreadPool thread_pool(4, CPUAffinityPolicy::AFFINITY_BIG_ONLY);
thread_pool.Init();
mace::testing::StartTiming();
while (iters--) {
thread_pool.Compute2D([&](size_t start0, size_t end0, size_t step0,
size_t start1, size_t end1, size_t step1) {
for (size_t b = start0; b < end0; b += step0) {
for (size_t c = start1; c < end1; c += step1) {
for (size_t i = 0; i < image_size; ++i) {
output_data[(b * size1 + c) * image_size + i] += bias_data[c];
}
}
}
}, 0, size0, 1, 0, size1, 1);
}
}
} // namespace
#define MACE_BM_THREADPOOL_OPENMP_1D(SIZE) \
static void MACE_BM_THREADPOOL_OPENMP_1D_##SIZE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * SIZE; \
mace::testing::MacsProcessed(static_cast<int64_t>(iters) * SIZE); \
mace::testing::BytesProcessed(tot * sizeof(float)); \
OpenMPBenchmark1D(iters, SIZE); \
} \
MACE_BENCHMARK(MACE_BM_THREADPOOL_OPENMP_1D_##SIZE)
#define MACE_BM_THREADPOOL_MACE_1D(SIZE) \
static void MACE_BM_THREADPOOL_MACE_1D_##SIZE(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * SIZE; \
mace::testing::MacsProcessed(static_cast<int64_t>(iters) * SIZE); \
mace::testing::BytesProcessed(tot * sizeof(float)); \
ThreadPoolBenchmark1D(iters, SIZE); \
} \
MACE_BENCHMARK(MACE_BM_THREADPOOL_MACE_1D_##SIZE)
#define MACE_BM_THREADPOOL_OPENMP_2D(SIZE0, SIZE1) \
static void MACE_BM_THREADPOOL_OPENMP_2D_##SIZE0##_##SIZE1(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * SIZE0 * SIZE1; \
mace::testing::MacsProcessed(static_cast<int64_t>(iters) * SIZE0 * SIZE1);\
mace::testing::BytesProcessed(tot * sizeof(float)); \
OpenMPBenchmark2D(iters, SIZE0, SIZE1); \
} \
MACE_BENCHMARK(MACE_BM_THREADPOOL_OPENMP_2D_##SIZE0##_##SIZE1)
#define MACE_BM_THREADPOOL_MACE_2D(SIZE0, SIZE1) \
static void MACE_BM_THREADPOOL_MACE_2D_##SIZE0##_##SIZE1(int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * SIZE0 * SIZE1; \
mace::testing::MacsProcessed(static_cast<int64_t>(iters) * SIZE0 * SIZE1);\
mace::testing::BytesProcessed(tot * sizeof(float)); \
ThreadPoolBenchmark2D(iters, SIZE0, SIZE1); \
} \
MACE_BENCHMARK(MACE_BM_THREADPOOL_MACE_2D_##SIZE0##_##SIZE1)
// OpenMP and Mace threadpool need to be benchmarked separately.
MACE_BM_THREADPOOL_OPENMP_1D(64);
MACE_BM_THREADPOOL_OPENMP_1D(128);
MACE_BM_THREADPOOL_OPENMP_1D(256);
MACE_BM_THREADPOOL_OPENMP_1D(512);
MACE_BM_THREADPOOL_OPENMP_1D(1024);
MACE_BM_THREADPOOL_OPENMP_2D(1, 64);
MACE_BM_THREADPOOL_OPENMP_2D(1, 128);
MACE_BM_THREADPOOL_OPENMP_2D(1, 256);
MACE_BM_THREADPOOL_OPENMP_2D(1, 512);
MACE_BM_THREADPOOL_OPENMP_2D(1, 1024);
MACE_BM_THREADPOOL_MACE_1D(64);
MACE_BM_THREADPOOL_MACE_1D(128);
MACE_BM_THREADPOOL_MACE_1D(256);
MACE_BM_THREADPOOL_MACE_1D(512);
MACE_BM_THREADPOOL_MACE_1D(1024);
MACE_BM_THREADPOOL_MACE_2D(1, 64);
MACE_BM_THREADPOOL_MACE_2D(1, 128);
MACE_BM_THREADPOOL_MACE_2D(1, 256);
MACE_BM_THREADPOOL_MACE_2D(1, 512);
MACE_BM_THREADPOOL_MACE_2D(1, 1024);
} // namespace test
} // namespace ops
} // namespace mace
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册