提交 d7fbfee1 编写于 作者: B Bin Li 提交者: 李寅

Add sgemm benchmark

上级 285604f7
......@@ -22,6 +22,7 @@
#include "mace/core/testing/test_benchmark.h"
#include "mace/kernels/gemm.h"
#include "mace/kernels/gemmlowp_util.h"
#include "mace/kernels/sgemm.h"
namespace gemmlowp {
......@@ -107,6 +108,26 @@ void MatmulBenchmark_Mace(int iters, int m, int k, int n) {
}
}
void MatmulBenchmark_Mace_SGemm(int iters, int m, int k, int n) {
mace::testing::StopTiming();
std::vector<float> lhs(m * k);
std::vector<float> rhs(k * n);
std::vector<float> result(m * n);
kernels::MatrixMap<const float> matrix_lhs(m, k, RowMajor, lhs.data(), true);
kernels::MatrixMap<const float> matrix_rhs(k, n, RowMajor, rhs.data(), true);
kernels::MatrixMap<float> matrix_result(m, n, RowMajor, result.data());
kernels::SGemm sgemm;
sgemm(matrix_lhs, matrix_rhs, &matrix_result);
mace::testing::StartTiming();
while (iters--) {
sgemm(matrix_lhs, matrix_rhs, &matrix_result);
}
}
void MatmulBenchmark_Eigen(int iters, int m, int k, int n) {
mace::testing::StopTiming();
Eigen::MatrixXf lhs = Eigen::MatrixXf::Random(m, k);
......@@ -202,6 +223,7 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) {
#define MACE_BM_MATMUL(M, K, N) \
MACE_BM_MATMUL_FUNC(M, K, N, Mace, float); \
MACE_BM_MATMUL_FUNC(M, K, N, Mace_SGemm, float); \
MACE_BM_MATMUL_FUNC(M, K, N, Eigen, float); \
MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_uint8, uint8_t); \
MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_int32, uint8_t);
......@@ -215,15 +237,43 @@ MACE_BM_MATMUL(15, 384, 384);
MACE_BM_MATMUL(15, 384, 1536);
MACE_BM_MATMUL(15, 1536, 384);
MACE_BM_MATMUL(1, 384, 384);
MACE_BM_MATMUL(1, 384, 1536);
MACE_BM_MATMUL(1, 1536, 384);
MACE_BM_MATMUL(1, 384, 44678);
MACE_BM_MATMUL(1, 256, 256);
MACE_BM_MATMUL(1, 256, 1536);
MACE_BM_MATMUL(1, 1536, 256);
MACE_BM_MATMUL(256, 256, 1);
MACE_BM_MATMUL(1536, 256, 1);
MACE_BM_MATMUL(256, 1536, 1);
MACE_BM_MATMUL(29792, 256, 1);
MACE_BM_MATMUL(1, 256, 29792);
MACE_BM_MATMUL(2, 256, 256);
MACE_BM_MATMUL(2, 256, 1536);
MACE_BM_MATMUL(2, 1536, 256);
MACE_BM_MATMUL(3, 256, 256);
MACE_BM_MATMUL(3, 256, 1536);
MACE_BM_MATMUL(3, 1536, 256);
MACE_BM_MATMUL(4, 256, 256);
MACE_BM_MATMUL(4, 256, 1536);
MACE_BM_MATMUL(4, 1536, 256);
MACE_BM_MATMUL(8, 256, 256);
MACE_BM_MATMUL(8, 256, 1536);
MACE_BM_MATMUL(8, 1536, 256);
MACE_BM_MATMUL(10, 256, 256);
MACE_BM_MATMUL(10, 256, 1536);
MACE_BM_MATMUL(10, 1536, 256);
MACE_BM_MATMUL(15, 256, 256);
MACE_BM_MATMUL(15, 256, 1536);
MACE_BM_MATMUL(15, 1536, 256);
// Embedding size 128
MACE_BM_MATMUL(1, 128, 1536);
MACE_BM_MATMUL(1, 128, 44678);
// MobileNet
MACE_BM_MATMUL(128, 128, 3136);
MACE_BM_MATMUL(256, 256, 784);
MACE_BM_MATMUL(512, 512, 196);
MACE_BM_MATMUL(1024, 1024, 49);
} // namespace test
} // namespace kernels
} // namespace mace
......@@ -505,12 +505,12 @@ void SGemm::operator()(const PackedBlock<float> &lhs,
#pragma omp parallel for
for (index_t bw = 0; bw < remain_w; ++bw) {
index_t remain_h = height;
index_t block_h = 0;
const float *lhs_ptr = lhs_data;
float *res_ptr = result_data + height * bw;
#if defined(MACE_ENABLE_NEON)
index_t block_h = 0;
#if defined(__aarch64__)
block_h = remain_h >> 3;
remain_h -= (block_h << 3);
......@@ -555,12 +555,13 @@ void SGemm::operator()(const PackedBlock<float> &lhs,
for (index_t d = 0; d < remain_d; ++d) {
// 8.1.1
float32x4_t b0, b1;
float32x4_t a0 = vdupq_n_f32(rhs_ptr[0]);
b0 = vld1q_f32(lhs_ptr);
b1 = vld1q_f32(lhs_ptr + 4);
c0 += b0 * rhs_ptr[0];
c1 += b1 * rhs_ptr[0];
c0 = vfmaq_laneq_f32(c0, b0, a0, 0);
c1 = vfmaq_laneq_f32(c1, b1, a0, 0);
lhs_ptr += 8;
rhs_ptr += 1;
......@@ -611,10 +612,11 @@ void SGemm::operator()(const PackedBlock<float> &lhs,
for (index_t d = 0; d < remain_d; ++d) {
// 4.1.1
float32x4_t b0, b1;
float32x2_t a0 = vdup_n_f32(rhs_ptr[0]);
b0 = vld1q_f32(lhs_ptr);
c0 += b0 * rhs_ptr[0];
c0 = vmlaq_lane_f32(c0, b0, a0, 0);
lhs_ptr += 4;
rhs_ptr += 1;
......@@ -631,11 +633,12 @@ void SGemm::operator()(const PackedBlock<float> &lhs,
const float *rhs_ptr = rhs_data + depth * bw;
index_t remain_d = depth;
index_t block_d = 0;
float sum = 0.f;
#if defined(MACE_ENABLE_NEON)
index_t block_d = 0;
float32x4_t c0;
c0 = vdupq_n_f32(0.f);
......
......@@ -53,7 +53,7 @@ class MatrixMap {
MatrixMap transpose() const {
Major transpose_major = major_ == RowMajor ? ColMajor : RowMajor;
return MatrixMap(col_, row_, transpose_major, data_);
return MatrixMap(col_, row_, transpose_major, data_, is_const_);
}
index_t row() const {
......
......@@ -82,10 +82,12 @@ TEST(SGemmTest, Pack) {
std::vector<float> data =
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36};
// For no-transpose lhs
TestPack(data,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
3, 4, Major::RowMajor, PackOrder::ColMajor);
#if defined(MACE_ENABLE_NEON)
TestPack(data,
{1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16},
4, 4, Major::RowMajor, PackOrder::ColMajor);
......@@ -93,14 +95,18 @@ TEST(SGemmTest, Pack) {
{1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16, 17, 18, 19,
20},
5, 4, Major::RowMajor, PackOrder::ColMajor);
#if defined(__aarch64__)
TestPack(data,
{1, 5, 9, 13, 17, 21, 25, 29, 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11,
15, 19, 23, 27, 31, 4, 8, 12, 16, 20, 24, 28, 32, 33, 34, 35, 36},
9, 4, Major::RowMajor, PackOrder::ColMajor);
#endif
#endif
// For transpose-needed lhs
TestPack(data,
{1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12},
3, 4, Major::ColMajor, PackOrder::ColMajor);
#if defined(MACE_ENABLE_NEON)
TestPack(data,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
4, 4, Major::ColMajor, PackOrder::ColMajor);
......@@ -108,14 +114,18 @@ TEST(SGemmTest, Pack) {
{1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 5, 10, 15,
20},
5, 4, Major::ColMajor, PackOrder::ColMajor);
#if defined(__aarch64__)
TestPack(data,
{1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21,
22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 9, 18, 27, 36},
9, 4, Major::ColMajor, PackOrder::ColMajor);
#endif
#endif
// For no-transpose rhs
TestPack(data,
{1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12},
4, 3, Major::RowMajor, PackOrder::RowMajor);
#if defined(MACE_ENABLE_NEON)
TestPack(data,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
4, 4, Major::RowMajor, PackOrder::RowMajor);
......@@ -123,10 +133,12 @@ TEST(SGemmTest, Pack) {
{1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 5, 10, 15,
20},
4, 5, Major::RowMajor, PackOrder::RowMajor);
#endif
// For transpose-needed rhs
TestPack(data,
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
4, 3, Major::ColMajor, PackOrder::RowMajor);
#if defined(MACE_ENABLE_NEON)
TestPack(data,
{1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16},
4, 4, Major::ColMajor, PackOrder::RowMajor);
......@@ -134,6 +146,7 @@ TEST(SGemmTest, Pack) {
{1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16, 17, 18, 19,
20},
4, 5, Major::ColMajor, PackOrder::RowMajor);
#endif
}
TEST(SGemmTest, UnPack) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册