From d7fbfee14ca7885f954afe1a3292555ef8e0c7d7 Mon Sep 17 00:00:00 2001 From: Bin Li Date: Mon, 3 Sep 2018 11:18:29 +0800 Subject: [PATCH] Add sgemm benchmark --- mace/kernels/matmul_benchmark.cc | 58 +++++++++++++++++++++++++++++--- mace/kernels/sgemm.cc | 13 ++++--- mace/kernels/sgemm.h | 2 +- mace/kernels/sgemm_test.cc | 13 +++++++ 4 files changed, 76 insertions(+), 10 deletions(-) diff --git a/mace/kernels/matmul_benchmark.cc b/mace/kernels/matmul_benchmark.cc index d109de47..0b789456 100644 --- a/mace/kernels/matmul_benchmark.cc +++ b/mace/kernels/matmul_benchmark.cc @@ -22,6 +22,7 @@ #include "mace/core/testing/test_benchmark.h" #include "mace/kernels/gemm.h" #include "mace/kernels/gemmlowp_util.h" +#include "mace/kernels/sgemm.h" namespace gemmlowp { @@ -107,6 +108,26 @@ void MatmulBenchmark_Mace(int iters, int m, int k, int n) { } } +void MatmulBenchmark_Mace_SGemm(int iters, int m, int k, int n) { + mace::testing::StopTiming(); + std::vector lhs(m * k); + std::vector rhs(k * n); + std::vector result(m * n); + + kernels::MatrixMap matrix_lhs(m, k, RowMajor, lhs.data(), true); + kernels::MatrixMap matrix_rhs(k, n, RowMajor, rhs.data(), true); + kernels::MatrixMap matrix_result(m, n, RowMajor, result.data()); + + kernels::SGemm sgemm; + + sgemm(matrix_lhs, matrix_rhs, &matrix_result); + + mace::testing::StartTiming(); + while (iters--) { + sgemm(matrix_lhs, matrix_rhs, &matrix_result); + } +} + void MatmulBenchmark_Eigen(int iters, int m, int k, int n) { mace::testing::StopTiming(); Eigen::MatrixXf lhs = Eigen::MatrixXf::Random(m, k); @@ -202,6 +223,7 @@ void MatmulBenchmark_gemmlowp_int32(int iters, int rows, int depth, int cols) { #define MACE_BM_MATMUL(M, K, N) \ MACE_BM_MATMUL_FUNC(M, K, N, Mace, float); \ + MACE_BM_MATMUL_FUNC(M, K, N, Mace_SGemm, float); \ MACE_BM_MATMUL_FUNC(M, K, N, Eigen, float); \ MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_uint8, uint8_t); \ MACE_BM_MATMUL_FUNC(M, K, N, gemmlowp_int32, uint8_t); @@ -215,15 +237,43 @@ MACE_BM_MATMUL(15, 384, 384); MACE_BM_MATMUL(15, 384, 1536); MACE_BM_MATMUL(15, 1536, 384); -MACE_BM_MATMUL(1, 384, 384); -MACE_BM_MATMUL(1, 384, 1536); -MACE_BM_MATMUL(1, 1536, 384); -MACE_BM_MATMUL(1, 384, 44678); +MACE_BM_MATMUL(1, 256, 256); +MACE_BM_MATMUL(1, 256, 1536); +MACE_BM_MATMUL(1, 1536, 256); +MACE_BM_MATMUL(256, 256, 1); +MACE_BM_MATMUL(1536, 256, 1); +MACE_BM_MATMUL(256, 1536, 1); +MACE_BM_MATMUL(29792, 256, 1); +MACE_BM_MATMUL(1, 256, 29792); +MACE_BM_MATMUL(2, 256, 256); +MACE_BM_MATMUL(2, 256, 1536); +MACE_BM_MATMUL(2, 1536, 256); +MACE_BM_MATMUL(3, 256, 256); +MACE_BM_MATMUL(3, 256, 1536); +MACE_BM_MATMUL(3, 1536, 256); +MACE_BM_MATMUL(4, 256, 256); +MACE_BM_MATMUL(4, 256, 1536); +MACE_BM_MATMUL(4, 1536, 256); +MACE_BM_MATMUL(8, 256, 256); +MACE_BM_MATMUL(8, 256, 1536); +MACE_BM_MATMUL(8, 1536, 256); +MACE_BM_MATMUL(10, 256, 256); +MACE_BM_MATMUL(10, 256, 1536); +MACE_BM_MATMUL(10, 1536, 256); +MACE_BM_MATMUL(15, 256, 256); +MACE_BM_MATMUL(15, 256, 1536); +MACE_BM_MATMUL(15, 1536, 256); // Embedding size 128 MACE_BM_MATMUL(1, 128, 1536); MACE_BM_MATMUL(1, 128, 44678); +// MobileNet +MACE_BM_MATMUL(128, 128, 3136); +MACE_BM_MATMUL(256, 256, 784); +MACE_BM_MATMUL(512, 512, 196); +MACE_BM_MATMUL(1024, 1024, 49); + } // namespace test } // namespace kernels } // namespace mace diff --git a/mace/kernels/sgemm.cc b/mace/kernels/sgemm.cc index d1c3ed07..5cf3264e 100644 --- a/mace/kernels/sgemm.cc +++ b/mace/kernels/sgemm.cc @@ -505,12 +505,12 @@ void SGemm::operator()(const PackedBlock &lhs, #pragma omp parallel for for (index_t bw = 0; bw < remain_w; ++bw) { index_t remain_h = height; - index_t block_h = 0; const float *lhs_ptr = lhs_data; float *res_ptr = result_data + height * bw; #if defined(MACE_ENABLE_NEON) + index_t block_h = 0; #if defined(__aarch64__) block_h = remain_h >> 3; remain_h -= (block_h << 3); @@ -555,12 +555,13 @@ void SGemm::operator()(const PackedBlock &lhs, for (index_t d = 0; d < remain_d; ++d) { // 8.1.1 float32x4_t b0, b1; + float32x4_t a0 = vdupq_n_f32(rhs_ptr[0]); b0 = vld1q_f32(lhs_ptr); b1 = vld1q_f32(lhs_ptr + 4); - c0 += b0 * rhs_ptr[0]; - c1 += b1 * rhs_ptr[0]; + c0 = vfmaq_laneq_f32(c0, b0, a0, 0); + c1 = vfmaq_laneq_f32(c1, b1, a0, 0); lhs_ptr += 8; rhs_ptr += 1; @@ -611,10 +612,11 @@ void SGemm::operator()(const PackedBlock &lhs, for (index_t d = 0; d < remain_d; ++d) { // 4.1.1 float32x4_t b0, b1; + float32x2_t a0 = vdup_n_f32(rhs_ptr[0]); b0 = vld1q_f32(lhs_ptr); - c0 += b0 * rhs_ptr[0]; + c0 = vmlaq_lane_f32(c0, b0, a0, 0); lhs_ptr += 4; rhs_ptr += 1; @@ -631,11 +633,12 @@ void SGemm::operator()(const PackedBlock &lhs, const float *rhs_ptr = rhs_data + depth * bw; index_t remain_d = depth; - index_t block_d = 0; float sum = 0.f; #if defined(MACE_ENABLE_NEON) + index_t block_d = 0; + float32x4_t c0; c0 = vdupq_n_f32(0.f); diff --git a/mace/kernels/sgemm.h b/mace/kernels/sgemm.h index 02562952..263aed80 100644 --- a/mace/kernels/sgemm.h +++ b/mace/kernels/sgemm.h @@ -53,7 +53,7 @@ class MatrixMap { MatrixMap transpose() const { Major transpose_major = major_ == RowMajor ? ColMajor : RowMajor; - return MatrixMap(col_, row_, transpose_major, data_); + return MatrixMap(col_, row_, transpose_major, data_, is_const_); } index_t row() const { diff --git a/mace/kernels/sgemm_test.cc b/mace/kernels/sgemm_test.cc index 55b88e0e..095ea1b1 100644 --- a/mace/kernels/sgemm_test.cc +++ b/mace/kernels/sgemm_test.cc @@ -82,10 +82,12 @@ TEST(SGemmTest, Pack) { std::vector data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36}; + // For no-transpose lhs TestPack(data, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, 3, 4, Major::RowMajor, PackOrder::ColMajor); +#if defined(MACE_ENABLE_NEON) TestPack(data, {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16}, 4, 4, Major::RowMajor, PackOrder::ColMajor); @@ -93,14 +95,18 @@ TEST(SGemmTest, Pack) { {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16, 17, 18, 19, 20}, 5, 4, Major::RowMajor, PackOrder::ColMajor); +#if defined(__aarch64__) TestPack(data, {1, 5, 9, 13, 17, 21, 25, 29, 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31, 4, 8, 12, 16, 20, 24, 28, 32, 33, 34, 35, 36}, 9, 4, Major::RowMajor, PackOrder::ColMajor); +#endif +#endif // For transpose-needed lhs TestPack(data, {1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12}, 3, 4, Major::ColMajor, PackOrder::ColMajor); +#if defined(MACE_ENABLE_NEON) TestPack(data, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 4, 4, Major::ColMajor, PackOrder::ColMajor); @@ -108,14 +114,18 @@ TEST(SGemmTest, Pack) { {1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 5, 10, 15, 20}, 5, 4, Major::ColMajor, PackOrder::ColMajor); +#if defined(__aarch64__) TestPack(data, {1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 9, 18, 27, 36}, 9, 4, Major::ColMajor, PackOrder::ColMajor); +#endif +#endif // For no-transpose rhs TestPack(data, {1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12}, 4, 3, Major::RowMajor, PackOrder::RowMajor); +#if defined(MACE_ENABLE_NEON) TestPack(data, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, 4, 4, Major::RowMajor, PackOrder::RowMajor); @@ -123,10 +133,12 @@ TEST(SGemmTest, Pack) { {1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 5, 10, 15, 20}, 4, 5, Major::RowMajor, PackOrder::RowMajor); +#endif // For transpose-needed rhs TestPack(data, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, 4, 3, Major::ColMajor, PackOrder::RowMajor); +#if defined(MACE_ENABLE_NEON) TestPack(data, {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16}, 4, 4, Major::ColMajor, PackOrder::RowMajor); @@ -134,6 +146,7 @@ TEST(SGemmTest, Pack) { {1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16, 17, 18, 19, 20}, 4, 5, Major::ColMajor, PackOrder::RowMajor); +#endif } TEST(SGemmTest, UnPack) { -- GitLab