From ef7dfb19c5bc9d633dd44f82a3992a85249da170 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AF=85?= Date: Fri, 27 Jul 2018 16:40:56 +0800 Subject: [PATCH] Add gemm comments --- mace/kernels/gemm.cc | 11 +++++++++++ mace/kernels/gemm.h | 8 ++++++++ 2 files changed, 19 insertions(+) diff --git a/mace/kernels/gemm.cc b/mace/kernels/gemm.cc index 0e05106f..2003d3ec 100644 --- a/mace/kernels/gemm.cc +++ b/mace/kernels/gemm.cc @@ -18,6 +18,17 @@ #include "mace/core/tensor.h" #include "mace/kernels/gemm.h" +/** + * Gemm does fast matrix multiplications with batch. + * It is optimized for arm64-v8 and armeabi-v7a using neon. + * + * We adopt two-level tiling to make better use of l1 cache and register. + * For register tiling, function like GemmXYZ computes gemm for + * matrix[X, Y] * matrix[Y, Z] with all data being able to fit in register. + * For cache tiling, we try to compute one block of multiplication with + * two input matrices and one output matrix fit in l1 cache. + */ + #if defined(MACE_ENABLE_NEON) #include #endif diff --git a/mace/kernels/gemm.h b/mace/kernels/gemm.h index ade517a0..f6ea31c4 100644 --- a/mace/kernels/gemm.h +++ b/mace/kernels/gemm.h @@ -21,9 +21,15 @@ #include "mace/core/types.h" +// Gemm function does fast matrix-matrix multiplications with batch. +// Gemv function does fast matrix-vector multiplications with batch. + namespace mace { namespace kernels { +// Gemm calculates A[batch, height, K] dot B[batch, K, width] within each batch, +// and output to C[batch, height, width]. +// height, K, width correspond to matrix dimension size after transpose (if any) void Gemm(const float *A, const float *B, const index_t batch, @@ -44,6 +50,8 @@ void GemmRef(const float *A, const bool transpose_a = false, const bool transpose_b = false); +// Gemm calculates M[height, width] dot V[batch, height] within each batch of V, +// and output to out[batch, width]. void Gemv(const float *m_ptr, const float *v_ptr, const index_t batch, -- GitLab