Add gemm comments

ef7dfb19 · 李寅 · cd8471c1 · ef7dfb19 · ef7dfb19
隐藏空白更改
内联并排

Showing with 19 addition and 0 deletion

mace/kernels/gemm.cc mace/kernels/gemm.cc +11 -0

mace/kernels/gemm.h mace/kernels/gemm.h +8 -0

未找到文件。
--- a/mace/kernels/gemm.cc
+++ b/mace/kernels/gemm.cc
@@ -18,6 +18,17 @@
 #include "mace/core/tensor.h"
 #include "mace/kernels/gemm.h"

+/**
+ * Gemm does fast matrix multiplications with batch.
+ * It is optimized for arm64-v8 and armeabi-v7a using neon.
+ *
+ * We adopt two-level tiling to make better use of l1 cache and register.
+ * For register tiling, function like GemmXYZ computes gemm for
+ * matrix[X, Y] * matrix[Y, Z] with all data being able to fit in register.
+ * For cache tiling, we try to compute one block of multiplication with
+ * two input matrices and one output matrix fit in l1 cache.
+ */
+
 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
 #endif

--- a/mace/kernels/gemm.h
+++ b/mace/kernels/gemm.h
@@ -21,9 +21,15 @@

 #include "mace/core/types.h"

+// Gemm function does fast matrix-matrix multiplications with batch.
+// Gemv function does fast matrix-vector multiplications with batch.
+
 namespace mace {
 namespace kernels {

+// Gemm calculates A[batch, height, K] dot B[batch, K, width] within each batch,
+// and output to C[batch, height, width].
+// height, K, width correspond to matrix dimension size after transpose (if any)
 void Gemm(const float *A,
          const float *B,
          const index_t batch,
@@ -44,6 +50,8 @@ void GemmRef(const float *A,
             const bool transpose_a = false,
             const bool transpose_b = false);

+// Gemm calculates M[height, width] dot V[batch, height] within each batch of V,
+// and output to out[batch, width].
 void Gemv(const float *m_ptr,
          const float *v_ptr,
          const index_t batch,