Optimize Gemm: fuse add relu batchnorm op, dynamic block, add AddDot4x8,...

Optimize Gemm: fuse add relu batchnorm op, dynamic block, add AddDot4x8, optimize memory write back.

Optimize Gemm: fuse add relu batchnorm op, dynamic block, add AddDot4x8,...
Optimize Gemm: fuse add relu batchnorm op, dynamic block, add AddDot4x8, optimize memory write back.
2b5c8381 · zhaojiaying01 · f6d34bd0 · 2b5c8381 · 2b5c8381 · 2b5c8381
4 changed file
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -22,9 +22,14 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 namespace math {
-alignas(64) float packedA[MC * KC];
+int MC = 0;
-alignas(64) float packedB[KC * NC];
+int KC = 0;
-alignas(64) float ab[MR * NR];
+int NC = 0;
+float *packedA;
+float *packedB;
+float *packedC;
+float *zero;
 // 将A矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                 float *buffer) {
@@ -55,28 +60,39 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
 // 将A矩阵分块复制到连续内存(RowMajor)
 void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
                  float *buffer) {
-  int i, j;
+  const float *a0, *a1, *a2, *a3;
-  const float *Ai, *Ai1, *Ai2, *Ai3;
+  for (int i = 0; i < m - m_tail; i += MR) {
-  for (i = 0; i < m - m_tail; i += MR) {
+    a0 = A + i * lda;
-    Ai = &A(i, 0);
+    a1 = A + (i + 1) * lda;
-    Ai1 = &A(i + 1, 0);
+    a2 = A + (i + 2) * lda;
-    Ai2 = &A(i + 2, 0);
+    a3 = A + (i + 3) * lda;
-    Ai3 = &A(i + 3, 0);
    for (int j = 0; j < k; ++j) {
-      *buffer++ = *Ai++;
+      *buffer++ = *a0++;
-      *buffer++ = *Ai1++;
+      *buffer++ = *a1++;
-      *buffer++ = *Ai2++;
+      *buffer++ = *a2++;
-      *buffer++ = *Ai3++;
+      *buffer++ = *a3++;
    }
  }
+  int i = m - m_tail;
+  a0 = &A(i, 0);
+  a1 = a0 + lda;
+  a2 = a0 + 2 * lda;
+  a3 = a0 + 3 * lda;
  if (m_tail != 0) {
-    for (j = 0; j < k; ++j) {
+    if (m_tail <= 3) {
-      for (i = m - m_tail; i < m; ++i) {
+      a3 = zero;
-        *buffer++ = A(i, j);
    }
-      for (i = m; i < m + (MR - m_tail); ++i) {
+    if (m_tail <= 2) {
-        *buffer++ = 0;
+      a2 = zero;
    }
+    if (m_tail <= 1) {
+      a1 = zero;
+    }
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
    }
  }
 }
@@ -113,35 +129,24 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
 // 将B矩阵分块复制到连续内存(RowMajor)
 void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
                  float *buffer) {
-  int i, j;
+  const float *b0;
-  const float *Bij;
+  for (int j = 0; j < n - n_tail; j += NR) {
-  for (j = 0; j < n - n_tail; j += NR) {
+    for (int i = 0; i < k; ++i) {
-#ifdef ARMV7
+      b0 = &B(i, j);
-    for (i = 0; i < k; ++i) {
-      Bij = &B(i, j);
      asm volatile(
-          "vld1.32    {q0}, [%[Bij]]        \n\t"
+          "pld        [%[b0]]               \n\t"
-          "vst1.32    {q0}, [%[buffer]]!    \n\t"
+          "vld1.32    {q0, q1}, [%[b0]]         \n\t"
+          "vst1.32    {q0, q1}, [%[buffer]]!    \n\t"
          : [buffer] "+r"(buffer)
-          : [Bij] "r"(Bij)
+          : [b0] "r"(b0)
-          : "memory", "q0");
+          : "memory", "q0", "q0");
-    }
-#else
-    for (i = 0; i < k; ++i) {
-      Bij = &B(i, j);
-      *buffer++ = *Bij;
-      *buffer++ = *(Bij + 1);
-      *buffer++ = *(Bij + 2);
-      *buffer++ = *(Bij + 3);
    }
-#endif
  }
  if (n_tail != 0) {
-    for (i = 0; i < k; ++i) {
+    for (int i = 0; i < k; ++i) {
-      Bij = &B(i, n - n_tail);
+      b0 = &B(i, n - n_tail);
      for (int j = n - n_tail; j < n; ++j) {
-        *buffer++ = *Bij++;
+        *buffer++ = *b0++;
      }
      for (int j = n; j < n + (NR - n_tail); ++j) {
        *buffer++ = 0;
@@ -151,118 +156,53 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
 }
 // 分块矩阵乘法
-void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
+void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
-                 const float *B, int ldb, float beta, float *C, int ldc,
+                 float beta, float *c, float *C, int ldc, bool relu) {
-                 int first_time) {
+  for (int j = 0; j < nc; j += NR) {
-  int m_block = (m + MR - 1) / MR * MR;
+    for (int i = 0; i < mc; i += MR) {
-  int n_block = (n + NR - 1) / NR * NR;
+      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-  int m_tail = m % MR;
-  int n_tail = n % NR;
-  if (first_time) {
-    PackMatrixB_(k, n, n_tail, B, ldb, packedB);
-  }
-  PackMatrixA_(m, k, m_tail, A, lda, packedA);
-  int i, j, mc, nc;
-  // B 取 4 列, 打包预热
-  for (j = 0; j < n_block; j += NR) {
-    nc = (n - j) < NR ? n_tail : NR;
-    // A 取 4 行，打包预热
-    for (i = 0; i < m_block; i += MR) {
-      mc = (m - i) < MR ? m_tail : MR;
-      AddDot4x4(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
-                &C(i, j), ldc, mc, nc);
    }
  }
-}
-// 分块矩阵乘法
-void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda,
-                      const float *B, int ldb, float beta, float *C, int ldc,
-                      int first_time, bool relu = false) {
-  int m_block = (m + MR - 1) / MR * MR;
-  int n_block = (n + NR - 1) / NR * NR;
-  int m_tail = m % MR;
-  int n_tail = n % NR;
-  if (first_time) {
+  if (alpha != 1) {
-    PackMatrixB_(k, n, n_tail, B, ldb, packedB);
+    WriteWithAlphaBeta(mc, nc, c, C, ldc);
+    return;
  }
-  PackMatrixA_(m, k, m_tail, A, lda, packedA);
+  if (beta == 0) {
+    WriteBasic(mc, nc, c, C, ldc);
-  int i, j, mc, nc;
+    return;
+  }
-  // B 取 4 列, 打包预热
+  if (beta == 1 && !relu) {
-  for (j = 0; j < n_block; j += NR) {
+    WriteWithAdd(mc, nc, c, C, ldc);
-    nc = (n - j) < NR ? n_tail : NR;
+    return;
-    // A 取 4 行，打包预热
-    for (i = 0; i < m_block; i += MR) {
-      mc = (m - i) < MR ? m_tail : MR;
-      AddDot4x4_relu(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
-                     &C(i, j), ldc, mc, nc, relu);
  }
+  if (beta == 1 && relu) {
+    WriteWithAddRelu(mc, nc, c, C, ldc);
+    return;
  }
 }
-// 计算一个更小的 4 * 4 的 C 矩阵分块
+// 分块矩阵乘法
-#if defined(IOS)
+void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
-void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
+                       const float *b, float beta, float *c, float *C, int ldc,
-               int ldb, float beta, float *C, int ldc, int mc, int nc) {
+                       bool relu, float *new_scale, float *new_bias) {
-  // init C
+  for (int j = 0; j < nc; j += NR) {
-  float32x4_t cv0 = vdupq_n_f32(0.0);
+    for (int i = 0; i < mc; i += MR) {
-  float32x4_t cv1 = vdupq_n_f32(0.0);
+      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-  float32x4_t cv2 = vdupq_n_f32(0.0);
+      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-  float32x4_t cv3 = vdupq_n_f32(0.0);
-  float32x4_t av;
-  float32x4_t bv;
-  float32x2_t av01;
-  float32x2_t av23;
-  for (int p = 0; p < k; p += 1) {
-    av = vld1q_f32(a);
-    bv = vld1q_f32(b);
-    av01 = vget_low_f32(av);
-    cv0 = vmlaq_lane_f32(cv0, bv, av01, 0);
-    cv1 = vmlaq_lane_f32(cv1, bv, av01, 1);
-    av23 = vget_high_f32(av);
-    cv2 = vmlaq_lane_f32(cv2, bv, av23, 0);
-    cv3 = vmlaq_lane_f32(cv3, bv, av23, 1);
-    a += MR;
-    b += NR;
-  }
-  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
-      }
-      if (j == 0) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
-      } else if (j == 1) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
-      } else if (j == 2) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
-      } else if (j == 3) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
    }
  }
+  if (relu) {
+    WriteWithBnRelu(mc, nc, c, C, ldc, new_scale, new_bias);
+  } else {
+    WriteWithBn(mc, nc, c, C, ldc, new_scale, new_bias);
  }
 }
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
+#if defined(IOS)
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
+void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
-                    bool relu = false) {
  // init C
  float32x4_t cv0 = vdupq_n_f32(0.0);
  float32x4_t cv1 = vdupq_n_f32(0.0);
@@ -307,183 +247,22 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
      } else if (j == 3) {
        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
      }
-      if (C(i, j) < 0) {
-        C(i, j) = 0;
-      }
    }
  }
 }
+}  // namespace math
 #elif defined(ARMV7)
-void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
-               int ldb, float beta, float *C, int ldc, int mc, int nc) {
+  const float *a_ptr, *b_ptr;
-  int kc1 = k / 4, kc2 = k % 4;
+  a_ptr = a;
-  int bytes_ldc = 4 * ldc;
+  b_ptr = b;
-  int flag_alpha = (alpha == 1.0) ? 1 : 2;
+  int kc1 = k / 4;
-  int flag_beta;
+  int kc2 = k % 4;
-  if (beta == 0.0) {
+  int step = 4 * ldc;
-    flag_beta = 0;
-  } else if (beta == 1.0) {
-    flag_beta = 1;
-  } else {
-    flag_beta = 2;
-  }
-  asm volatile(
-      "pld        [%[a]]              \n\t"
-      "pld        [%[b]]              \n\t"
-      "vmov.f32   q10,    #0.0        \n\t"
-      "vmov.f32   q11,    #0.0        \n\t"
-      "vmov.f32   q12,    #0.0        \n\t"
-      "vmov.f32   q13,    #0.0        \n\t"
-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "blt        end_kc1_%=          \n\t"
-      "loop_kc1_%=:                   \n\t"
-      "pld        [%[a], #64]         \n\t"
-      "pld        [%[b], #64]         \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
-      "vmla.f32   q10, q2, d0[0]      \n\t"
-      "vmla.f32   q11, q2, d0[1]      \n\t"
-      "vmla.f32   q12, q2, d1[0]      \n\t"
-      "vmla.f32   q13, q2, d1[1]      \n\t"
-      "vmla.f32   q10, q3, d2[0]      \n\t"
-      "vmla.f32   q11, q3, d2[1]      \n\t"
-      "vmla.f32   q12, q3, d3[0]      \n\t"
-      "vmla.f32   q13, q3, d3[1]      \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
-      "vmla.f32   q10, q2, d0[0]      \n\t"
-      "vmla.f32   q11, q2, d0[1]      \n\t"
-      "vmla.f32   q12, q2, d1[0]      \n\t"
-      "vmla.f32   q13, q2, d1[1]      \n\t"
-      "vmla.f32   q10, q3, d2[0]      \n\t"
-      "vmla.f32   q11, q3, d2[1]      \n\t"
-      "vmla.f32   q12, q3, d3[0]      \n\t"
-      "vmla.f32   q13, q3, d3[1]      \n\t"
-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "bge        loop_kc1_%=         \n\t"
-      "end_kc1_%=:                    \n\t"
-      "subs       %[kc2], %[kc2], #1  \n\t"
-      "blt        end_kc2_%=          \n\t"
-      "loop_kc2_%=:                   \n\t"
-      "vld1.32    {q0}, [%[a]]!       \n\t"
-      "vld1.32    {q1}, [%[b]]!       \n\t"
-      "vmla.f32   q10, q1, d0[0]      \n\t"
-      "vmla.f32   q11, q1, d0[1]      \n\t"
-      "vmla.f32   q12, q1, d1[0]      \n\t"
-      "vmla.f32   q13, q1, d1[1]      \n\t"
-      "subs       %[kc2], %[kc2], #1  \n\t"
-      "bge        loop_kc2_%=         \n\t"
-      "end_kc2_%=:                    \n\t"
-      "cmp        %[mc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-      "cmp        %[nc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-      "vmov.f32   d8[0],    %[alpha]  \n\t"
-      "vmov.f32   d8[1],    %[beta]   \n\t"
-      "cmp        %[flag_alpha],  #1  \n\t"
-      "bne        alpha_%=            \n\t"
-      "alpha_%=:                      \n\t"
-      "vmul.f32   q10, q10, d8[0]     \n\t"
-      "vmul.f32   q11, q11, d8[0]     \n\t"
-      "vmul.f32   q12, q12, d8[0]     \n\t"
-      "vmul.f32   q13, q13, d8[0]     \n\t"
-      "beta_%=:                       \n\t"
-      "cmp        %[flag_beta],   #0  \n\t"
-      "beq        memory_%=           \n\t"
-      "mov        r4,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
-      "vld1.32    {q0}, [r4], r6      \n\t"
-      "vld1.32    {q1}, [r4], r6      \n\t"
-      "vld1.32    {q2}, [r4], r6      \n\t"
-      "vld1.32    {q3}, [r4]          \n\t"
-      "cmp        %[flag_beta],   #1  \n\t"
-      "beq        beta_eq1_%=         \n\t"
-      "bne        beta_ne1_%=         \n\t"
-      "beta_eq1_%=:                   \n\t"
-      "vadd.f32   q10, q10, q0        \n\t"
-      "vadd.f32   q11, q11, q1        \n\t"
-      "vadd.f32   q12, q12, q2        \n\t"
-      "vadd.f32   q13, q13, q3        \n\t"
-      "b          memory_%=           \n\t"
-      "beta_ne1_%=:                   \n\t"
-      "vmla.f32   q10, q0, d8[1]      \n\t"
-      "vmla.f32   q11, q1, d8[1]      \n\t"
-      "vmla.f32   q12, q2, d8[1]      \n\t"
-      "vmla.f32   q13, q3, d8[1]      \n\t"
-      "memory_%=:                     \n\t"
-      "mov        r5,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
-      "vst1.32    {q10}, [r5], r6     \n\t"
-      "vst1.32    {q11}, [r5], r6     \n\t"
-      "vst1.32    {q12}, [r5], r6     \n\t"
-      "vst1.32    {q13}, [r5]         \n\t"
-      "b          end_%=              \n\t"
-      "temp_%=:                       \n\t"
-      "vst1.32    {q10, q11}, [%[ab]]!\n\t"
-      "vst1.32    {q12, q13}, [%[ab]] \n\t"
-      "end_%=:                        \n\t"
-      :
-      : [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
-        [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
-        [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13");
-  if (mc != MR || nc != NR) {
-    int i, j;
-    for (i = 0; i < mc; ++i) {
-      for (j = 0; j < nc; ++j) {
-        if (beta == 0.0) {
-          if (alpha != 1.0) {
-            C(i, j) = alpha * ab[i * MR + j];
-          } else {
-            C(i, j) = ab[i * MR + j];
-          }
-        } else {
-          if (beta != 1.0) {
-            C(i, j) *= beta;
-          }
-          if (alpha != 1.0) {
-            C(i, j) += alpha * ab[i * MR + j];
-          } else {
-            C(i, j) += ab[i * MR + j];
-          }
-        }
-      }
-    }
-  }
-}
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu = false) {
-  int kc1 = k / 4, kc2 = k % 4;
-  int bytes_ldc = 4 * ldc;
-  int flag_alpha = (alpha == 1.0) ? 1 : 2;
-  int flag_beta;
-  if (beta == 0.0) {
-    flag_beta = 0;
-  } else if (beta == 1.0) {
-    flag_beta = 1;
-  } else {
-    flag_beta = 2;
-  }
  asm volatile(
-      "pld        [%[a]]              \n\t"
+      "pld        [%[a_ptr]]          \n\t"
-      "pld        [%[b]]              \n\t"
+      "pld        [%[b_ptr]]          \n\t"
      "vmov.f32   q10,    #0.0        \n\t"
      "vmov.f32   q11,    #0.0        \n\t"
      "vmov.f32   q12,    #0.0        \n\t"
@@ -492,20 +271,10 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
      "subs       %[kc1], %[kc1], #1  \n\t"
      "blt        end_kc1_%=          \n\t"
      "loop_kc1_%=:                   \n\t"
-      "pld        [%[a], #64]         \n\t"
+      "pld        [%[a_ptr], #64]     \n\t"
-      "pld        [%[b], #64]         \n\t"
+      "pld        [%[b_ptr], #64]     \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
+      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-      "vmla.f32   q10, q2, d0[0]      \n\t"
-      "vmla.f32   q11, q2, d0[1]      \n\t"
-      "vmla.f32   q12, q2, d1[0]      \n\t"
-      "vmla.f32   q13, q2, d1[1]      \n\t"
-      "vmla.f32   q10, q3, d2[0]      \n\t"
-      "vmla.f32   q11, q3, d2[1]      \n\t"
-      "vmla.f32   q12, q3, d3[0]      \n\t"
-      "vmla.f32   q13, q3, d3[1]      \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
      "vmla.f32   q10, q2, d0[0]      \n\t"
      "vmla.f32   q11, q2, d0[1]      \n\t"
      "vmla.f32   q12, q2, d1[0]      \n\t"
@@ -514,6 +283,16 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
      "vmla.f32   q11, q3, d2[1]      \n\t"
      "vmla.f32   q12, q3, d3[0]      \n\t"
      "vmla.f32   q13, q3, d3[1]      \n\t"
+      "vld1.32    {q4, q5}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q6, q7}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q10, q6, d8[0]      \n\t"
+      "vmla.f32   q11, q6, d8[1]      \n\t"
+      "vmla.f32   q12, q6, d9[0]      \n\t"
+      "vmla.f32   q13, q6, d9[1]      \n\t"
+      "vmla.f32   q10, q7, d10[0]     \n\t"
+      "vmla.f32   q11, q7, d10[1]     \n\t"
+      "vmla.f32   q12, q7, d11[0]     \n\t"
+      "vmla.f32   q13, q7, d11[1]     \n\t"
      "subs       %[kc1], %[kc1], #1  \n\t"
      "bge        loop_kc1_%=         \n\t"
      "end_kc1_%=:                    \n\t"
@@ -521,8 +300,8 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
      "subs       %[kc2], %[kc2], #1  \n\t"
      "blt        end_kc2_%=          \n\t"
      "loop_kc2_%=:                   \n\t"
-      "vld1.32    {q0}, [%[a]]!       \n\t"
+      "vld1.32    {q0}, [%[a_ptr]]!   \n\t"
-      "vld1.32    {q1}, [%[b]]!       \n\t"
+      "vld1.32    {q1}, [%[b_ptr]]!   \n\t"
      "vmla.f32   q10, q1, d0[0]      \n\t"
      "vmla.f32   q11, q1, d0[1]      \n\t"
      "vmla.f32   q12, q1, d1[0]      \n\t"
@@ -531,290 +310,168 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
      "bge        loop_kc2_%=         \n\t"
      "end_kc2_%=:                    \n\t"
-      "cmp        %[mc],      #4      \n\t"
+      "mov        r5,     %[c]        \n\t"
-      "bne        temp_%=             \n\t"
+      "mov        r6,     %[step]     \n\t"
-      "cmp        %[nc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-      "vmov.f32   d8[0],    %[alpha]  \n\t"
-      "vmov.f32   d8[1],    %[beta]   \n\t"
-      "cmp        %[flag_alpha],  #1  \n\t"
-      "bne        alpha_%=            \n\t"
-      "alpha_%=:                      \n\t"
-      "vmul.f32   q10, q10, d8[0]     \n\t"
-      "vmul.f32   q11, q11, d8[0]     \n\t"
-      "vmul.f32   q12, q12, d8[0]     \n\t"
-      "vmul.f32   q13, q13, d8[0]     \n\t"
-      "beta_%=:                       \n\t"
-      "cmp        %[flag_beta],   #0  \n\t"
-      "beq        memory_%=           \n\t"
-      "mov        r4,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
-      "vld1.32    {q0}, [r4], r6      \n\t"
-      "vld1.32    {q1}, [r4], r6      \n\t"
-      "vld1.32    {q2}, [r4], r6      \n\t"
-      "vld1.32    {q3}, [r4]          \n\t"
-      "cmp        %[flag_beta],   #1  \n\t"
-      "beq        beta_eq1_%=         \n\t"
-      "bne        beta_ne1_%=         \n\t"
-      "beta_eq1_%=:                   \n\t"
-      "vadd.f32   q10, q10, q0        \n\t"
-      "vadd.f32   q11, q11, q1        \n\t"
-      "vadd.f32   q12, q12, q2        \n\t"
-      "vadd.f32   q13, q13, q3        \n\t"
-      "b          memory_%=           \n\t"
-      "beta_ne1_%=:                   \n\t"
-      "vmla.f32   q10, q0, d8[1]      \n\t"
-      "vmla.f32   q11, q1, d8[1]      \n\t"
-      "vmla.f32   q12, q2, d8[1]      \n\t"
-      "vmla.f32   q13, q3, d8[1]      \n\t"
-      "memory_%=:                     \n\t"
-      "vmax.f32 q10, q10, q14         \n\t"
-      "vmax.f32 q11, q11, q14         \n\t"
-      "vmax.f32 q12, q12, q14         \n\t"
-      "vmax.f32 q13, q13, q14         \n\t"
-      "mov        r5,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
      "vst1.32    {q10}, [r5], r6     \n\t"
      "vst1.32    {q11}, [r5], r6     \n\t"
      "vst1.32    {q12}, [r5], r6     \n\t"
      "vst1.32    {q13}, [r5]         \n\t"
-      "b          end_%=              \n\t"
-      "temp_%=:                       \n\t"
-      "vst1.32    {q10, q11}, [%[ab]]!\n\t"
-      "vst1.32    {q12, q13}, [%[ab]] \n\t"
-      "end_%=:                        \n\t"
      :
-      : [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
+        [kc2] "r"(kc2), [step] "r"(step)
-        [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-        [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
+        "q10", "q11", "q12", "q13");
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13",
-        "q14");
-  if (mc != MR || nc != NR) {
-    int i, j;
-    for (i = 0; i < mc; ++i) {
-      for (j = 0; j < nc; ++j) {
-        if (beta == 0.0) {
-          if (alpha != 1.0) {
-            C(i, j) = alpha * ab[i * MR + j];
-          } else {
-            C(i, j) = ab[i * MR + j];
-          }
-        } else {
-          if (beta != 1.0) {
-            C(i, j) *= beta;
-          }
-          if (alpha != 1.0) {
-            C(i, j) += alpha * ab[i * MR + j];
-          } else {
-            C(i, j) += ab[i * MR + j];
-          }
-        }
-        if (relu) {
-          if (C(i, j) < 0) {
-            C(i, j) = 0;
-          }
-        }
-      }
-    }
-  }
 }
 #else
-void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
-               int ldb, float beta, float *C, int ldc, int mc, int nc) {
+  float *c0, *c1, *c2, *c3;
-  float c[16] = {0};
+  c0 = c;
-  float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
+  c1 = c + ldc;
+  c2 = c + 2 * ldc;
+  c3 = c + 3 * ldc;
  for (int p = 0; p < k; p += 1) {
-    reg_b0 = *b++;
-    reg_b1 = *b++;
-    reg_b2 = *b++;
-    reg_b3 = *b++;
-    reg_a0 = *a++;
-    reg_a1 = *a++;
-    reg_a2 = *a++;
-    reg_a3 = *a++;
    // first row
-    c[0] += reg_a0 * reg_b0;
+    c0[0] += a[0] * b[0];
-    c[1] += reg_a0 * reg_b1;
+    c0[1] += a[0] * b[1];
-    c[2] += reg_a0 * reg_b2;
+    c0[2] += a[0] * b[2];
-    c[3] += reg_a0 * reg_b3;
+    c0[3] += a[0] * b[3];
    // second row
-    c[4] += reg_a1 * reg_b0;
+    c1[0] += a[1] * b[0];
-    c[5] += reg_a1 * reg_b1;
+    c1[1] += a[1] * b[1];
-    c[6] += reg_a1 * reg_b2;
+    c1[2] += a[1] * b[2];
-    c[7] += reg_a1 * reg_b3;
+    c1[3] += a[1] * b[3];
    // third row
-    c[8] += reg_a2 * reg_b0;
+    c2[0] += a[2] * b[0];
-    c[9] += reg_a2 * reg_b1;
+    c2[1] += a[2] * b[1];
-    c[10] += reg_a2 * reg_b2;
+    c2[2] += a[2] * b[2];
-    c[11] += reg_a2 * reg_b3;
+    c2[3] += a[2] * b[3];
    // fourth row
-    c[12] += reg_a3 * reg_b0;
+    c3[0] += a[3] * b[0];
-    c[13] += reg_a3 * reg_b1;
+    c3[1] += a[3] * b[1];
-    c[14] += reg_a3 * reg_b2;
+    c3[2] += a[3] * b[2];
-    c[15] += reg_a3 * reg_b3;
+    c3[3] += a[3] * b[3];
-  }
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
-      }
-      if (alpha != 1.0) {
-        C(i, j) += alpha * c[i * MR + j];
-      } else {
-        C(i, j) += c[i * MR + j];
-      }
-    }
-  }
-}
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu) {
-  float c[16] = {0};
-  float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
-  for (int p = 0; p < k; p += 1) {
-    reg_b0 = *b++;
-    reg_b1 = *b++;
-    reg_b2 = *b++;
-    reg_b3 = *b++;
-    reg_a0 = *a++;
-    reg_a1 = *a++;
-    reg_a2 = *a++;
-    reg_a3 = *a++;
-    // first row
-    c[0] += reg_a0 * reg_b0;
-    c[1] += reg_a0 * reg_b1;
-    c[2] += reg_a0 * reg_b2;
-    c[3] += reg_a0 * reg_b3;
-    // second row
-    c[4] += reg_a1 * reg_b0;
-    c[5] += reg_a1 * reg_b1;
-    c[6] += reg_a1 * reg_b2;
-    c[7] += reg_a1 * reg_b3;
-    // third row
-    c[8] += reg_a2 * reg_b0;
-    c[9] += reg_a2 * reg_b1;
-    c[10] += reg_a2 * reg_b2;
-    c[11] += reg_a2 * reg_b3;
-    // fourth row
+    a += 4;
-    c[12] += reg_a3 * reg_b0;
+    b += 4;
-    c[13] += reg_a3 * reg_b1;
-    c[14] += reg_a3 * reg_b2;
-    c[15] += reg_a3 * reg_b3;
-  }
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
-      }
-      if (alpha != 1.0) {
-        C(i, j) += alpha * c[i * MR + j];
-      } else {
-        C(i, j) += c[i * MR + j];
-      }
-      if (relu) {
-        if (C(i, j) < 0) {
-          C(i, j) = 0;
-        }
-      }
-    }
  }
 }
 #endif
 // 32位 float 矩阵乘法
-void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc) {
+           const float *B, int ldb, float beta, float *C, int ldc, bool relu) {
-  int i, j, p, mc, nc, kc;
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  float beta_;
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int L1 = 30 * 1024;
-#ifdef ARMV7
+  int L2 = 1 * 1024 * 1024;
-  if (m == 1) {
-    VectorKernel(1, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  KC = k;
-    return;
+  MC = L2 / (2 * KC * sizeof(float));
-  }
+  NC = MC;
-#endif
+  // make sure MC is multiple of 4, and NC is multiple of 8
-  for (j = 0; j < n; j += NC) {
+  int mblock_num = (m + MC - 1) / MC;
+  MC = (m + mblock_num - 1) / mblock_num;
+  MC = (MC + 4 - 1) / 4 * 4;
+  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+  int nblock_num = (n + NC - 1) / NC;
+  NC = (n + nblock_num - 1) / nblock_num;
+  NC = (NC + 8 - 1) / 8 * 8;
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+  packedA = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+  packedB = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  for (int l = 0; l < KC; ++l) {
+    zero[l] = 0;
+  }
+  int mc, nc;
+  for (int j = 0; j < n; j += NC) {
    nc = s_min(n - j, NC);
-    for (p = 0; p < k; p += KC) {
+    PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
-      kc = s_min(k - p, KC);
+    for (int i = 0; i < m; i += MC) {
-      for (i = 0; i < m; i += MC) {
      mc = s_min(m - i, MC);
-        if (p != 0) {
+      PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
-          beta_ = 1.0;
+      InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc,
-        } else {
+                  relu);
-          beta_ = beta;
-        }
-        InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
-                    &C(i, j), ldc, i == 0);
-      }
    }
  }
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
 }
-void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
+void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
-                const float *B, int ldb, float beta, float *C, int ldc) {
+                 const float *B, int ldb, float beta, float *C, int ldc,
-  int i, j, p, mc, nc, kc;
+                 bool relu, float *new_scale, float *new_bias) {
-  float beta_;
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  for (j = 0; j < n; j += NC) {
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int L1 = 30 * 1024;
+  int L2 = 1 * 1024 * 1024;
+  KC = k;
+  MC = L2 / (2 * KC * sizeof(float));
+  NC = MC;
+  // make sure MC is multiple of 4, and NC is multiple of 8
+  int mblock_num = (m + MC - 1) / MC;
+  MC = (m + mblock_num - 1) / mblock_num;
+  MC = (MC + 4 - 1) / 4 * 4;
+  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+  int nblock_num = (n + NC - 1) / NC;
+  NC = (n + nblock_num - 1) / nblock_num;
+  NC = (NC + 8 - 1) / 8 * 8;
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+  packedA = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+  packedB = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+  for (int l = 0; l < KC; ++l) {
+    zero[l] = 0;
+  }
+  int mc, nc;
+  for (int j = 0; j < n; j += NC) {
    nc = s_min(n - j, NC);
-    for (p = 0; p < k; p += KC) {
+    PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
-      kc = s_min(k - p, KC);
+    for (int i = 0; i < m; i += MC) {
-      for (i = 0; i < m; i += MC) {
      mc = s_min(m - i, MC);
-        if (p != 0) {
+      PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
-          beta_ = 1.0;
+      InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
-        } else {
+                        &C(i, j), ldc, relu, new_scale + ldc * i + j,
-          beta_ = beta;
+                        new_bias + ldc * i + j);
-        }
-        if (p + KC >= k) {
-          InnerKernel_relu(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb,
-                           beta_, &C(i, j), ldc, i == 0, true);
-        } else {
-          InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
-                      &C(i, j), ldc, i == 0);
-        }
-      }
    }
  }
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
 }
-#ifdef ARMV7
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                  const float *B, int ldb, float beta, float *C, int ldc) {
+                  const float *B, int ldb, float beta, float *C, int ldc,
+                  bool relu) {
  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
  const float *a0, *b0, *b1, *b2, *b3;
@@ -1016,18 +673,995 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
    }
  }
-  c0 = bufferC;
+  if (alpha != 1) {
-  C0 = C;
+    VecWriteWithAlphaBeta(n, bufferC, C, ldc);
-  for (int i = 0; i < n; i++) {
+    return;
-    if (beta == 1.0) {
-      *C0++ += *c0++;
-    } else {
-      *C0++ = *c0++;
  }
+  if (beta == 0) {
+    VecWriteBasic(n, bufferC, C, ldc);
+    return;
  }
-}
+  if (beta == 1 && !relu) {
-#endif
+    VecWriteWithAdd(n, bufferC, C, ldc);
+    return;
-}  // namespace math
+  }
+  if (beta == 1 && relu) {
+    VecWriteWithAddRelu(n, bufferC, C, ldc);
+    return;
+  }
+}
+void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+                        int lda, const float *B, int ldb, float beta, float *C,
+                        int ldc, bool relu, float *new_scale, float *new_bias) {
+  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
+  const float *a0, *b0, *b1, *b2, *b3;
+  float *c0, *C0;
+  int volatile kc1 = k / 4;
+  int volatile kc2 = k % 4;
+  int volatile nc1 = n / 16;
+  int _nc1 = n % 16;
+  int volatile nc2 = _nc1 / 4;
+  int volatile nc3 = _nc1 % 4;
+  for (int i = 0; i < kc1; i++) {
+    a0 = A + i * 4;
+    b0 = B + i * 4 * ldb;
+    b1 = b0 + ldb;
+    b2 = b1 + ldb;
+    b3 = b2 + ldb;
+    c0 = bufferC;
+    asm volatile(
+        "pld        [%[a0], #16]          \n\t"
+        "vld1.32    {q0}, [%[a0]]         \n\t"
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "blt        end_nc1_%=            \n\t"
+        "loop_nc1_%=:                     \n\t"
+        "cmp        %[i],       #0        \n\t"
+        "beq        i_eq0_%=              \n\t"
+        "bne        i_ne0_%=              \n\t"
+        "i_eq0_%=:                        \n\t"
+        "vmov.f32   q10,    #0.0          \n\t"
+        "vmov.f32   q11,    #0.0          \n\t"
+        "vmov.f32   q12,    #0.0          \n\t"
+        "vmov.f32   q13,    #0.0          \n\t"
+        "b          gemm_nc1_%=           \n\t"
+        "i_ne0_%=:                        \n\t"
+        "pld        [%[c0], #64]          \n\t"
+        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
+        "sub        %[c0], %[c0], #32     \n\t"
+        "gemm_nc1_%=:                     \n\t"
+        "pld        [%[b0], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "vmla.f32   q11, q3, d0[0]        \n\t"
+        "vmla.f32   q12, q4, d0[0]        \n\t"
+        "vmla.f32   q13, q5, d0[0]        \n\t"
+        "pld        [%[b1], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b1]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b1]]!    \n\t"
+        "vmla.f32   q10, q2, d0[1]        \n\t"
+        "vmla.f32   q11, q3, d0[1]        \n\t"
+        "vmla.f32   q12, q4, d0[1]        \n\t"
+        "vmla.f32   q13, q5, d0[1]        \n\t"
+        "pld        [%[b2], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b2]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b2]]!    \n\t"
+        "vmla.f32   q10, q2, d1[0]        \n\t"
+        "vmla.f32   q11, q3, d1[0]        \n\t"
+        "vmla.f32   q12, q4, d1[0]        \n\t"
+        "vmla.f32   q13, q5, d1[0]        \n\t"
+        "pld        [%[b3], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b3]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b3]]!    \n\t"
+        "vmla.f32   q10, q2, d1[1]        \n\t"
+        "vmla.f32   q11, q3, d1[1]        \n\t"
+        "vmla.f32   q12, q4, d1[1]        \n\t"
+        "vmla.f32   q13, q5, d1[1]        \n\t"
+        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "bge        loop_nc1_%=           \n\t"
+        "end_nc1_%=:                      \n\t"
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "blt        end_nc2_%=            \n\t"
+        "loop_nc2_%=:                     \n\t"
+        "cmp        %[i],       #0        \n\t"
+        "beq        ii_eq0_%=             \n\t"
+        "bne        ii_ne0_%=             \n\t"
+        "ii_eq0_%=:                       \n\t"
+        "vmov.f32   q10,    #0.0          \n\t"
+        "b          gemm_nc2_%=           \n\t"
+        "ii_ne0_%=:                       \n\t"
+        "pld        [%[c0], #16]          \n\t"
+        "vld1.32    {q10}, [%[c0]]        \n\t"
+        "gemm_nc2_%=:                     \n\t"
+        "pld        [%[b0], #16]          \n\t"
+        "vld1.32    {q2}, [%[b0]]!        \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "pld        [%[b1], #16]          \n\t"
+        "vld1.32    {q3}, [%[b1]]!        \n\t"
+        "vmla.f32   q10, q3, d0[1]        \n\t"
+        "pld        [%[b2], #16]          \n\t"
+        "vld1.32    {q4}, [%[b2]]!        \n\t"
+        "vmla.f32   q10, q4, d1[0]        \n\t"
+        "pld        [%[b3], #16]          \n\t"
+        "vld1.32    {q5}, [%[b3]]!        \n\t"
+        "vmla.f32   q10, q5, d1[1]        \n\t"
+        "vst1.32    {q10}, [%[c0]]!       \n\t"
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "bge        loop_nc2_%=           \n\t"
+        "end_nc2_%=:                      \n\t"
+        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
+          [c0] "+r"(c0)
+        : [a0] "r"(a0), [i] "r"(i), [nc1] "r"(nc1), [nc2] "r"(nc2)
+        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
+    for (int j = 0; j < nc3; j++) {
+      if (i == 0) {
+        *c0 = (*a0) * (*b0++);
+      } else {
+        *c0 += (*a0) * (*b0++);
+      }
+      *c0 += (*(a0 + 1)) * (*b1++);
+      *c0 += (*(a0 + 2)) * (*b2++);
+      *c0 += (*(a0 + 3)) * (*b3++);
+      c0++;
+    }
+  }
+  for (int i = 0; i < kc2; ++i) {
+    a0 = A + 4 * kc1 + i;
+    b0 = B + (4 * kc1 + i) * ldb;
+    c0 = bufferC;
+    asm volatile(
+        "pld        [%[a0], #16]          \n\t"
+        "vld1.32    {d0}, [%[a0]]         \n\t"
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "blt        end_nc1_%=            \n\t"
+        "loop_nc1_%=:                     \n\t"
+        "pld        [%[c0], #64]          \n\t"
+        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
+        "sub        %[c0], %[c0], #32     \n\t"
+        "gemm_nc1_%=:                     \n\t"
+        "pld        [%[b0], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "vmla.f32   q11, q3, d0[0]        \n\t"
+        "vmla.f32   q12, q4, d0[0]        \n\t"
+        "vmla.f32   q13, q5, d0[0]        \n\t"
+        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "bge        loop_nc1_%=           \n\t"
+        "end_nc1_%=:                      \n\t"
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "blt        end_nc2_%=            \n\t"
+        "loop_nc2_%=:                     \n\t"
+        "pld        [%[c0], #16]          \n\t"
+        "vld1.32    {q10}, [%[c0]]        \n\t"
+        "gemm_nc2_%=:                     \n\t"
+        "vld1.32    {q2}, [%[b0]]!        \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "vst1.32    {q10}, [%[c0]]!       \n\t"
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "bge        loop_nc2_%=           \n\t"
+        "end_nc2_%=:                      \n\t"
+        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
+          [c0] "+r"(c0)
+        : [a0] "r"(a0), [nc1] "r"(nc1), [nc2] "r"(nc2)
+        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
+    for (int j = 0; j < nc3; j++) {
+      *c0 += (*a0) * (*b0++);
+      c0++;
+    }
+  }
+  if (relu) {
+    VecWriteWithBnRelu(n, bufferC, C, ldc, new_scale, new_bias);
+  } else {
+    VecWriteWithBn(n, bufferC, C, ldc, new_scale, new_bias);
+  }
+}
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k / 4;
+  int kc2 = k % 4;
+  int step = 4 * ldc;
+  asm volatile(
+      "pld        [%[a_ptr]]          \n\t"
+      "pld        [%[b_ptr]]          \n\t"
+      "vmov.f32   q8,     #0.0        \n\t"
+      "vmov.f32   q9,     #0.0        \n\t"
+      "vmov.f32   q10,    #0.0        \n\t"
+      "vmov.f32   q11,    #0.0        \n\t"
+      "vmov.f32   q12,    #0.0        \n\t"
+      "vmov.f32   q13,    #0.0        \n\t"
+      "vmov.f32   q14,    #0.0        \n\t"
+      "vmov.f32   q15,    #0.0        \n\t"
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "blt        end_kc1_%=          \n\t"
+      "loop_kc1_%=:                   \n\t"
+      "pld        [%[a_ptr], #64]     \n\t"
+      "pld        [%[b_ptr], #64]     \n\t"
+      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vld1.32    {q4, q5}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q8,   q2,   d0[0]      \n\t"
+      "vmla.f32   q9,   q3,   d0[0]      \n\t"
+      "vmla.f32   q10,  q2,   d0[1]      \n\t"
+      "vmla.f32   q11,  q3,   d0[1]      \n\t"
+      "vmla.f32   q12,  q2,   d1[0]      \n\t"
+      "vmla.f32   q13,  q3,   d1[0]      \n\t"
+      "vmla.f32   q14,  q2,   d1[1]      \n\t"
+      "vmla.f32   q15,  q3,   d1[1]      \n\t"
+      "vmla.f32   q8,   q4,   d2[0]      \n\t"
+      "vmla.f32   q9,   q5,   d2[0]      \n\t"
+      "vmla.f32   q10,  q4,   d2[1]      \n\t"
+      "vmla.f32   q11,  q5,   d2[1]      \n\t"
+      "vmla.f32   q12,  q4,   d3[0]      \n\t"
+      "vmla.f32   q13,  q5,   d3[0]      \n\t"
+      "vmla.f32   q14,  q4,   d3[1]      \n\t"
+      "vmla.f32   q15,  q5,   d3[1]      \n\t"
+      "pld        [%[b_ptr], #64]     \n\t"
+      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vld1.32    {q4, q5}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q8,   q2,   d0[0]      \n\t"
+      "vmla.f32   q9,   q3,   d0[0]      \n\t"
+      "vmla.f32   q10,  q2,   d0[1]      \n\t"
+      "vmla.f32   q11,  q3,   d0[1]      \n\t"
+      "vmla.f32   q12,  q2,   d1[0]      \n\t"
+      "vmla.f32   q13,  q3,   d1[0]      \n\t"
+      "vmla.f32   q14,  q2,   d1[1]      \n\t"
+      "vmla.f32   q15,  q3,   d1[1]      \n\t"
+      "vmla.f32   q8,   q4,   d2[0]      \n\t"
+      "vmla.f32   q9,   q5,   d2[0]      \n\t"
+      "vmla.f32   q10,  q4,   d2[1]      \n\t"
+      "vmla.f32   q11,  q5,   d2[1]      \n\t"
+      "vmla.f32   q12,  q4,   d3[0]      \n\t"
+      "vmla.f32   q13,  q5,   d3[0]      \n\t"
+      "vmla.f32   q14,  q4,   d3[1]      \n\t"
+      "vmla.f32   q15,  q5,   d3[1]      \n\t"
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "bge        loop_kc1_%=         \n\t"
+      "end_kc1_%=:                    \n\t"
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "blt        end_kc2_%=          \n\t"
+      "loop_kc2_%=:                   \n\t"
+      "vld1.32    {q0},     [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q8,   q2,   d0[0]      \n\t"
+      "vmla.f32   q9,   q3,   d0[0]      \n\t"
+      "vmla.f32   q10,  q2,   d0[1]      \n\t"
+      "vmla.f32   q11,  q3,   d0[1]      \n\t"
+      "vmla.f32   q12,  q2,   d1[0]      \n\t"
+      "vmla.f32   q13,  q3,   d1[0]      \n\t"
+      "vmla.f32   q14,  q2,   d1[1]      \n\t"
+      "vmla.f32   q15,  q3,   d1[1]      \n\t"
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "bge        loop_kc2_%=         \n\t"
+      "end_kc2_%=:                    \n\t"
+      "mov        r5,     %[c]        \n\t"
+      "mov        r6,     %[step]     \n\t"
+      "vst1.32    {q8, q9},   [r5], r6     \n\t"
+      "vst1.32    {q10, q11}, [r5], r6     \n\t"
+      "vst1.32    {q12, q13}, [r5], r6     \n\t"
+      "vst1.32    {q14, q15}, [r5]         \n\t"
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [step] "r"(step)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9",
+        "q10", "q11", "q12", "q13", "q14", "q15");
+}
+// C = A * B
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int step = 4 * ldc;
+  int step1 = 4 * (NC - 16 * nc1);
+  int volatile m = mc;
+  float *volatile c_ptr, *volatile C_ptr;
+  float *C0, *c0;
+  c_ptr = c;
+  C_ptr = C;
+  if (nc1 > 0) {
+    asm volatile(
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
+        "vst1.32    {q0, q1}, [r6]!         \n\t"
+        "vld1.32    {q2, q3}, [%[c_ptr]]!   \n\t"
+        "vst1.32    {q2, q3}, [r6]!         \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+        "add        %[C_ptr], %[C_ptr], %[step]   \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1]  \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3");
+  }
+  if (_nc1 != 0) {
+    for (int i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 16 + i * ldc;
+      c0 = c_ptr + nc1 * 16 + i * NC;
+      for (int j = 0; j < _nc1; j++) {
+        *C0++ = *c0++;
+      }
+    }
+  }
+}
+// C = alpha * A * B + beta * C
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+// C = A * B + C
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int step = 4 * ldc;
+  int step1 = 4 * (NC - 16 * nc1);
+  int volatile m = mc;
+  float *volatile c_ptr, *volatile C_ptr;
+  float *C0, *c0;
+  c_ptr = c;
+  C_ptr = C;
+  if (nc1 > 0) {
+    asm volatile(
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+        "vld1.32    {q0, q1},   [r6]        \n\t"
+        "vld1.32    {q2, q3},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vst1.32    {q10, q11}, [r6]!       \n\t"
+        "vld1.32    {q4, q5},   [r6]        \n\t"
+        "vld1.32    {q6, q7},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vst1.32    {q12, q13}, [r6]!       \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+        "add        %[C_ptr], %[C_ptr], %[step]     \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1]    \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+          "q10", "q11", "q12", "q13");
+  }
+  if (_nc1 != 0) {
+    for (int i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 16 + i * ldc;
+      c0 = c_ptr + nc1 * 16 + i * NC;
+      for (int j = 0; j < _nc1; j++) {
+        *C0++ += *c0++;
+      }
+    }
+  }
+}
+// C = A * B + C, relu(C)
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int step = 4 * ldc;
+  int step1 = 4 * (NC - 16 * nc1);
+  int volatile m = mc;
+  float *volatile c_ptr, *volatile C_ptr;
+  float *C0, *c0;
+  c_ptr = c;
+  C_ptr = C;
+  if (nc1 > 0) {
+    asm volatile(
+        "vmov.f32   q14,    #0.0            \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+        "vld1.32    {q0, q1},   [r6]        \n\t"
+        "vld1.32    {q2, q3},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vmax.f32   q10,  q10,  q14         \n\t"
+        "vmax.f32   q11,  q11,  q14         \n\t"
+        "vst1.32    {q10, q11}, [r6]!       \n\t"
+        "vld1.32    {q4, q5},   [r6]        \n\t"
+        "vld1.32    {q6, q7},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vmax.f32   q12,  q12,  q14         \n\t"
+        "vmax.f32   q13,  q13,  q14         \n\t"
+        "vst1.32    {q12, q13}, [r6]!       \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+        "add        %[C_ptr], %[C_ptr], %[step]     \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1]    \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+          "q10", "q11", "q12", "q13");
+  }
+  if (_nc1 != 0) {
+    for (int i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 16 + i * ldc;
+      c0 = c_ptr + nc1 * 16 + i * NC;
+      for (int j = 0; j < _nc1; j++) {
+        *C0 += *c0;
+        if (*C0 < 0) {
+          *C0 = 0;
+        }
+        C0++;
+        c0++;
+      }
+    }
+  }
+}
+// C = A * B, batchnorm(C)
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
+                 float *bias) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int nc2 = _nc1 / 4;
+  int nc3 = 16 - 4 * (_nc1 % 4);
+  int step = 4 * (ldc - nc);
+  int step1 = 4 * (NC - nc);
+  asm volatile(
+      "subs       %[mc], %[mc], #1        \n\t"
+      "blt        end_mc_%=               \n\t"
+      "loop_mc_%=:                        \n\t"
+      "mov        r5,   %[nc1]            \n\t"
+      "mov        r6,   %[nc2]            \n\t"
+      "subs       r5,   r5,   #1          \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+      "vmla.f32   q10,  q0,   q2          \n\t"
+      "vmla.f32   q11,  q1,   q3          \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+      "vmla.f32   q12,  q4,   q6          \n\t"
+      "vmla.f32   q13,  q5,   q7          \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+      "subs       r5,   r5,   #1          \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+      "subs       r6,  r6,   #1           \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+      "vld1.32    {q0},   [%[c]]!           \n\t"
+      "vld1.32    {q1},   [%[scale]]!       \n\t"
+      "vld1.32    {q10},  [%[bias]]!        \n\t"
+      "vmla.f32   q10,    q0,   q1          \n\t"
+      "vst1.32    {q10},  [%[C]]!           \n\t"
+      "subs       r6,   r6,   #1          \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+      "sub        %[c],     %[c],   %[nc3]      \n\t"
+      "sub        %[scale], %[scale],  %[nc3]   \n\t"
+      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+      "sub        %[C],     %[C],   %[nc3]      \n\t"
+      "vld1.32    {q0},   [%[c]]!         \n\t"
+      "vld1.32    {q1},   [%[scale]]!     \n\t"
+      "vld1.32    {q10},  [%[bias]]!      \n\t"
+      "vmla.f32   q10,    q0,   q1        \n\t"
+      "vst1.32    {q10},  [%[C]]!         \n\t"
+      "end_nc3_%=:                        \n\t"
+      "add        %[c],     %[c],     %[step1]  \n\t"
+      "add        %[scale], %[scale], %[step]   \n\t"
+      "add        %[bias],  %[bias],  %[step]   \n\t"
+      "add        %[C],     %[C],     %[step]   \n\t"
+      "subs       %[mc], %[mc], #1        \n\t"
+      "bge        loop_mc_%=              \n\t"
+      "end_mc_%=:                         \n\t"
+      :
+      : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
+        [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1),
+        [scale] "r"(scale), [bias] "r"(bias)
+      : "memory", "cc", "r5", "r6", "r7", "r8", "q0", "q1", "q2", "q3", "q4",
+        "q5", "q6", "q7", "q10", "q11", "q12", "q13");
+}
+// C = A * B, batchnorm(C), relu(C)
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
+                     float *bias) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int nc2 = _nc1 / 4;
+  int nc3 = 16 - 4 * (_nc1 % 4);
+  int step = 4 * (ldc - nc);
+  int step1 = 4 * (NC - nc);
+  asm volatile(
+      "vmov.f32   q14,    #0.0            \n\t"
+      "subs       %[mc], %[mc], #1        \n\t"
+      "blt        end_mc_%=               \n\t"
+      "loop_mc_%=:                        \n\t"
+      "mov        r5,   %[nc1]            \n\t"
+      "mov        r6,   %[nc2]            \n\t"
+      "subs       r5,   r5,   #1          \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+      "vmla.f32   q10,  q0,   q2          \n\t"
+      "vmla.f32   q11,  q1,   q3          \n\t"
+      "vmax.f32   q10,  q10,  q14         \n\t"
+      "vmax.f32   q11,  q11,  q14         \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+      "vmla.f32   q12,  q4,   q6          \n\t"
+      "vmla.f32   q13,  q5,   q7          \n\t"
+      "vmax.f32   q12,  q12,  q14         \n\t"
+      "vmax.f32   q13,  q13,  q14         \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+      "subs       r5,   r5,   #1          \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+      "subs       r6,  r6,   #1           \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+      "vld1.32    {q0},   [%[c]]!           \n\t"
+      "vld1.32    {q1},   [%[scale]]!       \n\t"
+      "vld1.32    {q10},  [%[bias]]!        \n\t"
+      "vmla.f32   q10,    q0,   q1          \n\t"
+      "vmax.f32   q10,    q10,  q14         \n\t"
+      "vst1.32    {q10},  [%[C]]!           \n\t"
+      "subs       r6,   r6,   #1          \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+      "sub        %[c],     %[c],   %[nc3]      \n\t"
+      "sub        %[scale], %[scale],  %[nc3]   \n\t"
+      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+      "sub        %[C],     %[C],   %[nc3]      \n\t"
+      "vld1.32    {q0},   [%[c]]!         \n\t"
+      "vld1.32    {q1},   [%[scale]]!     \n\t"
+      "vld1.32    {q10},  [%[bias]]!      \n\t"
+      "vmla.f32   q10,    q0,   q1        \n\t"
+      "vmax.f32   q10,    q10,  q14       \n\t"
+      "vst1.32    {q10},  [%[C]]!         \n\t"
+      "end_nc3_%=:                        \n\t"
+      "add        %[c],     %[c],     %[step1]  \n\t"
+      "add        %[scale], %[scale], %[step]   \n\t"
+      "add        %[bias],  %[bias],  %[step]   \n\t"
+      "add        %[C],     %[C],     %[step]   \n\t"
+      "subs       %[mc], %[mc], #1        \n\t"
+      "bge        loop_mc_%=              \n\t"
+      "end_mc_%=:                         \n\t"
+      :
+      : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
+        [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1),
+        [scale] "r"(scale), [bias] "r"(bias)
+      : "memory", "r5", "r6", "r7", "r8", "q0", "q1", "q2", "q3", "q4", "q5",
+        "q6", "q7", "q10", "q11", "q12", "q13", "q14");
+}
+// C = A * B
+void VecWriteBasic(int n, float *c, float *C, int ldc) {
+  int nc1 = n / 16;
+  int _nc1 = n % 16;
+  int nc2 = _nc1 / 4;
+  int nc3 = 16 - 4 * (_nc1 % 4);
+  asm volatile(
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+      "vld1.32    {q0, q1}, [%[c]]!       \n\t"
+      "vst1.32    {q0, q1}, [%[C]]!       \n\t"
+      "vld1.32    {q2, q3}, [%[c]]!       \n\t"
+      "vst1.32    {q2, q3}, [%[C]]!       \n\t"
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+      "vld1.32    {q4},     [%[c]]!       \n\t"
+      "vst1.32    {q4},     [%[C]]!       \n\t"
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+      "sub        %[c],     %[c],   %[nc3]    \n\t"
+      "sub        %[C],     %[C],   %[nc3]    \n\t"
+      "vld1.32    {q5},     [%[c]]!       \n\t"
+      "vst1.32    {q5},     [%[C]]!       \n\t"
+      "end_nc3_%=:                        \n\t"
+      :
+      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q5");
+}
+// C = alpha * A * B + beta * C
+void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
+// C = A * B + C
+void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
+  int nc1 = n / 16;
+  int _nc1 = n % 16;
+  asm volatile(
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+      "vld1.32    {q2, q3},   [%[C]]      \n\t"
+      "vadd.f32   q10,  q0,   q2          \n\t"
+      "vadd.f32   q11,  q1,   q3          \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+      "vld1.32    {q6, q7},   [%[C]]      \n\t"
+      "vadd.f32   q12,  q4,   q6          \n\t"
+      "vadd.f32   q13,  q5,   q7          \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+      : [C] "+r"(C), [c] "+r"(c)
+      : [nc1] "r"(nc1)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
+        "q12", "q13");
+  if (_nc1 != 0) {
+    for (int j = 0; j < _nc1; j++) {
+      *C++ += *c++;
+    }
+  }
+}
+// C = A * B + C, relu(C)
+void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
+  int nc1 = n / 16;
+  int _nc1 = n % 16;
+  asm volatile(
+      "vmov.f32   q14,      #0.0          \n\t"
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+      "vld1.32    {q2, q3},   [%[C]]      \n\t"
+      "vadd.f32   q10,  q0,   q2          \n\t"
+      "vadd.f32   q11,  q1,   q3          \n\t"
+      "vmax.f32   q10,  q10,  q14         \n\t"
+      "vmax.f32   q11,  q11,  q14         \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+      "vld1.32    {q6, q7},   [%[C]]      \n\t"
+      "vadd.f32   q12,  q4,   q6          \n\t"
+      "vadd.f32   q13,  q5,   q7          \n\t"
+      "vmax.f32   q12,  q12,  q14         \n\t"
+      "vmax.f32   q13,  q13,  q14         \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+      : [C] "+r"(C), [c] "+r"(c)
+      : [nc1] "r"(nc1)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
+        "q12", "q13");
+  if (_nc1 != 0) {
+    for (int j = 0; j < _nc1; j++) {
+      *C += *c;
+      if (*C < 0) {
+        *C = 0;
+      }
+      C++;
+      c++;
+    }
+  }
+}
+// C = A * B, batchnorm(C)
+void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
+                    float *bias) {
+  int nc1 = n / 16;
+  int _nc1 = n % 16;
+  int nc2 = _nc1 / 4;
+  int nc3 = 16 - 4 * (_nc1 % 4);
+  asm volatile(
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+      "vmla.f32   q10,  q0,   q2          \n\t"
+      "vmla.f32   q11,  q1,   q3          \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+      "vmla.f32   q12,  q4,   q6          \n\t"
+      "vmla.f32   q13,  q5,   q7          \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+      "vld1.32    {q0},   [%[c]]!         \n\t"
+      "vld1.32    {q1},   [%[scale]]!     \n\t"
+      "vld1.32    {q10},  [%[bias]]!      \n\t"
+      "vmla.f32   q10,    q0,   q1        \n\t"
+      "vst1.32    {q10},  [%[C]]!         \n\t"
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+      "sub        %[c],     %[c],   %[nc3]      \n\t"
+      "sub        %[scale], %[scale],  %[nc3]   \n\t"
+      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+      "sub        %[C],     %[C],   %[nc3]      \n\t"
+      "vld1.32    {q0},   [%[c]]!         \n\t"
+      "vld1.32    {q1},   [%[scale]]!     \n\t"
+      "vld1.32    {q10},  [%[bias]]!      \n\t"
+      "vmla.f32   q10,    q0,   q1        \n\t"
+      "vst1.32    {q10},  [%[C]]!         \n\t"
+      "end_nc3_%=:                        \n\t"
+      :
+      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3),
+        [scale] "r"(scale), [bias] "r"(bias)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
+        "q12", "q13");
+}
+// C = A * B, batchnorm(C), relu(C)
+void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
+                        float *bias) {
+  int nc1 = n / 16;
+  int _nc1 = n % 16;
+  int nc2 = _nc1 / 4;
+  int nc3 = 16 - 4 * (_nc1 % 4);
+  asm volatile(
+      "vmov.f32   q14,      #0.0          \n\t"
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+      "vmla.f32   q10,  q0,   q2          \n\t"
+      "vmla.f32   q11,  q1,   q3          \n\t"
+      "vmax.f32   q10,  q10,  q14         \n\t"
+      "vmax.f32   q11,  q11,  q14         \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+      "vmla.f32   q12,  q4,   q6          \n\t"
+      "vmla.f32   q13,  q5,   q7          \n\t"
+      "vmax.f32   q12,  q12,  q14         \n\t"
+      "vmax.f32   q13,  q13,  q14         \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+      "vld1.32    {q0},   [%[c]]!         \n\t"
+      "vld1.32    {q1},   [%[scale]]!     \n\t"
+      "vld1.32    {q10},  [%[bias]]!      \n\t"
+      "vmla.f32   q10,    q0,   q1        \n\t"
+      "vmax.f32   q10,    q10,  q14       \n\t"
+      "vst1.32    {q10},  [%[C]]!         \n\t"
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+      "sub        %[c],     %[c],   %[nc3]      \n\t"
+      "sub        %[scale], %[scale],  %[nc3]   \n\t"
+      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+      "sub        %[C],     %[C],   %[nc3]      \n\t"
+      "vld1.32    {q0},   [%[c]]!         \n\t"
+      "vld1.32    {q1},   [%[scale]]!     \n\t"
+      "vld1.32    {q10},  [%[bias]]!      \n\t"
+      "vmla.f32   q10,    q0,   q1        \n\t"
+      "vmax.f32   q10,    q10,  q14       \n\t"
+      "vst1.32    {q10},  [%[C]]!         \n\t"
+      "end_nc3_%=:                        \n\t"
+      :
+      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3),
+        [scale] "r"(scale), [bias] "r"(bias)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
+        "q12", "q13", "q14");
+}
 }  // namespace operators
 }  // namespace paddle_mobile
+}  // namespace paddle_mobile
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -19,12 +19,8 @@ limitations under the License. */
 #define B(i, j) B[(i)*ldb + (j)]
 #define C(i, j) C[(i)*ldc + (j)]
-// 分块计算的块大小，mc 与 kc 分别对应分块计算时的 m 与 k
-#define MC 128
-#define KC 128
-#define NC 1024
 #define MR 4
-#define NR 4
+#define NR 8
 #define s_min(i, j) ((i) < (j) ? (i) : (j))
@@ -49,28 +45,66 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
                  float *buffer);
 // 分块矩阵乘法
-void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
+void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
-                 const float *B, int ldb, float beta, float *C, int ldc,
+                 float beta, float *c, float *C, int ldc, bool relu);
-                 int first_time);
+void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+                       const float *b, float beta, float *c, float *C, int ldc,
+                       bool relu, float *new_scale, float *new_bias);
 // 向量矩阵乘法 (M = 1)
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                  const float *B, int ldb, float beta, float *C, int ldc);
+                  const float *B, int ldb, float beta, float *C, int ldc,
-// 计算一个更小的 4 * 4 的 C 矩阵分块
-void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B,
-               int ldb, float beta, float *C, int ldc, int mc, int nc);
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
                  bool relu);
+void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+                        int lda, const float *B, int ldb, float beta, float *C,
+                        int ldc, bool relu, float *new_scale, float *new_bias);
+// 计算一个更小的 C 矩阵分块
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
+// 分块矩阵乘法结果回写
+// C = A * B
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
+// C = alpha * A * B + beta * C
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B + C
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B + C, relu(C)
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B, batchnorm(C)
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
+                 float *new_bias);
+// C = A * B, batchnorm(C), relu(C)
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                     float *new_scale, float *new_bias);
+// 向量矩阵乘法结果回写
+// C = A * B
+void VecWriteBasic(int n, float *c, float *C, int ldc);
+// C = alpha * A * B + beta * C
+void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
+// C = A * B + C
+void VecWriteWithAdd(int n, float *c, float *C, int ldc);
+// C = A * B + C, relu(C)
+void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
+// C = A * B, batchnorm(C)
+void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
+                    float *new_bias);
+// C = A * B, batchnorm(C), relu(C)
+void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
+                        float *new_bias);
 // 32位 float 矩阵乘法
-void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc);
+           const float *B, int ldb, float beta, float *C, int ldc, bool relu);
-void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
+// 32位 float 矩阵乘法, 并对结果进行 batchnrom
-                const float *B, int ldb, float beta, float *C, int ldc);
+void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 bool relu, float *new_scale, float *new_bias);
 // 64位 double 矩阵乘法
 void dgemm(int m, int n, int k, float alpha, const double *A, int lda,

--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -39,22 +39,18 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
  int M = dim_out[0];
  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+  int K = (!trans_a) ? dim_a[1] : dim_a[0];
-  if (relu) {
+  Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-    sgemm_relu(M, N, K, alpha, matrix_a.data<float>(), K,
+        beta, matrix_out->data<float>(), N, relu);
-               matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N);
-  } else {
-    sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-          beta, matrix_out->data<float>(), N);
-  }
 }
 template <>
-void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
+void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
                         const framework::Tensor &matrix_b, bool trans_b,
-                    double alpha, framework::Tensor *matrix_out, double beta,
+                         float alpha, framework::Tensor *matrix_out, float beta,
-                    bool relu) {
+                         bool relu, framework::Tensor *new_scale,
+                         framework::Tensor *new_bias) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -71,7 +67,11 @@ void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
  int M = dim_out[0];
  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+  int K = (!trans_a) ? dim_a[1] : dim_a[0];
+  SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
+              N, beta, matrix_out->data<float>(), N, relu,
+              new_scale->data<float>(), new_bias->data<float>());
 }
 }  // namespace math

--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -26,6 +26,12 @@ template <typename T>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
            const framework::Tensor &matrix_b, bool trans_b, T alpha,
            framework::Tensor *matrix_out, T beta, bool relu = false);
+template <typename T>
+void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
+                  const framework::Tensor &matrix_b, bool trans_b, T alpha,
+                  framework::Tensor *matrix_out, T beta, bool relu,
+                  framework::Tensor *new_scale, framework::Tensor *new_bias);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile