Merge branch 'develop' into develop

009273ef · Ruilong Liu · GitHub · 74c491c2 · f2f0217c · 009273ef
隐藏空白更改
内联并排

Showing with 545 addition and 282 deletion

src/operators/math/gemm.cpp src/operators/math/gemm.cpp +539 -281

src/operators/math/gemm.h src/operators/math/gemm.h +6 -1

未找到文件。
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -33,6 +33,7 @@ float *packedA;
 float *packedB;
 float *packedC;
 float *zero;
+/*
 // 将A矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                 float *buffer) {
@@ -60,6 +61,36 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
  }
 }

+// 将B矩阵分块复制到连续内存(ColMajor)
+void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
+                 float *buffer) {
+  int i, j;
+  const float *Bj, *Bj1, *Bj2, *Bj3;
+  for (j = 0; j < n - n_tail; j += NR) {
+    Bj = &B(0, j);
+    Bj1 = &B(0, j + 1);
+    Bj2 = &B(0, j + 2);
+    Bj3 = &B(0, j + 3);
+    for (i = 0; i < k; ++i) {
+      *buffer++ = *Bj++;
+      *buffer++ = *Bj1++;
+      *buffer++ = *Bj2++;
+      *buffer++ = *Bj3++;
+    }
+  }
+  if (n_tail != 0) {
+    for (i = 0; i < k; ++i) {
+      for (int j = n - n_tail; j < n; ++j) {
+        *buffer++ = B(i, j);
+      }
+      for (int j = n; j < n + (NR - n_tail); ++j) {
+        *buffer++ = 0;
+      }
+    }
+  }
+}
+*/
+
 // 将A矩阵分块复制到连续内存(RowMajor)
 void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
                  float *buffer) {
@@ -100,35 +131,6 @@ void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
  }
 }

-// 将B矩阵分块复制到连续内存(ColMajor)
-void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
-                 float *buffer) {
-  int i, j;
-  const float *Bj, *Bj1, *Bj2, *Bj3;
-  for (j = 0; j < n - n_tail; j += NR) {
-    Bj = &B(0, j);
-    Bj1 = &B(0, j + 1);
-    Bj2 = &B(0, j + 2);
-    Bj3 = &B(0, j + 3);
-    for (i = 0; i < k; ++i) {
-      *buffer++ = *Bj++;
-      *buffer++ = *Bj1++;
-      *buffer++ = *Bj2++;
-      *buffer++ = *Bj3++;
-    }
-  }
-  if (n_tail != 0) {
-    for (i = 0; i < k; ++i) {
-      for (int j = n - n_tail; j < n; ++j) {
-        *buffer++ = B(i, j);
-      }
-      for (int j = n; j < n + (NR - n_tail); ++j) {
-        *buffer++ = 0;
-      }
-    }
-  }
-}
-
 // 将B矩阵分块复制到连续内存(RowMajor)
 void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
                  float *buffer) {
@@ -138,18 +140,31 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
      b0 = &B(i, j);
 #if __ARM_NEON
 #if __aarch64__
-
+      asm volatile(
+          "prfm   pldl1keep,        [%[b0]]           \n\t"
+          "ld1    {v0.4s, v1.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s},   [%[buffer]],  #32 \n\t"
+          : [buffer] "+r"(buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1");
 #else
      asm volatile(
-          "pld        [%[b0]]               \n\t"
-          "vld1.32    {q0, q1}, [%[b0]]         \n\t"
-          "vst1.32    {q0, q1}, [%[buffer]]!    \n\t"
+          "pld        [%[b0]]                     \n\t"
+          "vld1.32    {q0, q1},   [%[b0]]         \n\t"
+          "vst1.32    {q0, q1},   [%[buffer]]!    \n\t"
          : [buffer] "+r"(buffer)
          : [b0] "r"(b0)
-          : "memory", "q0", "q0");
+          : "memory", "q0", "q1");
 #endif  // __aarch64__
 #else
-
+      *buffer++ = *b0++;
+      *buffer++ = *b0++;
+      *buffer++ = *b0++;
+      *buffer++ = *b0++;
+      *buffer++ = *b0++;
+      *buffer++ = *b0++;
+      *buffer++ = *b0++;
+      *buffer++ = *b0++;
 #endif  // __ARM_NEON
    }
  }
@@ -217,7 +232,7 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
 #if __ARM_NEON
 #if __aarch64__

-void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
  // init C
  float32x4_t cv0 = vdupq_n_f32(0.0);
  float32x4_t cv1 = vdupq_n_f32(0.0);
@@ -244,23 +259,264 @@ void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
    a += MR;
    b += NR;
  }
-  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
+
+  vst1q_f32(c, cv0);
+  vst1q_f32(c + ldc, cv1);
+  vst1q_f32(c + 2 * ldc, cv2);
+  vst1q_f32(c + 3 * ldc, cv3);
+  //  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
+}
+
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
+  // init C
+  float32x4_t cv0 = vdupq_n_f32(0.0);
+  float32x4_t cv1 = vdupq_n_f32(0.0);
+  float32x4_t cv2 = vdupq_n_f32(0.0);
+  float32x4_t cv3 = vdupq_n_f32(0.0);
+  float32x4_t cv4 = vdupq_n_f32(0.0);
+  float32x4_t cv5 = vdupq_n_f32(0.0);
+  float32x4_t cv6 = vdupq_n_f32(0.0);
+  float32x4_t cv7 = vdupq_n_f32(0.0);
+
+  float32x4_t av;
+  float32x4_t bv0;
+  float32x4_t bv1;
+
+  float32x2_t av01;
+  float32x2_t av23;
+
+  for (int p = 0; p < k; p += 1) {
+    av = vld1q_f32(a);
+    bv0 = vld1q_f32(b);
+    bv1 = vld1q_f32(b + 4);
+
+    av01 = vget_low_f32(av);
+    cv0 = vmlaq_lane_f32(cv0, bv0, av01, 0);
+    cv1 = vmlaq_lane_f32(cv1, bv1, av01, 0);
+    cv2 = vmlaq_lane_f32(cv2, bv0, av01, 1);
+    cv3 = vmlaq_lane_f32(cv3, bv1, av01, 1);
+    av23 = vget_high_f32(av);
+    cv4 = vmlaq_lane_f32(cv4, bv0, av23, 0);
+    cv5 = vmlaq_lane_f32(cv5, bv1, av23, 0);
+    cv6 = vmlaq_lane_f32(cv6, bv0, av23, 1);
+    cv7 = vmlaq_lane_f32(cv7, bv1, av23, 1);
+
+    a += MR;
+    b += NR;
+  }
+
+  vst1q_f32(c, cv0);
+  vst1q_f32(c + 4, cv1);
+  vst1q_f32(c + ldc, cv2);
+  vst1q_f32(c + ldc + 4, cv3);
+  vst1q_f32(c + 2 * ldc, cv4);
+  vst1q_f32(c + 2 * ldc + 4, cv5);
+  vst1q_f32(c + 3 * ldc, cv6);
+  vst1q_f32(c + 3 * ldc + 4, cv7);
+}
+
+// 分块矩阵乘法结果回写
+// C = A * B
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+      }
+    }
+  }
+}
+
+// C = alpha * A * B + beta * C
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+
+// C = A * B + C
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t cv1;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv1 = vld1q_f32(C_ptr);
+      cv = vaddq_f32(cv, cv1);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv1 = vld1q_f32(C_ptr);
+      cv = vaddq_f32(cv, cv1);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
      }
-      if (j == 0) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
-      } else if (j == 1) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
-      } else if (j == 2) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
-      } else if (j == 3) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+      }
+    }
+  }
+}
+
+// C = A * B + C, relu(C)
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t cv1;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv1 = vld1q_f32(C_ptr);
+      cv = vaddq_f32(cv, cv1);
+      cv = vmaxq_f32(cv, zero);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv1 = vld1q_f32(C_ptr);
+      cv = vaddq_f32(cv, cv1);
+      cv = vmaxq_f32(cv, zero);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+      }
+    }
+  }
+}
+
+// C = A * B, batchnorm(C)
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
+                 float *new_bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t cv1;
+  float32x4_t bias;
+  float32x2_t scale;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    bias = vld1q_dup_f32(new_bias);
+    scale = vld1_dup_f32(new_scale);
+    new_bias++;
+    new_scale++;
+    float scale0 = vget_lane_f32(scale, 0);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vmlaq_n_f32(bias, cv, scale0);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vmlaq_n_f32(bias, cv, scale0);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+        C_ptr++;
+      }
+    }
+  }
+}
+
+// C = A * B, batchnorm(C), relu(C)
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                     float *new_scale, float *new_bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t bias;
+  float32x2_t scale;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    bias = vld1q_dup_f32(new_bias);
+    scale = vld1_dup_f32(new_scale);
+    new_bias++;
+    new_scale++;
+    float scale0 = vget_lane_f32(scale, 0);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vmlaq_n_f32(bias, cv, scale0);
+      cv = vmaxq_f32(cv, zero);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vmlaq_n_f32(bias, cv, scale0);
+      cv = vmaxq_f32(cv, zero);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
      }
    }
  }
@@ -338,6 +594,7 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
        "q10", "q11", "q12", "q13");
 }

+/*
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
                  const float *B, int ldb, float beta, float *C, int ldc,
                  bool relu) {
@@ -770,6 +1027,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
    VecWriteWithBn(n, bufferC, C, ldc, new_scale, new_bias);
  }
 }
+*/

 void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
  const float *a_ptr, *b_ptr;
@@ -1288,281 +1546,281 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
        "q8", "q10", "q11", "q12", "q13", "q14");
 }

-// C = A * B
-void VecWriteBasic(int n, float *c, float *C, int ldc) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
-  int nc2 = _nc1 / 4;
-  int nc3 = 16 - 4 * (_nc1 % 4);
+  /*
+  // C = A * B
+  void VecWriteBasic(int n, float *c, float *C, int ldc) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
+    int nc2 = _nc1 / 4;
+    int nc3 = 16 - 4 * (_nc1 % 4);

-  asm volatile(
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
+    asm volatile(
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"

-      "vld1.32    {q0, q1}, [%[c]]!       \n\t"
-      "vst1.32    {q0, q1}, [%[C]]!       \n\t"
+        "vld1.32    {q0, q1}, [%[c]]!       \n\t"
+        "vst1.32    {q0, q1}, [%[C]]!       \n\t"

-      "vld1.32    {q2, q3}, [%[c]]!       \n\t"
-      "vst1.32    {q2, q3}, [%[C]]!       \n\t"
+        "vld1.32    {q2, q3}, [%[c]]!       \n\t"
+        "vst1.32    {q2, q3}, [%[C]]!       \n\t"

-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"

-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "blt        end_nc2_%=              \n\t"
-      "loop_nc2_%=:                       \n\t"
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "blt        end_nc2_%=              \n\t"
+        "loop_nc2_%=:                       \n\t"

-      "vld1.32    {q4},     [%[c]]!       \n\t"
-      "vst1.32    {q4},     [%[C]]!       \n\t"
+        "vld1.32    {q4},     [%[c]]!       \n\t"
+        "vst1.32    {q4},     [%[C]]!       \n\t"

-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "bge        loop_nc2_%=             \n\t"
-      "end_nc2_%=:                        \n\t"
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "bge        loop_nc2_%=             \n\t"
+        "end_nc2_%=:                        \n\t"

-      "cmp        %[nc3],    #16          \n\t"
-      "beq        end_nc3_%=              \n\t"
-      "sub        %[c],     %[c],   %[nc3]    \n\t"
-      "sub        %[C],     %[C],   %[nc3]    \n\t"
-      "vld1.32    {q5},     [%[c]]!       \n\t"
-      "vst1.32    {q5},     [%[C]]!       \n\t"
-      "end_nc3_%=:                        \n\t"
+        "cmp        %[nc3],    #16          \n\t"
+        "beq        end_nc3_%=              \n\t"
+        "sub        %[c],     %[c],   %[nc3]    \n\t"
+        "sub        %[C],     %[C],   %[nc3]    \n\t"
+        "vld1.32    {q5},     [%[c]]!       \n\t"
+        "vst1.32    {q5},     [%[C]]!       \n\t"
+        "end_nc3_%=:                        \n\t"

-      :
-      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5");
-}
+        :
+        : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5");
+  }

-// C = alpha * A * B + beta * C
-void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
+  // C = alpha * A * B + beta * C
+  void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}

-// C = A * B + C
-void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
+  // C = A * B + C
+  void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;

-  asm volatile(
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
+    asm volatile(
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"

-      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
-      "vld1.32    {q2, q3},   [%[C]]      \n\t"
-      "vadd.f32   q10,  q0,   q2          \n\t"
-      "vadd.f32   q11,  q1,   q3          \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+        "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+        "vld1.32    {q2, q3},   [%[C]]      \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vst1.32    {q10, q11}, [%[C]]!     \n\t"

-      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
-      "vld1.32    {q6, q7},   [%[C]]      \n\t"
-      "vadd.f32   q12,  q4,   q6          \n\t"
-      "vadd.f32   q13,  q5,   q7          \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+        "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+        "vld1.32    {q6, q7},   [%[C]]      \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vst1.32    {q12, q13}, [%[C]]!     \n\t"

-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"

-      : [C] "+r"(C), [c] "+r"(c)
-      : [nc1] "r"(nc1)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
-        "q12", "q13");
+        : [C] "+r"(C), [c] "+r"(c)
+        : [nc1] "r"(nc1)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10",
+  "q11", "q12", "q13");

-  if (_nc1 != 0) {
-    for (int j = 0; j < _nc1; j++) {
-      *C++ += *c++;
+    if (_nc1 != 0) {
+      for (int j = 0; j < _nc1; j++) {
+        *C++ += *c++;
+      }
    }
  }
-}

-// C = A * B + C, relu(C)
-void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
+  // C = A * B + C, relu(C)
+  void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;

-  asm volatile(
-      "vmov.f32   q14,      #0.0          \n\t"
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
+    asm volatile(
+        "vmov.f32   q14,      #0.0          \n\t"
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"

-      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
-      "vld1.32    {q2, q3},   [%[C]]      \n\t"
-      "vadd.f32   q10,  q0,   q2          \n\t"
-      "vadd.f32   q11,  q1,   q3          \n\t"
-      "vmax.f32   q10,  q10,  q14         \n\t"
-      "vmax.f32   q11,  q11,  q14         \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+        "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+        "vld1.32    {q2, q3},   [%[C]]      \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vmax.f32   q10,  q10,  q14         \n\t"
+        "vmax.f32   q11,  q11,  q14         \n\t"
+        "vst1.32    {q10, q11}, [%[C]]!     \n\t"

-      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
-      "vld1.32    {q6, q7},   [%[C]]      \n\t"
-      "vadd.f32   q12,  q4,   q6          \n\t"
-      "vadd.f32   q13,  q5,   q7          \n\t"
-      "vmax.f32   q12,  q12,  q14         \n\t"
-      "vmax.f32   q13,  q13,  q14         \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+        "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+        "vld1.32    {q6, q7},   [%[C]]      \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vmax.f32   q12,  q12,  q14         \n\t"
+        "vmax.f32   q13,  q13,  q14         \n\t"
+        "vst1.32    {q12, q13}, [%[C]]!     \n\t"

-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"

-      : [C] "+r"(C), [c] "+r"(c)
-      : [nc1] "r"(nc1)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
-        "q12", "q13");
+        : [C] "+r"(C), [c] "+r"(c)
+        : [nc1] "r"(nc1)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10",
+  "q11", "q12", "q13");

-  if (_nc1 != 0) {
-    for (int j = 0; j < _nc1; j++) {
-      *C += *c;
-      if (*C < 0) {
-        *C = 0;
+    if (_nc1 != 0) {
+      for (int j = 0; j < _nc1; j++) {
+        *C += *c;
+        if (*C < 0) {
+          *C = 0;
+        }
+        C++;
+        c++;
      }
-      C++;
-      c++;
    }
  }
-}

-// C = A * B, batchnorm(C)
-void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
-                    float *bias) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
-  int nc2 = _nc1 / 4;
-  int nc3 = 16 - 4 * (_nc1 % 4);
-
-  asm volatile(
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
-
-      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
-      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
-      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
-      "vmla.f32   q10,  q0,   q2          \n\t"
-      "vmla.f32   q11,  q1,   q3          \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
-
-      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
-      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
-      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
-      "vmla.f32   q12,  q4,   q6          \n\t"
-      "vmla.f32   q13,  q5,   q7          \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+  // C = A * B, batchnorm(C)
+  void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
+                      float *bias) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
+    int nc2 = _nc1 / 4;
+    int nc3 = 16 - 4 * (_nc1 % 4);

-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
-
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "blt        end_nc2_%=              \n\t"
-      "loop_nc2_%=:                       \n\t"
+    asm volatile(
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"

-      "vld1.32    {q0},   [%[c]]!         \n\t"
-      "vld1.32    {q1},   [%[scale]]!     \n\t"
-      "vld1.32    {q10},  [%[bias]]!      \n\t"
-      "vmla.f32   q10,    q0,   q1        \n\t"
-      "vst1.32    {q10},  [%[C]]!         \n\t"
+        "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+        "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+        "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+        "vmla.f32   q10,  q0,   q2          \n\t"
+        "vmla.f32   q11,  q1,   q3          \n\t"
+        "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+        "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+        "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+        "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+        "vmla.f32   q12,  q4,   q6          \n\t"
+        "vmla.f32   q13,  q5,   q7          \n\t"
+        "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"

-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "bge        loop_nc2_%=             \n\t"
-      "end_nc2_%=:                        \n\t"
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "blt        end_nc2_%=              \n\t"
+        "loop_nc2_%=:                       \n\t"

-      "cmp        %[nc3],    #16          \n\t"
-      "beq        end_nc3_%=              \n\t"
+        "vld1.32    {q0},   [%[c]]!         \n\t"
+        "vld1.32    {q1},   [%[scale]]!     \n\t"
+        "vld1.32    {q10},  [%[bias]]!      \n\t"
+        "vmla.f32   q10,    q0,   q1        \n\t"
+        "vst1.32    {q10},  [%[C]]!         \n\t"

-      "sub        %[c],     %[c],   %[nc3]      \n\t"
-      "sub        %[scale], %[scale],  %[nc3]   \n\t"
-      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
-      "sub        %[C],     %[C],   %[nc3]      \n\t"
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "bge        loop_nc2_%=             \n\t"
+        "end_nc2_%=:                        \n\t"

-      "vld1.32    {q0},   [%[c]]!         \n\t"
-      "vld1.32    {q1},   [%[scale]]!     \n\t"
-      "vld1.32    {q10},  [%[bias]]!      \n\t"
-      "vmla.f32   q10,    q0,   q1        \n\t"
-      "vst1.32    {q10},  [%[C]]!         \n\t"
-      "end_nc3_%=:                        \n\t"
+        "cmp        %[nc3],    #16          \n\t"
+        "beq        end_nc3_%=              \n\t"

-      :
-      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3),
-        [scale] "r"(scale), [bias] "r"(bias)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
-        "q12", "q13");
-}
+        "sub        %[c],     %[c],   %[nc3]      \n\t"
+        "sub        %[scale], %[scale],  %[nc3]   \n\t"
+        "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+        "sub        %[C],     %[C],   %[nc3]      \n\t"

-// C = A * B, batchnorm(C), relu(C)
-void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
-                        float *bias) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
-  int nc2 = _nc1 / 4;
-  int nc3 = 16 - 4 * (_nc1 % 4);
+        "vld1.32    {q0},   [%[c]]!         \n\t"
+        "vld1.32    {q1},   [%[scale]]!     \n\t"
+        "vld1.32    {q10},  [%[bias]]!      \n\t"
+        "vmla.f32   q10,    q0,   q1        \n\t"
+        "vst1.32    {q10},  [%[C]]!         \n\t"
+        "end_nc3_%=:                        \n\t"

-  asm volatile(
-      "vmov.f32   q14,      #0.0          \n\t"
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
-
-      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
-      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
-      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
-      "vmla.f32   q10,  q0,   q2          \n\t"
-      "vmla.f32   q11,  q1,   q3          \n\t"
-      "vmax.f32   q10,  q10,  q14         \n\t"
-      "vmax.f32   q11,  q11,  q14         \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
-
-      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
-      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
-      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
-      "vmla.f32   q12,  q4,   q6          \n\t"
-      "vmla.f32   q13,  q5,   q7          \n\t"
-      "vmax.f32   q12,  q12,  q14         \n\t"
-      "vmax.f32   q13,  q13,  q14         \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
-
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
+        :
+        : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3]
+  "r"(nc3), [scale] "r"(scale), [bias] "r"(bias) : "memory", "q0", "q1", "q2",
+  "q3", "q4", "q5", "q6", "q7", "q10", "q11", "q12", "q13");
+  }

-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "blt        end_nc2_%=              \n\t"
-      "loop_nc2_%=:                       \n\t"
+  // C = A * B, batchnorm(C), relu(C)
+  void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
+                          float *bias) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
+    int nc2 = _nc1 / 4;
+    int nc3 = 16 - 4 * (_nc1 % 4);

-      "vld1.32    {q0},   [%[c]]!         \n\t"
-      "vld1.32    {q1},   [%[scale]]!     \n\t"
-      "vld1.32    {q10},  [%[bias]]!      \n\t"
-      "vmla.f32   q10,    q0,   q1        \n\t"
-      "vmax.f32   q10,    q10,  q14       \n\t"
-      "vst1.32    {q10},  [%[C]]!         \n\t"
+    asm volatile(
+        "vmov.f32   q14,      #0.0          \n\t"
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"

-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "bge        loop_nc2_%=             \n\t"
-      "end_nc2_%=:                        \n\t"
+        "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+        "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+        "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+        "vmla.f32   q10,  q0,   q2          \n\t"
+        "vmla.f32   q11,  q1,   q3          \n\t"
+        "vmax.f32   q10,  q10,  q14         \n\t"
+        "vmax.f32   q11,  q11,  q14         \n\t"
+        "vst1.32    {q10, q11}, [%[C]]!     \n\t"

-      "cmp        %[nc3],    #16          \n\t"
-      "beq        end_nc3_%=              \n\t"
+        "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+        "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+        "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+        "vmla.f32   q12,  q4,   q6          \n\t"
+        "vmla.f32   q13,  q5,   q7          \n\t"
+        "vmax.f32   q12,  q12,  q14         \n\t"
+        "vmax.f32   q13,  q13,  q14         \n\t"
+        "vst1.32    {q12, q13}, [%[C]]!     \n\t"

-      "sub        %[c],     %[c],   %[nc3]      \n\t"
-      "sub        %[scale], %[scale],  %[nc3]   \n\t"
-      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
-      "sub        %[C],     %[C],   %[nc3]      \n\t"
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"

-      "vld1.32    {q0},   [%[c]]!         \n\t"
-      "vld1.32    {q1},   [%[scale]]!     \n\t"
-      "vld1.32    {q10},  [%[bias]]!      \n\t"
-      "vmla.f32   q10,    q0,   q1        \n\t"
-      "vmax.f32   q10,    q10,  q14       \n\t"
-      "vst1.32    {q10},  [%[C]]!         \n\t"
-      "end_nc3_%=:                        \n\t"
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "blt        end_nc2_%=              \n\t"
+        "loop_nc2_%=:                       \n\t"
+
+        "vld1.32    {q0},   [%[c]]!         \n\t"
+        "vld1.32    {q1},   [%[scale]]!     \n\t"
+        "vld1.32    {q10},  [%[bias]]!      \n\t"
+        "vmla.f32   q10,    q0,   q1        \n\t"
+        "vmax.f32   q10,    q10,  q14       \n\t"
+        "vst1.32    {q10},  [%[C]]!         \n\t"
+
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "bge        loop_nc2_%=             \n\t"
+        "end_nc2_%=:                        \n\t"
+
+        "cmp        %[nc3],    #16          \n\t"
+        "beq        end_nc3_%=              \n\t"
+
+        "sub        %[c],     %[c],   %[nc3]      \n\t"
+        "sub        %[scale], %[scale],  %[nc3]   \n\t"
+        "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+        "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+        "vld1.32    {q0},   [%[c]]!         \n\t"
+        "vld1.32    {q1},   [%[scale]]!     \n\t"
+        "vld1.32    {q10},  [%[bias]]!      \n\t"
+        "vmla.f32   q10,    q0,   q1        \n\t"
+        "vmax.f32   q10,    q10,  q14       \n\t"
+        "vst1.32    {q10},  [%[C]]!         \n\t"
+        "end_nc3_%=:                        \n\t"

-      :
-      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3),
-        [scale] "r"(scale), [bias] "r"(bias)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
-        "q12", "q13", "q14");
-}
+        :
+        : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3]
+  "r"(nc3), [scale] "r"(scale), [bias] "r"(bias) : "memory", "q0", "q1", "q2",
+  "q3", "q4", "q5", "q6", "q7", "q10", "q11", "q12", "q13", "q14");
+  }
+  */

 #endif  // __aarch64__
 #else

--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -28,6 +28,7 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {

+/*
 // 将 A 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                 float *buffer);
@@ -35,6 +36,7 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
 // 将 B 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                 float *buffer);
+*/

 // 将 A 矩阵分块复制到连续内存(RowMajor)
 void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
@@ -51,7 +53,7 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
 void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
                       const float *b, float beta, float *c, float *C, int ldc,
                       bool relu, float *new_scale, float *new_bias);
-
+/*
 // 向量矩阵乘法 (M = 1)
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
                  const float *B, int ldb, float beta, float *C, int ldc,
@@ -60,6 +62,7 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
 void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
                        int lda, const float *B, int ldb, float beta, float *C,
                        int ldc, bool relu, float *new_scale, float *new_bias);
+*/

 // 计算一个更小的 C 矩阵分块
 void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
@@ -81,6 +84,7 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
 void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
                     float *new_scale, float *new_bias);

+/*
 // 向量矩阵乘法结果回写
 // C = A * B
 void VecWriteBasic(int n, float *c, float *C, int ldc);
@@ -96,6 +100,7 @@ void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
 // C = A * B, batchnorm(C), relu(C)
 void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
                        float *new_bias);
+*/

 // 32位 float 矩阵乘法
 void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,