diff --git a/CMakeLists.txt b/CMakeLists.txt
index ffbb8f68ad3efa2e9d767a5b73374c0727b9cd6f..097986546601ddf2f7f25e14c10ef4dc104c9e3a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,6 @@ option(LOG_PROFILE "log profile" ON)
 option(CPU "armv7 with neon" ON)
 option(MALI_GPU "mali gpu" OFF)
 option(FPGA "fpga" OFF)
-set(DEBUGING ON)
 
 if (ARM_LINUX)
 include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake")
diff --git a/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/MainActivity.java b/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/MainActivity.java
index febe816681d3845a61c5a8b40630e82ac9b4ea95..6a6665dd334d1c7a47fea04ef708b84498f0e357 100755
--- a/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/MainActivity.java
+++ b/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/MainActivity.java
@@ -121,7 +121,14 @@ public class MainActivity extends Activity {
                 String assetPath = "pml_demo";
                 String sdcardPath = Environment.getExternalStorageDirectory()
                         + File.separator + assetPath + File.separator + type;
-                PML.load(sdcardPath);
+                //PML.load(sdcardPath);
+                String modelPath = Environment.getExternalStorageDirectory()
+                                                           + File.separator + assetPath +
+                                                           File.separator + "googlenet_combine" + File.separator + "model";
+                String paramPath = Environment.getExternalStorageDirectory()
+                                                           + File.separator + assetPath +
+                                                           File.separator + "googlenet_combine" + File.separator + "params";
+                PML.loadCombined(modelPath, paramPath);
 
             }
         });
diff --git a/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/PML.java b/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/PML.java
index 7649d4c081223bace01b806d1eb7dca57129ed7c..e67f04e47a77b28bfd8ce98866b1539797c217cd 100644
--- a/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/PML.java
+++ b/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/PML.java
@@ -8,6 +8,14 @@ public class PML {
      */
     public static native boolean load(String modelPath);
 
+    /**
+     * Load
+     * @param modelPath
+     * @param paramPath
+     * @return
+     */
+    public static native boolean loadCombined(String modelPath,String paramPath);
+
 
     /**
      * object detection
diff --git a/src/jni/paddle_mobile_jni.cpp b/src/jni/paddle_mobile_jni.cpp
index 01d4e52a4b1308a7ff97bc672d1a15d329dbf318..b14f095c1d82f167c1e3f15897b907e730a4a5a8 100644
--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
@@ -60,6 +60,15 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                          optimize);
 }
 
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) {
+  ANDROIDLOGI("load invoked");
+  bool optimize = true;
+  return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                         jstring2cppstring(env, paramPath),
+                                         optimize);
+}
+
 JNIEXPORT jfloatArray JNICALL
 Java_com_baidu_paddle_PML_predict(JNIEnv *env, jclass thiz, jfloatArray buf) {
   jfloatArray result = NULL;
diff --git a/src/jni/paddle_mobile_jni.h b/src/jni/paddle_mobile_jni.h
index 86caa9a273ab11124f6ea67efe27dc3529cea69f..ab88816dcb7ec6ba88f12cb270812c4af0923b32 100644
--- a/src/jni/paddle_mobile_jni.h
+++ b/src/jni/paddle_mobile_jni.h
@@ -22,11 +22,16 @@ extern "C" {
 namespace paddle_mobile {
 namespace jni {
 /**
- * load model & params of the net for android
+ * load separated model for android
  */
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                                           jclass thiz,
                                                           jstring modelPath);
+/**
+ * load combined model  for android
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath);
 
 /**
  * object detection for anroid
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index e9974df967b293317c3014803bec27d2da73fca3..9582c18cbcfb6e502c42ab4195b553bd3b20093b 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -22,9 +22,14 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 namespace math {
-alignas(64) float packedA[MC * KC];
-alignas(64) float packedB[KC * NC];
-alignas(64) float ab[MR * NR];
+int MC = 0;
+int KC = 0;
+int NC = 0;
+
+float *packedA;
+float *packedB;
+float *packedC;
+float *zero;
 // 将A矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                  float *buffer) {
@@ -55,28 +60,39 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
 // 将A矩阵分块复制到连续内存(RowMajor)
 void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
                   float *buffer) {
-  int i, j;
-  const float *Ai, *Ai1, *Ai2, *Ai3;
-  for (i = 0; i < m - m_tail; i += MR) {
-    Ai = &A(i, 0);
-    Ai1 = &A(i + 1, 0);
-    Ai2 = &A(i + 2, 0);
-    Ai3 = &A(i + 3, 0);
+  const float *a0, *a1, *a2, *a3;
+  for (int i = 0; i < m - m_tail; i += MR) {
+    a0 = A + i * lda;
+    a1 = A + (i + 1) * lda;
+    a2 = A + (i + 2) * lda;
+    a3 = A + (i + 3) * lda;
     for (int j = 0; j < k; ++j) {
-      *buffer++ = *Ai++;
-      *buffer++ = *Ai1++;
-      *buffer++ = *Ai2++;
-      *buffer++ = *Ai3++;
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
     }
   }
+  int i = m - m_tail;
+  a0 = &A(i, 0);
+  a1 = a0 + lda;
+  a2 = a0 + 2 * lda;
+  a3 = a0 + 3 * lda;
   if (m_tail != 0) {
-    for (j = 0; j < k; ++j) {
-      for (i = m - m_tail; i < m; ++i) {
-        *buffer++ = A(i, j);
-      }
-      for (i = m; i < m + (MR - m_tail); ++i) {
-        *buffer++ = 0;
-      }
+    if (m_tail <= 3) {
+      a3 = zero;
+    }
+    if (m_tail <= 2) {
+      a2 = zero;
+    }
+    if (m_tail <= 1) {
+      a1 = zero;
+    }
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
     }
   }
 }
@@ -113,35 +129,24 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
 // 将B矩阵分块复制到连续内存(RowMajor)
 void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
                   float *buffer) {
-  int i, j;
-  const float *Bij;
-  for (j = 0; j < n - n_tail; j += NR) {
-#ifdef ARMV7
-
-    for (i = 0; i < k; ++i) {
-      Bij = &B(i, j);
+  const float *b0;
+  for (int j = 0; j < n - n_tail; j += NR) {
+    for (int i = 0; i < k; ++i) {
+      b0 = &B(i, j);
       asm volatile(
-          "vld1.32    {q0}, [%[Bij]]        \n\t"
-          "vst1.32    {q0}, [%[buffer]]!    \n\t"
+          "pld        [%[b0]]               \n\t"
+          "vld1.32    {q0, q1}, [%[b0]]         \n\t"
+          "vst1.32    {q0, q1}, [%[buffer]]!    \n\t"
           : [buffer] "+r"(buffer)
-          : [Bij] "r"(Bij)
-          : "memory", "q0");
-    }
-#else
-    for (i = 0; i < k; ++i) {
-      Bij = &B(i, j);
-      *buffer++ = *Bij;
-      *buffer++ = *(Bij + 1);
-      *buffer++ = *(Bij + 2);
-      *buffer++ = *(Bij + 3);
+          : [b0] "r"(b0)
+          : "memory", "q0", "q0");
     }
-#endif
   }
   if (n_tail != 0) {
-    for (i = 0; i < k; ++i) {
-      Bij = &B(i, n - n_tail);
+    for (int i = 0; i < k; ++i) {
+      b0 = &B(i, n - n_tail);
       for (int j = n - n_tail; j < n; ++j) {
-        *buffer++ = *Bij++;
+        *buffer++ = *b0++;
       }
       for (int j = n; j < n + (NR - n_tail); ++j) {
         *buffer++ = 0;
@@ -151,118 +156,53 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
 }
 
 // 分块矩阵乘法
-void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                 const float *B, int ldb, float beta, float *C, int ldc,
-                 int first_time) {
-  int m_block = (m + MR - 1) / MR * MR;
-  int n_block = (n + NR - 1) / NR * NR;
-
-  int m_tail = m % MR;
-  int n_tail = n % NR;
+void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
+                 float beta, float *c, float *C, int ldc, bool relu) {
+  for (int j = 0; j < nc; j += NR) {
+    for (int i = 0; i < mc; i += MR) {
+      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+    }
+  }
 
-  if (first_time) {
-    PackMatrixB_(k, n, n_tail, B, ldb, packedB);
+  if (alpha != 1) {
+    WriteWithAlphaBeta(mc, nc, c, C, ldc);
+    return;
   }
-  PackMatrixA_(m, k, m_tail, A, lda, packedA);
-
-  int i, j, mc, nc;
-
-  // B 取 4 列, 打包预热
-  for (j = 0; j < n_block; j += NR) {
-    nc = (n - j) < NR ? n_tail : NR;
-    // A 取 4 行，打包预热
-    for (i = 0; i < m_block; i += MR) {
-      mc = (m - i) < MR ? m_tail : MR;
-      AddDot4x4(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
-                &C(i, j), ldc, mc, nc);
-    }
+  if (beta == 0) {
+    WriteBasic(mc, nc, c, C, ldc);
+    return;
+  }
+  if (beta == 1 && !relu) {
+    WriteWithAdd(mc, nc, c, C, ldc);
+    return;
+  }
+  if (beta == 1 && relu) {
+    WriteWithAddRelu(mc, nc, c, C, ldc);
+    return;
   }
 }
 
 // 分块矩阵乘法
-void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda,
-                      const float *B, int ldb, float beta, float *C, int ldc,
-                      int first_time, bool relu = false) {
-  int m_block = (m + MR - 1) / MR * MR;
-  int n_block = (n + NR - 1) / NR * NR;
-
-  int m_tail = m % MR;
-  int n_tail = n % NR;
-
-  if (first_time) {
-    PackMatrixB_(k, n, n_tail, B, ldb, packedB);
-  }
-  PackMatrixA_(m, k, m_tail, A, lda, packedA);
-
-  int i, j, mc, nc;
-
-  // B 取 4 列, 打包预热
-  for (j = 0; j < n_block; j += NR) {
-    nc = (n - j) < NR ? n_tail : NR;
-    // A 取 4 行，打包预热
-    for (i = 0; i < m_block; i += MR) {
-      mc = (m - i) < MR ? m_tail : MR;
-      AddDot4x4_relu(k, alpha, &packedA[i * k], 4, &packedB[j * k], k, beta,
-                     &C(i, j), ldc, mc, nc, relu);
+void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+                       const float *b, float beta, float *c, float *C, int ldc,
+                       bool relu, float *new_scale, float *new_bias) {
+  for (int j = 0; j < nc; j += NR) {
+    for (int i = 0; i < mc; i += MR) {
+      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
     }
   }
-}
-
-// 计算一个更小的 4 * 4 的 C 矩阵分块
-#if defined(IOS)
-void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
-               int ldb, float beta, float *C, int ldc, int mc, int nc) {
-  // init C
-  float32x4_t cv0 = vdupq_n_f32(0.0);
-  float32x4_t cv1 = vdupq_n_f32(0.0);
-  float32x4_t cv2 = vdupq_n_f32(0.0);
-  float32x4_t cv3 = vdupq_n_f32(0.0);
-
-  float32x4_t av;
-  float32x4_t bv;
-
-  float32x2_t av01;
-  float32x2_t av23;
-
-  for (int p = 0; p < k; p += 1) {
-    av = vld1q_f32(a);
-    bv = vld1q_f32(b);
-
-    av01 = vget_low_f32(av);
-    cv0 = vmlaq_lane_f32(cv0, bv, av01, 0);
-    cv1 = vmlaq_lane_f32(cv1, bv, av01, 1);
-    av23 = vget_high_f32(av);
-    cv2 = vmlaq_lane_f32(cv2, bv, av23, 0);
-    cv3 = vmlaq_lane_f32(cv3, bv, av23, 1);
 
-    a += MR;
-    b += NR;
-  }
-  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
-      }
-      if (j == 0) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
-      } else if (j == 1) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
-      } else if (j == 2) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
-      } else if (j == 3) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
-      }
-    }
+  if (relu) {
+    WriteWithBnRelu(mc, nc, c, C, ldc, new_scale, new_bias);
+  } else {
+    WriteWithBn(mc, nc, c, C, ldc, new_scale, new_bias);
   }
 }
 
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu = false) {
+#if defined(IOS)
+void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
   // init C
   float32x4_t cv0 = vdupq_n_f32(0.0);
   float32x4_t cv1 = vdupq_n_f32(0.0);
@@ -307,183 +247,22 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
       } else if (j == 3) {
         C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
       }
-      if (C(i, j) < 0) {
-        C(i, j) = 0;
-      }
     }
   }
 }
+}  // namespace math
 
 #elif defined(ARMV7)
-void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
-               int ldb, float beta, float *C, int ldc, int mc, int nc) {
-  int kc1 = k / 4, kc2 = k % 4;
-  int bytes_ldc = 4 * ldc;
-  int flag_alpha = (alpha == 1.0) ? 1 : 2;
-  int flag_beta;
-  if (beta == 0.0) {
-    flag_beta = 0;
-  } else if (beta == 1.0) {
-    flag_beta = 1;
-  } else {
-    flag_beta = 2;
-  }
-  asm volatile(
-      "pld        [%[a]]              \n\t"
-      "pld        [%[b]]              \n\t"
-      "vmov.f32   q10,    #0.0        \n\t"
-      "vmov.f32   q11,    #0.0        \n\t"
-      "vmov.f32   q12,    #0.0        \n\t"
-      "vmov.f32   q13,    #0.0        \n\t"
-
-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "blt        end_kc1_%=          \n\t"
-      "loop_kc1_%=:                   \n\t"
-      "pld        [%[a], #64]         \n\t"
-      "pld        [%[b], #64]         \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
-      "vmla.f32   q10, q2, d0[0]      \n\t"
-      "vmla.f32   q11, q2, d0[1]      \n\t"
-      "vmla.f32   q12, q2, d1[0]      \n\t"
-      "vmla.f32   q13, q2, d1[1]      \n\t"
-      "vmla.f32   q10, q3, d2[0]      \n\t"
-      "vmla.f32   q11, q3, d2[1]      \n\t"
-      "vmla.f32   q12, q3, d3[0]      \n\t"
-      "vmla.f32   q13, q3, d3[1]      \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
-      "vmla.f32   q10, q2, d0[0]      \n\t"
-      "vmla.f32   q11, q2, d0[1]      \n\t"
-      "vmla.f32   q12, q2, d1[0]      \n\t"
-      "vmla.f32   q13, q2, d1[1]      \n\t"
-      "vmla.f32   q10, q3, d2[0]      \n\t"
-      "vmla.f32   q11, q3, d2[1]      \n\t"
-      "vmla.f32   q12, q3, d3[0]      \n\t"
-      "vmla.f32   q13, q3, d3[1]      \n\t"
-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "bge        loop_kc1_%=         \n\t"
-      "end_kc1_%=:                    \n\t"
-
-      "subs       %[kc2], %[kc2], #1  \n\t"
-      "blt        end_kc2_%=          \n\t"
-      "loop_kc2_%=:                   \n\t"
-      "vld1.32    {q0}, [%[a]]!       \n\t"
-      "vld1.32    {q1}, [%[b]]!       \n\t"
-      "vmla.f32   q10, q1, d0[0]      \n\t"
-      "vmla.f32   q11, q1, d0[1]      \n\t"
-      "vmla.f32   q12, q1, d1[0]      \n\t"
-      "vmla.f32   q13, q1, d1[1]      \n\t"
-      "subs       %[kc2], %[kc2], #1  \n\t"
-      "bge        loop_kc2_%=         \n\t"
-      "end_kc2_%=:                    \n\t"
-
-      "cmp        %[mc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-      "cmp        %[nc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-
-      "vmov.f32   d8[0],    %[alpha]  \n\t"
-      "vmov.f32   d8[1],    %[beta]   \n\t"
-
-      "cmp        %[flag_alpha],  #1  \n\t"
-      "bne        alpha_%=            \n\t"
-
-      "alpha_%=:                      \n\t"
-      "vmul.f32   q10, q10, d8[0]     \n\t"
-      "vmul.f32   q11, q11, d8[0]     \n\t"
-      "vmul.f32   q12, q12, d8[0]     \n\t"
-      "vmul.f32   q13, q13, d8[0]     \n\t"
-
-      "beta_%=:                       \n\t"
-      "cmp        %[flag_beta],   #0  \n\t"
-      "beq        memory_%=           \n\t"
-
-      "mov        r4,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
-      "vld1.32    {q0}, [r4], r6      \n\t"
-      "vld1.32    {q1}, [r4], r6      \n\t"
-      "vld1.32    {q2}, [r4], r6      \n\t"
-      "vld1.32    {q3}, [r4]          \n\t"
-      "cmp        %[flag_beta],   #1  \n\t"
-      "beq        beta_eq1_%=         \n\t"
-      "bne        beta_ne1_%=         \n\t"
-
-      "beta_eq1_%=:                   \n\t"
-      "vadd.f32   q10, q10, q0        \n\t"
-      "vadd.f32   q11, q11, q1        \n\t"
-      "vadd.f32   q12, q12, q2        \n\t"
-      "vadd.f32   q13, q13, q3        \n\t"
-      "b          memory_%=           \n\t"
-
-      "beta_ne1_%=:                   \n\t"
-      "vmla.f32   q10, q0, d8[1]      \n\t"
-      "vmla.f32   q11, q1, d8[1]      \n\t"
-      "vmla.f32   q12, q2, d8[1]      \n\t"
-      "vmla.f32   q13, q3, d8[1]      \n\t"
-
-      "memory_%=:                     \n\t"
-      "mov        r5,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
-      "vst1.32    {q10}, [r5], r6     \n\t"
-      "vst1.32    {q11}, [r5], r6     \n\t"
-      "vst1.32    {q12}, [r5], r6     \n\t"
-      "vst1.32    {q13}, [r5]         \n\t"
-      "b          end_%=              \n\t"
-
-      "temp_%=:                       \n\t"
-      "vst1.32    {q10, q11}, [%[ab]]!\n\t"
-      "vst1.32    {q12, q13}, [%[ab]] \n\t"
-      "end_%=:                        \n\t"
-      :
-      : [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
-        [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
-        [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13");
-
-  if (mc != MR || nc != NR) {
-    int i, j;
-    for (i = 0; i < mc; ++i) {
-      for (j = 0; j < nc; ++j) {
-        if (beta == 0.0) {
-          if (alpha != 1.0) {
-            C(i, j) = alpha * ab[i * MR + j];
-          } else {
-            C(i, j) = ab[i * MR + j];
-          }
-        } else {
-          if (beta != 1.0) {
-            C(i, j) *= beta;
-          }
-          if (alpha != 1.0) {
-            C(i, j) += alpha * ab[i * MR + j];
-          } else {
-            C(i, j) += ab[i * MR + j];
-          }
-        }
-      }
-    }
-  }
-}
-
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu = false) {
-  int kc1 = k / 4, kc2 = k % 4;
-  int bytes_ldc = 4 * ldc;
-  int flag_alpha = (alpha == 1.0) ? 1 : 2;
-  int flag_beta;
-  if (beta == 0.0) {
-    flag_beta = 0;
-  } else if (beta == 1.0) {
-    flag_beta = 1;
-  } else {
-    flag_beta = 2;
-  }
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k / 4;
+  int kc2 = k % 4;
+  int step = 4 * ldc;
   asm volatile(
-      "pld        [%[a]]              \n\t"
-      "pld        [%[b]]              \n\t"
+      "pld        [%[a_ptr]]          \n\t"
+      "pld        [%[b_ptr]]          \n\t"
       "vmov.f32   q10,    #0.0        \n\t"
       "vmov.f32   q11,    #0.0        \n\t"
       "vmov.f32   q12,    #0.0        \n\t"
@@ -492,20 +271,10 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
       "subs       %[kc1], %[kc1], #1  \n\t"
       "blt        end_kc1_%=          \n\t"
       "loop_kc1_%=:                   \n\t"
-      "pld        [%[a], #64]         \n\t"
-      "pld        [%[b], #64]         \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
-      "vmla.f32   q10, q2, d0[0]      \n\t"
-      "vmla.f32   q11, q2, d0[1]      \n\t"
-      "vmla.f32   q12, q2, d1[0]      \n\t"
-      "vmla.f32   q13, q2, d1[1]      \n\t"
-      "vmla.f32   q10, q3, d2[0]      \n\t"
-      "vmla.f32   q11, q3, d2[1]      \n\t"
-      "vmla.f32   q12, q3, d3[0]      \n\t"
-      "vmla.f32   q13, q3, d3[1]      \n\t"
-      "vld1.32    {q0, q1}, [%[a]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b]]!   \n\t"
+      "pld        [%[a_ptr], #64]     \n\t"
+      "pld        [%[b_ptr], #64]     \n\t"
+      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
       "vmla.f32   q10, q2, d0[0]      \n\t"
       "vmla.f32   q11, q2, d0[1]      \n\t"
       "vmla.f32   q12, q2, d1[0]      \n\t"
@@ -514,6 +283,16 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
       "vmla.f32   q11, q3, d2[1]      \n\t"
       "vmla.f32   q12, q3, d3[0]      \n\t"
       "vmla.f32   q13, q3, d3[1]      \n\t"
+      "vld1.32    {q4, q5}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q6, q7}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q10, q6, d8[0]      \n\t"
+      "vmla.f32   q11, q6, d8[1]      \n\t"
+      "vmla.f32   q12, q6, d9[0]      \n\t"
+      "vmla.f32   q13, q6, d9[1]      \n\t"
+      "vmla.f32   q10, q7, d10[0]     \n\t"
+      "vmla.f32   q11, q7, d10[1]     \n\t"
+      "vmla.f32   q12, q7, d11[0]     \n\t"
+      "vmla.f32   q13, q7, d11[1]     \n\t"
       "subs       %[kc1], %[kc1], #1  \n\t"
       "bge        loop_kc1_%=         \n\t"
       "end_kc1_%=:                    \n\t"
@@ -521,8 +300,8 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
       "subs       %[kc2], %[kc2], #1  \n\t"
       "blt        end_kc2_%=          \n\t"
       "loop_kc2_%=:                   \n\t"
-      "vld1.32    {q0}, [%[a]]!       \n\t"
-      "vld1.32    {q1}, [%[b]]!       \n\t"
+      "vld1.32    {q0}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q1}, [%[b_ptr]]!   \n\t"
       "vmla.f32   q10, q1, d0[0]      \n\t"
       "vmla.f32   q11, q1, d0[1]      \n\t"
       "vmla.f32   q12, q1, d1[0]      \n\t"
@@ -531,290 +310,168 @@ void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
       "bge        loop_kc2_%=         \n\t"
       "end_kc2_%=:                    \n\t"
 
-      "cmp        %[mc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-      "cmp        %[nc],      #4      \n\t"
-      "bne        temp_%=             \n\t"
-
-      "vmov.f32   d8[0],    %[alpha]  \n\t"
-      "vmov.f32   d8[1],    %[beta]   \n\t"
-
-      "cmp        %[flag_alpha],  #1  \n\t"
-      "bne        alpha_%=            \n\t"
-
-      "alpha_%=:                      \n\t"
-      "vmul.f32   q10, q10, d8[0]     \n\t"
-      "vmul.f32   q11, q11, d8[0]     \n\t"
-      "vmul.f32   q12, q12, d8[0]     \n\t"
-      "vmul.f32   q13, q13, d8[0]     \n\t"
-
-      "beta_%=:                       \n\t"
-      "cmp        %[flag_beta],   #0  \n\t"
-      "beq        memory_%=           \n\t"
-
-      "mov        r4,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
-      "vld1.32    {q0}, [r4], r6      \n\t"
-      "vld1.32    {q1}, [r4], r6      \n\t"
-      "vld1.32    {q2}, [r4], r6      \n\t"
-      "vld1.32    {q3}, [r4]          \n\t"
-      "cmp        %[flag_beta],   #1  \n\t"
-      "beq        beta_eq1_%=         \n\t"
-      "bne        beta_ne1_%=         \n\t"
-
-      "beta_eq1_%=:                   \n\t"
-      "vadd.f32   q10, q10, q0        \n\t"
-      "vadd.f32   q11, q11, q1        \n\t"
-      "vadd.f32   q12, q12, q2        \n\t"
-      "vadd.f32   q13, q13, q3        \n\t"
-      "b          memory_%=           \n\t"
-
-      "beta_ne1_%=:                   \n\t"
-      "vmla.f32   q10, q0, d8[1]      \n\t"
-      "vmla.f32   q11, q1, d8[1]      \n\t"
-      "vmla.f32   q12, q2, d8[1]      \n\t"
-      "vmla.f32   q13, q3, d8[1]      \n\t"
-
-      "memory_%=:                     \n\t"
-      "vmax.f32 q10, q10, q14         \n\t"
-      "vmax.f32 q11, q11, q14         \n\t"
-      "vmax.f32 q12, q12, q14         \n\t"
-      "vmax.f32 q13, q13, q14         \n\t"
-      "mov        r5,     %[C]        \n\t"
-      "mov        r6,     %[bytes_ldc]\n\t"
+      "mov        r5,     %[c]        \n\t"
+      "mov        r6,     %[step]     \n\t"
       "vst1.32    {q10}, [r5], r6     \n\t"
       "vst1.32    {q11}, [r5], r6     \n\t"
       "vst1.32    {q12}, [r5], r6     \n\t"
       "vst1.32    {q13}, [r5]         \n\t"
-      "b          end_%=              \n\t"
-
-      "temp_%=:                       \n\t"
-      "vst1.32    {q10, q11}, [%[ab]]!\n\t"
-      "vst1.32    {q12, q13}, [%[ab]] \n\t"
-      "end_%=:                        \n\t"
       :
-      : [a] "r"(a), [b] "r"(b), [C] "r"(C), [ab] "r"(ab), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [mc] "r"(mc), [nc] "r"(nc), [alpha] "r"(alpha),
-        [beta] "r"(beta), [bytes_ldc] "r"(bytes_ldc),
-        [flag_alpha] "r"(flag_alpha), [flag_beta] "r"(flag_beta)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11", "q12", "q13",
-        "q14");
-
-  if (mc != MR || nc != NR) {
-    int i, j;
-    for (i = 0; i < mc; ++i) {
-      for (j = 0; j < nc; ++j) {
-        if (beta == 0.0) {
-          if (alpha != 1.0) {
-            C(i, j) = alpha * ab[i * MR + j];
-          } else {
-            C(i, j) = ab[i * MR + j];
-          }
-        } else {
-          if (beta != 1.0) {
-            C(i, j) *= beta;
-          }
-          if (alpha != 1.0) {
-            C(i, j) += alpha * ab[i * MR + j];
-          } else {
-            C(i, j) += ab[i * MR + j];
-          }
-        }
-        if (relu) {
-          if (C(i, j) < 0) {
-            C(i, j) = 0;
-          }
-        }
-      }
-    }
-  }
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [step] "r"(step)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q10", "q11", "q12", "q13");
 }
 
 #else
-void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
-               int ldb, float beta, float *C, int ldc, int mc, int nc) {
-  float c[16] = {0};
-  float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
-
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+  float *c0, *c1, *c2, *c3;
+  c0 = c;
+  c1 = c + ldc;
+  c2 = c + 2 * ldc;
+  c3 = c + 3 * ldc;
   for (int p = 0; p < k; p += 1) {
-    reg_b0 = *b++;
-    reg_b1 = *b++;
-    reg_b2 = *b++;
-    reg_b3 = *b++;
-
-    reg_a0 = *a++;
-    reg_a1 = *a++;
-    reg_a2 = *a++;
-    reg_a3 = *a++;
-
     // first row
-    c[0] += reg_a0 * reg_b0;
-    c[1] += reg_a0 * reg_b1;
-    c[2] += reg_a0 * reg_b2;
-    c[3] += reg_a0 * reg_b3;
+    c0[0] += a[0] * b[0];
+    c0[1] += a[0] * b[1];
+    c0[2] += a[0] * b[2];
+    c0[3] += a[0] * b[3];
 
     // second row
-    c[4] += reg_a1 * reg_b0;
-    c[5] += reg_a1 * reg_b1;
-    c[6] += reg_a1 * reg_b2;
-    c[7] += reg_a1 * reg_b3;
+    c1[0] += a[1] * b[0];
+    c1[1] += a[1] * b[1];
+    c1[2] += a[1] * b[2];
+    c1[3] += a[1] * b[3];
 
     // third row
-    c[8] += reg_a2 * reg_b0;
-    c[9] += reg_a2 * reg_b1;
-    c[10] += reg_a2 * reg_b2;
-    c[11] += reg_a2 * reg_b3;
+    c2[0] += a[2] * b[0];
+    c2[1] += a[2] * b[1];
+    c2[2] += a[2] * b[2];
+    c2[3] += a[2] * b[3];
 
     // fourth row
-    c[12] += reg_a3 * reg_b0;
-    c[13] += reg_a3 * reg_b1;
-    c[14] += reg_a3 * reg_b2;
-    c[15] += reg_a3 * reg_b3;
-  }
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
-      }
-      if (alpha != 1.0) {
-        C(i, j) += alpha * c[i * MR + j];
-      } else {
-        C(i, j) += c[i * MR + j];
-      }
-    }
-  }
-}
-
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu) {
-  float c[16] = {0};
-  float reg_a0, reg_a1, reg_a2, reg_a3, reg_b0, reg_b1, reg_b2, reg_b3;
-
-  for (int p = 0; p < k; p += 1) {
-    reg_b0 = *b++;
-    reg_b1 = *b++;
-    reg_b2 = *b++;
-    reg_b3 = *b++;
-
-    reg_a0 = *a++;
-    reg_a1 = *a++;
-    reg_a2 = *a++;
-    reg_a3 = *a++;
-
-    // first row
-    c[0] += reg_a0 * reg_b0;
-    c[1] += reg_a0 * reg_b1;
-    c[2] += reg_a0 * reg_b2;
-    c[3] += reg_a0 * reg_b3;
-
-    // second row
-    c[4] += reg_a1 * reg_b0;
-    c[5] += reg_a1 * reg_b1;
-    c[6] += reg_a1 * reg_b2;
-    c[7] += reg_a1 * reg_b3;
-
-    // third row
-    c[8] += reg_a2 * reg_b0;
-    c[9] += reg_a2 * reg_b1;
-    c[10] += reg_a2 * reg_b2;
-    c[11] += reg_a2 * reg_b3;
+    c3[0] += a[3] * b[0];
+    c3[1] += a[3] * b[1];
+    c3[2] += a[3] * b[2];
+    c3[3] += a[3] * b[3];
 
-    // fourth row
-    c[12] += reg_a3 * reg_b0;
-    c[13] += reg_a3 * reg_b1;
-    c[14] += reg_a3 * reg_b2;
-    c[15] += reg_a3 * reg_b3;
-  }
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
-      }
-      if (alpha != 1.0) {
-        C(i, j) += alpha * c[i * MR + j];
-      } else {
-        C(i, j) += c[i * MR + j];
-      }
-      if (relu) {
-        if (C(i, j) < 0) {
-          C(i, j) = 0;
-        }
-      }
-    }
+    a += 4;
+    b += 4;
   }
 }
 
 #endif
 
 // 32位 float 矩阵乘法
-void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc) {
-  int i, j, p, mc, nc, kc;
-  float beta_;
-
-#ifdef ARMV7
-  if (m == 1) {
-    VectorKernel(1, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-    return;
+void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+           const float *B, int ldb, float beta, float *C, int ldc, bool relu) {
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int L1 = 30 * 1024;
+  int L2 = 1 * 1024 * 1024;
+
+  KC = k;
+  MC = L2 / (2 * KC * sizeof(float));
+  NC = MC;
+
+  // make sure MC is multiple of 4, and NC is multiple of 8
+  int mblock_num = (m + MC - 1) / MC;
+  MC = (m + mblock_num - 1) / mblock_num;
+  MC = (MC + 4 - 1) / 4 * 4;
+  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+
+  int nblock_num = (n + NC - 1) / NC;
+  NC = (n + nblock_num - 1) / nblock_num;
+  NC = (NC + 8 - 1) / 8 * 8;
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+
+  packedA = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+  packedB = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+
+  for (int l = 0; l < KC; ++l) {
+    zero[l] = 0;
   }
-#endif
 
-  for (j = 0; j < n; j += NC) {
+  int mc, nc;
+  for (int j = 0; j < n; j += NC) {
     nc = s_min(n - j, NC);
-    for (p = 0; p < k; p += KC) {
-      kc = s_min(k - p, KC);
-      for (i = 0; i < m; i += MC) {
-        mc = s_min(m - i, MC);
-        if (p != 0) {
-          beta_ = 1.0;
-        } else {
-          beta_ = beta;
-        }
-        InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
-                    &C(i, j), ldc, i == 0);
-      }
+    PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    for (int i = 0; i < m; i += MC) {
+      mc = s_min(m - i, MC);
+      PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc,
+                  relu);
     }
   }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
 }
 
-void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
-                const float *B, int ldb, float beta, float *C, int ldc) {
-  int i, j, p, mc, nc, kc;
-  float beta_;
-  for (j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-    for (p = 0; p < k; p += KC) {
-      kc = s_min(k - p, KC);
-      for (i = 0; i < m; i += MC) {
-        mc = s_min(m - i, MC);
-        if (p != 0) {
-          beta_ = 1.0;
-        } else {
-          beta_ = beta;
-        }
+void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 bool relu, float *new_scale, float *new_bias) {
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int L1 = 30 * 1024;
+  int L2 = 1 * 1024 * 1024;
+
+  KC = k;
+  MC = L2 / (2 * KC * sizeof(float));
+  NC = MC;
+
+  // make sure MC is multiple of 4, and NC is multiple of 8
+  int mblock_num = (m + MC - 1) / MC;
+  MC = (m + mblock_num - 1) / mblock_num;
+  MC = (MC + 4 - 1) / 4 * 4;
+  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+
+  int nblock_num = (n + NC - 1) / NC;
+  NC = (n + nblock_num - 1) / nblock_num;
+  NC = (NC + 8 - 1) / 8 * 8;
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+
+  packedA = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+  packedB = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+
+  for (int l = 0; l < KC; ++l) {
+    zero[l] = 0;
+  }
 
-        if (p + KC >= k) {
-          InnerKernel_relu(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb,
-                           beta_, &C(i, j), ldc, i == 0, true);
-        } else {
-          InnerKernel(mc, nc, kc, alpha, &A(i, p), lda, &B(p, j), ldb, beta_,
-                      &C(i, j), ldc, i == 0);
-        }
-      }
+  int mc, nc;
+  for (int j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+    PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    for (int i = 0; i < m; i += MC) {
+      mc = s_min(m - i, MC);
+      PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
+                        &C(i, j), ldc, relu, new_scale + ldc * i + j,
+                        new_bias + ldc * i + j);
     }
   }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
 }
 
-#ifdef ARMV7
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                  const float *B, int ldb, float beta, float *C, int ldc) {
+                  const float *B, int ldb, float beta, float *C, int ldc,
+                  bool relu) {
   float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
 
   const float *a0, *b0, *b1, *b2, *b3;
@@ -1016,18 +673,995 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
     }
   }
 
-  c0 = bufferC;
-  C0 = C;
-  for (int i = 0; i < n; i++) {
-    if (beta == 1.0) {
-      *C0++ += *c0++;
-    } else {
-      *C0++ = *c0++;
-    }
+  if (alpha != 1) {
+    VecWriteWithAlphaBeta(n, bufferC, C, ldc);
+    return;
+  }
+  if (beta == 0) {
+    VecWriteBasic(n, bufferC, C, ldc);
+    return;
+  }
+  if (beta == 1 && !relu) {
+    VecWriteWithAdd(n, bufferC, C, ldc);
+    return;
+  }
+  if (beta == 1 && relu) {
+    VecWriteWithAddRelu(n, bufferC, C, ldc);
+    return;
   }
 }
-#endif
 
-}  // namespace math
-}  // namespace operators
+void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+                        int lda, const float *B, int ldb, float beta, float *C,
+                        int ldc, bool relu, float *new_scale, float *new_bias) {
+  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
+
+  const float *a0, *b0, *b1, *b2, *b3;
+  float *c0, *C0;
+
+  int volatile kc1 = k / 4;
+  int volatile kc2 = k % 4;
+  int volatile nc1 = n / 16;
+  int _nc1 = n % 16;
+  int volatile nc2 = _nc1 / 4;
+  int volatile nc3 = _nc1 % 4;
+  for (int i = 0; i < kc1; i++) {
+    a0 = A + i * 4;
+    b0 = B + i * 4 * ldb;
+    b1 = b0 + ldb;
+    b2 = b1 + ldb;
+    b3 = b2 + ldb;
+    c0 = bufferC;
+    asm volatile(
+        "pld        [%[a0], #16]          \n\t"
+        "vld1.32    {q0}, [%[a0]]         \n\t"
+
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "blt        end_nc1_%=            \n\t"
+        "loop_nc1_%=:                     \n\t"
+
+        "cmp        %[i],       #0        \n\t"
+        "beq        i_eq0_%=              \n\t"
+        "bne        i_ne0_%=              \n\t"
+
+        "i_eq0_%=:                        \n\t"
+        "vmov.f32   q10,    #0.0          \n\t"
+        "vmov.f32   q11,    #0.0          \n\t"
+        "vmov.f32   q12,    #0.0          \n\t"
+        "vmov.f32   q13,    #0.0          \n\t"
+        "b          gemm_nc1_%=           \n\t"
+
+        "i_ne0_%=:                        \n\t"
+        "pld        [%[c0], #64]          \n\t"
+        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
+        "sub        %[c0], %[c0], #32     \n\t"
+
+        "gemm_nc1_%=:                     \n\t"
+        "pld        [%[b0], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "vmla.f32   q11, q3, d0[0]        \n\t"
+        "vmla.f32   q12, q4, d0[0]        \n\t"
+        "vmla.f32   q13, q5, d0[0]        \n\t"
+
+        "pld        [%[b1], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b1]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b1]]!    \n\t"
+        "vmla.f32   q10, q2, d0[1]        \n\t"
+        "vmla.f32   q11, q3, d0[1]        \n\t"
+        "vmla.f32   q12, q4, d0[1]        \n\t"
+        "vmla.f32   q13, q5, d0[1]        \n\t"
+
+        "pld        [%[b2], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b2]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b2]]!    \n\t"
+        "vmla.f32   q10, q2, d1[0]        \n\t"
+        "vmla.f32   q11, q3, d1[0]        \n\t"
+        "vmla.f32   q12, q4, d1[0]        \n\t"
+        "vmla.f32   q13, q5, d1[0]        \n\t"
+
+        "pld        [%[b3], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b3]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b3]]!    \n\t"
+        "vmla.f32   q10, q2, d1[1]        \n\t"
+        "vmla.f32   q11, q3, d1[1]        \n\t"
+        "vmla.f32   q12, q4, d1[1]        \n\t"
+        "vmla.f32   q13, q5, d1[1]        \n\t"
+
+        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
+
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "bge        loop_nc1_%=           \n\t"
+        "end_nc1_%=:                      \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "blt        end_nc2_%=            \n\t"
+        "loop_nc2_%=:                     \n\t"
+
+        "cmp        %[i],       #0        \n\t"
+        "beq        ii_eq0_%=             \n\t"
+        "bne        ii_ne0_%=             \n\t"
+
+        "ii_eq0_%=:                       \n\t"
+        "vmov.f32   q10,    #0.0          \n\t"
+        "b          gemm_nc2_%=           \n\t"
+
+        "ii_ne0_%=:                       \n\t"
+        "pld        [%[c0], #16]          \n\t"
+        "vld1.32    {q10}, [%[c0]]        \n\t"
+
+        "gemm_nc2_%=:                     \n\t"
+        "pld        [%[b0], #16]          \n\t"
+        "vld1.32    {q2}, [%[b0]]!        \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+
+        "pld        [%[b1], #16]          \n\t"
+        "vld1.32    {q3}, [%[b1]]!        \n\t"
+        "vmla.f32   q10, q3, d0[1]        \n\t"
+
+        "pld        [%[b2], #16]          \n\t"
+        "vld1.32    {q4}, [%[b2]]!        \n\t"
+        "vmla.f32   q10, q4, d1[0]        \n\t"
+
+        "pld        [%[b3], #16]          \n\t"
+        "vld1.32    {q5}, [%[b3]]!        \n\t"
+        "vmla.f32   q10, q5, d1[1]        \n\t"
+
+        "vst1.32    {q10}, [%[c0]]!       \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "bge        loop_nc2_%=           \n\t"
+        "end_nc2_%=:                      \n\t"
+
+        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
+          [c0] "+r"(c0)
+        : [a0] "r"(a0), [i] "r"(i), [nc1] "r"(nc1), [nc2] "r"(nc2)
+        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
+
+    for (int j = 0; j < nc3; j++) {
+      if (i == 0) {
+        *c0 = (*a0) * (*b0++);
+      } else {
+        *c0 += (*a0) * (*b0++);
+      }
+      *c0 += (*(a0 + 1)) * (*b1++);
+      *c0 += (*(a0 + 2)) * (*b2++);
+      *c0 += (*(a0 + 3)) * (*b3++);
+      c0++;
+    }
+  }
+
+  for (int i = 0; i < kc2; ++i) {
+    a0 = A + 4 * kc1 + i;
+    b0 = B + (4 * kc1 + i) * ldb;
+    c0 = bufferC;
+    asm volatile(
+        "pld        [%[a0], #16]          \n\t"
+        "vld1.32    {d0}, [%[a0]]         \n\t"
+
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "blt        end_nc1_%=            \n\t"
+        "loop_nc1_%=:                     \n\t"
+
+        "pld        [%[c0], #64]          \n\t"
+        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
+        "sub        %[c0], %[c0], #32     \n\t"
+
+        "gemm_nc1_%=:                     \n\t"
+        "pld        [%[b0], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "vmla.f32   q11, q3, d0[0]        \n\t"
+        "vmla.f32   q12, q4, d0[0]        \n\t"
+        "vmla.f32   q13, q5, d0[0]        \n\t"
+
+        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
+
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "bge        loop_nc1_%=           \n\t"
+        "end_nc1_%=:                      \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "blt        end_nc2_%=            \n\t"
+        "loop_nc2_%=:                     \n\t"
+
+        "pld        [%[c0], #16]          \n\t"
+        "vld1.32    {q10}, [%[c0]]        \n\t"
+
+        "gemm_nc2_%=:                     \n\t"
+        "vld1.32    {q2}, [%[b0]]!        \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+
+        "vst1.32    {q10}, [%[c0]]!       \n\t"
+
+        "subs       %[nc2], %[nc2], #1    \n\t"
+        "bge        loop_nc2_%=           \n\t"
+        "end_nc2_%=:                      \n\t"
+
+        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
+          [c0] "+r"(c0)
+        : [a0] "r"(a0), [nc1] "r"(nc1), [nc2] "r"(nc2)
+        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
+
+    for (int j = 0; j < nc3; j++) {
+      *c0 += (*a0) * (*b0++);
+      c0++;
+    }
+  }
+
+  if (relu) {
+    VecWriteWithBnRelu(n, bufferC, C, ldc, new_scale, new_bias);
+  } else {
+    VecWriteWithBn(n, bufferC, C, ldc, new_scale, new_bias);
+  }
+}
+
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k / 4;
+  int kc2 = k % 4;
+  int step = 4 * ldc;
+  asm volatile(
+      "pld        [%[a_ptr]]          \n\t"
+      "pld        [%[b_ptr]]          \n\t"
+
+      "vmov.f32   q8,     #0.0        \n\t"
+      "vmov.f32   q9,     #0.0        \n\t"
+      "vmov.f32   q10,    #0.0        \n\t"
+      "vmov.f32   q11,    #0.0        \n\t"
+      "vmov.f32   q12,    #0.0        \n\t"
+      "vmov.f32   q13,    #0.0        \n\t"
+      "vmov.f32   q14,    #0.0        \n\t"
+      "vmov.f32   q15,    #0.0        \n\t"
+
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "blt        end_kc1_%=          \n\t"
+      "loop_kc1_%=:                   \n\t"
+
+      "pld        [%[a_ptr], #64]     \n\t"
+      "pld        [%[b_ptr], #64]     \n\t"
+
+      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vld1.32    {q4, q5}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q8,   q2,   d0[0]      \n\t"
+      "vmla.f32   q9,   q3,   d0[0]      \n\t"
+      "vmla.f32   q10,  q2,   d0[1]      \n\t"
+      "vmla.f32   q11,  q3,   d0[1]      \n\t"
+      "vmla.f32   q12,  q2,   d1[0]      \n\t"
+      "vmla.f32   q13,  q3,   d1[0]      \n\t"
+      "vmla.f32   q14,  q2,   d1[1]      \n\t"
+      "vmla.f32   q15,  q3,   d1[1]      \n\t"
+
+      "vmla.f32   q8,   q4,   d2[0]      \n\t"
+      "vmla.f32   q9,   q5,   d2[0]      \n\t"
+      "vmla.f32   q10,  q4,   d2[1]      \n\t"
+      "vmla.f32   q11,  q5,   d2[1]      \n\t"
+      "vmla.f32   q12,  q4,   d3[0]      \n\t"
+      "vmla.f32   q13,  q5,   d3[0]      \n\t"
+      "vmla.f32   q14,  q4,   d3[1]      \n\t"
+      "vmla.f32   q15,  q5,   d3[1]      \n\t"
+
+      "pld        [%[b_ptr], #64]     \n\t"
+
+      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vld1.32    {q4, q5}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q8,   q2,   d0[0]      \n\t"
+      "vmla.f32   q9,   q3,   d0[0]      \n\t"
+      "vmla.f32   q10,  q2,   d0[1]      \n\t"
+      "vmla.f32   q11,  q3,   d0[1]      \n\t"
+      "vmla.f32   q12,  q2,   d1[0]      \n\t"
+      "vmla.f32   q13,  q3,   d1[0]      \n\t"
+      "vmla.f32   q14,  q2,   d1[1]      \n\t"
+      "vmla.f32   q15,  q3,   d1[1]      \n\t"
+
+      "vmla.f32   q8,   q4,   d2[0]      \n\t"
+      "vmla.f32   q9,   q5,   d2[0]      \n\t"
+      "vmla.f32   q10,  q4,   d2[1]      \n\t"
+      "vmla.f32   q11,  q5,   d2[1]      \n\t"
+      "vmla.f32   q12,  q4,   d3[0]      \n\t"
+      "vmla.f32   q13,  q5,   d3[0]      \n\t"
+      "vmla.f32   q14,  q4,   d3[1]      \n\t"
+      "vmla.f32   q15,  q5,   d3[1]      \n\t"
+
+      "subs       %[kc1], %[kc1], #1  \n\t"
+      "bge        loop_kc1_%=         \n\t"
+      "end_kc1_%=:                    \n\t"
+
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "blt        end_kc2_%=          \n\t"
+      "loop_kc2_%=:                   \n\t"
+      "vld1.32    {q0},     [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q8,   q2,   d0[0]      \n\t"
+      "vmla.f32   q9,   q3,   d0[0]      \n\t"
+      "vmla.f32   q10,  q2,   d0[1]      \n\t"
+      "vmla.f32   q11,  q3,   d0[1]      \n\t"
+      "vmla.f32   q12,  q2,   d1[0]      \n\t"
+      "vmla.f32   q13,  q3,   d1[0]      \n\t"
+      "vmla.f32   q14,  q2,   d1[1]      \n\t"
+      "vmla.f32   q15,  q3,   d1[1]      \n\t"
+      "subs       %[kc2], %[kc2], #1  \n\t"
+      "bge        loop_kc2_%=         \n\t"
+      "end_kc2_%=:                    \n\t"
+
+      "mov        r5,     %[c]        \n\t"
+      "mov        r6,     %[step]     \n\t"
+      "vst1.32    {q8, q9},   [r5], r6     \n\t"
+      "vst1.32    {q10, q11}, [r5], r6     \n\t"
+      "vst1.32    {q12, q13}, [r5], r6     \n\t"
+      "vst1.32    {q14, q15}, [r5]         \n\t"
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [step] "r"(step)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9",
+        "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+// C = A * B
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int step = 4 * ldc;
+  int step1 = 4 * (NC - 16 * nc1);
+  int volatile m = mc;
+
+  float *volatile c_ptr, *volatile C_ptr;
+  float *C0, *c0;
+  c_ptr = c;
+  C_ptr = C;
+  if (nc1 > 0) {
+    asm volatile(
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
+        "vst1.32    {q0, q1}, [r6]!         \n\t"
+
+        "vld1.32    {q2, q3}, [%[c_ptr]]!   \n\t"
+        "vst1.32    {q2, q3}, [r6]!         \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[C_ptr], %[C_ptr], %[step]   \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1]  \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3");
+  }
+
+  if (_nc1 != 0) {
+    for (int i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 16 + i * ldc;
+      c0 = c_ptr + nc1 * 16 + i * NC;
+      for (int j = 0; j < _nc1; j++) {
+        *C0++ = *c0++;
+      }
+    }
+  }
+}
+
+// C = alpha * A * B + beta * C
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+
+// C = A * B + C
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int step = 4 * ldc;
+  int step1 = 4 * (NC - 16 * nc1);
+  int volatile m = mc;
+
+  float *volatile c_ptr, *volatile C_ptr;
+  float *C0, *c0;
+  c_ptr = c;
+  C_ptr = C;
+  if (nc1 > 0) {
+    asm volatile(
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [r6]        \n\t"
+        "vld1.32    {q2, q3},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vst1.32    {q10, q11}, [r6]!       \n\t"
+
+        "vld1.32    {q4, q5},   [r6]        \n\t"
+        "vld1.32    {q6, q7},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vst1.32    {q12, q13}, [r6]!       \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[C_ptr], %[C_ptr], %[step]     \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1]    \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+          "q10", "q11", "q12", "q13");
+  }
+
+  if (_nc1 != 0) {
+    for (int i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 16 + i * ldc;
+      c0 = c_ptr + nc1 * 16 + i * NC;
+      for (int j = 0; j < _nc1; j++) {
+        *C0++ += *c0++;
+      }
+    }
+  }
+}
+
+// C = A * B + C, relu(C)
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int step = 4 * ldc;
+  int step1 = 4 * (NC - 16 * nc1);
+  int volatile m = mc;
+
+  float *volatile c_ptr, *volatile C_ptr;
+  float *C0, *c0;
+  c_ptr = c;
+  C_ptr = C;
+  if (nc1 > 0) {
+    asm volatile(
+        "vmov.f32   q14,    #0.0            \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [r6]        \n\t"
+        "vld1.32    {q2, q3},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vmax.f32   q10,  q10,  q14         \n\t"
+        "vmax.f32   q11,  q11,  q14         \n\t"
+        "vst1.32    {q10, q11}, [r6]!       \n\t"
+
+        "vld1.32    {q4, q5},   [r6]        \n\t"
+        "vld1.32    {q6, q7},   [%[c_ptr]]! \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vmax.f32   q12,  q12,  q14         \n\t"
+        "vmax.f32   q13,  q13,  q14         \n\t"
+        "vst1.32    {q12, q13}, [r6]!       \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[C_ptr], %[C_ptr], %[step]     \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1]    \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+          "q10", "q11", "q12", "q13");
+  }
+
+  if (_nc1 != 0) {
+    for (int i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 16 + i * ldc;
+      c0 = c_ptr + nc1 * 16 + i * NC;
+      for (int j = 0; j < _nc1; j++) {
+        *C0 += *c0;
+        if (*C0 < 0) {
+          *C0 = 0;
+        }
+        C0++;
+        c0++;
+      }
+    }
+  }
+}
+
+// C = A * B, batchnorm(C)
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
+                 float *bias) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int nc2 = _nc1 / 4;
+  int nc3 = 16 - 4 * (_nc1 % 4);
+  int step = 4 * (ldc - nc);
+  int step1 = 4 * (NC - nc);
+
+  asm volatile(
+      "subs       %[mc], %[mc], #1        \n\t"
+      "blt        end_mc_%=               \n\t"
+      "loop_mc_%=:                        \n\t"
+
+      "mov        r5,   %[nc1]            \n\t"
+      "mov        r6,   %[nc2]            \n\t"
+
+      "subs       r5,   r5,   #1          \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+      "vmla.f32   q10,  q0,   q2          \n\t"
+      "vmla.f32   q11,  q1,   q3          \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+      "vmla.f32   q12,  q4,   q6          \n\t"
+      "vmla.f32   q13,  q5,   q7          \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+      "subs       r5,   r5,   #1          \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      "subs       r6,  r6,   #1           \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+
+      "vld1.32    {q0},   [%[c]]!           \n\t"
+      "vld1.32    {q1},   [%[scale]]!       \n\t"
+      "vld1.32    {q10},  [%[bias]]!        \n\t"
+      "vmla.f32   q10,    q0,   q1          \n\t"
+      "vst1.32    {q10},  [%[C]]!           \n\t"
+
+      "subs       r6,   r6,   #1          \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+
+      "sub        %[c],     %[c],   %[nc3]      \n\t"
+      "sub        %[scale], %[scale],  %[nc3]   \n\t"
+      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+      "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+      "vld1.32    {q0},   [%[c]]!         \n\t"
+      "vld1.32    {q1},   [%[scale]]!     \n\t"
+      "vld1.32    {q10},  [%[bias]]!      \n\t"
+      "vmla.f32   q10,    q0,   q1        \n\t"
+      "vst1.32    {q10},  [%[C]]!         \n\t"
+      "end_nc3_%=:                        \n\t"
+
+      "add        %[c],     %[c],     %[step1]  \n\t"
+      "add        %[scale], %[scale], %[step]   \n\t"
+      "add        %[bias],  %[bias],  %[step]   \n\t"
+      "add        %[C],     %[C],     %[step]   \n\t"
+
+      "subs       %[mc], %[mc], #1        \n\t"
+      "bge        loop_mc_%=              \n\t"
+      "end_mc_%=:                         \n\t"
+
+      :
+      : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
+        [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1),
+        [scale] "r"(scale), [bias] "r"(bias)
+      : "memory", "cc", "r5", "r6", "r7", "r8", "q0", "q1", "q2", "q3", "q4",
+        "q5", "q6", "q7", "q10", "q11", "q12", "q13");
+}
+
+// C = A * B, batchnorm(C), relu(C)
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
+                     float *bias) {
+  int nc1 = nc / 16;
+  int _nc1 = nc % 16;
+  int nc2 = _nc1 / 4;
+  int nc3 = 16 - 4 * (_nc1 % 4);
+  int step = 4 * (ldc - nc);
+  int step1 = 4 * (NC - nc);
+
+  asm volatile(
+      "vmov.f32   q14,    #0.0            \n\t"
+      "subs       %[mc], %[mc], #1        \n\t"
+      "blt        end_mc_%=               \n\t"
+      "loop_mc_%=:                        \n\t"
+
+      "mov        r5,   %[nc1]            \n\t"
+      "mov        r6,   %[nc2]            \n\t"
+
+      "subs       r5,   r5,   #1          \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+      "vmla.f32   q10,  q0,   q2          \n\t"
+      "vmla.f32   q11,  q1,   q3          \n\t"
+      "vmax.f32   q10,  q10,  q14         \n\t"
+      "vmax.f32   q11,  q11,  q14         \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+      "vmla.f32   q12,  q4,   q6          \n\t"
+      "vmla.f32   q13,  q5,   q7          \n\t"
+      "vmax.f32   q12,  q12,  q14         \n\t"
+      "vmax.f32   q13,  q13,  q14         \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+      "subs       r5,   r5,   #1          \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      "subs       r6,  r6,   #1           \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+
+      "vld1.32    {q0},   [%[c]]!           \n\t"
+      "vld1.32    {q1},   [%[scale]]!       \n\t"
+      "vld1.32    {q10},  [%[bias]]!        \n\t"
+      "vmla.f32   q10,    q0,   q1          \n\t"
+      "vmax.f32   q10,    q10,  q14         \n\t"
+      "vst1.32    {q10},  [%[C]]!           \n\t"
+
+      "subs       r6,   r6,   #1          \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+
+      "sub        %[c],     %[c],   %[nc3]      \n\t"
+      "sub        %[scale], %[scale],  %[nc3]   \n\t"
+      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+      "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+      "vld1.32    {q0},   [%[c]]!         \n\t"
+      "vld1.32    {q1},   [%[scale]]!     \n\t"
+      "vld1.32    {q10},  [%[bias]]!      \n\t"
+      "vmla.f32   q10,    q0,   q1        \n\t"
+      "vmax.f32   q10,    q10,  q14       \n\t"
+      "vst1.32    {q10},  [%[C]]!         \n\t"
+      "end_nc3_%=:                        \n\t"
+
+      "add        %[c],     %[c],     %[step1]  \n\t"
+      "add        %[scale], %[scale], %[step]   \n\t"
+      "add        %[bias],  %[bias],  %[step]   \n\t"
+      "add        %[C],     %[C],     %[step]   \n\t"
+
+      "subs       %[mc], %[mc], #1        \n\t"
+      "bge        loop_mc_%=              \n\t"
+      "end_mc_%=:                         \n\t"
+
+      :
+      : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
+        [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1),
+        [scale] "r"(scale), [bias] "r"(bias)
+      : "memory", "r5", "r6", "r7", "r8", "q0", "q1", "q2", "q3", "q4", "q5",
+        "q6", "q7", "q10", "q11", "q12", "q13", "q14");
+}
+
+// C = A * B
+void VecWriteBasic(int n, float *c, float *C, int ldc) {
+  int nc1 = n / 16;
+  int _nc1 = n % 16;
+  int nc2 = _nc1 / 4;
+  int nc3 = 16 - 4 * (_nc1 % 4);
+
+  asm volatile(
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q0, q1}, [%[c]]!       \n\t"
+      "vst1.32    {q0, q1}, [%[C]]!       \n\t"
+
+      "vld1.32    {q2, q3}, [%[c]]!       \n\t"
+      "vst1.32    {q2, q3}, [%[C]]!       \n\t"
+
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+
+      "vld1.32    {q4},     [%[c]]!       \n\t"
+      "vst1.32    {q4},     [%[C]]!       \n\t"
+
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+      "sub        %[c],     %[c],   %[nc3]    \n\t"
+      "sub        %[C],     %[C],   %[nc3]    \n\t"
+      "vld1.32    {q5},     [%[c]]!       \n\t"
+      "vst1.32    {q5},     [%[C]]!       \n\t"
+      "end_nc3_%=:                        \n\t"
+
+      :
+      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q5");
+}
+
+// C = alpha * A * B + beta * C
+void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
+
+// C = A * B + C
+void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
+  int nc1 = n / 16;
+  int _nc1 = n % 16;
+
+  asm volatile(
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+      "vld1.32    {q2, q3},   [%[C]]      \n\t"
+      "vadd.f32   q10,  q0,   q2          \n\t"
+      "vadd.f32   q11,  q1,   q3          \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+      "vld1.32    {q6, q7},   [%[C]]      \n\t"
+      "vadd.f32   q12,  q4,   q6          \n\t"
+      "vadd.f32   q13,  q5,   q7          \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      : [C] "+r"(C), [c] "+r"(c)
+      : [nc1] "r"(nc1)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
+        "q12", "q13");
+
+  if (_nc1 != 0) {
+    for (int j = 0; j < _nc1; j++) {
+      *C++ += *c++;
+    }
+  }
+}
+
+// C = A * B + C, relu(C)
+void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
+  int nc1 = n / 16;
+  int _nc1 = n % 16;
+
+  asm volatile(
+      "vmov.f32   q14,      #0.0          \n\t"
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+      "vld1.32    {q2, q3},   [%[C]]      \n\t"
+      "vadd.f32   q10,  q0,   q2          \n\t"
+      "vadd.f32   q11,  q1,   q3          \n\t"
+      "vmax.f32   q10,  q10,  q14         \n\t"
+      "vmax.f32   q11,  q11,  q14         \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+      "vld1.32    {q6, q7},   [%[C]]      \n\t"
+      "vadd.f32   q12,  q4,   q6          \n\t"
+      "vadd.f32   q13,  q5,   q7          \n\t"
+      "vmax.f32   q12,  q12,  q14         \n\t"
+      "vmax.f32   q13,  q13,  q14         \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      : [C] "+r"(C), [c] "+r"(c)
+      : [nc1] "r"(nc1)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
+        "q12", "q13");
+
+  if (_nc1 != 0) {
+    for (int j = 0; j < _nc1; j++) {
+      *C += *c;
+      if (*C < 0) {
+        *C = 0;
+      }
+      C++;
+      c++;
+    }
+  }
+}
+
+// C = A * B, batchnorm(C)
+void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
+                    float *bias) {
+  int nc1 = n / 16;
+  int _nc1 = n % 16;
+  int nc2 = _nc1 / 4;
+  int nc3 = 16 - 4 * (_nc1 % 4);
+
+  asm volatile(
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+      "vmla.f32   q10,  q0,   q2          \n\t"
+      "vmla.f32   q11,  q1,   q3          \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+      "vmla.f32   q12,  q4,   q6          \n\t"
+      "vmla.f32   q13,  q5,   q7          \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+
+      "vld1.32    {q0},   [%[c]]!         \n\t"
+      "vld1.32    {q1},   [%[scale]]!     \n\t"
+      "vld1.32    {q10},  [%[bias]]!      \n\t"
+      "vmla.f32   q10,    q0,   q1        \n\t"
+      "vst1.32    {q10},  [%[C]]!         \n\t"
+
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+
+      "sub        %[c],     %[c],   %[nc3]      \n\t"
+      "sub        %[scale], %[scale],  %[nc3]   \n\t"
+      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+      "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+      "vld1.32    {q0},   [%[c]]!         \n\t"
+      "vld1.32    {q1},   [%[scale]]!     \n\t"
+      "vld1.32    {q10},  [%[bias]]!      \n\t"
+      "vmla.f32   q10,    q0,   q1        \n\t"
+      "vst1.32    {q10},  [%[C]]!         \n\t"
+      "end_nc3_%=:                        \n\t"
+
+      :
+      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3),
+        [scale] "r"(scale), [bias] "r"(bias)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
+        "q12", "q13");
+}
+
+// C = A * B, batchnorm(C), relu(C)
+void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
+                        float *bias) {
+  int nc1 = n / 16;
+  int _nc1 = n % 16;
+  int nc2 = _nc1 / 4;
+  int nc3 = 16 - 4 * (_nc1 % 4);
+
+  asm volatile(
+      "vmov.f32   q14,      #0.0          \n\t"
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "blt        end_nc1_%=              \n\t"
+      "loop_nc1_%=:                       \n\t"
+
+      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+      "vmla.f32   q10,  q0,   q2          \n\t"
+      "vmla.f32   q11,  q1,   q3          \n\t"
+      "vmax.f32   q10,  q10,  q14         \n\t"
+      "vmax.f32   q11,  q11,  q14         \n\t"
+      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+      "vmla.f32   q12,  q4,   q6          \n\t"
+      "vmla.f32   q13,  q5,   q7          \n\t"
+      "vmax.f32   q12,  q12,  q14         \n\t"
+      "vmax.f32   q13,  q13,  q14         \n\t"
+      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+      "subs       %[nc1],   %[nc1],   #1  \n\t"
+      "bge        loop_nc1_%=             \n\t"
+      "end_nc1_%=:                        \n\t"
+
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "blt        end_nc2_%=              \n\t"
+      "loop_nc2_%=:                       \n\t"
+
+      "vld1.32    {q0},   [%[c]]!         \n\t"
+      "vld1.32    {q1},   [%[scale]]!     \n\t"
+      "vld1.32    {q10},  [%[bias]]!      \n\t"
+      "vmla.f32   q10,    q0,   q1        \n\t"
+      "vmax.f32   q10,    q10,  q14       \n\t"
+      "vst1.32    {q10},  [%[C]]!         \n\t"
+
+      "subs       %[nc2],   %[nc2],   #1  \n\t"
+      "bge        loop_nc2_%=             \n\t"
+      "end_nc2_%=:                        \n\t"
+
+      "cmp        %[nc3],    #16          \n\t"
+      "beq        end_nc3_%=              \n\t"
+
+      "sub        %[c],     %[c],   %[nc3]      \n\t"
+      "sub        %[scale], %[scale],  %[nc3]   \n\t"
+      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+      "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+      "vld1.32    {q0},   [%[c]]!         \n\t"
+      "vld1.32    {q1},   [%[scale]]!     \n\t"
+      "vld1.32    {q10},  [%[bias]]!      \n\t"
+      "vmla.f32   q10,    q0,   q1        \n\t"
+      "vmax.f32   q10,    q10,  q14       \n\t"
+      "vst1.32    {q10},  [%[C]]!         \n\t"
+      "end_nc3_%=:                        \n\t"
+
+      :
+      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3),
+        [scale] "r"(scale), [bias] "r"(bias)
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
+        "q12", "q13", "q14");
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
 }  // namespace paddle_mobile
diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h
index 6d7ae6d2bcdbd7e24cb3c2389dd3cdf09a807892..b4bce43c7a29fba09ade7512cbc660f0ac2888ab 100644
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -19,12 +19,8 @@ limitations under the License. */
 #define B(i, j) B[(i)*ldb + (j)]
 #define C(i, j) C[(i)*ldc + (j)]
 
-// 分块计算的块大小，mc 与 kc 分别对应分块计算时的 m 与 k
-#define MC 128
-#define KC 128
-#define NC 1024
 #define MR 4
-#define NR 4
+#define NR 8
 
 #define s_min(i, j) ((i) < (j) ? (i) : (j))
 
@@ -49,28 +45,66 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
                   float *buffer);
 
 // 分块矩阵乘法
-void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                 const float *B, int ldb, float beta, float *C, int ldc,
-                 int first_time);
+void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
+                 float beta, float *c, float *C, int ldc, bool relu);
+
+void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+                       const float *b, float beta, float *c, float *C, int ldc,
+                       bool relu, float *new_scale, float *new_bias);
 
 // 向量矩阵乘法 (M = 1)
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                  const float *B, int ldb, float beta, float *C, int ldc);
-
-// 计算一个更小的 4 * 4 的 C 矩阵分块
-void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B,
-               int ldb, float beta, float *C, int ldc, int mc, int nc);
-
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
-                    bool relu);
+                  const float *B, int ldb, float beta, float *C, int ldc,
+                  bool relu);
+
+void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
+                        int lda, const float *B, int ldb, float beta, float *C,
+                        int ldc, bool relu, float *new_scale, float *new_bias);
+
+// 计算一个更小的 C 矩阵分块
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
+
+// 分块矩阵乘法结果回写
+// C = A * B
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
+// C = alpha * A * B + beta * C
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B + C
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B + C, relu(C)
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B, batchnorm(C)
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
+                 float *new_bias);
+// C = A * B, batchnorm(C), relu(C)
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                     float *new_scale, float *new_bias);
+
+// 向量矩阵乘法结果回写
+// C = A * B
+void VecWriteBasic(int n, float *c, float *C, int ldc);
+// C = alpha * A * B + beta * C
+void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
+// C = A * B + C
+void VecWriteWithAdd(int n, float *c, float *C, int ldc);
+// C = A * B + C, relu(C)
+void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
+// C = A * B, batchnorm(C)
+void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
+                    float *new_bias);
+// C = A * B, batchnorm(C), relu(C)
+void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
+                        float *new_bias);
 
 // 32位 float 矩阵乘法
-void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc);
+void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+           const float *B, int ldb, float beta, float *C, int ldc, bool relu);
 
-void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
-                const float *B, int ldb, float beta, float *C, int ldc);
+// 32位 float 矩阵乘法, 并对结果进行 batchnrom
+void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 bool relu, float *new_scale, float *new_bias);
 
 // 64位 double 矩阵乘法
 void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp
index fd4106038c7446e659736c6b3c61b5aa05127e72..ca5367788ed87da070dd19900e8d546e51caf337 100644
--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -39,22 +39,18 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
 
   int M = dim_out[0];
   int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+  int K = (!trans_a) ? dim_a[1] : dim_a[0];
 
-  if (relu) {
-    sgemm_relu(M, N, K, alpha, matrix_a.data<float>(), K,
-               matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N);
-  } else {
-    sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-          beta, matrix_out->data<float>(), N);
-  }
+  Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
+        beta, matrix_out->data<float>(), N, relu);
 }
 
 template <>
-void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
-                    const framework::Tensor &matrix_b, bool trans_b,
-                    double alpha, framework::Tensor *matrix_out, double beta,
-                    bool relu) {
+void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
+                         const framework::Tensor &matrix_b, bool trans_b,
+                         float alpha, framework::Tensor *matrix_out, float beta,
+                         bool relu, framework::Tensor *new_scale,
+                         framework::Tensor *new_bias) {
   auto dim_a = matrix_a.dims();
   auto dim_b = matrix_b.dims();
   auto dim_out = matrix_out->dims();
@@ -71,7 +67,11 @@ void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
 
   int M = dim_out[0];
   int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+  int K = (!trans_a) ? dim_a[1] : dim_a[0];
+
+  SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
+              N, beta, matrix_out->data<float>(), N, relu,
+              new_scale->data<float>(), new_bias->data<float>());
 }
 
 }  // namespace math
diff --git a/src/operators/math/math_function.h b/src/operators/math/math_function.h
index 0b953ec6a3b2a03a94a91884b9daf3ed88523a22..0ca7815fc2bcff2be0345b581d3dfb26cf55794c 100644
--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -26,6 +26,12 @@ template <typename T>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
             const framework::Tensor &matrix_b, bool trans_b, T alpha,
             framework::Tensor *matrix_out, T beta, bool relu = false);
+
+template <typename T>
+void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
+                  const framework::Tensor &matrix_b, bool trans_b, T alpha,
+                  framework::Tensor *matrix_out, T beta, bool relu,
+                  framework::Tensor *new_scale, framework::Tensor *new_bias);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile