Merge remote-tracking branch 'origin/develop' into develop

e7b06b70 · qnqinan · 33122506 · bd9c7c39 · e7b06b70 · e7b06b70
17 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -75,4 +75,5 @@ cmake-build-release
 demo/ios/PaddleMobileDemo/PaddleMobileDemo/googlenet_combine/
 demo/ios/PaddleMobileDemo/PaddleMobileDemo/*.jpg
 demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/*.a
-*.xcuserstate
\ No newline at end of file
+*.xcuserstate
+/tools/quantification/quantify
--- a/src/fpga/api/fpga_api.cpp
+++ b/src/fpga/api/fpga_api.cpp
@@ -36,7 +36,7 @@ static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";

 static inline int do_ioctl(int req, void *arg) {
-  return ioctl(req, (long unsigned int)arg);
+  return ioctl(req, (unsigned int64_t)arg);
 }

 int open_device() {
@@ -58,9 +58,13 @@ void fpga_copy(void *dest, const void *src, size_t num) {
  memcpy(dest, src, num);
 }

-int ComputeFpgaConv(struct ConvArgs args) {}
-int ComputeFpgaPool(struct PoolingArgs args) {}
-int ComputeFpgaEWAdd(struct EWAddArgs args) {}
+int ComputeFpgaConv(const struct ConvArgs &args) { return do_ioctl(21, &args); }
+int ComputeFpgaPool(const struct PoolingArgs &args) {
+  return do_ioctl(22, &args);
+}
+int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
+  return do_ioctl(23, &args);
+}

 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/api/fpga_api.h
+++ b/src/fpga/api/fpga_api.h
@@ -31,6 +31,18 @@ void* fpga_malloc(size_t size);
 void fpga_free(void* ptr);
 void fpga_copy(void* dst, const void* src, size_t num);

+enum DataConvertType {
+  DATA_NO_CONVERT = 0,
+  DATA_FP32_TO_FP16 = 1,
+  DATA_FP16_TO_FP32 = 2,
+};
+
+enum LayoutConvertType {
+  LAYOUT_NO_CONVERT = 0,
+  LAYOUT_CHW_TO_HWC = 1,
+  LAYOUT_HWC_TO_CHW = 2,
+};
+
 struct VersionArgs {
  void* buffer;
 };
@@ -79,7 +91,7 @@ struct ConvArgs {
  uint32_t filter_num;
  uint32_t group_num;

-  struct BNArgs bn;
+  void* sb_address;  // scale and bias are interlaced;
  struct KernelArgs kernel;
  struct ImageInputArgs image;  // input image;
  struct ImageOutputArgs output;
@@ -102,6 +114,12 @@ struct EWAddArgs {
  struct ImageOutputArgs output;
 };

+struct BypassArgs {
+  enum DataConvertType convert_type;
+  struct ImageInputArgs image;
+  struct ImageOutputArgs output;
+};
+
 struct FpgaRegWriteArgs {
  uint64_t address;  //
  uint64_t value;
@@ -115,8 +133,6 @@ struct FpgaRegReadArgs {
 #define IOCTL_FPGA_MAGIC 'FPGA'

 #define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
-#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 02, struct FpgaRegReadArgs)
-#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 03, struct FpgaRegWriteArgs)

 #define IOCTL_SEPARATOR_0 10

@@ -127,6 +143,8 @@ struct FpgaRegReadArgs {
 #define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
 #define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
 #define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
+#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
+#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)

 enum FPGA_ERR_TYPE {
  ERR_IOCTL_CMD = -1,
@@ -154,9 +172,9 @@ enum FPGA_ERR_TYPE {

 //============================== API =============================

-int ComputeFpgaConv(struct ConvArgs args);
-int ComputeFpgaPool(struct PoolingArgs args);
-int ComputeFpgaEWAdd(struct EWAddArgs args);
+int ComputeFpgaConv(const struct ConvArgs& args);
+int ComputeFpgaPool(const struct PoolingArgs& args);
+int ComputeFpgaEWAdd(const struct EWAddArgs& args);

 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/fpga_quantilization.h
+++ b/src/fpga/fpga_quantilization.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <string>
+#include "common/types.h"
+#include "framework/lod_tensor.h"
+#include "framework/operator.h"
+#include "framework/scope.h"
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+
+bool is_conv(std::string type) {
+  if (type.compare(G_OP_TYPE_CONV) == 0) {
+    return true;
+  }
+  if (type.compare(G_OP_TYPE_FUSION_CONV_ADD) == 0) {
+    return true;
+  }
+  if (type.compare(G_OP_TYPE_FUSION_CONV_ADD_RELU) == 0) {
+    return true;
+  }
+  if (type.compare(G_OP_TYPE_FUSION_CONV_BN_RELU) == 0) {
+    return true;
+  }
+  if (type.compare(G_OP_TYPE_FUSION_CONV_ADD_BN) == 0) {
+    return true;
+  }
+  return false;
+}
+
+template <typename Dtype>
+void quantilize_op(std::shared_ptr<framework::OperatorBase<Dtype>> op,
+                   std::shared_ptr<framework::Scope> scope) {
+  if (!is_conv(op.get()->Type())) {
+    return;
+  }
+  framework::Tensor* filter = nullptr;
+  auto var_vec = op.get()->Inputs().at("Filter");
+  if (!var_vec.empty()) {
+    auto var = scope.get()->FindVar(var_vec[0]);
+    filter = var->template GetMutable<framework::LoDTensor>();
+  }
+  float scale = 0;
+
+  // 32bit filter -> 8bit filter;
+  if (filter->type() == typeid(float)) {
+    framework::Tensor* originalFilter = filter;
+    framework::Tensor* quantFilter = new framework::Tensor();
+    float* floatData = originalFilter->data<float>();
+    int8_t* intData = quantFilter->mutable_data<int8_t>();
+  }
+}
+
+}  // namespace paddle_mobile
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -260,7 +260,7 @@ class Tensor {
    inline float *scale_pointer() { return &scale; }
  };

-  struct &fpga_args() const {
+  struct FPGAArgs &fpga_args() {
    return fpgaArgs_;
  }
 #endif

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -32,6 +32,10 @@ limitations under the License. */
 #include "common/threadpool.h"
 #endif

+#ifdef PADDLE_MOBILE_FPGA
+#include "fpga/fpga_quantilization.h"
+#endif
+
 namespace paddle_mobile {
 using framework::Variable;

@@ -96,6 +100,11 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
  for (const auto &op : ops) {
    op->Init();
  }
+#ifdef PADDLE_MOBILE_FPGA
+  for (const auto &op : ops) {
+    quantilize_op(op, program_.scope);
+  }
+#endif
 }

 template <typename Dtype, Precision P>

--- a/src/io/loader.cpp
+++ b/src/io/loader.cpp
@@ -56,7 +56,8 @@ template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
    const std::string &model_path, const std::string &para_path, bool optimize,
    bool quantification) {
-  auto program = this->LoadProgram(model_path, optimize);
+  auto program = this->LoadProgram(model_path, optimize, quantification);
+
  program.para_path = para_path;
  program.combined = true;
  program.quantification = quantification;

--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
@@ -61,6 +61,15 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                         optimize);
 }

+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
+    JNIEnv *env, jclass thiz, jstring modelPath) {
+  ANDROIDLOGI("loadQualified invoked");
+  bool optimize = true;
+  bool qualified = true;
+  return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                         optimize, qualified);
+}
+
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) {
  ANDROIDLOGI("loadCombined invoked");
@@ -70,6 +79,16 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
                                         optimize);
 }

+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) {
+  ANDROIDLOGI("loadCombinedQualified invoked");
+  bool optimize = true;
+  bool qualified = true;
+  return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                         jstring2cppstring(env, paramPath),
+                                         optimize, qualified);
+}
+
 JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
    JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims) {
  ANDROIDLOGI("predictImage invoked");

--- a/src/jni/paddle_mobile_jni.h
+++ b/src/jni/paddle_mobile_jni.h
@@ -27,12 +27,24 @@ namespace jni {
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                                          jclass thiz,
                                                          jstring modelPath);
+
+/**
+ * load separated qualified model for android
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
+    JNIEnv *env, jclass thiz, jstring modelPath);
 /**
 * load combined model  for android
 */
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath);

+/**
+ * load combined qualified model for android
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath);
+
 /**
 * object detection for anroid
 */

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -107,20 +107,22 @@ void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
      *buffer++ = *a3++;
    }
  }
-  int i = m - m_tail;
-  a0 = &A(i, 0);
-  a1 = a0 + lda;
-  a2 = a0 + 2 * lda;
-  a3 = a0 + 3 * lda;
+
  if (m_tail != 0) {
-    if (m_tail <= 3) {
-      a3 = zero;
-    }
-    if (m_tail <= 2) {
-      a2 = zero;
-    }
-    if (m_tail <= 1) {
-      a1 = zero;
+    a0 = &A(m - m_tail, 0);
+    a1 = a0 + lda;
+    a2 = a0 + 2 * lda;
+    a3 = a0 + 3 * lda;
+    switch (m_tail) {
+      case 1:
+        a1 = zero;
+      case 2:
+        a2 = zero;
+      case 3:
+        a3 = zero;
+        break;
+      default:
+        break;
    }
    for (int j = 0; j < k; ++j) {
      *buffer++ = *a0++;
@@ -150,28 +152,89 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
      *buffer++ = *a5++;
    }
  }
-  int i = m - m_tail;
-  a0 = &A(i, 0);
-  a1 = a0 + lda;
-  a2 = a0 + 2 * lda;
-  a3 = a0 + 3 * lda;
-  a4 = a0 + 4 * lda;
-  a5 = a0 + 5 * lda;
  if (m_tail != 0) {
-    if (m_tail <= 5) {
-      a5 = zero;
+    a0 = &A(m - m_tail, 0);
+    a1 = a0 + lda;
+    a2 = a0 + 2 * lda;
+    a3 = a0 + 3 * lda;
+    a4 = a0 + 4 * lda;
+    a5 = a0 + 5 * lda;
+    switch (m_tail) {
+      case 1:
+        a1 = zero;
+      case 2:
+        a2 = zero;
+      case 3:
+        a3 = zero;
+      case 4:
+        a4 = zero;
+      case 5:
+        a5 = zero;
+        break;
+      default:
+        break;
    }
-    if (m_tail <= 4) {
-      a4 = zero;
-    }
-    if (m_tail <= 3) {
-      a3 = zero;
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
+      *buffer++ = *a4++;
+      *buffer++ = *a5++;
    }
-    if (m_tail <= 2) {
-      a2 = zero;
+  }
+}
+
+void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer) {
+  const float *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
+  for (int i = 0; i < m - m_tail; i += MR) {
+    a0 = A + i * lda;
+    a1 = A + (i + 1) * lda;
+    a2 = A + (i + 2) * lda;
+    a3 = A + (i + 3) * lda;
+    a4 = A + (i + 4) * lda;
+    a5 = A + (i + 5) * lda;
+    a6 = A + (i + 6) * lda;
+    a7 = A + (i + 7) * lda;
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
+      *buffer++ = *a4++;
+      *buffer++ = *a5++;
+      *buffer++ = *a6++;
+      *buffer++ = *a7++;
    }
-    if (m_tail <= 1) {
-      a1 = zero;
+  }
+  if (m_tail != 0) {
+    a0 = &A(m - m_tail, 0);
+    a1 = a0 + lda;
+    a2 = a0 + 2 * lda;
+    a3 = a0 + 3 * lda;
+    a4 = a0 + 4 * lda;
+    a5 = a0 + 5 * lda;
+    a6 = a0 + 6 * lda;
+    a7 = a0 + 7 * lda;
+    switch (m_tail) {
+      case 1:
+        a1 = zero;
+      case 2:
+        a2 = zero;
+      case 3:
+        a3 = zero;
+      case 4:
+        a4 = zero;
+      case 5:
+        a5 = zero;
+      case 6:
+        a6 = zero;
+      case 7:
+        a7 = zero;
+        break;
+      default:
+        break;
    }
    for (int j = 0; j < k; ++j) {
      *buffer++ = *a0++;
@@ -180,6 +243,8 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
      *buffer++ = *a3++;
      *buffer++ = *a4++;
      *buffer++ = *a5++;
+      *buffer++ = *a6++;
+      *buffer++ = *a7++;
    }
  }
 }
@@ -234,15 +299,78 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
  }
 }

+#if __aarch64__
+void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
+                     float *buffer) {
+  const float *b0;
+  for (int j = 0; j < n - n_tail; j += NR) {
+    for (int i = 0; i < k; ++i) {
+      b0 = &B(i, j);
+      asm volatile(
+          "prfm   pldl2keep,        [%[b0], #64]           \n\t"
+          "ld1    {v0.4s, v1.4s, v2.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s, v2.4s},   [%[buffer]],  #48 \n\t"
+          : [buffer] "+r"(buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1", "v2");
+    }
+  }
+  if (n_tail != 0) {
+    for (int i = 0; i < k; ++i) {
+      b0 = &B(i, n - n_tail);
+      for (int j = n - n_tail; j < n; ++j) {
+        *buffer++ = *b0++;
+      }
+      for (int j = n; j < n + (NR - n_tail); ++j) {
+        *buffer++ = 0;
+      }
+    }
+  }
+}
+
+void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
+                     float *buffer) {
+  const float *b0;
+  for (int j = 0; j < n - n_tail; j += NR) {
+    for (int i = 0; i < k; ++i) {
+      b0 = &B(i, j);
+      asm volatile(
+          "prfm   pldl2keep,        [%[b0], #64]           \n\t"
+          "ld1    {v0.4s, v1.4s, v2.4s, v3.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s, v2.4s, v3.4s},   [%[buffer]],  #64 \n\t"
+          : [buffer] "+r"(buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1", "v2", "v3");
+    }
+  }
+  if (n_tail != 0) {
+    for (int i = 0; i < k; ++i) {
+      b0 = &B(i, n - n_tail);
+      for (int j = n - n_tail; j < n; ++j) {
+        *buffer++ = *b0++;
+      }
+      for (int j = n; j < n + (NR - n_tail); ++j) {
+        *buffer++ = 0;
+      }
+    }
+  }
+}
+#endif  // __aarch64__
+
 // 分块矩阵乘法
 void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
                 float beta, float *c, float *C, int ldc, bool relu) {
 #pragma omp parallel for
  for (int j = 0; j < nc; j += NR) {
    for (int i = 0; i < mc; i += MR) {
+#if __aarch64__
+      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#else
      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#endif
    }
  }

@@ -271,9 +399,14 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
 #pragma omp parallel for
  for (int j = 0; j < nc; j += NR) {
    for (int i = 0; i < mc; i += MR) {
+#if __aarch64__
+      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#else
      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#endif
    }
  }

@@ -1956,10 +2089,20 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
  int mc, nc;
  for (int j = 0; j < n; j += NC) {
    nc = s_min(n - j, NC);
+#if __aarch64__
+    // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#else
    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#endif
    for (int i = 0; i < m; i += MC) {
      mc = s_min(m - i, MC);
+#if __aarch64__
      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#else
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#endif
      InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc,
                  relu);
    }
@@ -2009,10 +2152,20 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
  int mc, nc;
  for (int j = 0; j < n; j += NC) {
    nc = s_min(n - j, NC);
+#if __aarch64__
+    // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#else
    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#endif
    for (int i = 0; i < m; i += MC) {
      mc = s_min(m - i, MC);
+#if __aarch64__
      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#else
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#endif
      InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
                        &C(i, j), ldc, relu, new_scale + i, new_bias + i);
    }
@@ -2239,6 +2392,192 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
 #endif  // __ARM_NEON
 }

+#if __aarch64__
+void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) {
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k;
+  int step = 4 * ldc;
+  asm volatile(
+      "dup      v5.4s,     wzr     \n\t"
+      "dup      v6.4s,     wzr     \n\t"
+      "dup      v7.4s,     wzr     \n\t"
+      "dup      v8.4s,     wzr     \n\t"
+      "dup      v9.4s,     wzr     \n\t"
+      "dup      v10.4s,    wzr     \n\t"
+      "dup      v11.4s,    wzr     \n\t"
+      "dup      v12.4s,    wzr     \n\t"
+      "dup      v13.4s,    wzr     \n\t"
+      "dup      v14.4s,    wzr     \n\t"
+      "dup      v15.4s,    wzr     \n\t"
+      "dup      v16.4s,    wzr     \n\t"
+
+      "dup      v17.4s,    wzr     \n\t"
+      "dup      v18.4s,    wzr     \n\t"
+      "dup      v19.4s,    wzr     \n\t"
+      "dup      v20.4s,    wzr     \n\t"
+      "dup      v21.4s,    wzr     \n\t"
+      "dup      v22.4s,    wzr     \n\t"
+      "dup      v23.4s,    wzr     \n\t"
+      "dup      v24.4s,    wzr     \n\t"
+      "dup      v25.4s,    wzr     \n\t"
+      "dup      v26.4s,    wzr     \n\t"
+      "dup      v27.4s,    wzr     \n\t"
+      "dup      v28.4s,    wzr     \n\t"
+
+      "subs       %[kc1], %[kc1], #1    \n\t"
+      "blt        end_kc1_%=            \n\t"
+      "loop_kc1_%=:                     \n\t"
+
+      "prfm     pldl1keep,         [%[a_ptr],   #32]  \n\t"
+      "prfm     pldl1keep,         [%[b_ptr],   #48]  \n\t"
+
+      "ld1      {v0.4s, v1.4s},         [%[a_ptr]],   #32   \n\t"
+      "ld1      {v2.4s, v3.4s, v4.4s},  [%[b_ptr]],   #48   \n\t"
+
+      "fmla     v5.4s,    v2.4s,   v0.s[0]       \n\t"
+      "fmla     v6.4s,    v3.4s,   v0.s[0]       \n\t"
+      "fmla     v7.4s,    v4.4s,   v0.s[0]       \n\t"
+      "fmla     v8.4s,    v2.4s,   v0.s[1]       \n\t"
+      "fmla     v9.4s,    v3.4s,   v0.s[1]       \n\t"
+      "fmla     v10.4s,   v4.4s,   v0.s[1]       \n\t"
+      "fmla     v11.4s,   v2.4s,   v0.s[2]       \n\t"
+      "fmla     v12.4s,   v3.4s,   v0.s[2]       \n\t"
+      "fmla     v13.4s,   v4.4s,   v0.s[2]       \n\t"
+      "fmla     v14.4s,   v2.4s,   v0.s[3]       \n\t"
+      "fmla     v15.4s,   v3.4s,   v0.s[3]       \n\t"
+      "fmla     v16.4s,   v4.4s,   v0.s[3]       \n\t"
+
+      "fmla     v17.4s,   v2.4s,   v1.s[0]       \n\t"
+      "fmla     v18.4s,   v3.4s,   v1.s[0]       \n\t"
+      "fmla     v19.4s,   v4.4s,   v1.s[0]       \n\t"
+      "fmla     v20.4s,   v2.4s,   v1.s[1]       \n\t"
+      "fmla     v21.4s,   v3.4s,   v1.s[1]       \n\t"
+      "fmla     v22.4s,   v4.4s,   v1.s[1]       \n\t"
+      "fmla     v23.4s,   v2.4s,   v1.s[2]       \n\t"
+      "fmla     v24.4s,   v3.4s,   v1.s[2]       \n\t"
+      "fmla     v25.4s,   v4.4s,   v1.s[2]       \n\t"
+      "fmla     v26.4s,   v2.4s,   v1.s[3]       \n\t"
+      "fmla     v27.4s,   v3.4s,   v1.s[3]       \n\t"
+      "fmla     v28.4s,   v4.4s,   v1.s[3]       \n\t"
+
+      "subs       %[kc1], %[kc1], #1      \n\t"
+      "bge        loop_kc1_%=             \n\t"
+      "end_kc1_%=:                        \n\t"
+
+      "st1      {v5.4s,   v6.4s,  v7.4s},    [%[c]],   %[step]   \n\t"
+      "st1      {v8.4s,   v9.4s,  v10.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v11.4s,  v12.4s, v13.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v14.4s,  v15.4s, v16.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v17.4s,  v18.4s, v19.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v20.4s,  v21.4s, v22.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v23.4s,  v24.4s, v25.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v26.4s,  v27.4s, v28.4s},   [%[c]],   %[step]   \n\t"
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [step] "r"(step)
+      : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+        "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+        "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28");
+}
+
+void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc) {
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k;
+  int step = 4 * ldc;
+  int step1 = 4 * 6;
+  asm volatile(
+
+      "dup      v6.4s,     wzr     \n\t"
+      "dup      v7.4s,     wzr     \n\t"
+      "dup      v8.4s,     wzr     \n\t"
+      "dup      v9.4s,     wzr     \n\t"
+      "dup      v10.4s,    wzr     \n\t"
+      "dup      v11.4s,    wzr     \n\t"
+      "dup      v12.4s,    wzr     \n\t"
+      "dup      v13.4s,    wzr     \n\t"
+
+      "dup      v14.4s,    wzr     \n\t"
+      "dup      v15.4s,    wzr     \n\t"
+      "dup      v16.4s,    wzr     \n\t"
+      "dup      v17.4s,    wzr     \n\t"
+      "dup      v18.4s,    wzr     \n\t"
+      "dup      v19.4s,    wzr     \n\t"
+      "dup      v20.4s,    wzr     \n\t"
+      "dup      v21.4s,    wzr     \n\t"
+
+      "dup      v22.4s,    wzr     \n\t"
+      "dup      v23.4s,    wzr     \n\t"
+      "dup      v24.4s,    wzr     \n\t"
+      "dup      v25.4s,    wzr     \n\t"
+      "dup      v26.4s,    wzr     \n\t"
+      "dup      v27.4s,    wzr     \n\t"
+      "dup      v28.4s,    wzr     \n\t"
+      "dup      v29.4s,    wzr     \n\t"
+
+      "subs       %[kc1], %[kc1], #1    \n\t"
+      "blt        end_kc1_%=            \n\t"
+      "loop_kc1_%=:                     \n\t"
+
+      "prfm   pldl1keep,  [%[a_ptr],  #24]  \n\t"
+      "prfm   pldl1keep,  [%[b_ptr],  #64]  \n\t"
+
+      "ld1      {v0.4s, v1.4s},  [%[a_ptr]],   %[step1]       \n\t"
+      "ld1      {v2.4s, v3.4s, v4.4s, v5.4s},  [%[b_ptr]],    #64   \n\t"
+
+      "fmla     v6.4s,    v2.4s,   v0.s[0]       \n\t"
+      "fmla     v7.4s,    v3.4s,   v0.s[0]       \n\t"
+      "fmla     v8.4s,    v4.4s,   v0.s[0]       \n\t"
+      "fmla     v9.4s,    v5.4s,   v0.s[0]       \n\t"
+
+      "fmla     v10.4s,   v2.4s,   v0.s[1]       \n\t"
+      "fmla     v11.4s,   v3.4s,   v0.s[1]       \n\t"
+      "fmla     v12.4s,   v4.4s,   v0.s[1]       \n\t"
+      "fmla     v13.4s,   v5.4s,   v0.s[1]       \n\t"
+
+      "fmla     v14.4s,   v2.4s,   v0.s[2]       \n\t"
+      "fmla     v15.4s,   v3.4s,   v0.s[2]       \n\t"
+      "fmla     v16.4s,   v4.4s,   v0.s[2]       \n\t"
+      "fmla     v17.4s,   v5.4s,   v0.s[2]       \n\t"
+
+      "fmla     v18.4s,   v2.4s,   v0.s[3]       \n\t"
+      "fmla     v19.4s,   v3.4s,   v0.s[3]       \n\t"
+      "fmla     v20.4s,   v4.4s,   v0.s[3]       \n\t"
+      "fmla     v21.4s,   v5.4s,   v0.s[3]       \n\t"
+
+      "fmla     v22.4s,   v2.4s,   v1.s[0]       \n\t"
+      "fmla     v23.4s,   v3.4s,   v1.s[0]       \n\t"
+      "fmla     v24.4s,   v4.4s,   v1.s[0]       \n\t"
+      "fmla     v25.4s,   v5.4s,   v1.s[0]       \n\t"
+
+      "fmla     v26.4s,   v2.4s,   v1.s[1]       \n\t"
+      "fmla     v27.4s,   v3.4s,   v1.s[1]       \n\t"
+      "fmla     v28.4s,   v4.4s,   v1.s[1]       \n\t"
+      "fmla     v29.4s,   v5.4s,   v1.s[1]       \n\t"
+
+      "subs       %[kc1], %[kc1], #1      \n\t"
+      "bge        loop_kc1_%=             \n\t"
+      "end_kc1_%=:                        \n\t"
+
+      "st1      {v6.4s,  v7.4s,  v8.4s,  v9.4s},    [%[c]],   %[step]   \n\t"
+      "st1      {v10.4s, v11.4s, v12.4s, v13.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v14.4s, v15.4s, v16.4s, v17.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v18.4s, v19.4s, v20.4s, v21.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v22.4s, v23.4s, v24.4s, v25.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v26.4s, v27.4s, v28.4s, v29.4s},   [%[c]],   %[step]   \n\t"
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [step] "r"(step), [step1] "r"(step1)
+      : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+        "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+        "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29");
+}
+
+#endif  // __aarch64__
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -19,8 +19,13 @@ limitations under the License. */
 #define B(i, j) B[(i)*ldb + (j)]
 #define C(i, j) C[(i)*ldc + (j)]

+#if __aarch64__
+#define MR 6
+#define NR 16
+#else
 #define MR 6
 #define NR 8
+#endif

 #define s_min(i, j) ((i) < (j) ? (i) : (j))

@@ -43,10 +48,16 @@ void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
                    float *buffer);
 void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
                    float *buffer);
+void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer);

 // 将 B 矩阵分块复制到连续内存(RowMajor)
 void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
                    float *buffer);
+void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
+                     float *buffer);
+void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
+                     float *buffer);

 // 分块矩阵乘法
 void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
@@ -70,6 +81,8 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
 void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
 void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
 void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc);

 // 分块矩阵乘法结果回写
 // C = A * B
@@ -114,10 +127,6 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
                 const float *B, int ldb, float beta, float *C, int ldc,
                 bool relu, float *new_scale, float *new_bias);

-// 64位 double 矩阵乘法
-void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
-           const double *B, int ldb, float beta, double *C, int ldc);
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -114,8 +114,12 @@ else ()
    target_link_libraries(test-softmax paddle-mobile)

    # gen test
-    ADD_EXECUTABLE(test-gemm common/test_gemm.cpp)
-    target_link_libraries(test-gemm paddle-mobile)
+    ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
+    target_link_libraries(test-gemm-accuracy paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp)
+    target_link_libraries(test-gemm-perf paddle-mobile)

    # gen test
    ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)

--- a/test/common/test_gemm.cpp
+++ b/test/common/test_gemm.cpp
--- a/test/common/test_gemm_perf.cpp
+++ b/test/common/test_gemm_perf.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "operators/math/gemm.h"
+#include "operators/math/math_function.h"
+
+#define a(i, j) a[(i)*lda + (j)]
+#define b(i, j) b[(i)*ldb + (j)]
+#define c1(i, j) c1[(i)*ldc + (j)]
+
+#define m 1024
+#define n 1024
+#define k 1024
+
+int main() {
+  Tensor aa, bb, cc, scale, bias;
+  auto aaptr = aa.mutable_data<float>({m, k});
+  auto bbptr = bb.mutable_data<float>({k, n});
+  auto ccptr = cc.mutable_data<float>({m, n});
+  auto scaleptr = scale.mutable_data<float>({m});
+  auto biasptr = bias.mutable_data<float>({m});
+
+  for (int i = 0; i < m * k; ++i) {
+    aaptr[i] = 2;
+  }
+  for (int i = 0; i < k * n; ++i) {
+    bbptr[i] = 2;
+  }
+  for (int i = 0; i < m * n; ++i) {
+    ccptr[i] = 2;
+  }
+  for (int i = 0; i < m; ++i) {
+    scaleptr[i] = 1;
+    biasptr[i] = 0;
+  }
+
+  auto time1 = time();
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul<float>(aa, false, bb, false,
+                                                  static_cast<float>(1), &cc,
+                                                  static_cast<float>(0), false);
+
+    //    paddle_mobile::operators::math::matmulWithBn<float>(
+    //        aa, false, bb, false, static_cast<float>(1), &cc,
+    //        static_cast<float>(0), true, &scale, &bias, 0);
+  }
+  auto time2 = time();
+  std::cout << "gemm  cost :" << time_diff(time1, time2) / 10 << "ms\n";
+
+  return 0;
+}
--- a/test/fpga/test_tensor_quant.cpp
+++ b/test/fpga/test_tensor_quant.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  bool optimize = false;
+  if (paddle_mobile.Load(g_googlenet, optimize)) {
+    auto time2 = time();
+    DLOG << "load cost: " << time_diff(time1, time1) << "ms";
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
+    auto time3 = time();
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    auto time4 = time();
+    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  }
+  return 0;
+}
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"

@@ -23,15 +23,20 @@ int main() {
  auto time1 = time();
  if (paddle_mobile.Load(g_googlenet, optimize)) {
    auto time2 = time();
-    DLOG << "load cost: " << time_diff(time1, time1) << "ms";
+    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
    std::vector<float> input;
    std::vector<int64_t> dims{1, 3, 224, 224};
    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
-    auto time3 = time();
+    // 预热一次
    auto vec_result = paddle_mobile.Predict(input, dims);
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
    auto time4 = time();

-    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
  }
  return 0;
 }
--- a/tools/quantification/convert.cpp
+++ b/tools/quantification/convert.cpp
@@ -68,60 +68,60 @@ std::shared_ptr<ProgramDesc> loadParams(const std::string &model_path) {

 }

-void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char *dataP, FILE *out_file) {
+void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file) {
    // 1. version
-    uint32_t version = *reinterpret_cast<uint32_t *>(dataP);
+    uint32_t version = *reinterpret_cast<uint32_t *>(*dataP);

    // write version
    fwrite(&version, kSize32, 1, out_file);

-    dataP += kSize32;
+    *dataP += kSize32;

    // 2 Lod information
    auto *lod_level_ptr = new uint64_t();
-    memcpy(lod_level_ptr, dataP, kSize64);
+    memcpy(lod_level_ptr, *dataP, kSize64);

    uint64_t lod_level = 0;
    // write lod Information
    fwrite(&lod_level, kSize64, 1, out_file);
    delete lod_level_ptr;

-    dataP += kSize64;
+    *dataP += kSize64;

    for (uint64_t i = 0; i < lod_level; ++i) {
-        uint64_t size = *reinterpret_cast<uint64_t *>(dataP);
+        uint64_t size = *reinterpret_cast<uint64_t *>(*dataP);
        // write lod size
        fwrite(&size, kSize64, 1, out_file);
-        (dataP) += kSize64;
+        (*dataP) += kSize64;

        std::vector<size_t> tmp(size / sizeof(size_t));
        for (unsigned long &k : tmp) {
-            k = *reinterpret_cast<size_t *>(dataP);
-            (dataP) += sizeof(size_t);
+            k = *reinterpret_cast<size_t *>(*dataP);
+            (*dataP) += sizeof(size_t);
        }
        // write lod size vector
        fwrite(&tmp, sizeof(size_t), tmp.size(), out_file);
    }

    // 3. tensor version
-    uint32_t tensor_version = *reinterpret_cast<uint32_t *>(dataP);
+    uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*dataP);
    // write tensor version
    fwrite(&tensor_version, kSize32, 1, out_file);
-    (dataP) += kSize32;
+    (*dataP) += kSize32;

    // 4. tensor desc
-    int32_t size = *reinterpret_cast<int32_t *>(dataP);
+    int32_t size = *reinterpret_cast<int32_t *>(*dataP);
    // write tensor desc
    fwrite(&size, sizeof(int32_t), 1, out_file);
-    (dataP) += sizeof(int32_t);
+    (*dataP) += sizeof(int32_t);

    std::unique_ptr<char[]> buf(new char[size]);
    for (int m = 0; m < size; ++m) {
-        buf.get()[m] = (dataP)[m];
+        buf.get()[m] = (*dataP)[m];
    }

    fwrite(buf.get(), sizeof(char), static_cast<size_t>(size), out_file);
-    (dataP) += (sizeof(char) * size);
+    (*dataP) += (sizeof(char) * size);

    const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
    int memory_size = 1;
@@ -158,9 +158,9 @@ void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char *dataP
    memory = new char[tensorSize];

    for (int n = 0; n < tensorSize; ++n) {
-        static_cast<char *>(memory)[n] = (dataP)[n];
+        static_cast<char *>(memory)[n] = (*dataP)[n];
    }
-    dataP += tensorSize;
+    *dataP += tensorSize;

    // for float 32
    float min_value = std::numeric_limits<float>::max();
@@ -194,7 +194,7 @@ quantificate_combined(const std::string &model_path, const std::string &param_pa
                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
                    continue;
                }
-                LoadWithDump(*var_desc, data, out_file);
+                LoadWithDump(*var_desc, &data, out_file);
            }
        }
    }
@@ -220,7 +220,7 @@ void quantificate_seperated(const std::string model_dir, const std::string param
                FILE *out_file = fopen(file_name.c_str(), "wb");
                char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
                char *data = origin_data;
-                LoadWithDump(*var_desc, data, out_file);
+                LoadWithDump(*var_desc, &data, out_file);
                delete origin_data;
                fclose(out_file);
            }