diff --git a/.gitignore b/.gitignore
index 547e94ea11f048c9e007996d4ee716d22a13742e..4c86068b7ee3024416094613d2f8f8d74ce89921 100644
--- a/.gitignore
+++ b/.gitignore
@@ -75,4 +75,5 @@ cmake-build-release
 demo/ios/PaddleMobileDemo/PaddleMobileDemo/googlenet_combine/
 demo/ios/PaddleMobileDemo/PaddleMobileDemo/*.jpg
 demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/*.a
-*.xcuserstate
\ No newline at end of file
+*.xcuserstate
+/tools/quantification/quantify
diff --git a/src/fpga/api/fpga_api.cpp b/src/fpga/api/fpga_api.cpp
index a913d6e39cddda97b347c0675717c265dfa89d18..f91c21beb2d6b5fbce86b56d49b7d8c6a3ec9219 100644
--- a/src/fpga/api/fpga_api.cpp
+++ b/src/fpga/api/fpga_api.cpp
@@ -29,15 +29,15 @@ limitations under the License. */
 
 #include "fpga/api/fpga_api.h"
 
-namespace paddle {
-namespace mobile {
+namespace paddle_mobile {
 namespace fpga {
-namespace api {
 
 static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";
 
-static inline int do_ioctl(int req, void *arg) { return ioctl(req, arg); }
+static inline int do_ioctl(int req, void *arg) {
+  return ioctl(req, (unsigned int64_t)arg);
+}
 
 int open_device() {
   if (fd == -1) {
@@ -48,8 +48,8 @@ int open_device() {
 
 // memory management;
 void *fpga_malloc(size_t size) {
-  return reinterpret_cast<(void *)> mmap64(NULL, size, PROT_READ | PROT_WRITE,
-                                           MAP_SHARED, fd, 0);
+  return reinterpret_cast<void *>(
+      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
 }
 
 void fpga_free(void *ptr) { munmap(ptr, 0); }
@@ -58,11 +58,13 @@ void fpga_copy(void *dest, const void *src, size_t num) {
   memcpy(dest, src, num);
 }
 
-int ComputeFpgaConv(struct FpgaConvArgs) {}
-int ComputeFpgaPool(struct FpgaPoolArgs) {}
-int ComputeFpgaEWAdd(struct FpgaEWAddArgs) {}
+int ComputeFpgaConv(const struct ConvArgs &args) { return do_ioctl(21, &args); }
+int ComputeFpgaPool(const struct PoolingArgs &args) {
+  return do_ioctl(22, &args);
+}
+int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
+  return do_ioctl(23, &args);
+}
 
-}  // namespace api
 }  // namespace fpga
-}  // namespace mobile
-}  // namespace paddle
+}  // namespace paddle_mobile
diff --git a/src/fpga/api/fpga_api.h b/src/fpga/api/fpga_api.h
index 42e99f4e4238d6974d23c1fb33bf238ca8a8626d..08635cdb5c01b50f59eb35554bba9a7b70f6ebfb 100644
--- a/src/fpga/api/fpga_api.h
+++ b/src/fpga/api/fpga_api.h
@@ -31,90 +31,150 @@ void* fpga_malloc(size_t size);
 void fpga_free(void* ptr);
 void fpga_copy(void* dst, const void* src, size_t num);
 
-struct FpgaVersionArgs {
-  void* buf;
+enum DataConvertType {
+  DATA_NO_CONVERT = 0,
+  DATA_FP32_TO_FP16 = 1,
+  DATA_FP16_TO_FP32 = 2,
 };
 
-struct MemoryToPhysicalArgs {
-  const void* src;
-  uint64_t physical;
+enum LayoutConvertType {
+  LAYOUT_NO_CONVERT = 0,
+  LAYOUT_CHW_TO_HWC = 1,
+  LAYOUT_HWC_TO_CHW = 2,
+};
+
+struct VersionArgs {
+  void* buffer;
 };
 
 struct MemoryCopyArgs {
   void* src;
-  void* dst;
+  void* dest;
   size_t size;
 };
 
-struct FpgaQuantArgs {
-  float scale;
-};
-
-struct FpgaBNArgs {
-  bool enabled = false;
-  void* bias_addr;
-  void* scale_addr;
+struct BNArgs {
+  bool enabled;
+  void* bias_address;
+  void* scale_address;
 };
 
-struct FpgaKernelArgs {
+/**
+Conv and Pooling kernel
+*/
+struct KernelArgs {
   uint32_t width;
   uint32_t height;
-  uint32_t stride_h;
   uint32_t stride_w;
+  uint32_t stride_h;
 };
 
-struct FpgaImageArgs {
-  uint32_t width;
-  uint32_t height;
+struct ImageInputArgs {
+  void* address;         // input featuremap virtual address
+  float* scale_address;  // input scale address;
   uint32_t channels;
-  uint32_t pad_h;
-  uint32_t pad_w;
+  uint32_t width;  // featuremap width
+  uint32_t height;
+  uint32_t pad_width;  // padding width;
+  uint32_t pad_height;
 };
 
-struct FpgaConvArgs {
+struct ImageOutputArgs {
+  void* address;         // output result address;
+  float* scale_address;  // output scale address;
+};
+
+struct ConvArgs {
   bool relu_enabled;
-  struct FpgaBNArgs BNargs;
-  void* image_addr;
-  void* filter_addr;
-  void* bias_addr;
-  void* output_addr;
-  float quant_scale;
-  struct FpgaImageArgs image;
+  void* bias_address;
+  void* filter_address;
   uint32_t filter_num;
   uint32_t group_num;
 
-  struct FpgaKernelArgs kernel;
+  void* sb_address;  // scale and bias are interlaced;
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
 };
 
-struct FpgaPoolArgs {
-  void* image_addr;
-  void* output_addr;
-  struct FpgaImageArgs image;
-  struct FpgaKernelArgs kernel;
+struct PoolingArgs {
+  struct KernelArgs kernel;
+  struct ImageInputArgs image;  // input image;
+  struct ImageOutputArgs output;
 };
 
-struct FpgaEWAddArgs {
+// elementwise add arguments
+struct EWAddArgs {
   bool relu_enabled;
-  void* image0_addr;
-  void* image1_addr;
-  void* result_addr;
-  uint32_t const0;
-  uint32_t const1;
-  uint32_t data_len;  // aligned element count
+
+  float const0;  // output0 = const0 x input0 + const1 x input1;
+  float const1;
+  struct ImageInputArgs image0;
+  struct ImageInputArgs image1;
+  struct ImageOutputArgs output;
 };
 
-int ComputeFpgaConv(struct FpgaConvArgs args);
-int ComputeFpgaPool(struct FpgaPoolArgs args);
-int ComputeFpgaEWAdd(struct FpgaEWAddArgs args);
+struct BypassArgs {
+  enum DataConvertType convert_type;
+  struct ImageInputArgs image;
+  struct ImageOutputArgs output;
+};
+
+struct FpgaRegWriteArgs {
+  uint64_t address;  //
+  uint64_t value;
+};
+
+struct FpgaRegReadArgs {
+  uint64_t address;
+  uint64_t value;
+};
+
+#define IOCTL_FPGA_MAGIC 'FPGA'
+
+#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
+
+#define IOCTL_SEPARATOR_0 10
 
-#define IOCTL_FPGA_MAGIC 'CNN'
-#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 1, struct FpgaVersionArgs)
-#define IOCTL_GET_QUANT _IOW(IOCTL_FPGA_MAGIC, 2, struct FpgaQuantArgs)
-#define IOCTL_SET_QUANT _IOW(IOCTL_FPGA_MAGIC, 3, struct FpgaQuantArgs)
 #define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
-#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct FpgaConvArgs)
-#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct FpgaPoolArgs)
-#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct FpgaEWAddArgs)
+
+#define IOCTL_SEPARATOR_1 20
+
+#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
+#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
+#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
+#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
+#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
+
+enum FPGA_ERR_TYPE {
+  ERR_IOCTL_CMD = -1,
+  ERR_TIMEOUT = -2,
+  ERR_COMPLETION_TIMEOUT = -3,
+  ERR_INVALID_FPGA_ADDR = -4,
+  ERR_NOMEM = -5,
+  ERR_NO_RESERVE_MEM = -6,
+  ERR_COPY_FROM_USER = -7,
+  ERR_COPY_TO_USER = -8,
+  ERR_DEL_TIMER = -9,
+  ERR_ENABLE_MSI = -10,
+  ERR_REGISTER_IRQ = -11,
+  ERR_PCIE_REGISTER = -12,
+  ERR_PCIE_PROBE = -13,
+  ERR_REGISTER_BLOCK = -14,
+  ERR_ALLOC_GENDISK = -15,
+  ERR_INIT_QUEUE = -16,
+  ERR_WAIT = -17,
+  ERR_ECC_ERROR = -31,
+  ERR_FPGA_FAIL_STOP = -64,
+  ERR_FPGA_DEBUG_STOP = -113,
+  DEV_TMP_UNAVAILABLE = -128
+};
+
+//============================== API =============================
+
+int ComputeFpgaConv(const struct ConvArgs& args);
+int ComputeFpgaPool(const struct PoolingArgs& args);
+int ComputeFpgaEWAdd(const struct EWAddArgs& args);
 
 }  // namespace fpga
 }  // namespace paddle_mobile
diff --git a/src/fpga/fpga_quantilization.h b/src/fpga/fpga_quantilization.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2d2d61835de84c94760c10a25a973d4eaff1fbe
--- /dev/null
+++ b/src/fpga/fpga_quantilization.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <string>
+#include "common/types.h"
+#include "framework/lod_tensor.h"
+#include "framework/operator.h"
+#include "framework/scope.h"
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+
+bool is_conv(std::string type) {
+  if (type.compare(G_OP_TYPE_CONV) == 0) {
+    return true;
+  }
+  if (type.compare(G_OP_TYPE_FUSION_CONV_ADD) == 0) {
+    return true;
+  }
+  if (type.compare(G_OP_TYPE_FUSION_CONV_ADD_RELU) == 0) {
+    return true;
+  }
+  if (type.compare(G_OP_TYPE_FUSION_CONV_BN_RELU) == 0) {
+    return true;
+  }
+  if (type.compare(G_OP_TYPE_FUSION_CONV_ADD_BN) == 0) {
+    return true;
+  }
+  return false;
+}
+
+template <typename Dtype>
+void quantilize_op(std::shared_ptr<framework::OperatorBase<Dtype>> op,
+                   std::shared_ptr<framework::Scope> scope) {
+  if (!is_conv(op.get()->Type())) {
+    return;
+  }
+  framework::Tensor* filter = nullptr;
+  auto var_vec = op.get()->Inputs().at("Filter");
+  if (!var_vec.empty()) {
+    auto var = scope.get()->FindVar(var_vec[0]);
+    filter = var->template GetMutable<framework::LoDTensor>();
+  }
+  float scale = 0;
+
+  // 32bit filter -> 8bit filter;
+  if (filter->type() == typeid(float)) {
+    framework::Tensor* originalFilter = filter;
+    framework::Tensor* quantFilter = new framework::Tensor();
+    float* floatData = originalFilter->data<float>();
+    int8_t* intData = quantFilter->mutable_data<int8_t>();
+  }
+}
+
+}  // namespace paddle_mobile
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index 954a65a3605c4d0204890d9414aeb074371b0d69..3dba76d790d44f154f359454250d15b81ff717a3 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -253,6 +253,18 @@ class Tensor {
                           "Tensor's dims_ is out of bound. ");
   }
 
+#ifdef PADDLE_MOBILE_FPGA
+  struct FPGAArgs {
+    float scale;
+
+    inline float *scale_pointer() { return &scale; }
+  };
+
+  struct FPGAArgs &fpga_args() {
+    return fpgaArgs_;
+  }
+#endif
+
  private:
   /**
    * @note    Placeholder hides type T, so it doesn't appear as a
@@ -319,6 +331,10 @@ class Tensor {
    * begins.
    */
   size_t offset_;
+
+#ifdef PADDLE_MOBILE_FPGA
+  FPGAArgs fpgaArgs_;
+#endif
 };
 
 #ifdef PADDLE_MOBILE_DEBUG
diff --git a/src/io/executor.cpp b/src/io/executor.cpp
index 6b0af3454e0cb9c41633bd793b76250028644abe..c09fe2c58532437336307ce007532d43689d8fd2 100644
--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -32,6 +32,10 @@ limitations under the License. */
 #include "common/threadpool.h"
 #endif
 
+#ifdef PADDLE_MOBILE_FPGA
+#include "fpga/fpga_quantilization.h"
+#endif
+
 namespace paddle_mobile {
 using framework::Variable;
 
@@ -96,6 +100,11 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
   for (const auto &op : ops) {
     op->Init();
   }
+#ifdef PADDLE_MOBILE_FPGA
+  for (const auto &op : ops) {
+    quantilize_op(op, program_.scope);
+  }
+#endif
 }
 
 template <typename Dtype, Precision P>
@@ -420,6 +429,6 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
 
 template class Executor<CPU, Precision::FP32>;
 template class Executor<GPU_MALI, Precision::FP32>;
-template class Executor<FPGA, Precision::FP16>;
+template class Executor<FPGA, Precision::FP32>;
 
 }  // namespace paddle_mobile
diff --git a/src/io/loader.cpp b/src/io/loader.cpp
index 9ed877d05d51dfbe7139ea2289fdb6480c62f88f..cdcecf02ab8af22dec0e32113052ac26e9c5fcfc 100644
--- a/src/io/loader.cpp
+++ b/src/io/loader.cpp
@@ -56,7 +56,8 @@ template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
     const std::string &model_path, const std::string &para_path, bool optimize,
     bool quantification) {
-  auto program = this->LoadProgram(model_path, optimize);
+  auto program = this->LoadProgram(model_path, optimize, quantification);
+
   program.para_path = para_path;
   program.combined = true;
   program.quantification = quantification;
diff --git a/src/jni/paddle_mobile_jni.cpp b/src/jni/paddle_mobile_jni.cpp
index d4eb9e0f0733814cbe367a1873e241383127340a..66150e24e0ac957773161904948c10cf4637ee42 100644
--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
@@ -61,6 +61,15 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                          optimize);
 }
 
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
+    JNIEnv *env, jclass thiz, jstring modelPath) {
+  ANDROIDLOGI("loadQualified invoked");
+  bool optimize = true;
+  bool qualified = true;
+  return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                         optimize, qualified);
+}
+
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
     JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) {
   ANDROIDLOGI("loadCombined invoked");
@@ -70,6 +79,16 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
                                          optimize);
 }
 
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) {
+  ANDROIDLOGI("loadCombinedQualified invoked");
+  bool optimize = true;
+  bool qualified = true;
+  return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                         jstring2cppstring(env, paramPath),
+                                         optimize, qualified);
+}
+
 JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
     JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims) {
   ANDROIDLOGI("predictImage invoked");
diff --git a/src/jni/paddle_mobile_jni.h b/src/jni/paddle_mobile_jni.h
index a830ab43c8ee0598fbf75e1fef5f3eb7da06c27b..06fabe04c739dfcee06110a3592a88591e3d37b9 100644
--- a/src/jni/paddle_mobile_jni.h
+++ b/src/jni/paddle_mobile_jni.h
@@ -27,12 +27,24 @@ namespace jni {
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                                           jclass thiz,
                                                           jstring modelPath);
+
+/**
+ * load separated qualified model for android
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
+    JNIEnv *env, jclass thiz, jstring modelPath);
 /**
  * load combined model  for android
  */
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
     JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath);
 
+/**
+ * load combined qualified model for android
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath);
+
 /**
  * object detection for anroid
  */
diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp
index 178541953323b6ffd1a3339f8209c2839b37a784..42b8c4551871c58955251d94845ca13576d7735b 100644
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -27,17 +27,17 @@ namespace memory {
 const int MALLOC_ALIGN = 64;
 
 #ifdef PADDLE_MOBILE_FPGA
-namespace api = paddle::mobile::fpga::api;
+namespace fpga = paddle_mobile::fpga;
 
 void Copy(void *dst, const void *src, size_t num) {
   std::memcpy(dst, src, num);
 }
 
-void *Alloc(size_t size) { return api::malloc(size); }
+void *Alloc(size_t size) { return fpga::fpga_malloc(size); }
 
 void Free(void *ptr) {
   if (ptr) {
-    api::fpga_free(ptr);
+    fpga::fpga_free(ptr);
   }
 }
 
diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h
index 723747874da8fc8ee2c02eb1be4c89189c2af746..e45ad38fd68cb9b4616b7e363be117e2039c93a9 100644
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -29,7 +29,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
          std::shared_ptr<framework::Scope> scope)
       : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
                                             scope),
-        param_(inputs, outputs, attrs, *scope) {}
+        param_(inputs, outputs, attrs, scope.get()) {}
   void RunImpl() const { param_.Out()->ShareDataWith(*param_.InputX()); }
 
   void Init() {}
diff --git a/src/operators/kernel/arm/dropout_kernel.cpp b/src/operators/kernel/arm/dropout_kernel.cpp
index af16048a1b4eba2ff36f842b6cf968031989576e..db942b018d7085ca3986533937328101afb08ff9 100644
--- a/src/operators/kernel/arm/dropout_kernel.cpp
+++ b/src/operators/kernel/arm/dropout_kernel.cpp
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #ifdef DROPOUT_OP
 
-#pragma once
-
 #include "operators/kernel/dropout_kernel.h"
 #include <operators/math/transform.h>
 
diff --git a/src/operators/kernel/dropout_kernel.h b/src/operators/kernel/dropout_kernel.h
index 5a3783971959db8fba9ca6b701fb6eb6340fcb3f..3ef6b9dd62d88f012eba3456c676ac0d33bf9e52 100644
--- a/src/operators/kernel/dropout_kernel.h
+++ b/src/operators/kernel/dropout_kernel.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "framework/operator.h"
 #include "operators/op_param.h"
 
-#pragma once;
+#pragma once
 
 namespace paddle_mobile {
 namespace operators {
diff --git a/src/operators/kernel/fpga/concat_kernel.cpp b/src/operators/kernel/fpga/concat_kernel.cpp
index c691988f4a388c7835a7016602d7a1ac9cb5f9b6..627a94242ca4638640a7961120b36c9f763a0e85 100644
--- a/src/operators/kernel/fpga/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/concat_kernel.cpp
@@ -39,7 +39,7 @@ void ConcatKernel<FPGA, half>::Compute(const ConcatParam &param) const {
 
   for (int i = 0; i < inputs.size(); ++i) {
     auto input = inputs[i];
-    auto channels = input[3];
+    auto channels = input->dims()[3];
     out_offset += channels;
     auto src = input->data<half>();
     for (int j = 0; j < pixels; ++j) {
diff --git a/src/operators/kernel/im2sequence_kernel.h b/src/operators/kernel/im2sequence_kernel.h
index cb592613f73d90dae5a7d6e515f8bc091981776e..aa798fd6af5592a062de207714dc9fee2afb93df 100644
--- a/src/operators/kernel/im2sequence_kernel.h
+++ b/src/operators/kernel/im2sequence_kernel.h
@@ -20,13 +20,11 @@ limitations under the License. */
 #include "operators/math/vol2col.h"
 #include "operators/op_param.h"
 
-#pragma once;
+#pragma once
 
 namespace paddle_mobile {
 namespace operators {
 
-using namespace framework;
-
 template <typename DeviceType, typename T>
 class Im2SequenceKernel
     : public framework::OpKernelBase<DeviceType, Im2SequenceParam> {
diff --git a/src/operators/kernel/mali/fushion_fc_kernel.cpp b/src/operators/kernel/mali/fushion_fc_kernel.cpp
index a76c3c46012a758a05cf8f846a15376ad1b9f33c..44a7ce2af62a1d27aff8181f6742bebda1d6d066 100755
--- a/src/operators/kernel/mali/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #ifdef FUSION_FC_OP
 
-#pragma once
-
 #include "operators/kernel/fusion_fc_kernel.h"
 
 namespace paddle_mobile {
diff --git a/src/operators/kernel/prelu_kernel.h b/src/operators/kernel/prelu_kernel.h
index 9f5dcb23ee9bf44ffa8bbdd98879d533d07c39f9..15696174377f04ad9a62366e03ded1f2cdcdee9e 100644
--- a/src/operators/kernel/prelu_kernel.h
+++ b/src/operators/kernel/prelu_kernel.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "framework/operator.h"
 #include "operators/op_param.h"
 
-#pragma once;
+#pragma once
 
 namespace paddle_mobile {
 namespace operators {
diff --git a/src/operators/kernel/scale_kernel.h b/src/operators/kernel/scale_kernel.h
index cc76a4b59b0be24dd3a3fb82c0e3d9fb1a4dbf24..98ac71d0bbad86f595171ad7ac5b2a1cdf5908fa 100644
--- a/src/operators/kernel/scale_kernel.h
+++ b/src/operators/kernel/scale_kernel.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "framework/operator.h"
 #include "operators/op_param.h"
 
-#pragma once;
+#pragma once
 
 namespace paddle_mobile {
 namespace operators {
diff --git a/src/operators/kernel/slice_kernel.h b/src/operators/kernel/slice_kernel.h
index e308364602f401b1c6c6f8e2e35385aefa055360..fd3b8dc767076c5244509f6015c42bee87df100b 100644
--- a/src/operators/kernel/slice_kernel.h
+++ b/src/operators/kernel/slice_kernel.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "framework/operator.h"
 #include "operators/op_param.h"
 
-#pragma once;
+#pragma once
 
 namespace paddle_mobile {
 namespace operators {
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index b9b61f4d1c59a0e2c8e7822742c54472ad540981..20d71907ff9e391d97ce75e38b6e08dc1286a9a3 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -107,20 +107,22 @@ void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
       *buffer++ = *a3++;
     }
   }
-  int i = m - m_tail;
-  a0 = &A(i, 0);
-  a1 = a0 + lda;
-  a2 = a0 + 2 * lda;
-  a3 = a0 + 3 * lda;
+
   if (m_tail != 0) {
-    if (m_tail <= 3) {
-      a3 = zero;
-    }
-    if (m_tail <= 2) {
-      a2 = zero;
-    }
-    if (m_tail <= 1) {
-      a1 = zero;
+    a0 = &A(m - m_tail, 0);
+    a1 = a0 + lda;
+    a2 = a0 + 2 * lda;
+    a3 = a0 + 3 * lda;
+    switch (m_tail) {
+      case 1:
+        a1 = zero;
+      case 2:
+        a2 = zero;
+      case 3:
+        a3 = zero;
+        break;
+      default:
+        break;
     }
     for (int j = 0; j < k; ++j) {
       *buffer++ = *a0++;
@@ -150,28 +152,89 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
       *buffer++ = *a5++;
     }
   }
-  int i = m - m_tail;
-  a0 = &A(i, 0);
-  a1 = a0 + lda;
-  a2 = a0 + 2 * lda;
-  a3 = a0 + 3 * lda;
-  a4 = a0 + 4 * lda;
-  a5 = a0 + 5 * lda;
   if (m_tail != 0) {
-    if (m_tail <= 5) {
-      a5 = zero;
+    a0 = &A(m - m_tail, 0);
+    a1 = a0 + lda;
+    a2 = a0 + 2 * lda;
+    a3 = a0 + 3 * lda;
+    a4 = a0 + 4 * lda;
+    a5 = a0 + 5 * lda;
+    switch (m_tail) {
+      case 1:
+        a1 = zero;
+      case 2:
+        a2 = zero;
+      case 3:
+        a3 = zero;
+      case 4:
+        a4 = zero;
+      case 5:
+        a5 = zero;
+        break;
+      default:
+        break;
     }
-    if (m_tail <= 4) {
-      a4 = zero;
-    }
-    if (m_tail <= 3) {
-      a3 = zero;
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
+      *buffer++ = *a4++;
+      *buffer++ = *a5++;
     }
-    if (m_tail <= 2) {
-      a2 = zero;
+  }
+}
+
+void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer) {
+  const float *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
+  for (int i = 0; i < m - m_tail; i += MR) {
+    a0 = A + i * lda;
+    a1 = A + (i + 1) * lda;
+    a2 = A + (i + 2) * lda;
+    a3 = A + (i + 3) * lda;
+    a4 = A + (i + 4) * lda;
+    a5 = A + (i + 5) * lda;
+    a6 = A + (i + 6) * lda;
+    a7 = A + (i + 7) * lda;
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
+      *buffer++ = *a4++;
+      *buffer++ = *a5++;
+      *buffer++ = *a6++;
+      *buffer++ = *a7++;
     }
-    if (m_tail <= 1) {
-      a1 = zero;
+  }
+  if (m_tail != 0) {
+    a0 = &A(m - m_tail, 0);
+    a1 = a0 + lda;
+    a2 = a0 + 2 * lda;
+    a3 = a0 + 3 * lda;
+    a4 = a0 + 4 * lda;
+    a5 = a0 + 5 * lda;
+    a6 = a0 + 6 * lda;
+    a7 = a0 + 7 * lda;
+    switch (m_tail) {
+      case 1:
+        a1 = zero;
+      case 2:
+        a2 = zero;
+      case 3:
+        a3 = zero;
+      case 4:
+        a4 = zero;
+      case 5:
+        a5 = zero;
+      case 6:
+        a6 = zero;
+      case 7:
+        a7 = zero;
+        break;
+      default:
+        break;
     }
     for (int j = 0; j < k; ++j) {
       *buffer++ = *a0++;
@@ -180,6 +243,8 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
       *buffer++ = *a3++;
       *buffer++ = *a4++;
       *buffer++ = *a5++;
+      *buffer++ = *a6++;
+      *buffer++ = *a7++;
     }
   }
 }
@@ -234,15 +299,78 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
   }
 }
 
+#if __aarch64__
+void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
+                     float *buffer) {
+  const float *b0;
+  for (int j = 0; j < n - n_tail; j += NR) {
+    for (int i = 0; i < k; ++i) {
+      b0 = &B(i, j);
+      asm volatile(
+          "prfm   pldl2keep,        [%[b0], #64]           \n\t"
+          "ld1    {v0.4s, v1.4s, v2.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s, v2.4s},   [%[buffer]],  #48 \n\t"
+          : [buffer] "+r"(buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1", "v2");
+    }
+  }
+  if (n_tail != 0) {
+    for (int i = 0; i < k; ++i) {
+      b0 = &B(i, n - n_tail);
+      for (int j = n - n_tail; j < n; ++j) {
+        *buffer++ = *b0++;
+      }
+      for (int j = n; j < n + (NR - n_tail); ++j) {
+        *buffer++ = 0;
+      }
+    }
+  }
+}
+
+void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
+                     float *buffer) {
+  const float *b0;
+  for (int j = 0; j < n - n_tail; j += NR) {
+    for (int i = 0; i < k; ++i) {
+      b0 = &B(i, j);
+      asm volatile(
+          "prfm   pldl2keep,        [%[b0], #64]           \n\t"
+          "ld1    {v0.4s, v1.4s, v2.4s, v3.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s, v2.4s, v3.4s},   [%[buffer]],  #64 \n\t"
+          : [buffer] "+r"(buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1", "v2", "v3");
+    }
+  }
+  if (n_tail != 0) {
+    for (int i = 0; i < k; ++i) {
+      b0 = &B(i, n - n_tail);
+      for (int j = n - n_tail; j < n; ++j) {
+        *buffer++ = *b0++;
+      }
+      for (int j = n; j < n + (NR - n_tail); ++j) {
+        *buffer++ = 0;
+      }
+    }
+  }
+}
+#endif  // __aarch64__
+
 // 分块矩阵乘法
 void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
                  float beta, float *c, float *C, int ldc, bool relu) {
 #pragma omp parallel for
   for (int j = 0; j < nc; j += NR) {
     for (int i = 0; i < mc; i += MR) {
+#if __aarch64__
+      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#else
       // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
       // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
       AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#endif
     }
   }
 
@@ -271,9 +399,14 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
 #pragma omp parallel for
   for (int j = 0; j < nc; j += NR) {
     for (int i = 0; i < mc; i += MR) {
+#if __aarch64__
+      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#else
       // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
       // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
       AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#endif
     }
   }
 
@@ -1956,10 +2089,20 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
   int mc, nc;
   for (int j = 0; j < n; j += NC) {
     nc = s_min(n - j, NC);
+#if __aarch64__
+    // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#else
     PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#endif
     for (int i = 0; i < m; i += MC) {
       mc = s_min(m - i, MC);
+#if __aarch64__
       PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#else
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#endif
       InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc,
                   relu);
     }
@@ -2009,10 +2152,20 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
   int mc, nc;
   for (int j = 0; j < n; j += NC) {
     nc = s_min(n - j, NC);
+#if __aarch64__
+    // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#else
     PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+#endif
     for (int i = 0; i < m; i += MC) {
       mc = s_min(m - i, MC);
+#if __aarch64__
       PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#else
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+#endif
       InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
                         &C(i, j), ldc, relu, new_scale + i, new_bias + i);
     }
@@ -2239,6 +2392,192 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
 #endif  // __ARM_NEON
 }
 
+#if __aarch64__
+void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) {
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k;
+  int step = 4 * ldc;
+  asm volatile(
+      "dup      v5.4s,     wzr     \n\t"
+      "dup      v6.4s,     wzr     \n\t"
+      "dup      v7.4s,     wzr     \n\t"
+      "dup      v8.4s,     wzr     \n\t"
+      "dup      v9.4s,     wzr     \n\t"
+      "dup      v10.4s,    wzr     \n\t"
+      "dup      v11.4s,    wzr     \n\t"
+      "dup      v12.4s,    wzr     \n\t"
+      "dup      v13.4s,    wzr     \n\t"
+      "dup      v14.4s,    wzr     \n\t"
+      "dup      v15.4s,    wzr     \n\t"
+      "dup      v16.4s,    wzr     \n\t"
+
+      "dup      v17.4s,    wzr     \n\t"
+      "dup      v18.4s,    wzr     \n\t"
+      "dup      v19.4s,    wzr     \n\t"
+      "dup      v20.4s,    wzr     \n\t"
+      "dup      v21.4s,    wzr     \n\t"
+      "dup      v22.4s,    wzr     \n\t"
+      "dup      v23.4s,    wzr     \n\t"
+      "dup      v24.4s,    wzr     \n\t"
+      "dup      v25.4s,    wzr     \n\t"
+      "dup      v26.4s,    wzr     \n\t"
+      "dup      v27.4s,    wzr     \n\t"
+      "dup      v28.4s,    wzr     \n\t"
+
+      "subs       %[kc1], %[kc1], #1    \n\t"
+      "blt        end_kc1_%=            \n\t"
+      "loop_kc1_%=:                     \n\t"
+
+      "prfm     pldl1keep,         [%[a_ptr],   #32]  \n\t"
+      "prfm     pldl1keep,         [%[b_ptr],   #48]  \n\t"
+
+      "ld1      {v0.4s, v1.4s},         [%[a_ptr]],   #32   \n\t"
+      "ld1      {v2.4s, v3.4s, v4.4s},  [%[b_ptr]],   #48   \n\t"
+
+      "fmla     v5.4s,    v2.4s,   v0.s[0]       \n\t"
+      "fmla     v6.4s,    v3.4s,   v0.s[0]       \n\t"
+      "fmla     v7.4s,    v4.4s,   v0.s[0]       \n\t"
+      "fmla     v8.4s,    v2.4s,   v0.s[1]       \n\t"
+      "fmla     v9.4s,    v3.4s,   v0.s[1]       \n\t"
+      "fmla     v10.4s,   v4.4s,   v0.s[1]       \n\t"
+      "fmla     v11.4s,   v2.4s,   v0.s[2]       \n\t"
+      "fmla     v12.4s,   v3.4s,   v0.s[2]       \n\t"
+      "fmla     v13.4s,   v4.4s,   v0.s[2]       \n\t"
+      "fmla     v14.4s,   v2.4s,   v0.s[3]       \n\t"
+      "fmla     v15.4s,   v3.4s,   v0.s[3]       \n\t"
+      "fmla     v16.4s,   v4.4s,   v0.s[3]       \n\t"
+
+      "fmla     v17.4s,   v2.4s,   v1.s[0]       \n\t"
+      "fmla     v18.4s,   v3.4s,   v1.s[0]       \n\t"
+      "fmla     v19.4s,   v4.4s,   v1.s[0]       \n\t"
+      "fmla     v20.4s,   v2.4s,   v1.s[1]       \n\t"
+      "fmla     v21.4s,   v3.4s,   v1.s[1]       \n\t"
+      "fmla     v22.4s,   v4.4s,   v1.s[1]       \n\t"
+      "fmla     v23.4s,   v2.4s,   v1.s[2]       \n\t"
+      "fmla     v24.4s,   v3.4s,   v1.s[2]       \n\t"
+      "fmla     v25.4s,   v4.4s,   v1.s[2]       \n\t"
+      "fmla     v26.4s,   v2.4s,   v1.s[3]       \n\t"
+      "fmla     v27.4s,   v3.4s,   v1.s[3]       \n\t"
+      "fmla     v28.4s,   v4.4s,   v1.s[3]       \n\t"
+
+      "subs       %[kc1], %[kc1], #1      \n\t"
+      "bge        loop_kc1_%=             \n\t"
+      "end_kc1_%=:                        \n\t"
+
+      "st1      {v5.4s,   v6.4s,  v7.4s},    [%[c]],   %[step]   \n\t"
+      "st1      {v8.4s,   v9.4s,  v10.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v11.4s,  v12.4s, v13.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v14.4s,  v15.4s, v16.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v17.4s,  v18.4s, v19.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v20.4s,  v21.4s, v22.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v23.4s,  v24.4s, v25.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v26.4s,  v27.4s, v28.4s},   [%[c]],   %[step]   \n\t"
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [step] "r"(step)
+      : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+        "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+        "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28");
+}
+
+void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc) {
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k;
+  int step = 4 * ldc;
+  int step1 = 4 * 6;
+  asm volatile(
+
+      "dup      v6.4s,     wzr     \n\t"
+      "dup      v7.4s,     wzr     \n\t"
+      "dup      v8.4s,     wzr     \n\t"
+      "dup      v9.4s,     wzr     \n\t"
+      "dup      v10.4s,    wzr     \n\t"
+      "dup      v11.4s,    wzr     \n\t"
+      "dup      v12.4s,    wzr     \n\t"
+      "dup      v13.4s,    wzr     \n\t"
+
+      "dup      v14.4s,    wzr     \n\t"
+      "dup      v15.4s,    wzr     \n\t"
+      "dup      v16.4s,    wzr     \n\t"
+      "dup      v17.4s,    wzr     \n\t"
+      "dup      v18.4s,    wzr     \n\t"
+      "dup      v19.4s,    wzr     \n\t"
+      "dup      v20.4s,    wzr     \n\t"
+      "dup      v21.4s,    wzr     \n\t"
+
+      "dup      v22.4s,    wzr     \n\t"
+      "dup      v23.4s,    wzr     \n\t"
+      "dup      v24.4s,    wzr     \n\t"
+      "dup      v25.4s,    wzr     \n\t"
+      "dup      v26.4s,    wzr     \n\t"
+      "dup      v27.4s,    wzr     \n\t"
+      "dup      v28.4s,    wzr     \n\t"
+      "dup      v29.4s,    wzr     \n\t"
+
+      "subs       %[kc1], %[kc1], #1    \n\t"
+      "blt        end_kc1_%=            \n\t"
+      "loop_kc1_%=:                     \n\t"
+
+      "prfm   pldl1keep,  [%[a_ptr],  #24]  \n\t"
+      "prfm   pldl1keep,  [%[b_ptr],  #64]  \n\t"
+
+      "ld1      {v0.4s, v1.4s},  [%[a_ptr]],   %[step1]       \n\t"
+      "ld1      {v2.4s, v3.4s, v4.4s, v5.4s},  [%[b_ptr]],    #64   \n\t"
+
+      "fmla     v6.4s,    v2.4s,   v0.s[0]       \n\t"
+      "fmla     v7.4s,    v3.4s,   v0.s[0]       \n\t"
+      "fmla     v8.4s,    v4.4s,   v0.s[0]       \n\t"
+      "fmla     v9.4s,    v5.4s,   v0.s[0]       \n\t"
+
+      "fmla     v10.4s,   v2.4s,   v0.s[1]       \n\t"
+      "fmla     v11.4s,   v3.4s,   v0.s[1]       \n\t"
+      "fmla     v12.4s,   v4.4s,   v0.s[1]       \n\t"
+      "fmla     v13.4s,   v5.4s,   v0.s[1]       \n\t"
+
+      "fmla     v14.4s,   v2.4s,   v0.s[2]       \n\t"
+      "fmla     v15.4s,   v3.4s,   v0.s[2]       \n\t"
+      "fmla     v16.4s,   v4.4s,   v0.s[2]       \n\t"
+      "fmla     v17.4s,   v5.4s,   v0.s[2]       \n\t"
+
+      "fmla     v18.4s,   v2.4s,   v0.s[3]       \n\t"
+      "fmla     v19.4s,   v3.4s,   v0.s[3]       \n\t"
+      "fmla     v20.4s,   v4.4s,   v0.s[3]       \n\t"
+      "fmla     v21.4s,   v5.4s,   v0.s[3]       \n\t"
+
+      "fmla     v22.4s,   v2.4s,   v1.s[0]       \n\t"
+      "fmla     v23.4s,   v3.4s,   v1.s[0]       \n\t"
+      "fmla     v24.4s,   v4.4s,   v1.s[0]       \n\t"
+      "fmla     v25.4s,   v5.4s,   v1.s[0]       \n\t"
+
+      "fmla     v26.4s,   v2.4s,   v1.s[1]       \n\t"
+      "fmla     v27.4s,   v3.4s,   v1.s[1]       \n\t"
+      "fmla     v28.4s,   v4.4s,   v1.s[1]       \n\t"
+      "fmla     v29.4s,   v5.4s,   v1.s[1]       \n\t"
+
+      "subs       %[kc1], %[kc1], #1      \n\t"
+      "bge        loop_kc1_%=             \n\t"
+      "end_kc1_%=:                        \n\t"
+
+      "st1      {v6.4s,  v7.4s,  v8.4s,  v9.4s},    [%[c]],   %[step]   \n\t"
+      "st1      {v10.4s, v11.4s, v12.4s, v13.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v14.4s, v15.4s, v16.4s, v17.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v18.4s, v19.4s, v20.4s, v21.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v22.4s, v23.4s, v24.4s, v25.4s},   [%[c]],   %[step]   \n\t"
+      "st1      {v26.4s, v27.4s, v28.4s, v29.4s},   [%[c]],   %[step]   \n\t"
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [step] "r"(step), [step1] "r"(step1)
+      : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+        "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+        "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29");
+}
+
+#endif  // __aarch64__
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h
index 2044c264ed1c0f8624690874ed248661a753804c..a9593b15ae73f46aa287028ba74efdb0d303fdde 100644
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -19,8 +19,13 @@ limitations under the License. */
 #define B(i, j) B[(i)*ldb + (j)]
 #define C(i, j) C[(i)*ldc + (j)]
 
+#if __aarch64__
+#define MR 6
+#define NR 16
+#else
 #define MR 6
 #define NR 8
+#endif
 
 #define s_min(i, j) ((i) < (j) ? (i) : (j))
 
@@ -43,10 +48,16 @@ void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
                     float *buffer);
 void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
                     float *buffer);
+void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer);
 
 // 将 B 矩阵分块复制到连续内存(RowMajor)
 void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
                     float *buffer);
+void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
+                     float *buffer);
+void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
+                     float *buffer);
 
 // 分块矩阵乘法
 void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
@@ -70,6 +81,8 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
 void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
 void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
 void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc);
 
 // 分块矩阵乘法结果回写
 // C = A * B
@@ -114,10 +127,6 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
                  const float *B, int ldb, float beta, float *C, int ldc,
                  bool relu, float *new_scale, float *new_bias);
 
-// 64位 double 矩阵乘法
-void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
-           const double *B, int ldb, float beta, double *C, int ldc);
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index a1c9baad79df159b1784ef0dd5d12ccf7ed7fe11..88c1886ad7ade5960d1d8175a1b46e12363ca849 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -262,11 +262,11 @@ class ElementwiseAddParam : OpParam {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::FpgaEWAddArgs fpga_EW_add_args;
+  fpga::EWAddArgs fpga_EW_add_args;
 
  public:
-  const fpga::FpgaEWAddArgs &FpgaArgs() const { return fpga_EW_add_args; }
-  void SetFpgaArgs(const fpga::FpgaEWAddArgs &args) { fpga_EW_add_args = args; }
+  const fpga::EWAddArgs &FpgaArgs() const { return fpga_EW_add_args; }
+  void SetFpgaArgs(const fpga::EWAddArgs &args) { fpga_EW_add_args = args; }
 #endif
 };
 
@@ -465,11 +465,11 @@ class PoolParam : public OpParam {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::FpgaPoolArgs fpga_pool_args;
+  fpga::PoolingArgs fpga_pool_args;
 
  public:
-  const fpga::FpgaPoolArgs &FpgaArgs() const { return fpga_pool_args; }
-  void SetFpgaArgs(const fpga::FpgaPoolArgs &args) { fpga_pool_args = args; }
+  const fpga::PoolingArgs &FpgaArgs() const { return fpga_pool_args; }
+  void SetFpgaArgs(const fpga::PoolingArgs &args) { fpga_pool_args = args; }
 #endif
 };
 #endif
@@ -651,10 +651,10 @@ class MultiClassNMSParam : public OpParam {
 class FeedParam : public OpParam {
  public:
   FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope const &scope) {
-    input_x_ = InputXFrom<LoDTensor>(inputs, scope);
-    out_ = OutFrom<LoDTensor>(outputs, scope);
-    auto var = scope.Var("batch_size");
+            const AttributeMap &attrs, Scope *scope) {
+    input_x_ = InputXFrom<LoDTensor>(inputs, *scope);
+    out_ = OutFrom<LoDTensor>(outputs, *scope);
+    auto var = scope->Var("batch_size");
     batch_size = var->GetValue<int>();
   }
   const Tensor *InputX() const { return input_x_; }
@@ -933,11 +933,11 @@ class FusionFcParam : public OpParam {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::FpgaConvArgs fpga_conv_args;
+  fpga::ConvArgs fpga_conv_args;
 
  public:
-  const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
+  const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 
@@ -991,11 +991,11 @@ class FusionConvAddParam : public OpParam {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::FpgaConvArgs fpga_conv_args;
+  fpga::ConvArgs fpga_conv_args;
 
  public:
-  const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
+  const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 
@@ -1096,11 +1096,11 @@ class FusionConvAddBNReluParam : public OpParam {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::FpgaConvArgs fpga_conv_args;
+  fpga::ConvArgs fpga_conv_args;
 
  public:
-  const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
+  const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1190,11 +1190,11 @@ class FusionConvAddBNParam : public OpParam {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::FpgaConvArgs fpga_conv_args;
+  fpga::ConvArgs fpga_conv_args;
 
  public:
-  const fpga::FpgaConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::FpgaConvArgs &args) { fpga_conv_args = args; }
+  const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 418ebff79161675e8b23a4cca8f4319121aa6002..8839079fecfdbefcdaff85354d3a6a8208af10ee 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -114,8 +114,12 @@ else ()
     target_link_libraries(test-softmax paddle-mobile)
 
     # gen test
-    ADD_EXECUTABLE(test-gemm common/test_gemm.cpp)
-    target_link_libraries(test-gemm paddle-mobile)
+    ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
+    target_link_libraries(test-gemm-accuracy paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp)
+    target_link_libraries(test-gemm-perf paddle-mobile)
 
     # gen test
     ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
diff --git a/test/common/test_gemm.cpp b/test/common/test_gemm_accuracy.cpp
similarity index 100%
rename from test/common/test_gemm.cpp
rename to test/common/test_gemm_accuracy.cpp
diff --git a/test/common/test_gemm_perf.cpp b/test/common/test_gemm_perf.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..260236e24ea44a6fc5708d4d0dac239252d28945
--- /dev/null
+++ b/test/common/test_gemm_perf.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "operators/math/gemm.h"
+#include "operators/math/math_function.h"
+
+#define a(i, j) a[(i)*lda + (j)]
+#define b(i, j) b[(i)*ldb + (j)]
+#define c1(i, j) c1[(i)*ldc + (j)]
+
+#define m 1024
+#define n 1024
+#define k 1024
+
+int main() {
+  Tensor aa, bb, cc, scale, bias;
+  auto aaptr = aa.mutable_data<float>({m, k});
+  auto bbptr = bb.mutable_data<float>({k, n});
+  auto ccptr = cc.mutable_data<float>({m, n});
+  auto scaleptr = scale.mutable_data<float>({m});
+  auto biasptr = bias.mutable_data<float>({m});
+
+  for (int i = 0; i < m * k; ++i) {
+    aaptr[i] = 2;
+  }
+  for (int i = 0; i < k * n; ++i) {
+    bbptr[i] = 2;
+  }
+  for (int i = 0; i < m * n; ++i) {
+    ccptr[i] = 2;
+  }
+  for (int i = 0; i < m; ++i) {
+    scaleptr[i] = 1;
+    biasptr[i] = 0;
+  }
+
+  auto time1 = time();
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul<float>(aa, false, bb, false,
+                                                  static_cast<float>(1), &cc,
+                                                  static_cast<float>(0), false);
+
+    //    paddle_mobile::operators::math::matmulWithBn<float>(
+    //        aa, false, bb, false, static_cast<float>(1), &cc,
+    //        static_cast<float>(0), true, &scale, &bias, 0);
+  }
+  auto time2 = time();
+  std::cout << "gemm  cost :" << time_diff(time1, time2) / 10 << "ms\n";
+
+  return 0;
+}
diff --git a/test/fpga/test_tensor_quant.cpp b/test/fpga/test_tensor_quant.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e30b9be551c608c5200460ebb80526270da5aed
--- /dev/null
+++ b/test/fpga/test_tensor_quant.cpp
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  bool optimize = false;
+  if (paddle_mobile.Load(g_googlenet, optimize)) {
+    auto time2 = time();
+    DLOG << "load cost: " << time_diff(time1, time1) << "ms";
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
+    auto time3 = time();
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    auto time4 = time();
+    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+  }
+  return 0;
+}
diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp
index bea7d4ba7d2df1344f0819222fbdb389106fa77e..25cad4feaa706899122902dee2a8f0c915e78975 100644
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <string>
+
 #include "../test_helper.h"
 #include "io/loader.h"
 
@@ -20,12 +22,10 @@ int main() {
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
   //  auto program = loader.Load(g_googlenet, true);
+  //  auto program = loader.Load(g_mobilenet_ssd, true);
 
-  auto program = loader.Load(g_mobilenet_ssd, true);
-  //  auto program = loader.Load(g_googlenet_combine + "/model",
-  //  g_googlenet_combine +
-  //    "/params", true);
-
+  auto program = loader.Load(std::string(g_ocr) + "/model",
+                             std::string(g_ocr) + "/params", false);
   //  program.originProgram->Description("program desc: ");
   return 0;
 }
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index d230b9469229946fc74f4dc9e1ee6100196ed9aa..02882bedb01df49b8032325e506c9118f3434a2f 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
@@ -23,15 +23,20 @@ int main() {
   auto time1 = time();
   if (paddle_mobile.Load(g_googlenet, optimize)) {
     auto time2 = time();
-    DLOG << "load cost: " << time_diff(time1, time1) << "ms";
+    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
     std::vector<float> input;
     std::vector<int64_t> dims{1, 3, 224, 224};
     GetInput<float>(g_test_image_1x3x224x224, &input, dims);
-    auto time3 = time();
+    // 预热一次
     auto vec_result = paddle_mobile.Predict(input, dims);
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
     auto time4 = time();
 
-    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
   }
   return 0;
 }
diff --git a/test/net/test_mobilenet+ssd.cpp b/test/net/test_mobilenet+ssd.cpp
index 9b4e5f2d3a431001e138977b78994f5dfedbe0a3..ae6c40961ca96ea032b1822f17a663baedc8f661 100644
--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
@@ -32,10 +32,14 @@ int main() {
     std::vector<int64_t> dims{1, 3, 300, 300};
     GetInput<float>(g_hand, &input, dims);
 
-    auto time3 = time();
+    // 预热一次
     auto output = paddle_mobile.Predict(input, dims);
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto output = paddle_mobile.Predict(input, dims);
+    }
     auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
               << std::endl;
   }
   return 0;
diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp
index 9fc7226fc12fa7a0c631c9920487c0bd56c90816..d7793f729866024e2560ad13ac5613172eecc4dd 100644
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -26,19 +26,22 @@ int main() {
 
     std::vector<float> input;
     std::vector<int64_t> dims{1, 3, 224, 224};
-    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
 
+    // 预热一次
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+
+    auto time3 = time();
     for (int i = 0; i < 10; ++i) {
-      auto time3 = time();
       auto vec_result = paddle_mobile.Predict(input, dims);
-      auto time4 = time();
-      std::vector<float>::iterator biggest =
-          std::max_element(std::begin(vec_result), std::end(vec_result));
-      std::cout << " Max element is " << *biggest << " at position "
-                << std::distance(std::begin(vec_result), biggest) << std::endl;
-      std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
-                << std::endl;
     }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
   }
 
   return 0;
diff --git a/test/test_helper.h b/test/test_helper.h
index 9a5c62c79c44fdf52657ea5facb5f0768810c440..658af447d6cfcd85c68ff350b104c2468d442e40 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "framework/ddim.h"
 #include "framework/tensor.h"
 
+static const char *g_ocr = "../models/ocr";
 static const char *g_mobilenet_ssd = "../models/mobilenet+ssd";
 static const char *g_mobilenet_ssd_gesture = "../models/mobilenet+ssd_gesture";
 static const char *g_squeezenet = "../models/squeezenet";
diff --git a/test_gemm.cpp b/test_gemm.cpp
deleted file mode 100644
index 6a49193256d8293dc2cef559b1d1e73bc6dfc7bb..0000000000000000000000000000000000000000
--- a/test_gemm.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <cstdlib>
-#include <ctime>
-#include "../test_helper.h"
-#include "common/log.h"
-#include "memory/t_malloc.h"
-#include "operators/math/gemm.h"
-
-#define a(i, j) a[(i)*lda + (j)]
-#define b(i, j) b[(i)*ldb + (j)]
-#define c(i, j) c[(i)*ldc + (j)]
-#define c1(i, j) c1[(i)*ldc + (j)]
-
-
-void print_matirx(int m, int n, int ldc, float *c) {
-    for (int i = 0; i < m; ++i) {
-        std::cout << c(i, 0);
-        for (int j = 1; j < n; ++j) {
-            std::cout << " | " << c(i, j);
-        }
-        std::cout << std::endl;
-    }
-    std::cout << std::endl;
-}
-
-int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
-  int lda = k;
-  int ldb = n;
-  int ldc = n;
-
-  float *a = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * k));
-  float *b = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * k * n));
-  float *c = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
-  float *c1 = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
-  float* scale = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m));
-  float* bias = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m));
-  
-  srand(unsigned(time(0)));
-  for (int i = 0; i < m * k; ++i) {
-    a[i] = t1 + rand() % t2; 
-  }
-  for (int i = 0; i < k * n; ++i) {
-    b[i] = t1 + rand() % t2; 
-  }
-  for (int i = 0; i < m; ++i) {
-    scale[i] = t1 + rand() % t2; 
-  }
-  for (int i = 0; i < m; ++i) {
-    bias[i] = t1 + rand() % t2; 
-  }
-  
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      float r = 0;
-      for (int p = 0; p < k; p++) {
-        r += a(i, p) * b(p, j);
-      }
-      r *= scale[i];
-      r += bias[i];
-      if (relu && (r < 0)) {
-        r = 0;
-      }
-      c1(i, j) = r;
-    }
-  }
-  
-  paddle_mobile::operators::math::SgemmWithBn(m, n, k, 0.9, a, lda,
-                 b, ldb, 0.3, c, ldc, relu, scale, bias);
-  int eq = 0;
-  int neq = 0;
-  for (int i = 0; i < m * n; ++i) {
-    if (static_cast<int>(c[i]) == static_cast<int>(c1[i])) {
-        ++eq;
-    } else {
-        ++neq;
-    }
-  }
-  
-  if (pr > 0) {
-    std::cout << "A:" << std::endl;
-    print_matirx(m, k, lda, a);
-    
-    std::cout << "B:" << std::endl;
-    print_matirx(k, n, ldb, b);
-    
-    std::cout << "C:" << std::endl;
-    print_matirx(m, n, ldc, c);
-
-    std::cout << "C1:" << std::endl;
-    print_matirx(m, n, ldc, c1);
-  }
-    
-  std::cout << "mnk=" << m << " " << n << " " << k << 
-    " relu=" << relu <<
-    "   eq=" << eq << " neq=" << neq << std::endl;
-
-  paddle_mobile::memory::Free(a);
-  paddle_mobile::memory::Free(b);
-  paddle_mobile::memory::Free(c);
-  paddle_mobile::memory::Free(c1);
-  paddle_mobile::memory::Free(scale);
-  paddle_mobile::memory::Free(bias);
-        
-  return 0;
-}
-
-int main() {
-    
-  do_sgemm(9, 9, 9, true, 10, 10, 10);
-  do_sgemm(10, 6, 12, false, 10, 10, 0);
-  do_sgemm(512, 256, 384, false, 10, 10, 0);
-  do_sgemm(1366, 768, 256, false, 10, 10, 0);
-  do_sgemm(1255, 755, 333, false, 10, 10, 0);
-  do_sgemm(555, 777, 999, false, 10, 10, 0);
-  
-  do_sgemm(10, 6, 12, true, -4, 10, 0);
-  do_sgemm(512, 256, 384, true, -4, 10, 0);
-  do_sgemm(1366, 768, 256, true, -4, 10, 0);
-  do_sgemm(1255, 755, 333, true, -4, 10, 0);
-  do_sgemm(555, 777, 999, true, -4, 10, 0);
-  return 0;
-}
diff --git a/tools/build.sh b/tools/build.sh
index ced18a180762826ffa2c45949e5aab9bfe5c8f88..bf3545ef162c86c16c0877f5f25f3a1e09de1fd4 100755
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -40,8 +40,8 @@ build_for_android() {
     fi
 
     if [ -z "$PLATFORM" ]; then
-#        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" platform.
-        PLATFORM="arm-v8a"
+        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" platform.
+#        PLATFORM="arm-v8a"
     fi
 
     if [ "${PLATFORM}" = "arm-v7a" ]; then
diff --git a/tools/quantification/convert.cpp b/tools/quantification/convert.cpp
index 88eef48b39ab8d2aeb1d4e3858ba97ef6360c9a9..282b22073fc96ddb2ed0d421f113604aadcc4afc 100644
--- a/tools/quantification/convert.cpp
+++ b/tools/quantification/convert.cpp
@@ -3,8 +3,8 @@
 #include "src/enforce.h"
 #include "src/var_desc.h"
 #include "src/program_desc.h"
+#include <cstring>
 #include <cstdlib>
-#include <string>
 #include <cmath>
 #include <iostream>
 #include <utility>
@@ -13,7 +13,7 @@
 #include "src/protobuf-c.h"
 #include <fstream>
 #include <iostream>
-
+#include <limits>
 
 const size_t kSize64 = sizeof(uint64_t);
 const size_t kSize32 = sizeof(uint32_t);
@@ -68,60 +68,60 @@ std::shared_ptr<ProgramDesc> loadParams(const std::string &model_path) {
 
 }
 
-void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char *dataP, FILE *out_file) {
+void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file) {
     // 1. version
-    uint32_t version = *reinterpret_cast<uint32_t *>(dataP);
+    uint32_t version = *reinterpret_cast<uint32_t *>(*dataP);
 
     // write version
     fwrite(&version, kSize32, 1, out_file);
 
-    dataP += kSize32;
+    *dataP += kSize32;
 
     // 2 Lod information
     auto *lod_level_ptr = new uint64_t();
-    memcpy(lod_level_ptr, dataP, kSize64);
+    memcpy(lod_level_ptr, *dataP, kSize64);
 
     uint64_t lod_level = 0;
     // write lod Information
     fwrite(&lod_level, kSize64, 1, out_file);
     delete lod_level_ptr;
 
-    dataP += kSize64;
+    *dataP += kSize64;
 
     for (uint64_t i = 0; i < lod_level; ++i) {
-        uint64_t size = *reinterpret_cast<uint64_t *>(dataP);
+        uint64_t size = *reinterpret_cast<uint64_t *>(*dataP);
         // write lod size
         fwrite(&size, kSize64, 1, out_file);
-        (dataP) += kSize64;
+        (*dataP) += kSize64;
 
         std::vector<size_t> tmp(size / sizeof(size_t));
         for (unsigned long &k : tmp) {
-            k = *reinterpret_cast<size_t *>(dataP);
-            (dataP) += sizeof(size_t);
+            k = *reinterpret_cast<size_t *>(*dataP);
+            (*dataP) += sizeof(size_t);
         }
         // write lod size vector
         fwrite(&tmp, sizeof(size_t), tmp.size(), out_file);
     }
 
     // 3. tensor version
-    uint32_t tensor_version = *reinterpret_cast<uint32_t *>(dataP);
+    uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*dataP);
     // write tensor version
     fwrite(&tensor_version, kSize32, 1, out_file);
-    (dataP) += kSize32;
+    (*dataP) += kSize32;
 
     // 4. tensor desc
-    int32_t size = *reinterpret_cast<int32_t *>(dataP);
+    int32_t size = *reinterpret_cast<int32_t *>(*dataP);
     // write tensor desc
     fwrite(&size, sizeof(int32_t), 1, out_file);
-    (dataP) += sizeof(int32_t);
+    (*dataP) += sizeof(int32_t);
 
     std::unique_ptr<char[]> buf(new char[size]);
     for (int m = 0; m < size; ++m) {
-        buf.get()[m] = (dataP)[m];
+        buf.get()[m] = (*dataP)[m];
     }
 
     fwrite(buf.get(), sizeof(char), static_cast<size_t>(size), out_file);
-    (dataP) += (sizeof(char) * size);
+    (*dataP) += (sizeof(char) * size);
 
     const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
     int memory_size = 1;
@@ -158,9 +158,9 @@ void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char *dataP
     memory = new char[tensorSize];
 
     for (int n = 0; n < tensorSize; ++n) {
-        static_cast<char *>(memory)[n] = (dataP)[n];
+        static_cast<char *>(memory)[n] = (*dataP)[n];
     }
-    dataP += tensorSize;
+    *dataP += tensorSize;
 
     // for float 32
     float min_value = std::numeric_limits<float>::max();
@@ -194,7 +194,7 @@ quantificate_combined(const std::string &model_path, const std::string &param_pa
                 if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
                     continue;
                 }
-                LoadWithDump(*var_desc, data, out_file);
+                LoadWithDump(*var_desc, &data, out_file);
             }
         }
     }
@@ -220,7 +220,7 @@ void quantificate_seperated(const std::string model_dir, const std::string param
                 FILE *out_file = fopen(file_name.c_str(), "wb");
                 char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
                 char *data = origin_data;
-                LoadWithDump(*var_desc, data, out_file);
+                LoadWithDump(*var_desc, &data, out_file);
                 delete origin_data;
                 fclose(out_file);
             }
diff --git a/tools/quantification/src/block_desc_local.h b/tools/quantification/src/block_desc_local.h
index 41c2dc0abbdf8bb006f4152674e92dd1f7d01500..2ee8132af7f21ed0e62678c8da510bfd7fba9dbd 100644
--- a/tools/quantification/src/block_desc_local.h
+++ b/tools/quantification/src/block_desc_local.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #ifndef TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
 #define TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
 
+#include <memory>
 #include <vector>
 #include "src/var_desc.h"