Merge remote-tracking branch 'origin/develop' into develop

cf86fb69 · qnqinan · e471df8c · 07a56c35 · cf86fb69 · cf86fb69
29 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,6 @@ option(LOG_PROFILE "log profile" ON)
 option(CPU "armv7 with neon" ON)
 option(MALI_GPU "mali gpu" OFF)
 option(FPGA "fpga" OFF)
-option(QUANTI "quantification" OFF)
 file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
 file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
@@ -163,7 +162,4 @@ if(DEBUGING)
    endif()
 endif()
-if (QUANTI)
-    add_subdirectory(tools/quantification)
-endif ()
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 - **ARM CPU**
-![](http://mms-graph.bj.bcebos.com/paddle-mobile%2F2018_07_18.png)
+![](http://mms-graph.bj.bcebos.com/paddle-mobile%2F2018_07_29.png)
    arm cpu是paddle-mobile的主要支持方向，cpu的通用性一直是其优势。嵌入式深度学习，需要大量的cpu汇编实现。我们正在紧锣密鼓的编码，为的是能充分硬件的每一点加速能力。
    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。

--- a/doc/quantification.md
+++ b/doc/quantification.md
+# Quantification 模型量化、反量化
+## 背景故事
+部分网络如AlexNet训练出的模型体积较大，不适宜在移动设备上使用。
+## 解决模型过大办法
+1. 选用适合移动端的模型结构如：mobilenet、googlenet、 yolo、squeezenet 等；
+2. 使用我们提供的量化工具，可以在几乎不影响精度的情况下将float32模型减小至原模型的 1/4；
+- - - - - 
+## 量化工具介绍
+### 模型转化工具目录：
+- [量化工具目录](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/tools/quantification)
+- [模型转化工具](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/convert.cpp)
+#### 使用说明
+- [工具使用](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/README.md)
+## 如何读取量化后的模型
+load方法中添加了 quantification 参数，默认为false。 如果需要load量化后的模型，按需传参即可。
+[我是源代码](https://github.com/PaddlePaddle/paddle-mobile/blob/55302b33ea3bd68c9797d8f65e527544792b8095/src/io/paddle_mobile.h)
+```c++
+bool Load(const std::string &dirname, bool optimize = false,
+            bool quantification = false, int batch_size = 1);
+```
+- - - - - 
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -17,39 +17,39 @@ limitations under the License. */
 namespace paddle_mobile {
-const std::string G_OP_TYPE_CONV = "conv2d";
+const char *G_OP_TYPE_CONV = "conv2d";
-const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
+const char *G_OP_TYPE_BATCHNORM = "batch_norm";
-const std::string G_OP_TYPE_BOX_CODER = "box_coder";
+const char *G_OP_TYPE_BOX_CODER = "box_coder";
-const std::string G_OP_TYPE_CONCAT = "concat";
+const char *G_OP_TYPE_CONCAT = "concat";
-const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
+const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
-const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
+const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
-const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
+const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
-const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
+const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
-const std::string G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
+const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
-const std::string G_OP_TYPE_FC = "fusion_fc";
+const char *G_OP_TYPE_FC = "fusion_fc";
-const std::string G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
+const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
-const std::string G_OP_TYPE_LRN = "lrn";
+const char *G_OP_TYPE_LRN = "lrn";
-const std::string G_OP_TYPE_MUL = "mul";
+const char *G_OP_TYPE_MUL = "mul";
-const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
+const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
-const std::string G_OP_TYPE_POOL2D = "pool2d";
+const char *G_OP_TYPE_POOL2D = "pool2d";
-const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
+const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
-const std::string G_OP_TYPE_RELU = "relu";
+const char *G_OP_TYPE_RELU = "relu";
-const std::string G_OP_TYPE_RESHAPE = "reshape";
+const char *G_OP_TYPE_RESHAPE = "reshape";
-const std::string G_OP_TYPE_SIGMOID = "sigmoid";
+const char *G_OP_TYPE_SIGMOID = "sigmoid";
-const std::string G_OP_TYPE_SOFTMAX = "softmax";
+const char *G_OP_TYPE_SOFTMAX = "softmax";
-const std::string G_OP_TYPE_TRANSPOSE = "transpose";
+const char *G_OP_TYPE_TRANSPOSE = "transpose";
-const std::string G_OP_TYPE_SPLIT = "split";
+const char *G_OP_TYPE_SPLIT = "split";
-const std::string G_OP_TYPE_FEED = "feed";
+const char *G_OP_TYPE_FEED = "feed";
-const std::string G_OP_TYPE_FETCH = "fetch";
+const char *G_OP_TYPE_FETCH = "fetch";
-const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
+const char *G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
-const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence";
+const char *G_OP_TYPE_IM2SEQUENCE = "im2sequence";
-const std::string G_OP_TYPE_DROPOUT = "dropout";
+const char *G_OP_TYPE_DROPOUT = "dropout";
-const std::string G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn";
+const char *G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn";
-const std::string G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn";
+const char *G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn";
-const std::string G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU =
+const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU =
    "fusion_elementwise_add_relu";
-const std::string G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu";
+const char *G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu";
-const std::string G_OP_TYPE_REGION = "region";
+const char *G_OP_TYPE_REGION = "region";
 std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>

--- a/src/common/types.h
+++ b/src/common/types.h
@@ -73,40 +73,40 @@ enum PMStatus {
  PMWrongDevice = 0x08     /*!< un-correct device. */
 };
-extern const std::string G_OP_TYPE_CONV;
+extern const char *G_OP_TYPE_CONV;
-extern const std::string G_OP_TYPE_BATCHNORM;
+extern const char *G_OP_TYPE_BATCHNORM;
-extern const std::string G_OP_TYPE_BOX_CODER;
+extern const char *G_OP_TYPE_BOX_CODER;
-extern const std::string G_OP_TYPE_CONCAT;
+extern const char *G_OP_TYPE_CONCAT;
-extern const std::string G_OP_TYPE_ELEMENTWISE_ADD;
+extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
-extern const std::string G_OP_TYPE_FC;
+extern const char *G_OP_TYPE_FC;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
-extern const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU;
-extern const std::string G_OP_TYPE_FUSION_CONV_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_CONV_BN_RELU;
-extern const std::string G_OP_TYPE_LRN;
+extern const char *G_OP_TYPE_LRN;
-extern const std::string G_OP_TYPE_MUL;
+extern const char *G_OP_TYPE_MUL;
-extern const std::string G_OP_TYPE_MULTICLASS_NMS;
+extern const char *G_OP_TYPE_MULTICLASS_NMS;
-extern const std::string G_OP_TYPE_POOL2D;
+extern const char *G_OP_TYPE_POOL2D;
-extern const std::string G_OP_TYPE_PRIOR_BOX;
+extern const char *G_OP_TYPE_PRIOR_BOX;
-extern const std::string G_OP_TYPE_RELU;
+extern const char *G_OP_TYPE_RELU;
-extern const std::string G_OP_TYPE_RESHAPE;
+extern const char *G_OP_TYPE_RESHAPE;
-extern const std::string G_OP_TYPE_SIGMOID;
+extern const char *G_OP_TYPE_SIGMOID;
-extern const std::string G_OP_TYPE_SOFTMAX;
+extern const char *G_OP_TYPE_SOFTMAX;
-extern const std::string G_OP_TYPE_TRANSPOSE;
+extern const char *G_OP_TYPE_TRANSPOSE;
-extern const std::string G_OP_TYPE_SPLIT;
+extern const char *G_OP_TYPE_SPLIT;
-extern const std::string G_OP_TYPE_FEED;
+extern const char *G_OP_TYPE_FEED;
-extern const std::string G_OP_TYPE_FETCH;
+extern const char *G_OP_TYPE_FETCH;
-extern const std::string G_OP_TYPE_DEPTHWISE_CONV;
+extern const char *G_OP_TYPE_DEPTHWISE_CONV;
-extern const std::string G_OP_TYPE_IM2SEQUENCE;
+extern const char *G_OP_TYPE_IM2SEQUENCE;
-extern const std::string G_OP_TYPE_DROPOUT;
+extern const char *G_OP_TYPE_DROPOUT;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD_BN;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN;
-extern const std::string G_OP_TYPE_FUSION_POOL_BN;
+extern const char *G_OP_TYPE_FUSION_POOL_BN;
-extern const std::string G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU;
+extern const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU;
-extern const std::string G_OP_TYPE_FUSION_FC_RELU;
+extern const char *G_OP_TYPE_FUSION_FC_RELU;
-extern const std::string G_OP_TYPE_REGION;
+extern const char *G_OP_TYPE_REGION;
 extern std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>

--- a/src/fpga/api/fpga_api.cpp
+++ b/src/fpga/api/fpga_api.cpp
@@ -58,6 +58,10 @@ void fpga_copy(void *dest, const void *src, size_t num) {
  memcpy(dest, src, num);
 }
+int ComputeFpgaConv(struct FpgaConvArgs) {}
+int ComputeFpgaPool(struct FpgaPoolArgs) {}
+int ComputeFpgaEWAdd(struct FpgaEWAddArgs) {}
 }  // namespace api
 }  // namespace fpga
 }  // namespace mobile

--- a/src/fpga/api/fpga_api.h
+++ b/src/fpga/api/fpga_api.h
@@ -32,24 +32,55 @@ void *fpga_malloc(size_t size);
 void fpga_free(void *ptr);
 void fpga_copy(void *dst, const void *src, size_t num);
-struct CnnVersionArgs {
+struct FpgaVersionArgs {
  void *buf;
 };
-struct QuantArgs {
+struct MemoryToPhysicalArgs {
+  const void *src;
+  uint64_t physical;
+};
+struct MemoryCopyArgs {
+  void *src;
+  void *dst;
+  size_t size;
+};
+struct FpgaQuantArgs {
  float scale;
 };
-struct BatchNormalizationArgs {
+struct FpgaBNArgs {};
-  bool enable;
+struct FpgaConvArgs {
+  bool enable_BN = false;
+  bool enable_Relu = false;
+  struct FpgaBNParam bn_parm;
+};
+struct FpgaPoolArgs {
+  bool enable_BN = false;
+  struct FpgaBNParam bn_parm;
+};
+struct FpgaEWAddArgs {  // only support X + Y
+  bool enable_Relu = false;
 };
-struct ScaleArgs {};
+int ComputeFpgaConv(struct FpgaConvArgs);
+int ComputeFpgaPool(struct FpgaPoolArgs);
+int ComputeFpgaEWAdd(struct FpgaEWAddArgs);
-#define IOCTL_CNN_MAGIC 'CNN'
+#define IOCTL_FPGA_MAGIC 'FPGA'
-#define IOCTL_VERSION _IOW(IOCTL_CNN_MAGIC, 1, struct CnnVersionArgs)
+#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 1, struct FpgaVersionArgs)
-#define IOCTL_GET_QUANT _IOW(IOCTL_CNN_MAGIC, 2, struct QuantArgs)
+#define IOCTL_GET_QUANT _IOW(IOCTL_FPGA_MAGIC, 2, struct FpgaQuantArgs)
-#define IOCTL_SET_QUANT _IOW(IOCTL_CNN_MAGIC, 3, struct QuantArgs)
+#define IOCTL_SET_QUANT _IOW(IOCTL_FPGA_MAGIC, 3, struct FpgaArgs)
+#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
+#define IOCTL_MEM_TOPHY _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryToPhysicalArgs)
+#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct FpgaConvArgs)
+#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct FpgaPoolArgs)
+#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct FpgaEWAddArgs)
 }  // namespace api
 }  // namespace fpga

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -92,8 +92,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
 */
 // 将A矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
+void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
-                  float *buffer) {
+                    float *buffer) {
  const float *a0, *a1, *a2, *a3;
  for (int i = 0; i < m - m_tail; i += MR) {
    a0 = A + i * lda;
@@ -131,9 +131,62 @@ void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
  }
 }
+void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer) {
+  const float *a0, *a1, *a2, *a3, *a4, *a5;
+  for (int i = 0; i < m - m_tail; i += MR) {
+    a0 = A + i * lda;
+    a1 = A + (i + 1) * lda;
+    a2 = A + (i + 2) * lda;
+    a3 = A + (i + 3) * lda;
+    a4 = A + (i + 4) * lda;
+    a5 = A + (i + 5) * lda;
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
+      *buffer++ = *a4++;
+      *buffer++ = *a5++;
+    }
+  }
+  int i = m - m_tail;
+  a0 = &A(i, 0);
+  a1 = a0 + lda;
+  a2 = a0 + 2 * lda;
+  a3 = a0 + 3 * lda;
+  a4 = a0 + 4 * lda;
+  a5 = a0 + 5 * lda;
+  if (m_tail != 0) {
+    if (m_tail <= 5) {
+      a5 = zero;
+    }
+    if (m_tail <= 4) {
+      a4 = zero;
+    }
+    if (m_tail <= 3) {
+      a3 = zero;
+    }
+    if (m_tail <= 2) {
+      a2 = zero;
+    }
+    if (m_tail <= 1) {
+      a1 = zero;
+    }
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
+      *buffer++ = *a4++;
+      *buffer++ = *a5++;
+    }
+  }
+}
 // 将B矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
+void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
-                  float *buffer) {
+                    float *buffer) {
  const float *b0;
  for (int j = 0; j < n - n_tail; j += NR) {
    for (int i = 0; i < k; ++i) {
@@ -188,7 +241,8 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
  for (int j = 0; j < nc; j += NR) {
    for (int i = 0; i < mc; i += MR) {
      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
    }
  }
@@ -218,7 +272,8 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
  for (int j = 0; j < nc; j += NR) {
    for (int i = 0; i < mc; i += MR) {
      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
    }
  }
@@ -1868,22 +1923,22 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
           const float *B, int ldb, float beta, float *C, int ldc, bool relu) {
  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int L1 = 30 * 1024;
+  int L1 = 32 * 1024;
-  int L2 = 1 * 1024 * 1024;
+  int L2 = 0.5 * 1024 * 1024;
  KC = k;
-  MC = L2 / (2 * KC * sizeof(float));
+  MC = L1 / (KC * sizeof(float));
-  NC = MC;
+  NC = L2 / (KC * sizeof(float));
-  // make sure MC is multiple of 4, and NC is multiple of 8
+  // make sure MC is multiple of MR, and NC is multiple of NR
  int mblock_num = (m + MC - 1) / MC;
  MC = (m + mblock_num - 1) / mblock_num;
-  MC = (MC + 4 - 1) / 4 * 4;
+  MC = (MC + MR - 1) / MR * MR;
  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
  int nblock_num = (n + NC - 1) / NC;
  NC = (n + nblock_num - 1) / nblock_num;
-  NC = (NC + 8 - 1) / 8 * 8;
+  NC = (NC + NR - 1) / NR * NR;
  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
  packedA = static_cast<float *>(
@@ -1901,10 +1956,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
  int mc, nc;
  for (int j = 0; j < n; j += NC) {
    nc = s_min(n - j, NC);
-    PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
    for (int i = 0; i < m; i += MC) {
      mc = s_min(m - i, MC);
-      PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
      InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc,
                  relu);
    }
@@ -1921,22 +1976,22 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
                 bool relu, float *new_scale, float *new_bias) {
  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int L1 = 30 * 1024;
+  int L1 = 32 * 1024;
-  int L2 = 1 * 1024 * 1024;
+  int L2 = 0.5 * 1024 * 1024;
  KC = k;
-  MC = L2 / (2 * KC * sizeof(float));
+  MC = L1 / (KC * sizeof(float));
-  NC = MC;
+  NC = L2 / (KC * sizeof(float));
-  // make sure MC is multiple of 4, and NC is multiple of 8
+  // make sure MC is multiple of MR, and NC is multiple of NR
  int mblock_num = (m + MC - 1) / MC;
  MC = (m + mblock_num - 1) / mblock_num;
-  MC = (MC + 4 - 1) / 4 * 4;
+  MC = (MC + MR - 1) / MR * MR;
  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
  int nblock_num = (n + NC - 1) / NC;
  NC = (n + nblock_num - 1) / nblock_num;
-  NC = (NC + 8 - 1) / 8 * 8;
+  NC = (NC + NR - 1) / NR * NR;
  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
  packedA = static_cast<float *>(
@@ -1954,10 +2009,10 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
  int mc, nc;
  for (int j = 0; j < n; j += NC) {
    nc = s_min(n - j, NC);
-    PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
    for (int i = 0; i < m; i += MC) {
      mc = s_min(m - i, MC);
-      PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
      InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
                        &C(i, j), ldc, relu, new_scale + i, new_bias + i);
    }
@@ -1969,6 +2024,221 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
  paddle_mobile::memory::Free(zero);
 }
+void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
+#if __ARM_NEON
+#if __aarch64__
+  // init C
+  float32x4_t cv0 = vdupq_n_f32(0.0);
+  float32x4_t cv1 = vdupq_n_f32(0.0);
+  float32x4_t cv2 = vdupq_n_f32(0.0);
+  float32x4_t cv3 = vdupq_n_f32(0.0);
+  float32x4_t cv4 = vdupq_n_f32(0.0);
+  float32x4_t cv5 = vdupq_n_f32(0.0);
+  float32x4_t cv6 = vdupq_n_f32(0.0);
+  float32x4_t cv7 = vdupq_n_f32(0.0);
+  float32x4_t cv8 = vdupq_n_f32(0.0);
+  float32x4_t cv9 = vdupq_n_f32(0.0);
+  float32x4_t cv10 = vdupq_n_f32(0.0);
+  float32x4_t cv11 = vdupq_n_f32(0.0);
+  float32x4_t av;
+  float32x4_t bv0;
+  float32x4_t bv1;
+  float32x2_t av01;
+  float32x2_t av23;
+  float32x2_t av45;
+  for (int p = 0; p < k; p += 1) {
+    av = vld1q_f32(a);
+    av01 = vget_low_f32(av);
+    av23 = vget_high_f32(av);
+    av45 = vld1_f32(a + 4);
+    bv0 = vld1q_f32(b);
+    bv1 = vld1q_f32(b + 4);
+    cv0 = vmlaq_lane_f32(cv0, bv0, av01, 0);
+    cv1 = vmlaq_lane_f32(cv1, bv1, av01, 0);
+    cv2 = vmlaq_lane_f32(cv2, bv0, av01, 1);
+    cv3 = vmlaq_lane_f32(cv3, bv1, av01, 1);
+    cv4 = vmlaq_lane_f32(cv4, bv0, av23, 0);
+    cv5 = vmlaq_lane_f32(cv5, bv1, av23, 0);
+    cv6 = vmlaq_lane_f32(cv6, bv0, av23, 1);
+    cv7 = vmlaq_lane_f32(cv7, bv1, av23, 1);
+    cv8 = vmlaq_lane_f32(cv8, bv0, av45, 0);
+    cv9 = vmlaq_lane_f32(cv9, bv1, av45, 0);
+    cv10 = vmlaq_lane_f32(cv10, bv0, av45, 1);
+    cv11 = vmlaq_lane_f32(cv11, bv1, av45, 1);
+    a += MR;
+    b += NR;
+  }
+  vst1q_f32(c, cv0);
+  vst1q_f32(c + 4, cv1);
+  vst1q_f32(c + ldc, cv2);
+  vst1q_f32(c + ldc + 4, cv3);
+  vst1q_f32(c + 2 * ldc, cv4);
+  vst1q_f32(c + 2 * ldc + 4, cv5);
+  vst1q_f32(c + 3 * ldc, cv6);
+  vst1q_f32(c + 3 * ldc + 4, cv7);
+  vst1q_f32(c + 4 * ldc, cv8);
+  vst1q_f32(c + 4 * ldc + 4, cv9);
+  vst1q_f32(c + 5 * ldc, cv10);
+  vst1q_f32(c + 5 * ldc + 4, cv11);
+#else
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k / 4;
+  int kc2 = k % 4;
+  int step = 4 * ldc;
+  asm volatile(
+      "pld        [%[a_ptr]]            \n\t"
+      "pld        [%[b_ptr]]            \n\t"
+      "pld        [%[a_ptr],  #64]            \n\t"
+      "pld        [%[b_ptr],  #64]            \n\t"
+      "vmov.f32   q4,     #0.0          \n\t"
+      "vmov.f32   q5,     #0.0          \n\t"
+      "vmov.f32   q6,     #0.0          \n\t"
+      "vmov.f32   q7,     #0.0          \n\t"
+      "vmov.f32   q8,     #0.0          \n\t"
+      "vmov.f32   q9,     #0.0          \n\t"
+      "vmov.f32   q10,    #0.0          \n\t"
+      "vmov.f32   q11,    #0.0          \n\t"
+      "vmov.f32   q12,    #0.0          \n\t"
+      "vmov.f32   q13,    #0.0          \n\t"
+      "vmov.f32   q14,    #0.0          \n\t"
+      "vmov.f32   q15,    #0.0          \n\t"
+      "subs       %[kc1], %[kc1], #1    \n\t"
+      "blt        end_kc1_%=            \n\t"
+      "loop_kc1_%=:                     \n\t"
+      //      "pld        [%[a_ptr], #128]       \n\t"
+      //      "pld        [%[b_ptr], #128]       \n\t"
+      //      "pld        [%[a_ptr], #192]       \n\t"
+      //      "pld        [%[b_ptr], #192]       \n\t"
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+      "subs       %[kc1], %[kc1], #1      \n\t"
+      "bge        loop_kc1_%=             \n\t"
+      "end_kc1_%=:                        \n\t"
+      "subs       %[kc2], %[kc2], #1      \n\t"
+      "blt        end_kc2_%=              \n\t"
+      "loop_kc2_%=:                       \n\t"
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+      "subs       %[kc2], %[kc2], #1      \n\t"
+      "bge        loop_kc2_%=             \n\t"
+      "end_kc2_%=:                        \n\t"
+      "mov        r5,     %[c]            \n\t"
+      "mov        r6,     %[step]         \n\t"
+      "vst1.32    {q4, q5},   [r5], r6    \n\t"
+      "vst1.32    {q6, q7},   [r5], r6    \n\t"
+      "vst1.32    {q8, q9},   [r5], r6    \n\t"
+      "vst1.32    {q10, q11}, [r5], r6    \n\t"
+      "vst1.32    {q12, q13}, [r5], r6    \n\t"
+      "vst1.32    {q14, q15}, [r5]        \n\t"
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [step] "r"(step)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+#endif  // __aarch64__
+#else
+#endif  // __ARM_NEON
+}
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #define B(i, j) B[(i)*ldb + (j)]
 #define C(i, j) C[(i)*ldc + (j)]
-#define MR 4
+#define MR 6
 #define NR 8
 #define s_min(i, j) ((i) < (j) ? (i) : (j))
@@ -39,12 +39,14 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
 */
 // 将 A 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
+void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
-                  float *buffer);
+                    float *buffer);
+void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer);
 // 将 B 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
+void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
-                  float *buffer);
+                    float *buffer);
 // 分块矩阵乘法
 void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
@@ -67,6 +69,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
 // 计算一个更小的 C 矩阵分块
 void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
 void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
 // 分块矩阵乘法结果回写
 // C = A * B

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -145,6 +145,10 @@ else ()
    ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-conv-add-relu-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 endif()
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -43,7 +43,7 @@ template <typename DeviceType, typename OpType>
 class Executor4Test : public Executor<DeviceType> {
 public:
  Executor4Test(Program<DeviceType> p, string op_type,
-                bool use_optimize = false)
+                bool use_optimize = false, int predict_op_count = 1)
      : Executor<DeviceType>() {
    this->use_optimize_ = use_optimize;
    this->program_ = p;
@@ -57,12 +57,14 @@ class Executor4Test : public Executor<DeviceType> {
      LOG(paddle_mobile::LogLevel::kLOG_ERROR)
          << "to_predict_program_ == nullptr";
    }
    const std::vector<std::shared_ptr<BlockDesc>> blocks =
        this->to_predict_program_->Blocks();
    for (std::shared_ptr<BlockDesc> block_desc : blocks) {
      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      for (std::shared_ptr<OpDesc> op : ops) {
+      for (int i = 0; i < ops.size(); ++i) {
-        if (op->Type() == op_type) {
+        auto op = ops[i];
+        if (op->Type() == op_type && i < predict_op_count) {
          DLOG << "匹配到: " << op->Type();
          /// test first meeting op in program
@@ -72,11 +74,17 @@ class Executor4Test : public Executor<DeviceType> {
                      op->Type(), op->GetInputs(), op->GetOutputs(),
                      op->GetAttrMap(), this->program_.scope);
          this->ops_of_block_[*block_desc.get()].push_back(op_ptr);
-          break;
        }
      }
    }
    this->InitMemory();
+    std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
+        this->to_predict_program_->Block(0);
+    auto &ops = this->ops_of_block_[*to_predict_block.get()];
+    for (const auto &op : ops) {
+      op->Init();
+    }
  }
  template <typename T = LoDTensor>
@@ -130,9 +138,6 @@ class Executor4Test : public Executor<DeviceType> {
    auto *output_tensor = con_output->GetMutable<LoDTensor>();
    output_tensor->mutable_data<float>(dDim);
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
    std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
        this->to_predict_program_->Block(0);
    for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size();
@@ -141,6 +146,7 @@ class Executor4Test : public Executor<DeviceType> {
      op->Run();
    }
-    return out_tensor;
+    return std::make_shared<paddle_mobile::framework::Tensor>(
+        paddle_mobile::framework::Tensor(*output_tensor));
  }
 };
--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
@@ -20,22 +20,20 @@ int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
  paddle_mobile.SetThreadNum(4);
  auto time1 = time();
-  auto isok = paddle_mobile.Load(g_mobilenet_ssd_gesture + "/model",
+  auto isok = paddle_mobile.Load(
-                                 g_mobilenet_ssd_gesture + "/params", true);
+      std::string(g_mobilenet_ssd_gesture) + "/model",
+      std::string(g_mobilenet_ssd_gesture) + "/params", true);
  //  auto isok = paddle_mobile.Load(g_mobilenet_ssd, false);
  if (isok) {
    auto time2 = time();
    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
+    std::vector<float> input;
    std::vector<int64_t> dims{1, 3, 300, 300};
-    Tensor input_tensor;
+    GetInput<float>(g_hand, &input, dims);
-    SetupTensor<float>(&input_tensor, {1, 3, 300, 300}, static_cast<float>(0),
-                       static_cast<float>(1));
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
    auto time3 = time();
-    paddle_mobile.Predict(input, dims);
+    auto output = paddle_mobile.Predict(input, dims);
    auto time4 = time();
    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
              << std::endl;

--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -24,19 +24,21 @@ int main() {
    auto time2 = time();
    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    std::vector<float> input;
    std::vector<int64_t> dims{1, 3, 224, 224};
-    Tensor input_tensor;
+    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
-    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
-                       static_cast<float>(1));
+    for (int i = 0; i < 10; ++i) {
+      auto time3 = time();
-    std::vector<float> input(input_tensor.data<float>(),
+      auto vec_result = paddle_mobile.Predict(input, dims);
-                             input_tensor.data<float>() + input_tensor.numel());
+      auto time4 = time();
-    auto time3 = time();
+      std::vector<float>::iterator biggest =
-    auto vec_result = paddle_mobile.Predict(input, dims);
+          std::max_element(std::begin(vec_result), std::end(vec_result));
-    auto time4 = time();
+      std::cout << " Max element is " << *biggest << " at position "
+                << std::distance(std::begin(vec_result), biggest) << std::endl;
-    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+      std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
-              << std::endl;
+                << std::endl;
+    }
  }
  return 0;

--- a/test/operators/test_fusion_conv_add_bn_relu_op.cpp
+++ b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "../test_include.h"
+#include "operators/fusion_conv_add_bn_relu_op.h"
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../models/image_classification_resnet.inference.model
+  auto program = loader.Load(g_mobilenet, true);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::FusionConvAddBNReluOp<
+                    paddle_mobile::CPU, float>>
+      executor(program, "fusion_conv_add_bn_relu", true);
+  std::cout << "executor 4 test: " << std::endl;
+  paddle_mobile::framework::Tensor input;
+  GetInput<float>(g_test_image_1x3x224x224_banana, &input, {1, 3, 224, 224});
+  //  // use SetupTensor if not has local input image .
+  //  SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
+  //                     static_cast<float>(1));
+  DLOG << " fuck: " << input;
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 112, 112});
+  std::cout << "before predict: " << std::endl;
+  auto output =
+      executor.Predict(input, "data", "conv2_1_dw_bn.tmp_2", out_ddim);
+  std::cout << "after predict " << std::endl;
+  auto output_ptr = output->data<float>();
+  int stride = output->numel() / 100;
+  for (int i = 0; i < 100; i++) {
+    DLOG << " index:" << i * stride << " value: " << output_ptr[i * stride];
+  }
+  //  for (int i = 0; i < 100; i++) {
+  //    DLOG << " index:" << i << " value: "<< output_ptr[i];
+  //  }
+  //  for (int j = 0; j < output->numel(); ++j) {
+  //    std::cout << " (index: " << j << " value: " << output_ptr[j] << ") ";
+  //  }
+  std::cout << std::endl;
+  return 0;
+}
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -24,18 +24,21 @@ limitations under the License. */
 #include "framework/ddim.h"
 #include "framework/tensor.h"
-static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd";
+static const char *g_mobilenet_ssd = "../models/mobilenet+ssd";
-static const std::string g_mobilenet_ssd_gesture =
+static const char *g_mobilenet_ssd_gesture = "../models/mobilenet+ssd_gesture";
-    "../models/mobilenet+ssd_gesture";
+static const char *g_squeezenet = "../models/squeezenet";
-static const std::string g_squeezenet = "../models/squeezenet";
+static const char *g_googlenet = "../models/googlenet";
-static const std::string g_googlenet = "../models/googlenet";
+static const char *g_mobilenet = "../models/mobilenet";
-static const std::string g_mobilenet = "../models/mobilenet";
+static const char *g_resnet_50 = "../models/resnet_50";
-static const std::string g_resnet_50 = "../models/resnet_50";
+static const char *g_resnet = "../models/resnet";
-static const std::string g_resnet = "../models/resnet";
+static const char *g_googlenet_combine = "../models/googlenet_combine";
-static const std::string g_googlenet_combine = "../models/googlenet_combine";
+static const char *g_yolo = "../models/yolo";
-static const std::string g_yolo = "../models/yolo";
+static const char *g_test_image_1x3x224x224 =
-static const std::string g_test_image_1x3x224x224 =
    "../images/test_image_1x3x224x224_float";
+static const char *g_test_image_1x3x224x224_banana =
+    "../images/input_3x224x224_banana";
+static const char *g_hand = "../images/hand_image";
 using paddle_mobile::framework::DDim;
 using paddle_mobile::framework::Tensor;

--- a/tools/quantification/CMakeLists.txt
+++ b/tools/quantification/CMakeLists.txt
-set(dir ${CMAKE_CURRENT_SOURCE_DIR})
+cmake_minimum_required(VERSION 3.6)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build")
+project(quali)
+add_definitions(-DENABLE_EXCEPTION)
-ADD_EXECUTABLE(convert convert.cpp)
+set(CMAKE_CXX_STANDARD 11)
-target_link_libraries(convert paddle-mobile)
+file(GLOB_RECURSE QULIFICATON_CC src/*.cc src/*.cpp src/*.c src/*.mm)
\ No newline at end of file
+file(GLOB_RECURSE QULIFICATON_H src/*.h)
+include_directories(. src/)
+#add_library(paddle-mobile SHARED ${QULIFICATON_CC} ${QULIFICATON_H} convert.cpp)
+add_executable(quantify convert.cpp ${QULIFICATON_CC} ${QULIFICATON_H})
\ No newline at end of file
--- a/tools/quantification/README.md
+++ b/tools/quantification/README.md
+# 模型量化脚本
+#### 量化脚本使用指南
+1. 在PaddleMobile项目目录下（如 ~/PaddleProject/paddle-mobile）
+2. cd到  tools/quantification/ 目录
+3. cmake编译
+    ``` sh
+    cmake .
+    make
+    ```
+4. 运行量化脚本
+    ```sh
+    ./quantify (0:seperated. 1:combined ) (输入路径) (输出路径)
+    # quantify googlenet seperated   from  /Users/xiebaiyuan/PaddleProject/quali/models/googlenet to ./googlenet_min
+    ./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min 
+    ```
+*注:*
+*量化工具中*
+*1.seperated模型model文件默认命名为 "__model__";*
+*2.combined模型的model文件默认命名为 "model",参数文件默认命名为"params";*
+##### 整体如下:
+以googlenet非combined为例：
+```sh
+cd tools/quantification/
+cmake .
+make
+./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min
+```
--- a/tools/quantification/convert.cpp
+++ b/tools/quantification/convert.cpp
-#include "io/paddle_mobile.h"
+#include "src/enforce.h"
+#include "src/var_desc.h"
+#include "src/program_desc.h"
 #include <cstdlib>
-using std::string;
+#include <string>
+#include <cmath>
-static const std::string g_googlenet_combine = "../models/googlenet_combine";
+#include <iostream>
-static const std::string g_googlenet = "../models/googlenet";
+#include <utility>
-using paddle_mobile::Executor;
+#include <vector>
-using paddle_mobile::framework::Program;
+#include "src/framework.pb-c.h"
+#include "src/protobuf-c.h"
-    char *Get_binary_data(std::string filename) {
+#include <fstream>
-        FILE *file = fopen(filename.c_str(), "rb");
+#include <iostream>
-        PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                              filename.c_str());
-        fseek(file, 0, SEEK_END);
+const size_t kSize64 = sizeof(uint64_t);
-        int64_t size = ftell(file);
+const size_t kSize32 = sizeof(uint32_t);
-        PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
-        rewind(file);
+char *Get_binary_data(const std::string &filename) {
-        char *data = new char[size];
-        size_t bytes_read = fread(data, 1, size, file);
+    FILE *file = fopen(filename.c_str(), "rb");
-        PADDLE_MOBILE_ENFORCE(bytes_read == size,
-                              "read binary file bytes do not match with fseek");
+    PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-        DLOG << "Get_binary_data end";
+                          filename.c_str());
-        fclose(file);
+    fseek(file, 0, SEEK_END);
-        return data;
+    int64_t size = ftell(file);
+    PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
+    rewind(file);
+    auto *data = new char[size];
+    size_t bytes_read = fread(data, 1, static_cast<size_t>(size), file);
+    PADDLE_MOBILE_ENFORCE(bytes_read == size,
+                          "read binary file bytes do not match with fseek");
+    fclose(file);
+    return data;
+}
+static size_t ReadBuffer(const char *file_name, uint8_t **out) {
+    FILE *fp;
+    fp = fopen(file_name, "rb");
+    PADDLE_MOBILE_ENFORCE(fp != nullptr, " %s open failed !", file_name);
+    fseek(fp, 0, SEEK_END);
+    auto size = static_cast<size_t>(ftell(fp));
+    rewind(fp);
+    *out = reinterpret_cast<uint8_t *>(malloc(size));
+    size_t cur_len = 0;
+    size_t nread;
+    while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
+        cur_len += nread;
    }
+    fclose(fp);
+    return cur_len;
+}
-    void LoadWithDump(const paddle_mobile::framework::VarDesc var_desc,
+std::shared_ptr<ProgramDesc> loadParams(const std::string &model_path) {
-                    paddle_mobile::framework::LoDTensor *tensor, char **data, FILE *out_file) {
+    PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-        // 1. version
+    uint8_t *buf = nullptr;
-        uint32_t version = *reinterpret_cast<uint32_t *>(*data);
+    size_t read_size = ReadBuffer(model_path.c_str(), &buf);
-        // write version
+    PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
-        fwrite(&version, sizeof(uint32_t), 1, out_file );
+    c_program = paddle_mobile__framework__proto__program_desc__unpack(
-        (*data) += sizeof(uint32_t);
+            nullptr, read_size, buf);
-        // 2 Lod information
+    PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
-        uint64_t *lod_level_ptr = new uint64_t();
+    auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
-        memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
+    return originProgramDesc;
-        uint64_t lod_level = 0;
-        // write lod Information
-        fwrite(&lod_level, sizeof(uint64_t), 1, out_file);
-        delete lod_level_ptr;
-        (*data) += sizeof(uint64_t);
-        auto &lod = *tensor->mutable_lod();
-        lod.resize(lod_level);
-        for (uint64_t i = 0; i < lod_level; ++i) {
-            uint64_t size = *reinterpret_cast<uint64_t *>(*data);
-            // write lod size
-            fwrite(&size, sizeof(uint64_t), 1, out_file);
-            (*data) += sizeof(uint64_t);
-            std::vector<size_t> tmp(size / sizeof(size_t));
-            for (int k = 0; k < tmp.size(); ++k) {
-                tmp[k] = *reinterpret_cast<size_t *>(*data);
-                (*data) += sizeof(size_t);
-            }
-            // write lod size vector
-            fwrite(&tmp, sizeof(size_t), tmp.size(), out_file );
-            lod[i] = tmp;
+}
-        }
-        // 3. tensor version
+void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char *dataP, FILE *out_file) {
-        uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
+    // 1. version
-        // write tensor version
+    uint32_t version = *reinterpret_cast<uint32_t *>(dataP);
-        fwrite(&tensor_version, sizeof(uint32_t), 1, out_file);
-        (*data) += sizeof(uint32_t);
-        // 4. tensor desc
-        int32_t size = *reinterpret_cast<int32_t *>(*data);
-        // write tensor desc
-        fwrite(&size, sizeof(int32_t), 1, out_file);
-        (*data) += sizeof(int32_t);
-        std::unique_ptr<char[]> buf(new char[size]);
-        for (int m = 0; m < size; ++m) {
-            buf.get()[m] = (*data)[m];
-        }
-        fwrite(buf.get(), sizeof(char), size, out_file);
-        (*data) += (sizeof(char) * size);
-        const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
+    // write version
-        int memory_size = 1;
+    fwrite(&version, kSize32, 1, out_file);
-        for (auto l : desc.Dims()) {
-            memory_size *= l;
+    dataP += kSize32;
-        }
-        tensor->Resize(paddle_mobile::framework::make_ddim(desc.Dims()));
+    // 2 Lod information
+    auto *lod_level_ptr = new uint64_t();
-        void *memory = tensor;
+    memcpy(lod_level_ptr, dataP, kSize64);
-        int type_size = 0;
-        switch (desc.DataType()) {
+    uint64_t lod_level = 0;
-            case paddle_mobile::framework::VARTYPE_TYPE_FP16:
+    // write lod Information
-                type_size = 2;
+    fwrite(&lod_level, kSize64, 1, out_file);
-                break;
+    delete lod_level_ptr;
-            case paddle_mobile::framework::VARTYPE_TYPE_FP32:
-                type_size = 4;
+    dataP += kSize64;
-                memory = tensor->mutable_data<float>();
-                break;
+    for (uint64_t i = 0; i < lod_level; ++i) {
-            case paddle_mobile::framework::VARTYPE_TYPE_FP64:
+        uint64_t size = *reinterpret_cast<uint64_t *>(dataP);
-                type_size = 8;
+        // write lod size
-                break;
+        fwrite(&size, kSize64, 1, out_file);
-            case paddle_mobile::framework::VARTYPE_TYPE_INT32:
+        (dataP) += kSize64;
-                type_size = 4;
-                break;
+        std::vector<size_t> tmp(size / sizeof(size_t));
-            case paddle_mobile::framework::VARTYPE_TYPE_INT64:
+        for (unsigned long &k : tmp) {
-                type_size = 8;
+            k = *reinterpret_cast<size_t *>(dataP);
-                break;
+            (dataP) += sizeof(size_t);
-            case paddle_mobile::framework::VARTYPE_TYPE_BOOL:
-                type_size = 1;
-                break;
-            default:
-                break;
-        }
-        for (int n = 0; n < memory_size * type_size; ++n) {
-            static_cast<char *>(memory)[n] = (*data)[n];
-        }
-        (*data) += (sizeof(char) * memory_size * type_size);
-        // for float 32
-        float min_value = std::numeric_limits<float>::max();
-        float max_value = std::numeric_limits<float>::min();
-        for (int k = 0; k < memory_size; ++k) {
-            min_value = std::min(min_value, static_cast<float *> (memory)[k]);
-            max_value = std::max(max_value, static_cast<float *> (memory)[k]);
-        }
-        fwrite(&min_value, sizeof(float), 1, out_file);
-        fwrite(&max_value, sizeof(float), 1, out_file);
-        for (int g = 0; g < memory_size; ++g) {
-            float value = static_cast<float *> (memory)[g];
-            uint8_t factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
-            fwrite(&factor, sizeof(uint8_t), 1, out_file);
        }
+        // write lod size vector
+        fwrite(&tmp, sizeof(size_t), tmp.size(), out_file);
+    }
+    // 3. tensor version
+    uint32_t tensor_version = *reinterpret_cast<uint32_t *>(dataP);
+    // write tensor version
+    fwrite(&tensor_version, kSize32, 1, out_file);
+    (dataP) += kSize32;
+    // 4. tensor desc
+    int32_t size = *reinterpret_cast<int32_t *>(dataP);
+    // write tensor desc
+    fwrite(&size, sizeof(int32_t), 1, out_file);
+    (dataP) += sizeof(int32_t);
+    std::unique_ptr<char[]> buf(new char[size]);
+    for (int m = 0; m < size; ++m) {
+        buf.get()[m] = (dataP)[m];
+    }
+    fwrite(buf.get(), sizeof(char), static_cast<size_t>(size), out_file);
+    (dataP) += (sizeof(char) * size);
+    const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
+    int memory_size = 1;
+    for (auto l : desc.Dims()) {
+        memory_size *= l;
+    }
+    void *memory = nullptr;
+    int type_size = 0;
+    switch (desc.DataType()) {
+        case paddle_mobile::framework::VARTYPE_TYPE_FP16:
+            type_size = 2;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_FP32:
+            type_size = 4;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_FP64:
+            type_size = 8;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_INT32:
+            type_size = 4;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_INT64:
+            type_size = 8;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_BOOL:
+            type_size = 1;
+            break;
+        default:
+            break;
+    }
+    size_t tensorSize = sizeof(char) * memory_size * type_size;
+    memory = new char[tensorSize];
+    for (int n = 0; n < tensorSize; ++n) {
+        static_cast<char *>(memory)[n] = (dataP)[n];
+    }
+    dataP += tensorSize;
+    // for float 32
+    float min_value = std::numeric_limits<float>::max();
+    float max_value = std::numeric_limits<float>::min();
+    for (int k = 0; k < memory_size; ++k) {
+        min_value = std::min(min_value, static_cast<float *> (memory)[k]);
+        max_value = std::max(max_value, static_cast<float *> (memory)[k]);
+    }
+    fwrite(&min_value, sizeof(float), 1, out_file);
+    fwrite(&max_value, sizeof(float), 1, out_file);
+    for (int g = 0; g < memory_size; ++g) {
+        float value = static_cast<float *> (memory)[g];
+        auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
+        fwrite(&factor, sizeof(uint8_t), 1, out_file);
    }
+}
-    void quantificate_combined(std::string model_path, std::string param_path, std::string param_min_path){
+void
-        paddle_mobile::Loader<paddle_mobile::CPU,paddle_mobile::Precision::FP32 > loader;
+quantificate_combined(const std::string &model_path, const std::string &param_path, const std::string &param_min_path) {
-        bool optimize = true;
-        auto program = loader.Load(model_path, param_path, optimize);
+    auto program = loadParams(model_path);
-        char *origin_data = Get_binary_data(program.para_path);
+    char *origin_data = Get_binary_data(param_path);
-        char *data = origin_data;
+    char *data = origin_data;
-        FILE *out_file = fopen(param_min_path.c_str(), "wb");
+    FILE *out_file = fopen(param_min_path.c_str(), "wb");
-        for (const auto &block : program.originProgram->Blocks()) {
+    for (const auto &block : program->Blocks()) {
-            for (const auto &var_desc : block->Vars()) {
+        for (const auto &var_desc : block->Vars()) {
-                auto var = program.scope->Var(var_desc->Name());
+            if (var_desc->Persistable()) {
-                if(var_desc ->Persistable()) {
+                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-                    auto tensor = var->template GetMutable<paddle_mobile::framework::LoDTensor>();
+                    continue;
-                    if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-                        continue;
-                    }
-                    LoadWithDump(*var_desc, tensor, &data,out_file);
                }
+                LoadWithDump(*var_desc, data, out_file);
            }
        }
-        fclose(out_file);
-        delete origin_data;
    }
-    void quantificate_seperated(std::string model_dir, std::string param_min_path) {
+    fclose(out_file);
-        paddle_mobile::Loader<paddle_mobile::CPU,paddle_mobile::Precision::FP32 > loader;
+    delete origin_data;
-        bool optimize = true;
-        auto program = loader.Load(model_dir, optimize);
+}
-        std::string shell_command = "mkdir "+param_min_path;
-        system(shell_command.c_str());
+void quantificate_seperated(const std::string model_dir, const std::string param_min_path) {
-        for (const auto &block : program.originProgram->Blocks()) {
-            for (const auto &var_desc : block->Vars()) {
+    auto program = loadParams(model_dir + "/__model__");
-                auto var = program.scope->Var(var_desc->Name());
-                if(var_desc ->Persistable()) {
+    std::string shell_command = "mkdir " + param_min_path;
-                    auto tensor = var->template GetMutable<paddle_mobile::framework::LoDTensor>();
+    system(shell_command.c_str());
-                    if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-                        continue;
+    for (const auto &block : program->Blocks()) {
-                    }
+        for (const auto &var_desc : block->Vars()) {
-                    std::string file_name = param_min_path +"/"+ var_desc->Name();
+            if (var_desc->Persistable()) {
+                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-                    FILE *out_file = fopen(file_name.c_str(), "wb");
+                    continue;
-                    char *origin_data =
-                            Get_binary_data(program.model_path + "/" + var_desc->Name());
-                    char *data = origin_data;
-                    LoadWithDump(*var_desc, tensor, &data,out_file);
-                    delete origin_data;
-                    fclose(out_file);
                }
+                std::string file_name = param_min_path + "/" + var_desc->Name();
+                FILE *out_file = fopen(file_name.c_str(), "wb");
+                char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
+                char *data = origin_data;
+                LoadWithDump(*var_desc, data, out_file);
+                delete origin_data;
+                fclose(out_file);
            }
        }
+    }
+}
+int main(int argc, char **argv) {
+    const std::string kNoteEg = "( eg:  ./quantify 1 your_combined_model_path output_path  or  ./quantify 0 your_seperated_model_path output_path)";
+    PADDLE_MOBILE_ENFORCE(argc > 1, "wee need params.%s ", kNoteEg.c_str());
+    std::string action_type = argv[1];
+    PADDLE_MOBILE_ENFORCE(argc > 1 && (action_type) == "1" || action_type == "0",
+                          "only 1 or 2 supported, current is %s %s ",
+                          action_type.c_str(),
+                          kNoteEg.c_str());
+    PADDLE_MOBILE_ENFORCE(argc > 2, "we need your model path. %s ", kNoteEg.c_str());
+    std::string base_path = argv[2];
+    PADDLE_MOBILE_ENFORCE(argc > 3, "we need your output path. %s ", kNoteEg.c_str());
+    std::string output_path = argv[3];
+    if (action_type == "0") {
+        // for seperated
+        const std::string &seperated_min_dir = output_path;
+        quantificate_seperated(base_path, seperated_min_dir);
+        return 0;
    }
-    int main() {
-        std::string filename = "params_min";
+    if (action_type == "1") {
-        std::string model_path = g_googlenet_combine + "/model";
+        // for combined
-        std::string param_path = g_googlenet_combine + "/params";
+        const std::string &combined_min_dir = output_path;
-        std::string dirname = "param_min_dir";
+        std::string model_path = base_path + "/model";
-        std::string model_dir = g_googlenet;
+        std::string param_path = base_path + "/params";
-//        quantificate_combined(model_path, param_path,filename);
+        quantificate_combined(model_path, param_path, combined_min_dir);
-        quantificate_seperated(model_dir, dirname);
        return 0;
    }
+    return -1;
+}

--- a/tools/quantification/src/block_desc_local.cpp
+++ b/tools/quantification/src/block_desc_local.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+//
+// Created by 谢柏渊 on 2018/7/25.
+//
+#include "src/block_desc_local.h"
+#include <algorithm>
+#include <memory>
+#include <vector>
+#include "src/framework.pb-c.h"
+std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>>
+BlockDesc::Vars() const {
+  return vars_;
+}
+BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc)
+    : index_(desc->idx), parent_index_(desc->idx) {
+  for (int i = 0; i < desc->n_vars; ++i) {
+    PaddleMobile__Framework__Proto__VarDesc *var_desc = desc->vars[i];
+    vars_.emplace_back(std::shared_ptr<paddle_mobile::framework::VarDesc>(
+        new paddle_mobile::framework::VarDesc(var_desc)));
+  }
+  std::sort(vars_.begin(), vars_.end(),
+            [](std::shared_ptr<paddle_mobile::framework::VarDesc> left,
+               std::shared_ptr<paddle_mobile::framework::VarDesc> right) {
+              return left->Name() < right->Name();
+            });
+  //        for (int j = 0; j < desc->n_ops; ++j) {
+  //            PaddleMobile__Framework__Proto__OpDesc *op_desc = desc->ops[j];
+  //            ops_.emplace_back(new OpDesc(op_desc));
+  //        }
+}
--- a/tools/quantification/src/block_desc_local.h
+++ b/tools/quantification/src/block_desc_local.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+//
+// Created by 谢柏渊 on 2018/7/25.
+//
+#ifndef TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
+#define TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
+#include <vector>
+#include "src/var_desc.h"
+class BlockDesc {
+ public:
+  friend class Node;
+  friend class ProgramOptimize;
+  BlockDesc() {}
+  explicit BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc);
+  const int &ID() const { return index_; }
+  const bool &MultiThread() const { return multi_thread_; }
+  const int &Parent() const { return parent_index_; }
+  bool operator==(const BlockDesc &in_block) const {
+    return this->ID() == in_block.ID() && this->Parent() == in_block.Parent();
+  }
+  bool operator<(const BlockDesc &in_block) const {
+    return this->ID() < in_block.ID() && this->Parent() < in_block.Parent();
+  }
+  std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>> Vars() const;
+ private:
+  int index_;
+  bool multi_thread_;
+  int parent_index_;
+  std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>> vars_;
+};
+#endif  // TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
--- a/tools/quantification/src/enforce.h
+++ b/tools/quantification/src/enforce.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef ENABLE_EXCEPTION
+#include <stdio.h>
+#include <exception>
+#include <string>
+#endif
+namespace paddle_mobile {
+#ifdef ENABLE_EXCEPTION
+struct PaddleMobileException : public std::exception {
+  const std::string exception_prefix = "paddle mobile C++ Exception: \n";
+  std::string message;
+  PaddleMobileException(const char *header, const char *detail,
+                        const char *file, const int line) {
+    char buffer[1500];
+    snprintf(buffer, sizeof(buffer),
+             "%s| %s \n| [in file] : %s\n| [on line] : %d\n| [detail]  : %s\n",
+             exception_prefix.c_str(), header, file, line, detail);
+    message = std::string(buffer);
+  }
+  const char *what() const noexcept { return message.c_str(); }
+};
+#define PADDLE_MOBILE_THROW_EXCEPTION(...)                                 \
+  {                                                                        \
+    char buffer[1000];                                                     \
+    snprintf(buffer, sizeof(buffer), __VA_ARGS__);                         \
+    std::string detail(buffer);                                            \
+    throw paddle_mobile::PaddleMobileException("Custom Exception", buffer, \
+                                               __FILE__, __LINE__);        \
+  }
+#define PADDLE_MOBILE_ENFORCE(stat, ...)                                      \
+  {                                                                           \
+    if (stat) {                                                               \
+    } else {                                                                  \
+      char buffer[1000];                                                      \
+      snprintf(buffer, sizeof(buffer), __VA_ARGS__);                          \
+      std::string detail(buffer);                                             \
+      throw paddle_mobile::PaddleMobileException("paddle-mobile enforce",     \
+                                                 buffer, __FILE__, __LINE__); \
+    }                                                                         \
+  }
+#else
+#define PADDLE_MOBILE_THROW_EXCEPTION(...)
+#define PADDLE_MOBILE_ENFORCE(stat, ...)
+#endif
+}  // namespace paddle_mobile
--- a/tools/quantification/src/framework.pb-c.c
+++ b/tools/quantification/src/framework.pb-c.c
--- a/tools/quantification/src/framework.pb-c.h
+++ b/tools/quantification/src/framework.pb-c.h
--- a/tools/quantification/src/program_desc.cpp
+++ b/tools/quantification/src/program_desc.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+//
+// Created by 谢柏渊 on 2018/7/25.
+//
+#include "src/program_desc.h"
+#include <vector>
+ProgramDesc::ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc) {
+  for (int i = 0; i < desc->n_blocks; ++i) {
+    blocks_.emplace_back(std::make_shared<BlockDesc>(desc->blocks[i]));
+  }
+}
+const std::vector<std::shared_ptr<BlockDesc>> ProgramDesc::Blocks() {
+  return blocks_;
+}
--- a/tools/quantification/src/program_desc.h
+++ b/tools/quantification/src/program_desc.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+//
+// Created by 谢柏渊 on 2018/7/25.
+//
+#ifndef TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
+#define TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
+#include <memory>
+#include <vector>
+#include "src/block_desc_local.h"
+#include "src/framework.pb-c.h"
+class ProgramDesc {
+ public:
+  //    friend class Node;
+  //
+  //    friend class ProgramOptimize;
+  explicit ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc);
+  const std::vector<std::shared_ptr<BlockDesc>> Blocks();
+ private:
+  std::vector<std::shared_ptr<BlockDesc>> blocks_;
+};
+#endif  // TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
--- a/tools/quantification/src/protobuf-c.c
+++ b/tools/quantification/src/protobuf-c.c
--- a/tools/quantification/src/protobuf-c.h
+++ b/tools/quantification/src/protobuf-c.h
--- a/tools/quantification/src/tensor_desc.h
+++ b/tools/quantification/src/tensor_desc.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "src/framework.pb-c.h"
+namespace paddle_mobile {
+namespace framework {
+enum VarType_Type {
+  VARTYPE_TYPE_BOOL = 0,
+  VARTYPE_TYPE_INT16 = 1,
+  VARTYPE_TYPE_INT32 = 2,
+  VARTYPE_TYPE_INT64 = 3,
+  VARTYPE_TYPE_FP16 = 4,
+  VARTYPE_TYPE_FP32 = 5,
+  VARTYPE_TYPE_FP64 = 6,
+  VARTYPE_TYPE_LOD_TENSOR = 7,
+  VARTYPE_TYPE_SELECTED_ROWS = 8,
+  VARTYPE_TYPE_FEED_MINIBATCH = 9,
+  VARTYPE_TYPE_FETCH_LIST = 10,
+  VARTYPE_TYPE_STEP_SCOPES = 11,
+  VARTYPE_TYPE_STEP_LOD_RANK_TABLE = 12,
+  VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY = 13,
+  VARTYPE_TYPE_STEP_PLACE_LIST = 14,
+  VARTYPE_TYPE_READER = 15,
+  VARTYPE_TYPE_CHANNEL = 16,
+  VARTYPE_TYPE_RAW = 17,
+  VARTYPE_TYPE_TUPLE = 18
+};
+class TensorDesc {
+ public:
+  TensorDesc() = default;
+  TensorDesc(const TensorDesc &desc) {
+    this->dims_ = desc.dims_;
+    this->data_type_ = desc.data_type_;
+  }
+  explicit TensorDesc(
+      PaddleMobile__Framework__Proto__VarType__TensorDesc *desc) {
+    for (int i = 0; i < desc->n_dims; ++i) {
+      int64_t d = desc->dims[i];
+      dims_.emplace_back(d);
+    }
+    data_type_ = (VarType_Type)desc->data_type;
+  }
+  std::vector<int64_t> Dims() const { return dims_; }
+  VarType_Type DataType() const { return data_type_; }
+ private:
+  std::vector<int64_t> dims_;
+  VarType_Type data_type_;
+};
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/tools/quantification/src/var_desc.h
+++ b/tools/quantification/src/var_desc.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "src/framework.pb-c.h"
+#include "src/tensor_desc.h"
+namespace paddle_mobile {
+namespace framework {
+class VarDesc {
+ public:
+  VarDesc(const VarDesc &var_desc) {
+    this->data_type_ = var_desc.data_type_;
+    this->name_ = var_desc.name_;
+    this->persistable_ = var_desc.persistable_;
+    this->tensor_desc_ = var_desc.tensor_desc_;
+    this->type_ = var_desc.type_;
+  }
+  explicit VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) {
+    type_ = (VarType_Type)desc->type->type;
+    name_ = std::string(desc->name);
+    persistable_ = static_cast<bool>(desc->persistable);
+    switch (type_) {
+      case VARTYPE_TYPE_SELECTED_ROWS:
+        tensor_desc_ = TensorDesc(desc->type->selected_rows);
+        break;
+      case VARTYPE_TYPE_LOD_TENSOR:
+        tensor_desc_ = TensorDesc(desc->type->lod_tensor->tensor);
+        break;
+      case VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY:
+        // desc->type->tensor_array->tensor->data_type;
+        tensor_desc_ = TensorDesc(desc->type->tensor_array->tensor);
+        break;
+      default:
+        break;
+    }
+    switch (type_) {
+      case VARTYPE_TYPE_CHANNEL:
+        data_type_ = (VarType_Type)desc->type->channel->data_type;
+        break;
+      default:
+        data_type_ = tensor_desc_.DataType();
+        break;
+    }
+  }
+  std::string Name() const { return name_; }
+  VarType_Type Type() const { return type_; }
+  bool Persistable() const { return persistable_; }
+  const TensorDesc &Tensor_desc() const { return tensor_desc_; }
+ private:
+  std::string name_;
+  bool persistable_;
+  TensorDesc tensor_desc_;
+  VarType_Type type_;
+  VarType_Type data_type_;
+};
+}  // namespace framework
+}  // namespace paddle_mobile