diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2532ecf24367c0efd8cc6bda90209e77008a4a54..4ccf73763c08a748b53027d7f4a0f254774a1843 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,8 +1,8 @@
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.6)
 project(paddle-mobile)
 
 option(DEBUGING "enable debug mode" ON)
-option(USE_OPENMP "openmp support" ON)
+option(USE_OPENMP "openmp support" OFF)
 option(USE_EXCEPTION "use std exception" ON)
 option(LOG_PROFILE "log profile" ON)
 # select the platform to build
@@ -15,7 +15,7 @@ file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
 include_directories(src/)
 
 if(IS_IOS)
-    set(CMAKE_CXX_FLAGS "-fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
 else()
     set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
 endif()
@@ -43,7 +43,7 @@ if (LOG_PROFILE)
     add_definitions(-DPADDLE_MOBILE_PROFILE)
 endif()
 
-if(USE_OPENMP)
+if(USE_OPENMP AND NOT IS_IOS)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
     add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
 endif()
@@ -104,12 +104,21 @@ else()
     foreach(f ${_tmp_list_h})
         list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
     endforeach()
-endif()
 
 
+    file(GLOB_RECURSE _tmp_list src/fpga/*.cpp src/fpga/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+
+    file(GLOB_RECURSE _tmp_list_h src/fpga/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
+endif()
+
 if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
-    add_definitions(-DARMV7)
 else()
     list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
     list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
@@ -130,8 +139,8 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
 
 # NET default
-set(NET "defult" CACHE STRING "select net type")
-set_property(CACHE NET PROPERTY STRINGS "defult" "googlenet" "mobilenet" "yolo" "squeezenet")
+set(NET "default" CACHE STRING "select net type")
+set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGAnets")
 include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
 
 
@@ -153,3 +162,4 @@ if(DEBUGING)
     endif()
 endif()
 
+
diff --git a/README.md b/README.md
index 69362734116fd8af78442a07dd31600aa46b7935..59ef597dd749ea16658977cd6d548cedaa90d166 100644
--- a/README.md
+++ b/README.md
@@ -27,10 +27,10 @@ Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 - **ARM CPU**
 
 
-![](http://7xop3k.com1.z0.glb.clouddn.com/15312108766575.jpg)
+![](http://mms-graph.bj.bcebos.com/paddle-mobile%2F2018_07_29.png)
 
     arm cpu是paddle-mobile的主要支持方向，cpu的通用性一直是其优势。嵌入式深度学习，需要大量的cpu汇编实现。我们正在紧锣密鼓的编码，为的是能充分硬件的每一点加速能力。
-    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是120+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
+    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
     
 - **Mali GPU**
 
diff --git a/doc/quantification.md b/doc/quantification.md
new file mode 100644
index 0000000000000000000000000000000000000000..04a93116a08c094ef71861cec1bb3262304c4cb7
--- /dev/null
+++ b/doc/quantification.md
@@ -0,0 +1,39 @@
+# Quantification 模型量化、反量化
+
+## 背景故事
+部分网络如AlexNet训练出的模型体积较大，不适宜在移动设备上使用。
+
+
+## 解决模型过大办法
+1. 选用适合移动端的模型结构如：mobilenet、googlenet、 yolo、squeezenet 等；
+2. 使用我们提供的量化工具，可以在几乎不影响精度的情况下将float32模型减小至原模型的 1/4；
+
+- - - - - 
+## 量化工具介绍
+
+### 模型转化工具目录：
+
+- [量化工具目录](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/tools/quantification)
+
+- [模型转化工具](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/convert.cpp)
+
+#### 使用说明
+- [工具使用](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/README.md)
+
+## 如何读取量化后的模型
+load方法中添加了 quantification 参数，默认为false。 如果需要load量化后的模型，按需传参即可。
+
+[我是源代码](https://github.com/PaddlePaddle/paddle-mobile/blob/55302b33ea3bd68c9797d8f65e527544792b8095/src/io/paddle_mobile.h)
+
+```c++
+bool Load(const std::string &dirname, bool optimize = false,
+            bool quantification = false, int batch_size = 1);
+```
+
+- - - - - 
+
+
+
+
+
+
diff --git a/src/common/types.cpp b/src/common/types.cpp
index 9bc594c7533b980626d8d07e89fc3ccf649a127f..2f366eb9e5a10ea11e3153e6e32b18204c6dd9cd 100644
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -17,38 +17,46 @@ limitations under the License. */
 
 namespace paddle_mobile {
 
-const std::string G_OP_TYPE_CONV = "conv2d";
-const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
-const std::string G_OP_TYPE_BOX_CODER = "box_coder";
-const std::string G_OP_TYPE_CONCAT = "concat";
-const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
-const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
-const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
-const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
-
-const std::string G_OP_TYPE_FC = "fusion_fc";
-const std::string G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
-const std::string G_OP_TYPE_LRN = "lrn";
-const std::string G_OP_TYPE_MUL = "mul";
-const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
-const std::string G_OP_TYPE_POOL2D = "pool2d";
-const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
-const std::string G_OP_TYPE_RELU = "relu";
-const std::string G_OP_TYPE_RESHAPE = "reshape";
-const std::string G_OP_TYPE_SIGMOID = "sigmoid";
-const std::string G_OP_TYPE_SOFTMAX = "softmax";
-const std::string G_OP_TYPE_TRANSPOSE = "transpose";
-const std::string G_OP_TYPE_SPLIT = "split";
-const std::string G_OP_TYPE_FEED = "feed";
-const std::string G_OP_TYPE_FETCH = "fetch";
-const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
-const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence";
-const std::string G_OP_TYPE_DROPOUT = "dropout";
+const char *G_OP_TYPE_CONV = "conv2d";
+const char *G_OP_TYPE_BATCHNORM = "batch_norm";
+const char *G_OP_TYPE_BOX_CODER = "box_coder";
+const char *G_OP_TYPE_CONCAT = "concat";
+const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
+const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
+const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
+const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
+const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
+const char *G_OP_TYPE_FC = "fusion_fc";
+const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
+const char *G_OP_TYPE_LRN = "lrn";
+const char *G_OP_TYPE_MUL = "mul";
+const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
+const char *G_OP_TYPE_POOL2D = "pool2d";
+const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
+const char *G_OP_TYPE_RELU = "relu";
+const char *G_OP_TYPE_RESHAPE = "reshape";
+const char *G_OP_TYPE_SIGMOID = "sigmoid";
+const char *G_OP_TYPE_SOFTMAX = "softmax";
+const char *G_OP_TYPE_TRANSPOSE = "transpose";
+const char *G_OP_TYPE_SPLIT = "split";
+const char *G_OP_TYPE_FEED = "feed";
+const char *G_OP_TYPE_FETCH = "fetch";
+const char *G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
+const char *G_OP_TYPE_IM2SEQUENCE = "im2sequence";
+const char *G_OP_TYPE_DROPOUT = "dropout";
+const char *G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn";
+const char *G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn";
+const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU =
+    "fusion_elementwise_add_relu";
+const char *G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu";
+const char *G_OP_TYPE_REGION = "region";
 
 std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
     op_input_output_key = {
         {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_FUSION_DWCONV_BN_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_BN_RELU, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
         {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
@@ -72,6 +80,11 @@ std::unordered_map<
         {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
         {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}}};
+        {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_BN, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_POOL_BN, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_FC_RELU, {{"X", "Y", "Z"}, {"Out"}}},
+        {G_OP_TYPE_REGION, {{"X"}, {"Out"}}}};
 
 }  // namespace paddle_mobile
diff --git a/src/common/types.h b/src/common/types.h
index 1daf9c9b7bccfc8bcb584e5a37f920539736a911..7745f80a9ca2ef6f0258f6f2eacf45761d29a00e 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 namespace paddle_mobile {
@@ -72,33 +73,40 @@ enum PMStatus {
   PMWrongDevice = 0x08     /*!< un-correct device. */
 };
 
-extern const std::string G_OP_TYPE_CONV;
-extern const std::string G_OP_TYPE_BATCHNORM;
-extern const std::string G_OP_TYPE_BOX_CODER;
-extern const std::string G_OP_TYPE_CONCAT;
-extern const std::string G_OP_TYPE_ELEMENTWISE_ADD;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU;
-extern const std::string G_OP_TYPE_FC;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
-extern const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU;
-
-extern const std::string G_OP_TYPE_LRN;
-extern const std::string G_OP_TYPE_MUL;
-extern const std::string G_OP_TYPE_MULTICLASS_NMS;
-extern const std::string G_OP_TYPE_POOL2D;
-extern const std::string G_OP_TYPE_PRIOR_BOX;
-extern const std::string G_OP_TYPE_RELU;
-extern const std::string G_OP_TYPE_RESHAPE;
-extern const std::string G_OP_TYPE_SIGMOID;
-extern const std::string G_OP_TYPE_SOFTMAX;
-extern const std::string G_OP_TYPE_TRANSPOSE;
-extern const std::string G_OP_TYPE_SPLIT;
-extern const std::string G_OP_TYPE_FEED;
-extern const std::string G_OP_TYPE_FETCH;
-extern const std::string G_OP_TYPE_DEPTHWISE_CONV;
-extern const std::string G_OP_TYPE_IM2SEQUENCE;
-extern const std::string G_OP_TYPE_DROPOUT;
+extern const char *G_OP_TYPE_CONV;
+extern const char *G_OP_TYPE_BATCHNORM;
+extern const char *G_OP_TYPE_BOX_CODER;
+extern const char *G_OP_TYPE_CONCAT;
+extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
+extern const char *G_OP_TYPE_FC;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_CONV_BN_RELU;
+
+extern const char *G_OP_TYPE_LRN;
+extern const char *G_OP_TYPE_MUL;
+extern const char *G_OP_TYPE_MULTICLASS_NMS;
+extern const char *G_OP_TYPE_POOL2D;
+extern const char *G_OP_TYPE_PRIOR_BOX;
+extern const char *G_OP_TYPE_RELU;
+extern const char *G_OP_TYPE_RESHAPE;
+extern const char *G_OP_TYPE_SIGMOID;
+extern const char *G_OP_TYPE_SOFTMAX;
+extern const char *G_OP_TYPE_TRANSPOSE;
+extern const char *G_OP_TYPE_SPLIT;
+extern const char *G_OP_TYPE_FEED;
+extern const char *G_OP_TYPE_FETCH;
+extern const char *G_OP_TYPE_DEPTHWISE_CONV;
+extern const char *G_OP_TYPE_IM2SEQUENCE;
+extern const char *G_OP_TYPE_DROPOUT;
+
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN;
+extern const char *G_OP_TYPE_FUSION_POOL_BN;
+extern const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU;
+extern const char *G_OP_TYPE_FUSION_FC_RELU;
+extern const char *G_OP_TYPE_REGION;
 
 extern std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
diff --git a/src/common/variant.h b/src/common/variant.h
index 9d0aa3019fbfdd5acbaed8a1140bc58c33f7f438..00b8eb985d8f7fc22bb93a3e229aa387c358e257 100644
--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -84,7 +84,7 @@ struct Variant {
     if (type_id == typeid(T).hash_code()) {
       return *const_cast<T *>(reinterpret_cast<const T *>(&data));
     } else {
-      PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant ");
+      PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant");
       exit(0);
     }
   }
diff --git a/src/fpga/api/fpga_api.cpp b/src/fpga/api/fpga_api.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a913d6e39cddda97b347c0675717c265dfa89d18
--- /dev/null
+++ b/src/fpga/api/fpga_api.cpp
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+
+#include "fpga/api/fpga_api.h"
+
+namespace paddle {
+namespace mobile {
+namespace fpga {
+namespace api {
+
+static int fd = -1;
+static const char *device_path = "/dev/fpgadrv0";
+
+static inline int do_ioctl(int req, void *arg) { return ioctl(req, arg); }
+
+int open_device() {
+  if (fd == -1) {
+    fd = open(device_path, O_RDWR);
+  }
+  return fd;
+}
+
+// memory management;
+void *fpga_malloc(size_t size) {
+  return reinterpret_cast<(void *)> mmap64(NULL, size, PROT_READ | PROT_WRITE,
+                                           MAP_SHARED, fd, 0);
+}
+
+void fpga_free(void *ptr) { munmap(ptr, 0); }
+
+void fpga_copy(void *dest, const void *src, size_t num) {
+  memcpy(dest, src, num);
+}
+
+int ComputeFpgaConv(struct FpgaConvArgs) {}
+int ComputeFpgaPool(struct FpgaPoolArgs) {}
+int ComputeFpgaEWAdd(struct FpgaEWAddArgs) {}
+
+}  // namespace api
+}  // namespace fpga
+}  // namespace mobile
+}  // namespace paddle
diff --git a/src/fpga/api/fpga_api.h b/src/fpga/api/fpga_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..2dfc285af4506c055f6780d7b3d393433c0904a8
--- /dev/null
+++ b/src/fpga/api/fpga_api.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstddef>
+#include <iostream>
+#include <limits>
+
+// memory management;
+
+namespace paddle {
+namespace mobile {
+namespace fpga {
+namespace api {
+
+int open_device();
+int close_device();
+
+void *fpga_malloc(size_t size);
+void fpga_free(void *ptr);
+void fpga_copy(void *dst, const void *src, size_t num);
+
+struct FpgaVersionArgs {
+  void *buf;
+};
+
+struct MemoryToPhysicalArgs {
+  const void *src;
+  uint64_t physical;
+};
+
+struct MemoryCopyArgs {
+  void *src;
+  void *dst;
+  size_t size;
+};
+
+struct FpgaQuantArgs {
+  float scale;
+};
+
+struct FpgaBNArgs {};
+
+struct FpgaConvArgs {
+  bool enable_BN = false;
+  bool enable_Relu = false;
+  struct FpgaBNParam bn_parm;
+};
+
+struct FpgaPoolArgs {
+  bool enable_BN = false;
+  struct FpgaBNParam bn_parm;
+};
+
+struct FpgaEWAddArgs {  // only support X + Y
+  bool enable_Relu = false;
+};
+
+int ComputeFpgaConv(struct FpgaConvArgs);
+int ComputeFpgaPool(struct FpgaPoolArgs);
+int ComputeFpgaEWAdd(struct FpgaEWAddArgs);
+
+#define IOCTL_FPGA_MAGIC 'FPGA'
+#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 1, struct FpgaVersionArgs)
+#define IOCTL_GET_QUANT _IOW(IOCTL_FPGA_MAGIC, 2, struct FpgaQuantArgs)
+#define IOCTL_SET_QUANT _IOW(IOCTL_FPGA_MAGIC, 3, struct FpgaArgs)
+#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
+#define IOCTL_MEM_TOPHY _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryToPhysicalArgs)
+#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct FpgaConvArgs)
+#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct FpgaPoolArgs)
+#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct FpgaEWAddArgs)
+
+}  // namespace api
+}  // namespace fpga
+}  // namespace mobile
+}  // namespace paddle
diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp
index 36b4663cb603d29bb60cfc297899d1c300e8ca91..765103c241a82ac224d707340f8b66ace827e335 100644
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -28,6 +28,16 @@ vector<string> OperatorBase<Dtype>::GetOutKeys() const {
   return it->second.second;
 }
 
+template <typename Dtype>
+vector<string> OperatorBase<Dtype>::GetInputKeys() const {
+  auto it = op_input_output_key.find(type_);
+  if (it == op_input_output_key.end()) {
+    DLOG << type_ << " has no outputs";
+    return {};
+  }
+  return it->second.first;
+}
+
 template <typename Dtype>
 OperatorBase<Dtype>::OperatorBase(const std::string &type,
                                   const VariableNameMap &inputs,
@@ -49,6 +59,11 @@ template <typename Dtype>
 void OperatorBase<Dtype>::Run() const {
   RunImpl();
 #ifdef PADDLE_MOBILE_DEBUG
+  vector<string> input_keys = GetInputKeys();
+  for (const auto key : input_keys) {
+    Tensor *input = GetVarValue<framework::LoDTensor>(key, inputs_, *scope_);
+    DLOG << type_ << " input- " << key << "=" << *input;
+  }
   vector<string> output_keys = GetOutKeys();
   for (const auto key : output_keys) {
     Tensor *out_ = GetVarValue<framework::LoDTensor>(key, outputs_, *scope_);
diff --git a/src/framework/operator.h b/src/framework/operator.h
index 793551b0cd3eea290243c156c27616a34c37a3d2..084ac3c81185fe489fe1ca67589c1e8edb1d4fdf 100644
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -61,6 +61,7 @@ class OperatorBase {
   virtual ~OperatorBase() {}
   void Run() const;
   std::vector<string> GetOutKeys() const;
+  std::vector<string> GetInputKeys() const;
   virtual void RunImpl() const = 0;
 
   virtual void Init() = 0;
@@ -118,6 +119,10 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
   virtual void InferShape() const = 0;
 
   void Init() {
+    //    for (auto i : this->inputs_) {
+    //      DLOG << i.first;
+    //      DLOG << i.second;
+    //    }
     PADDLE_MOBILE_ENFORCE(kernel_.Init(&param_), "  %s kernel init failed",
                           this->type_.c_str());
   }
@@ -146,7 +151,7 @@ class OpKernelBase {
   }
 #endif
   virtual void Compute(const P &para) const = 0;
-  virtual bool Init(P *para) { return true; };
+  virtual bool Init(P *para) { return true; }
   virtual ~OpKernelBase() = default;
 
  private:
diff --git a/src/framework/program/program-optimize/fusion_op_register.h b/src/framework/program/program-optimize/fusion_op_register.h
index 1cd6b1dd779f9bc9ff0f5be5513c4fa716d80b10..f16a65c28fb47e1cf4139588742ebe1073c3f3e6 100644
--- a/src/framework/program/program-optimize/fusion_op_register.h
+++ b/src/framework/program/program-optimize/fusion_op_register.h
@@ -42,8 +42,17 @@ class FusionOpRegister {
     matchers_[matcher->Type()] = shared_matcher;
   }
 
-  const std::map<std::string, std::shared_ptr<FusionOpMatcher>> Matchers() {
-    return matchers_;
+  const std::vector<std::shared_ptr<FusionOpMatcher>> Matchers() {
+    std::vector<std::shared_ptr<FusionOpMatcher>> matchers;
+    for (const auto& match : matchers_) {
+      matchers.push_back(match.second);
+    }
+    std::sort(matchers.begin(), matchers.end(),
+              [](std::shared_ptr<FusionOpMatcher> first,
+                 std::shared_ptr<FusionOpMatcher> second) {
+                return first->BeginNode().Depth() > second->BeginNode().Depth();
+              });
+    return matchers;
   }
 
  private:
diff --git a/src/framework/program/program-optimize/node.cpp b/src/framework/program/program-optimize/node.cpp
index e635e07eaf4484c3e390101c3b43fdaf24bbd2c6..a4e1db506da362df4fb61b39827d5e77ebc425eb 100644
--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -44,23 +44,6 @@ bool Node::operator==(const Node &in) {
   return true;
 }
 
-std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(int size) {
-  std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
-  OpDescs(size - 1, &op_descs);
-  return op_descs;
-}
-
-void Node::OpDescs(int index,
-                   std::vector<std::shared_ptr<framework::OpDesc>> *op_desc) {
-  if (index == 0) {
-    return;
-  }
-  op_desc->push_back(this->op_desc_);
-  for (auto &output : outputs_) {
-    output->OpDescs(index, op_desc);
-  }
-}
-
 std::shared_ptr<Node> Node::To(int size) {
   std::shared_ptr<Node> node = std::make_shared<Node>();
   this->To(size - 1, node);
diff --git a/src/framework/program/program-optimize/node.h b/src/framework/program/program-optimize/node.h
index 88bf1e16ed2a5fb3a038eadd546d63ffb3916f68..7eb179c243c28fe2668c3cf2f8f28f81312c0988 100644
--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -47,13 +47,10 @@ class Node {
       std::map<std::string, std::vector<std::pair<std::string, std::string>>>
           change,
       std::vector<std::shared_ptr<Node>> *removed_nodes);
-  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(int size);
   std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
   std::string Type() { return type_; }
 
  private:
-  void OpDescs(int size,
-               std::vector<std::shared_ptr<framework::OpDesc>> *op_desc);
   void To(int index, std::shared_ptr<Node>);
   void Folder(
       std::shared_ptr<framework::OpDesc> op_desc,
diff --git a/src/framework/program/program-optimize/program_optimize.cpp b/src/framework/program/program-optimize/program_optimize.cpp
index 3619bc79f576651245aa322992df9d318c810cd4..82d33bc65d864e010fbe41b270b71ed98a21b33e 100644
--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -78,9 +78,8 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
     }
 
     for (auto &registed : FusionOpRegister::Instance()->Matchers()) {
-      std::string fusion_type = registed.first;
-      std::shared_ptr<FusionOpMatcher> matcher = registed.second;
-      //      DLOG << " registed node \n " << matcher->BeginNode();
+      std::string fusion_type = registed->Type();
+      std::shared_ptr<FusionOpMatcher> matcher = registed;
 
       auto match_vector = type_map[matcher->BeginType()];
 
diff --git a/src/framework/program/program.h b/src/framework/program/program.h
index 5760efc826667d805695118b12e41efa0305553b..e500d500344d83204bf388401541259b90ea2f78 100644
--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -30,6 +30,7 @@ class Program {
   std::string model_path;
   std::string para_path;
   bool combined = false;
+  bool quantification = false;
 
  private:
 };
diff --git a/src/io/executor.cpp b/src/io/executor.cpp
index 480f48290cc1bbf4888832d76187a13a4915ec40..65f019d1e3c3f6f6bdb8a18a9ff99bb7ecb2012c 100644
--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -154,7 +154,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
 
   tensor->Resize(framework::make_ddim(desc.Dims()));
 
-  void *memory = tensor;
+  void *memory = nullptr;
   int type_size = 0;
   switch (desc.DataType()) {
     case framework::VARTYPE_TYPE_FP16:
@@ -179,11 +179,25 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
     default:
       break;
   }
-
-  for (int n = 0; n < memory_size * type_size; ++n) {
-    static_cast<char *>(memory)[n] = (*data)[n];
+  if (program_.quantification) {
+    float min_value;
+    float max_value;
+
+    memcpy(&min_value, *data, sizeof(float));
+    memcpy(&max_value, *data + sizeof(float), sizeof(float));
+    *data += 2 * sizeof(float);
+    const float factor = (max_value - min_value) / 255.0;
+    uint8_t *uint8_data = (uint8_t *)(*data);
+    for (int k = 0; k < memory_size; ++k) {
+      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
+    }
+    *data += (memory_size * sizeof(uint8_t));
+  } else {
+    for (int n = 0; n < memory_size * type_size; ++n) {
+      static_cast<char *>(memory)[n] = (*data)[n];
+    }
+    (*data) += (sizeof(char) * memory_size * type_size);
   }
-  (*data) += (sizeof(char) * memory_size * type_size);
 }
 
 template <typename Dtype, Precision P>
diff --git a/src/io/loader.cpp b/src/io/loader.cpp
index 51e007a6ab4bce415628649a40f711903bceee92..9ed877d05d51dfbe7139ea2289fdb6480c62f88f 100644
--- a/src/io/loader.cpp
+++ b/src/io/loader.cpp
@@ -44,26 +44,29 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
 
 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &dirname, bool optimize, bool can_add_split) {
-  auto program =
-      this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
+    const std::string &dirname, bool optimize, bool quantification,
+    bool can_add_split) {
+  auto program = this->LoadProgram(dirname + "/__model__", optimize,
+                                   quantification, can_add_split);
   program.model_path = dirname;
   return program;
 }
 
 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &model_path, const std::string &para_path,
-    bool optimize) {
+    const std::string &model_path, const std::string &para_path, bool optimize,
+    bool quantification) {
   auto program = this->LoadProgram(model_path, optimize);
   program.para_path = para_path;
   program.combined = true;
+  program.quantification = quantification;
   return program;
 }
 
 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
-    const std::string &model_path, bool optimize, bool can_add_split) {
+    const std::string &model_path, bool optimize, bool quantification,
+    bool can_add_split) {
   std::string model_filename = model_path;
   PaddleMobile__Framework__Proto__ProgramDesc *c_program;
   uint8_t *buf = NULL;
@@ -82,6 +85,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
 
   framework::Program<Dtype, P> program;
   program.originProgram = originProgramDesc;
+  program.quantification = quantification;
 
   auto scope = std::make_shared<framework::Scope>();
   program.scope = scope;
diff --git a/src/io/loader.h b/src/io/loader.h
index 5e3c53dc9db858f506a13d2105339038340344a6..512cee831f0a09f8223c07c531eb9d1c74e75d92 100644
--- a/src/io/loader.h
+++ b/src/io/loader.h
@@ -30,6 +30,7 @@ class Loader {
    * */
   const framework::Program<Dtype, P> Load(const std::string &dirname,
                                           bool optimize = false,
+                                          bool quantification = false,
                                           bool can_add_split = false);
 
   /*
@@ -38,11 +39,13 @@ class Loader {
    * */
   const framework::Program<Dtype, P> Load(const std::string &model_path,
                                           const std::string &para_path,
-                                          bool optimize = false);
+                                          bool optimize = false,
+                                          bool quantification = false);
 
  private:
   const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
                                                  bool optimize = false,
+                                                 bool quantification = false,
                                                  bool can_add_split = false);
 };
 
diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp
index cabdd799a0e7d561d8bc56c0913f1389c38f8907..5e2e209d64aa7a00b56a5bdbbff88cb3097b7b94 100644
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -26,7 +26,7 @@ void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
 
 template <typename Dtype, Precision P>
 bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
-                                  int batch_size) {
+                                  bool quantification, int batch_size) {
   if (loader_.get() == nullptr) {
     loader_ = std::make_shared<Loader<Dtype, P>>();
   } else {
@@ -35,7 +35,7 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
 
   if (executor_.get() == nullptr) {
     executor_ = std::make_shared<Executor<Dtype, P>>(
-        loader_->Load(dirname, optimize), batch_size, optimize);
+        loader_->Load(dirname, optimize, quantification), batch_size, optimize);
   } else {
     LOG(kLOG_INFO) << "executor inited";
   }
@@ -46,7 +46,7 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
 template <typename Dtype, Precision P>
 bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
                                   const std::string &para_path, bool optimize,
-                                  int batch_size) {
+                                  bool quantification, int batch_size) {
   if (loader_.get() == nullptr) {
     loader_ = std::make_shared<Loader<Dtype, P>>();
   } else {
@@ -55,7 +55,8 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
 
   if (executor_.get() == nullptr) {
     executor_ = std::make_shared<Executor<Dtype, P>>(
-        loader_->Load(model_path, para_path, optimize), batch_size, optimize);
+        loader_->Load(model_path, para_path, optimize, quantification),
+        batch_size, optimize);
   } else {
     LOG(kLOG_INFO) << "executor inited";
   }
diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h
index 74c11471566c3db8a37ea2d62e0496e5d40cb3b7..5dc3ccb21dd7e67fbe9b5032d01046b12728dc64 100644
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -39,14 +39,18 @@ class PaddleMobile {
    * @b 加载分开形式的 fluid 模型
    * */
   bool Load(const std::string &dirname, bool optimize = false,
-            int batch_size = 1);
+            bool quantification = false, int batch_size = 1);
 
   /*
    * @b load combine format fluid mode
    * @b 加载结合在一起格式的模型
    * */
   bool Load(const std::string &model_path, const std::string &para_path,
-            bool optimize = false, int batch_size = 1);
+            bool optimize = false, bool quantification = false,
+            int batch_size = 1);
+  /*
+   * @b 设置线程数, 当 cmake 中开启 openmp 时生效
+   * */
   void SetThreadNum(int num);
 
   /*
diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp
index 0252f3c07c06487720586b0f650e2179d247234f..178541953323b6ffd1a3339f8209c2839b37a784 100644
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -16,10 +16,32 @@ limitations under the License. */
 #include <cstdlib>
 #include <cstring>
 
+#ifdef PADDLE_MOBILE_FPGA
+
+#include "fpga/api/fpga_api.h"
+
+#endif
+
 namespace paddle_mobile {
 namespace memory {
 const int MALLOC_ALIGN = 64;
 
+#ifdef PADDLE_MOBILE_FPGA
+namespace api = paddle::mobile::fpga::api;
+
+void Copy(void *dst, const void *src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+void *Alloc(size_t size) { return api::malloc(size); }
+
+void Free(void *ptr) {
+  if (ptr) {
+    api::fpga_free(ptr);
+  }
+}
+
+#else
 void Copy(void *dst, const void *src, size_t num) {
   std::memcpy(dst, src, num);
 }
@@ -42,5 +64,7 @@ void Free(void *ptr) {
   }
 }
 
+#endif
+
 }  // namespace memory
 }  // namespace paddle_mobile
diff --git a/src/operators/batchnorm_op.cpp b/src/operators/batchnorm_op.cpp
index 644a27c586375bc66d327e18ac5182e8fce2893b..f820908404ea637d9680c32d5c4b5568e191dd7e 100644
--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -26,7 +26,7 @@ void BatchNormOp<Dtype, T>::InferShape() const {
   auto x_dims = this->param_.InputX()->dims();
   this->param_.OutputY()->Resize(x_dims);
 }
-template class BatchNormOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/box_coder_op.cpp b/src/operators/box_coder_op.cpp
index dece07d5efcfae9629842aead04d0274b9d82c93..9e57c9021dac1b6857752989727c1c86051e33f7 100644
--- a/src/operators/box_coder_op.cpp
+++ b/src/operators/box_coder_op.cpp
@@ -47,7 +47,7 @@ void BoxCoderOp<Dtype, T>::InferShape() const {
   this->param_.OutputBox()->Resize(framework::make_ddim(
       {input_targetbox_dims[0], input_priorbox_dims[0], 4}));
 }
-template class BoxCoderOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/concat_op.cpp b/src/operators/concat_op.cpp
index 9c524df351549fd0141294be805d77b3f1057362..19d771ddd5884412624a0720368ecc80f92678ea 100644
--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -56,7 +56,6 @@ void ConcatOp<Dtype, T>::InferShape() const {
 
   this->param_.Out()->Resize(out_dims);
 }
-template class ConcatOp<CPU, float>;
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/conv_op.cpp b/src/operators/conv_op.cpp
index 1b00ed06eee2b1676667b9c54b8601c8872b6699..c4601995219b32db75f22c7c2ed959e18af85f36 100644
--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -48,8 +48,6 @@ void ConvOp<Dtype, T>::InferShape() const {
   this->param_.Output()->Resize(ddim);
 }
 
-template class ConvOp<CPU, float>;
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/depthwise_conv_op.cpp b/src/operators/depthwise_conv_op.cpp
index bee90781cd2de9d65bbbee3193cc922e743706de..8d6b6a143c37537be6de1e60cc095f1052136e26 100644
--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -49,8 +49,6 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
   this->param_.Output()->Resize(ddim);
 }
 
-template class DepthwiseConvOp<CPU, float>;
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/dropout_op.cpp b/src/operators/dropout_op.cpp
index f7f5ca2475171f5756ee8cf4f13754d07df8fe01..a632aa0c52b19c591467f94afb216245a596680b 100644
--- a/src/operators/dropout_op.cpp
+++ b/src/operators/dropout_op.cpp
@@ -22,7 +22,7 @@ void DropoutOp<Dtype, T>::InferShape() const {
   auto input_dims = this->param_.InputX()->dims();
   this->param_.Out()->Resize(input_dims);
 }
-template class DropoutOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/elementwise_add_op.cpp b/src/operators/elementwise_add_op.cpp
index 369589574139c7bc68debb7c55836926a3d5f6b2..49885f783417d61c6348fc4563e7306036994f17 100644
--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -24,7 +24,7 @@ void ElementwiseAddOp<Dtype, T>::InferShape() const {
   auto x_dim = this->param_.InputX()->dims();
   this->param_.Out()->Resize(x_dim);
 }
-template class ElementwiseAddOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/feed_op.cpp b/src/operators/feed_op.cpp
index c4357d7993cd91a306fec5856eaa6839e9ab6a6e..4447f2c699fc929805f15a265440803e6ff34b56 100644
--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -14,10 +14,7 @@ limitations under the License. */
 
 #include "feed_op.h"
 namespace paddle_mobile {
-namespace operators {
-
-template class FeedOp<CPU, float>;
-}
+namespace operators {}
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
diff --git a/src/operators/fetch_op.cpp b/src/operators/fetch_op.cpp
index cdbe413c955b931a16e716aa2e18d2a018a53bab..adbd61d5ec364a40b565059ceb5d5d49999c8436 100644
--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
@@ -14,10 +14,7 @@ limitations under the License. */
 
 #include "fetch_op.h"
 namespace paddle_mobile {
-namespace operators {
-
-template class FetchOp<CPU, float>;
-}
+namespace operators {}
 }  // namespace paddle_mobile
 
 namespace ops = paddle_mobile::operators;
diff --git a/src/operators/fusion_conv_add.cpp b/src/operators/fusion_conv_add.cpp
index b1dba23be0d8ea010b38844b1897381fbf578617..cdd6a6db2bb11ebf8dce2aca85630aa8805adf3e 100644
--- a/src/operators/fusion_conv_add.cpp
+++ b/src/operators/fusion_conv_add.cpp
@@ -45,7 +45,6 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
   this->param_.Output()->Resize(ddim);
 }
 
-template class FusionConvAddOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/fusion_conv_add.h b/src/operators/fusion_conv_add.h
index d23c99e5cd5e9cc50dc77672d9aaaf54dfdc545c..170df9ce33e4ab90297664fbc81d723e7c246f83 100644
--- a/src/operators/fusion_conv_add.h
+++ b/src/operators/fusion_conv_add.h
@@ -36,8 +36,6 @@ class FusionConvAddMatcher : public framework::FusionOpMatcher {
   void FolderNodes(
       framework::Node *node,
       std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node->OpDescs(node_.Depth());
     node->Folder(node_.Depth(), Type(),
                  {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
   }
@@ -68,11 +66,11 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
 
 #ifdef PADDLE_MOBILE_CPU
 
-//#ifndef CONV_ADD_REGISTER
-// static framework::FusionOpRegistrar convadd_registrar(
-//    new FusionConvAddMatcher());
-//#define CONV_ADD_REGISTER
-//#endif
+#ifndef CONV_ADD_REGISTER
+static framework::FusionOpRegistrar convadd_registrar(
+    new FusionConvAddMatcher());
+#define CONV_ADD_REGISTER
+#endif
 
 #endif
 
diff --git a/src/operators/fusion_conv_add_bn_relu_op.cpp b/src/operators/fusion_conv_add_bn_relu_op.cpp
index 62839c1a5acaf89a3efef39bbe4a67c675da393b..16f4650a64ec0c363d5fa94ee27c15c73cf58a70 100644
--- a/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
@@ -44,7 +44,7 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
   framework::DDim ddim = framework::make_ddim(output_shape);
   this->param_.Output()->Resize(ddim);
 }
-template class FusionConvAddBNReluOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/fusion_conv_add_bn_relu_op.h b/src/operators/fusion_conv_add_bn_relu_op.h
index 389c76cc83a532fe706d911903a8412bb8bfb4ca..19e33465c06921e9a6a7beb77053f05a03a6c760 100644
--- a/src/operators/fusion_conv_add_bn_relu_op.h
+++ b/src/operators/fusion_conv_add_bn_relu_op.h
@@ -39,8 +39,6 @@ class FusionConvAddBNReluMatcher : public framework::FusionOpMatcher {
   void FolderNodes(
       framework::Node *node,
       std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node->OpDescs(node_.Depth());
     node->Folder(node_.Depth(), Type(),
                  {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
                   {G_OP_TYPE_BATCHNORM,
diff --git a/src/operators/fusion_conv_add_relu_op.cpp b/src/operators/fusion_conv_add_relu_op.cpp
index 5575b52ce9866901a13c630a7509c7e5ec5401cb..18618886cccba08c7502b3e1d75fbba9b6916f56 100644
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef CONVADDRELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 
 #include "fusion_conv_add_relu_op.h"
 #include "operators/math/conv_func.h"
diff --git a/src/operators/fusion_conv_add_relu_op.h b/src/operators/fusion_conv_add_relu_op.h
index cf68fac8cf6dad4eb8469a543656311e5cedc9e7..50a4a2c7c64526c9a5dc1057829ed14f09357780 100644
--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef CONVADDRELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 
 #pragma once
 
diff --git a/src/operators/fusion_conv_bn_relu_op.cpp b/src/operators/fusion_conv_bn_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..49fe9c933a5a9695f2c18bd0921c2d36063dc065
--- /dev/null
+++ b/src/operators/fusion_conv_bn_relu_op.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#include "operators/fusion_conv_bn_relu_op.h"
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionConvBNReluOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/fusion_conv_bn_relu_op.h b/src/operators/fusion_conv_bn_relu_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c2c1033ac0a4d6c8e3bc3f188a66884dd9e0642
--- /dev/null
+++ b/src/operators/fusion_conv_bn_relu_op.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/conv_bn_relu_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionConvBNReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionConvBNReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_BN_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionConvBNReluOp : public framework::OperatorWithKernel<
+                               DeviceType, FusionConvBNReluParam,
+                               operators::ConvBNReluKernel<DeviceType, T>> {
+ public:
+  FusionConvBNReluOp(const string &type, const VariableNameMap &inputs,
+                     const VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs,
+                     std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionConvBNReluParam,
+            operators::ConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                        attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, FusionConvBNReluParam,
+      operators::ConvBNReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+#ifdef PADDLE_MOBILE_CPU
+
+#ifndef FUSION_CONV_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_bn_relu_registrar(
+    new FusionConvBNReluMatcher());
+#define FUSION_CONV_BN_RELU_REGISTER
+#endif
+
+#endif
+
+#ifdef PADDLE_MOBILE_MALI_GPU
+
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_bn_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+
+#endif
diff --git a/src/operators/fusion_dwconv_bn_relu_op.cpp b/src/operators/fusion_dwconv_bn_relu_op.cpp
index ba03a436c37cc8f1dcba94036fd6a3fbbd8fcaf3..e55295830e19b5b39a5ae2501e30170ffb1a7854 100644
--- a/src/operators/fusion_dwconv_bn_relu_op.cpp
+++ b/src/operators/fusion_dwconv_bn_relu_op.cpp
@@ -44,7 +44,7 @@ void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
   framework::DDim ddim = framework::make_ddim(output_shape);
   this->param_.Output()->Resize(ddim);
 }
-template class FusionDWConvBNReluOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/fusion_dwconv_bn_relu_op.h b/src/operators/fusion_dwconv_bn_relu_op.h
index bf95b51da43b8e9c0cec102876d48828b3749575..6f9f03e4936e082de802ced385060fecb9cc27a9 100644
--- a/src/operators/fusion_dwconv_bn_relu_op.h
+++ b/src/operators/fusion_dwconv_bn_relu_op.h
@@ -38,8 +38,6 @@ class FusionDWConvBNReluMatcher : public framework::FusionOpMatcher {
   void FolderNodes(
       framework::Node *node,
       std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node->OpDescs(node_.Depth());
     node->Folder(node_.Depth(), Type(),
                  {{G_OP_TYPE_BATCHNORM,
                    {{"Scale", "Scale"},
diff --git a/src/operators/fusion_fc_op.cpp b/src/operators/fusion_fc_op.cpp
index 57a8b1b53f2f98b3218ee8fc40c6c9774ec5a5c7..d564d4d88c16ee09382a9b2dae275807ec4bdb4b 100644
--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -50,7 +50,6 @@ void FusionFcOp<Dtype, T>::InferShape() const {
   this->param_.Out()->Resize(ddim);
 }
 
-template class FusionFcOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/im2sequence_op.cpp b/src/operators/im2sequence_op.cpp
index 273ce462d0aa423a6bf023186c6a579e975dfb11..3c929af9cf0a8a1550f197ffdb42ee590cd43235 100644
--- a/src/operators/im2sequence_op.cpp
+++ b/src/operators/im2sequence_op.cpp
@@ -47,8 +47,6 @@ void Im2SequenceOp<Dtype, T>::InferShape() const {
   this->param_.Output()->Resize(ddim);
 }
 
-template class Im2SequenceOp<CPU, float>;
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/arm/conv_add_relu_kernel.cpp b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
index 356dd191e761afc5d5b6bfacd250f90ae31017b2..8414b7374dd0ed2b10784563dbac9c1565d66f4c 100644
--- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef FUSION_CONVADD_RELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 
 #include "operators/kernel/conv_add_relu_kernel.h"
 #include "operators/kernel/central-arm-func/conv_add_relu_arm_func.h"
diff --git a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..23f06c1f0b8a0ed3f22ca9d23d24ae44c59f3618
--- /dev/null
+++ b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#include "operators/kernel/conv_bn_relu_kernel.h"
+#include "operators/kernel/central-arm-func/conv_bn_relu_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvBNReluKernel<CPU, float>::Init(FusionConvBNReluParam *param) {
+  const Tensor *mean = param->InputMean();
+  const Tensor *variance = param->InputVariance();
+  const Tensor *scale = param->InputScale();
+  const Tensor *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+
+  //   DLOG << "variance: " << *variance;
+
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+
+  const int C = mean->numel();
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  Tensor *new_scale = new Tensor();
+  Tensor *new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({C});
+  auto new_bias_ptr = new_bias->mutable_data<float>({C});
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  return true;
+}
+
+template <>
+void ConvBNReluKernel<CPU, float>::Compute(
+    const FusionConvBNReluParam &param) const {
+  ConvBNReluCompute<float>(param);
+}
+template class ConvBNReluKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/batchnorm_arm_func.h b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
index b2af17eb4aaf0a7ef98442f589162a3b6f371a3b..cc591035065e4cbbe71ff8f6bd6cbab9c6fe9e79 100644
--- a/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
+++ b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
@@ -54,7 +54,40 @@ void BatchnormCompute(const BatchNormParam &param) {
 
   int HXW = H * W;
 
-#ifdef ARMV7
+#if __ARM_NEON
+#if __aarch64__
+  float *inv_std_ptr = new float[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+
+  Tensor new_scale;
+  auto new_scale_ptr = new_scale.mutable_data<float>(framework::make_ddim({C}));
+  Tensor new_bias;
+  auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
+
+  /// ((x - est_mean) * (inv_var) * scale + bias equal to
+  /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+    {
+      for (int n = 0; n < N; n++) {
+        for (int h = 0; h < H; h++) {
+          int tmp_index = n * stride0 + i * stride1 + h * stride2;
+          for (int w = 0; w < W; w++) {
+            int index = tmp_index + w;
+            out_ptr[index] =
+                input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
+          }
+        }
+      }
+    }
+  }
+  delete[] inv_std_ptr;
+#else
+
   if (HXW > 32) {
     int NXC = N * C;
     float *inv_std_ptr = new float[NXC * 4];
@@ -229,6 +262,7 @@ void BatchnormCompute(const BatchNormParam &param) {
 
     delete[] inv_std_ptr;
   }
+#endif
 #else
   float *inv_std_ptr = new float[C];
   for (int i = 0; i < C; i++) {
diff --git a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
index 6aadbab95c591d4286fdbb3c3f01a291cdd90429..e8929e3e94073d384d24f63b5aa73e51e353fa26 100644
--- a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef FUSION_CONVADD_RELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 
 #pragma once
 #include <vector>
diff --git a/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..f18d67749b96cd0ee2d84c2731af8a2c3e136db1
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
@@ -0,0 +1,139 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#pragma once
+#include <vector>
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+void ConvBNReluBasic(const FusionConvBNReluParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor new_bias = *param.NewBias();
+  Tensor new_scale = *param.NewScale();
+
+  Tensor *output = param.Output();
+
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+
+  const int batch_size = static_cast<int>(input->dims()[0]);
+
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+
+      math::matmulWithBn<float>(
+          filter_slice, false, col_matrix, false, static_cast<float>(1),
+          &out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
+    }
+  }
+}
+
+template <typename P>
+void ConvBNReluCompute(const FusionConvBNReluParam &param) {
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
+                                        param.Output(), param.NewScale(),
+                                        param.NewBias(), true);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
+    //                                        param.Output(), param.NewScale(),
+    //                                        param.NewBias(), 1);
+    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
+                                          param.Output(), param.NewScale(),
+                                          param.NewBias(), true);
+  } else {
+    ConvBNReluBasic(param);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/pool_arm_func.h b/src/operators/kernel/central-arm-func/pool_arm_func.h
index 892dca2ea40d40484b4c32a57f8633849cc9d038..6179df5b0c11ad2a2e19384989029696e9d6c266 100644
--- a/src/operators/kernel/central-arm-func/pool_arm_func.h
+++ b/src/operators/kernel/central-arm-func/pool_arm_func.h
@@ -76,15 +76,20 @@ void PoolCompute(const PoolParam &param) {
     }
 
   } else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
-#ifndef IOS
+#if __ARM_NEON
+#if __aarch64__
+    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+#else
     if (pooling_type == "max") {
       math::Pool2x2Max(strides, paddings, in_x, out);
     } else if (pooling_type == "avg") {
       math::Pool2x2Avg(strides, paddings, in_x, out);
     }
+#endif
 #else
     PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
-#endif
+#endif  // __ARM_NEON
+
   } else {
     PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
   }
diff --git a/src/operators/kernel/central-arm-func/sigmoid_arm_func.h b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
index daf6ad0e472515c8034a400dfc73de608f5b12d2..c612c4b092143ef8925f81a6d6fefe9cd9dff25b 100644
--- a/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
+++ b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
@@ -68,6 +68,7 @@ void sigmoid(const Tensor *X, Tensor *Y) {
       input_outer_ptr++;
     }
   }
+#else
 #endif
 }
 
diff --git a/src/operators/kernel/conv_add_relu_kernel.h b/src/operators/kernel/conv_add_relu_kernel.h
index 3f36d80c4781aebea756b04e340d056a79cfd7d7..931313273d150fa1ad159e7069fbc3812d6e6657 100644
--- a/src/operators/kernel/conv_add_relu_kernel.h
+++ b/src/operators/kernel/conv_add_relu_kernel.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef FUSION_CONVADD_RELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 
 #include <vector>
 #include "framework/ddim.h"
diff --git a/src/operators/kernel/conv_bn_relu_kernel.h b/src/operators/kernel/conv_bn_relu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9d4df5d8f597deebaf2b53491851b7ce03fc7aa
--- /dev/null
+++ b/src/operators/kernel/conv_bn_relu_kernel.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef FUSION_CONVBNRELU_OP
+
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using framework::DDim;
+using framework::OpKernelBase;
+
+template <typename DeviceType, typename T>
+class ConvBNReluKernel
+    : public OpKernelBase<DeviceType, FusionConvBNReluParam> {
+ public:
+  void Compute(const FusionConvBNReluParam &param) const;
+  bool Init(FusionConvBNReluParam *param);
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/lrn_op.cpp b/src/operators/lrn_op.cpp
index 1a5a8eccc1fc314d27517db8bc286035e573c9be..dde9123edf3568020f933bb7375be99e40f2367b 100644
--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -24,7 +24,7 @@ void LrnOp<Dtype, T>::InferShape() const {
   auto x_dims = this->param_.InputX()->dims();
   this->param_.Out()->Resize(x_dims);
 }
-template class LrnOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp
index 5db676564e190bf40e8af437ba68aee80b5a5af3..7e353c29b80279f895ad6d0150b31eb1703d97d4 100644
--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "operators/math/depthwise_conv_3x3.h"
-#ifdef __ARM_NEON
+#if __ARM_NEON
 #include <arm_neon.h>
 #endif
 #include <vector>
@@ -23,7 +23,6 @@ namespace math {
 void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
                       vector<int> paddings, const Tensor *filter, Tensor *bias,
                       Tensor *output, bool if_bias) {
-#ifdef __ARM_NEON
   const int batch_size = input->dims()[0];
 
   const int input_height = input->dims()[2];
@@ -181,7 +180,27 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
             }
 
           } else {
-#if defined(ARMV17)
+#if __ARM_NEON
+#if __aarch64__
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos2);
+            const float32x4_t data3 = vld1q_f32(pos3);
+
+            const float32x4_t v_filter1 = vld1q_f32(filter1);
+            const float32x4_t v_filter2 = vld1q_f32(filter2);
+            const float32x4_t v_filter3 = vld1q_f32(filter3);
+            float32x4_t mula = vmulq_f32(data1, v_filter1);
+            mula = vmlaq_f32(mula, data2, v_filter2);
+            mula = vmlaq_f32(mula, data3, v_filter3);
+            float32x2_t res = vpadd_f32(
+                vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
+            res = vpadd_f32(res, res);
+            if (if_bias) {
+              output_data[ph * output_width + pw] += vget_lane_f32(res, 0);
+            } else {
+              output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
+            }
+#else
             asm volatile(
 
                 "vld1.32  {q1}, [%[pos1]]        \n\t"
@@ -209,26 +228,10 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
                   [filter2] "r"(filter2), [filter3] "r"(filter3),
                   [output_ptr] "r"(output_ptr), [zero] "r"(zero)
                 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
+#endif  // __aarch64__
 #else
-            const float32x4_t data1 = vld1q_f32(pos1);
-            const float32x4_t data2 = vld1q_f32(pos2);
-            const float32x4_t data3 = vld1q_f32(pos3);
 
-            const float32x4_t v_filter1 = vld1q_f32(filter1);
-            const float32x4_t v_filter2 = vld1q_f32(filter2);
-            const float32x4_t v_filter3 = vld1q_f32(filter3);
-            float32x4_t mula = vmulq_f32(data1, v_filter1);
-            mula = vmlaq_f32(mula, data2, v_filter2);
-            mula = vmlaq_f32(mula, data3, v_filter3);
-            float32x2_t res = vpadd_f32(
-                vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
-            res = vpadd_f32(res, res);
-            if (if_bias) {
-              output_data[ph * output_width + pw] += vget_lane_f32(res, 0);
-            } else {
-              output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
-            }
-#endif
+#endif  // __ARM_NEON
           }
         }
       }
@@ -239,12 +242,11 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
     input_data += input_batch_stride;
     output_data += output_batch_stride;
   }
-#endif
 }
 
 void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
                           Tensor *output, Tensor *bias, bool if_bias) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
   const float *input_data = input->data<float>();
   const float *filter_data = filter->data<float>();
   float *output_data = output->data<float>();
@@ -520,7 +522,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
 void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
                                    Tensor *output, const Tensor *new_scale,
                                    const Tensor *new_bias, bool if_relu) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
   const float *input_data = input->data<float>();
   const float *filter_data = filter->data<float>();
   float *output_data = output->data<float>();
@@ -824,7 +826,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
 void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
                                    Tensor *output, const Tensor *new_scale,
                                    const Tensor *new_bias, bool if_relu) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
 
   const int batch_size = input->dims()[0];
 
@@ -1022,7 +1024,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
 
 void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
                             Tensor *output, Tensor bias, bool if_bias) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
   const float *input_data = input->data<float>();
   const float *filter_data = filter->data<float>();
   float *output_data = output->data<float>();
@@ -1225,7 +1227,7 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
 void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
                                      Tensor *output, const Tensor *new_scale,
                                      const Tensor *new_bias, bool if_relu) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
   const float *input_data = input->data<float>();
   const float *filter_data = filter->data<float>();
   float *output_data = output->data<float>();
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index bb91adcc4db412db137fdc12831bad75e069e38c..b9b61f4d1c59a0e2c8e7822742c54472ad540981 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "operators/math/gemm.h"
 #include "common/log.h"
 #include "memory/t_malloc.h"
-#ifndef X86
+#if __ARM_NEON
 #include <arm_neon.h>
 #endif
 #ifdef _OPENMP
@@ -33,6 +33,7 @@ float *packedA;
 float *packedB;
 float *packedC;
 float *zero;
+/*
 // 将A矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                  float *buffer) {
@@ -60,9 +61,39 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
   }
 }
 
+// 将B矩阵分块复制到连续内存(ColMajor)
+void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
+                 float *buffer) {
+  int i, j;
+  const float *Bj, *Bj1, *Bj2, *Bj3;
+  for (j = 0; j < n - n_tail; j += NR) {
+    Bj = &B(0, j);
+    Bj1 = &B(0, j + 1);
+    Bj2 = &B(0, j + 2);
+    Bj3 = &B(0, j + 3);
+    for (i = 0; i < k; ++i) {
+      *buffer++ = *Bj++;
+      *buffer++ = *Bj1++;
+      *buffer++ = *Bj2++;
+      *buffer++ = *Bj3++;
+    }
+  }
+  if (n_tail != 0) {
+    for (i = 0; i < k; ++i) {
+      for (int j = n - n_tail; j < n; ++j) {
+        *buffer++ = B(i, j);
+      }
+      for (int j = n; j < n + (NR - n_tail); ++j) {
+        *buffer++ = 0;
+      }
+    }
+  }
+}
+*/
+
 // 将A矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
-                  float *buffer) {
+void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer) {
   const float *a0, *a1, *a2, *a3;
   for (int i = 0; i < m - m_tail; i += MR) {
     a0 = A + i * lda;
@@ -100,49 +131,94 @@ void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
   }
 }
 
-// 将B矩阵分块复制到连续内存(ColMajor)
-void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
-                 float *buffer) {
-  int i, j;
-  const float *Bj, *Bj1, *Bj2, *Bj3;
-  for (j = 0; j < n - n_tail; j += NR) {
-    Bj = &B(0, j);
-    Bj1 = &B(0, j + 1);
-    Bj2 = &B(0, j + 2);
-    Bj3 = &B(0, j + 3);
-    for (i = 0; i < k; ++i) {
-      *buffer++ = *Bj++;
-      *buffer++ = *Bj1++;
-      *buffer++ = *Bj2++;
-      *buffer++ = *Bj3++;
+void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer) {
+  const float *a0, *a1, *a2, *a3, *a4, *a5;
+  for (int i = 0; i < m - m_tail; i += MR) {
+    a0 = A + i * lda;
+    a1 = A + (i + 1) * lda;
+    a2 = A + (i + 2) * lda;
+    a3 = A + (i + 3) * lda;
+    a4 = A + (i + 4) * lda;
+    a5 = A + (i + 5) * lda;
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
+      *buffer++ = *a4++;
+      *buffer++ = *a5++;
     }
   }
-  if (n_tail != 0) {
-    for (i = 0; i < k; ++i) {
-      for (int j = n - n_tail; j < n; ++j) {
-        *buffer++ = B(i, j);
-      }
-      for (int j = n; j < n + (NR - n_tail); ++j) {
-        *buffer++ = 0;
-      }
+  int i = m - m_tail;
+  a0 = &A(i, 0);
+  a1 = a0 + lda;
+  a2 = a0 + 2 * lda;
+  a3 = a0 + 3 * lda;
+  a4 = a0 + 4 * lda;
+  a5 = a0 + 5 * lda;
+  if (m_tail != 0) {
+    if (m_tail <= 5) {
+      a5 = zero;
+    }
+    if (m_tail <= 4) {
+      a4 = zero;
+    }
+    if (m_tail <= 3) {
+      a3 = zero;
+    }
+    if (m_tail <= 2) {
+      a2 = zero;
+    }
+    if (m_tail <= 1) {
+      a1 = zero;
+    }
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
+      *buffer++ = *a4++;
+      *buffer++ = *a5++;
     }
   }
 }
 
 // 将B矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
-                  float *buffer) {
+void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
+                    float *buffer) {
   const float *b0;
   for (int j = 0; j < n - n_tail; j += NR) {
     for (int i = 0; i < k; ++i) {
       b0 = &B(i, j);
+#if __ARM_NEON
+#if __aarch64__
+      asm volatile(
+          "prfm   pldl1keep,        [%[b0]]           \n\t"
+          "ld1    {v0.4s, v1.4s},   [%[b0]]           \n\t"
+          "st1    {v0.4s, v1.4s},   [%[buffer]],  #32 \n\t"
+          : [buffer] "+r"(buffer)
+          : [b0] "r"(b0)
+          : "memory", "v0", "v1");
+#else
       asm volatile(
-          "pld        [%[b0]]               \n\t"
-          "vld1.32    {q0, q1}, [%[b0]]         \n\t"
-          "vst1.32    {q0, q1}, [%[buffer]]!    \n\t"
+          "pld        [%[b0]]                     \n\t"
+          "vld1.32    {q0, q1},   [%[b0]]         \n\t"
+          "vst1.32    {q0, q1},   [%[buffer]]!    \n\t"
           : [buffer] "+r"(buffer)
           : [b0] "r"(b0)
-          : "memory", "q0", "q0");
+          : "memory", "q0", "q1");
+#endif  // __aarch64__
+#else
+      *buffer++ = *b0++;
+      *buffer++ = *b0++;
+      *buffer++ = *b0++;
+      *buffer++ = *b0++;
+      *buffer++ = *b0++;
+      *buffer++ = *b0++;
+      *buffer++ = *b0++;
+      *buffer++ = *b0++;
+#endif  // __ARM_NEON
     }
   }
   if (n_tail != 0) {
@@ -165,7 +241,8 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
   for (int j = 0; j < nc; j += NR) {
     for (int i = 0; i < mc; i += MR) {
       // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
     }
   }
 
@@ -195,7 +272,8 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
   for (int j = 0; j < nc; j += NR) {
     for (int i = 0; i < mc; i += MR) {
       // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
     }
   }
 
@@ -206,8 +284,10 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
   }
 }
 
-#if defined(IOS)
-void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
+#if __ARM_NEON
+#if __aarch64__
+
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
   // init C
   float32x4_t cv0 = vdupq_n_f32(0.0);
   float32x4_t cv1 = vdupq_n_f32(0.0);
@@ -234,30 +314,271 @@ void AddDot4x4(int k, const float *a, const float *b, float *C, int ldc) {
     a += MR;
     b += NR;
   }
-  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
-  int i, j;
-  for (i = 0; i < mc; ++i) {
-    for (j = 0; j < nc; ++j) {
-      if (beta == 0.0) {
-        C(i, j) = 0.0;
-      } else if (beta != 1.0) {
-        C(i, j) *= beta;
+
+  vst1q_f32(c, cv0);
+  vst1q_f32(c + ldc, cv1);
+  vst1q_f32(c + 2 * ldc, cv2);
+  vst1q_f32(c + 3 * ldc, cv3);
+  //  float32x4x4_t cv = {cv0, cv1, cv2, cv3};
+}
+
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
+  // init C
+  float32x4_t cv0 = vdupq_n_f32(0.0);
+  float32x4_t cv1 = vdupq_n_f32(0.0);
+  float32x4_t cv2 = vdupq_n_f32(0.0);
+  float32x4_t cv3 = vdupq_n_f32(0.0);
+  float32x4_t cv4 = vdupq_n_f32(0.0);
+  float32x4_t cv5 = vdupq_n_f32(0.0);
+  float32x4_t cv6 = vdupq_n_f32(0.0);
+  float32x4_t cv7 = vdupq_n_f32(0.0);
+
+  float32x4_t av;
+  float32x4_t bv0;
+  float32x4_t bv1;
+
+  float32x2_t av01;
+  float32x2_t av23;
+
+  for (int p = 0; p < k; p += 1) {
+    av = vld1q_f32(a);
+    bv0 = vld1q_f32(b);
+    bv1 = vld1q_f32(b + 4);
+
+    av01 = vget_low_f32(av);
+    cv0 = vmlaq_lane_f32(cv0, bv0, av01, 0);
+    cv1 = vmlaq_lane_f32(cv1, bv1, av01, 0);
+    cv2 = vmlaq_lane_f32(cv2, bv0, av01, 1);
+    cv3 = vmlaq_lane_f32(cv3, bv1, av01, 1);
+    av23 = vget_high_f32(av);
+    cv4 = vmlaq_lane_f32(cv4, bv0, av23, 0);
+    cv5 = vmlaq_lane_f32(cv5, bv1, av23, 0);
+    cv6 = vmlaq_lane_f32(cv6, bv0, av23, 1);
+    cv7 = vmlaq_lane_f32(cv7, bv1, av23, 1);
+
+    a += MR;
+    b += NR;
+  }
+
+  vst1q_f32(c, cv0);
+  vst1q_f32(c + 4, cv1);
+  vst1q_f32(c + ldc, cv2);
+  vst1q_f32(c + ldc + 4, cv3);
+  vst1q_f32(c + 2 * ldc, cv4);
+  vst1q_f32(c + 2 * ldc + 4, cv5);
+  vst1q_f32(c + 3 * ldc, cv6);
+  vst1q_f32(c + 3 * ldc + 4, cv7);
+}
+
+// 分块矩阵乘法结果回写
+// C = A * B
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
       }
-      if (j == 0) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 0);
-      } else if (j == 1) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 1);
-      } else if (j == 2) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 2);
-      } else if (j == 3) {
-        C(i, j) += alpha * vgetq_lane_f32(cv.val[i], 3);
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
       }
     }
   }
 }
-}  // namespace math
 
-#elif defined(ARMV7)
+// C = alpha * A * B + beta * C
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
+
+// C = A * B + C
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t cv1;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv1 = vld1q_f32(C_ptr);
+      cv = vaddq_f32(cv, cv1);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv1 = vld1q_f32(C_ptr);
+      cv = vaddq_f32(cv, cv1);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+      }
+    }
+  }
+}
+
+// C = A * B + C, relu(C)
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t cv1;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv1 = vld1q_f32(C_ptr);
+      cv = vaddq_f32(cv, cv1);
+      cv = vmaxq_f32(cv, zero);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv1 = vld1q_f32(C_ptr);
+      cv = vaddq_f32(cv, cv1);
+      cv = vmaxq_f32(cv, zero);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+      }
+    }
+  }
+}
+
+// C = A * B, batchnorm(C)
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
+                 float *new_bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t cv1;
+  float32x4_t bias;
+  float32x2_t scale;
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    bias = vld1q_dup_f32(new_bias);
+    scale = vld1_dup_f32(new_scale);
+    new_bias++;
+    new_scale++;
+    float scale0 = vget_lane_f32(scale, 0);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vmlaq_n_f32(bias, cv, scale0);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vmlaq_n_f32(bias, cv, scale0);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+        C_ptr++;
+      }
+    }
+  }
+}
+
+// C = A * B, batchnorm(C), relu(C)
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                     float *new_scale, float *new_bias) {
+  int nc1 = nc / 4;
+  int _nc1 = nc % 4;
+
+  float *c_ptr, *C_ptr;
+  float32x4_t cv;
+  float32x4_t bias;
+  float32x2_t scale;
+  float32x4_t zero = vdupq_n_f32(0.0);
+  for (int i = 0; i < mc; ++i) {
+    c_ptr = c + i * NC;
+    C_ptr = C + i * ldc;
+    bias = vld1q_dup_f32(new_bias);
+    scale = vld1_dup_f32(new_scale);
+    new_bias++;
+    new_scale++;
+    float scale0 = vget_lane_f32(scale, 0);
+    for (int j = 0; j < nc1; ++j) {
+      cv = vld1q_f32(c_ptr);
+      cv = vmlaq_n_f32(bias, cv, scale0);
+      cv = vmaxq_f32(cv, zero);
+      vst1q_f32(C_ptr, cv);
+      c_ptr += 4;
+      C_ptr += 4;
+    }
+    if (_nc1 != 0) {
+      cv = vld1q_f32(c_ptr);
+      cv = vmlaq_n_f32(bias, cv, scale0);
+      cv = vmaxq_f32(cv, zero);
+      if (_nc1 >= 1) {
+        vst1q_lane_f32(C_ptr, cv, 0);
+        C_ptr++;
+      }
+      if (_nc1 >= 2) {
+        vst1q_lane_f32(C_ptr, cv, 1);
+        C_ptr++;
+      }
+      if (_nc1 >= 3) {
+        vst1q_lane_f32(C_ptr, cv, 2);
+      }
+    }
+  }
+}
+
+#else
+
 void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
   const float *a_ptr, *b_ptr;
   a_ptr = a;
@@ -328,205 +649,61 @@ void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
         "q10", "q11", "q12", "q13");
 }
 
-#else
-void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
-  float *c0, *c1, *c2, *c3;
-  c0 = c;
-  c1 = c + ldc;
-  c2 = c + 2 * ldc;
-  c3 = c + 3 * ldc;
-  for (int p = 0; p < k; p += 1) {
-    // first row
-    c0[0] += a[0] * b[0];
-    c0[1] += a[0] * b[1];
-    c0[2] += a[0] * b[2];
-    c0[3] += a[0] * b[3];
+/*
+void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
+                  const float *B, int ldb, float beta, float *C, int ldc,
+                  bool relu) {
+  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
 
-    // second row
-    c1[0] += a[1] * b[0];
-    c1[1] += a[1] * b[1];
-    c1[2] += a[1] * b[2];
-    c1[3] += a[1] * b[3];
+  const float *a0, *b0, *b1, *b2, *b3;
+  float *c0, *C0;
 
-    // third row
-    c2[0] += a[2] * b[0];
-    c2[1] += a[2] * b[1];
-    c2[2] += a[2] * b[2];
-    c2[3] += a[2] * b[3];
+  int volatile kc1 = k / 4;
+  int volatile kc2 = k % 4;
+  int volatile nc1 = n / 16;
+  int _nc1 = n % 16;
+  int volatile nc2 = _nc1 / 4;
+  int volatile nc3 = _nc1 % 4;
+  for (int i = 0; i < kc1; i++) {
+    a0 = A + i * 4;
+    b0 = B + i * 4 * ldb;
+    b1 = b0 + ldb;
+    b2 = b1 + ldb;
+    b3 = b2 + ldb;
+    c0 = bufferC;
+    asm volatile(
+        "pld        [%[a0], #16]          \n\t"
+        "vld1.32    {q0}, [%[a0]]         \n\t"
 
-    // fourth row
-    c3[0] += a[3] * b[0];
-    c3[1] += a[3] * b[1];
-    c3[2] += a[3] * b[2];
-    c3[3] += a[3] * b[3];
+        "subs       %[nc1], %[nc1], #1    \n\t"
+        "blt        end_nc1_%=            \n\t"
+        "loop_nc1_%=:                     \n\t"
 
-    a += 4;
-    b += 4;
-  }
-}
+        "cmp        %[i],       #0        \n\t"
+        "beq        i_eq0_%=              \n\t"
+        "bne        i_ne0_%=              \n\t"
 
-#endif
+        "i_eq0_%=:                        \n\t"
+        "vmov.f32   q10,    #0.0          \n\t"
+        "vmov.f32   q11,    #0.0          \n\t"
+        "vmov.f32   q12,    #0.0          \n\t"
+        "vmov.f32   q13,    #0.0          \n\t"
+        "b          gemm_nc1_%=           \n\t"
 
-// 32位 float 矩阵乘法
-void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc, bool relu) {
-  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int L1 = 30 * 1024;
-  int L2 = 1 * 1024 * 1024;
+        "i_ne0_%=:                        \n\t"
+        "pld        [%[c0], #64]          \n\t"
+        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
+        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
+        "sub        %[c0], %[c0], #32     \n\t"
 
-  KC = k;
-  MC = L2 / (2 * KC * sizeof(float));
-  NC = MC;
-
-  // make sure MC is multiple of 4, and NC is multiple of 8
-  int mblock_num = (m + MC - 1) / MC;
-  MC = (m + mblock_num - 1) / mblock_num;
-  MC = (MC + 4 - 1) / 4 * 4;
-  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
-
-  int nblock_num = (n + NC - 1) / NC;
-  NC = (n + nblock_num - 1) / nblock_num;
-  NC = (NC + 8 - 1) / 8 * 8;
-  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
-
-  packedA = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-  packedB = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-  packedC = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
-  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
-
-  for (int l = 0; l < KC; ++l) {
-    zero[l] = 0;
-  }
-
-  int mc, nc;
-  for (int j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-    PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
-    for (int i = 0; i < m; i += MC) {
-      mc = s_min(m - i, MC);
-      PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
-      InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc,
-                  relu);
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA);
-  paddle_mobile::memory::Free(packedB);
-  paddle_mobile::memory::Free(packedC);
-  paddle_mobile::memory::Free(zero);
-}
-
-void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
-                 const float *B, int ldb, float beta, float *C, int ldc,
-                 bool relu, float *new_scale, float *new_bias) {
-  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int L1 = 30 * 1024;
-  int L2 = 1 * 1024 * 1024;
-
-  KC = k;
-  MC = L2 / (2 * KC * sizeof(float));
-  NC = MC;
-
-  // make sure MC is multiple of 4, and NC is multiple of 8
-  int mblock_num = (m + MC - 1) / MC;
-  MC = (m + mblock_num - 1) / mblock_num;
-  MC = (MC + 4 - 1) / 4 * 4;
-  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
-
-  int nblock_num = (n + NC - 1) / NC;
-  NC = (n + nblock_num - 1) / nblock_num;
-  NC = (NC + 8 - 1) / 8 * 8;
-  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
-
-  packedA = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-  packedB = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-  packedC = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
-  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
-
-  for (int l = 0; l < KC; ++l) {
-    zero[l] = 0;
-  }
-
-  int mc, nc;
-  for (int j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-    PackMatrixB_(KC, nc, nc % NR, &B(0, j), ldb, packedB);
-    for (int i = 0; i < m; i += MC) {
-      mc = s_min(m - i, MC);
-      PackMatrixA_(mc, KC, mc % MR, &A(i, 0), lda, packedA);
-      InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
-                        &C(i, j), ldc, relu, new_scale + i, new_bias + i);
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA);
-  paddle_mobile::memory::Free(packedB);
-  paddle_mobile::memory::Free(packedC);
-  paddle_mobile::memory::Free(zero);
-}
-
-void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                  const float *B, int ldb, float beta, float *C, int ldc,
-                  bool relu) {
-  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
-
-  const float *a0, *b0, *b1, *b2, *b3;
-  float *c0, *C0;
-
-  int volatile kc1 = k / 4;
-  int volatile kc2 = k % 4;
-  int volatile nc1 = n / 16;
-  int _nc1 = n % 16;
-  int volatile nc2 = _nc1 / 4;
-  int volatile nc3 = _nc1 % 4;
-  for (int i = 0; i < kc1; i++) {
-    a0 = A + i * 4;
-    b0 = B + i * 4 * ldb;
-    b1 = b0 + ldb;
-    b2 = b1 + ldb;
-    b3 = b2 + ldb;
-    c0 = bufferC;
-    asm volatile(
-        "pld        [%[a0], #16]          \n\t"
-        "vld1.32    {q0}, [%[a0]]         \n\t"
-
-        "subs       %[nc1], %[nc1], #1    \n\t"
-        "blt        end_nc1_%=            \n\t"
-        "loop_nc1_%=:                     \n\t"
-
-        "cmp        %[i],       #0        \n\t"
-        "beq        i_eq0_%=              \n\t"
-        "bne        i_ne0_%=              \n\t"
-
-        "i_eq0_%=:                        \n\t"
-        "vmov.f32   q10,    #0.0          \n\t"
-        "vmov.f32   q11,    #0.0          \n\t"
-        "vmov.f32   q12,    #0.0          \n\t"
-        "vmov.f32   q13,    #0.0          \n\t"
-        "b          gemm_nc1_%=           \n\t"
-
-        "i_ne0_%=:                        \n\t"
-        "pld        [%[c0], #64]          \n\t"
-        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
-        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
-        "sub        %[c0], %[c0], #32     \n\t"
-
-        "gemm_nc1_%=:                     \n\t"
-        "pld        [%[b0], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
-        "vmla.f32   q10, q2, d0[0]        \n\t"
-        "vmla.f32   q11, q3, d0[0]        \n\t"
-        "vmla.f32   q12, q4, d0[0]        \n\t"
-        "vmla.f32   q13, q5, d0[0]        \n\t"
+        "gemm_nc1_%=:                     \n\t"
+        "pld        [%[b0], #64]          \n\t"
+        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
+        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
+        "vmla.f32   q10, q2, d0[0]        \n\t"
+        "vmla.f32   q11, q3, d0[0]        \n\t"
+        "vmla.f32   q12, q4, d0[0]        \n\t"
+        "vmla.f32   q13, q5, d0[0]        \n\t"
 
         "pld        [%[b1], #64]          \n\t"
         "vld1.32    {q2, q3}, [%[b1]]!    \n\t"
@@ -905,6 +1082,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
     VecWriteWithBn(n, bufferC, C, ldc, new_scale, new_bias);
   }
 }
+*/
 
 void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
   const float *a_ptr, *b_ptr;
@@ -1214,6 +1392,21 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
 // C = A * B, batchnorm(C)
 void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
                  float *bias) {
+  if (nc < 4) {
+    for (int i = 0; i < mc; ++i) {
+      for (int j = 0; j < nc; ++j) {
+        *C = (*c) * (*scale) + (*bias);
+        C++;
+        c++;
+      }
+      C += (ldc - nc);
+      c += (NC - nc);
+      scale++;
+      bias++;
+    }
+    return;
+  }
+
   int volatile nc1 = nc / 16;
   int _nc1 = nc % 16;
   int volatile nc2 = _nc1 / 4;
@@ -1300,6 +1493,24 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *scale,
 // C = A * B, batchnorm(C), relu(C)
 void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
                      float *bias) {
+  if (nc < 4) {
+    for (int i = 0; i < mc; ++i) {
+      for (int j = 0; j < nc; ++j) {
+        *C = (*c) * (*scale) + (*bias);
+        if (*C < 0) {
+          *C = 0;
+        }
+        C++;
+        c++;
+      }
+      C += (ldc - nc);
+      c += (NC - nc);
+      scale++;
+      bias++;
+    }
+    return;
+  }
+
   int nc1 = nc / 16;
   int _nc1 = nc % 16;
   int nc2 = _nc1 / 4;
@@ -1390,282 +1601,644 @@ void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *scale,
         "q8", "q10", "q11", "q12", "q13", "q14");
 }
 
-// C = A * B
-void VecWriteBasic(int n, float *c, float *C, int ldc) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
-  int nc2 = _nc1 / 4;
-  int nc3 = 16 - 4 * (_nc1 % 4);
+  /*
+  // C = A * B
+  void VecWriteBasic(int n, float *c, float *C, int ldc) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
+    int nc2 = _nc1 / 4;
+    int nc3 = 16 - 4 * (_nc1 % 4);
 
-  asm volatile(
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
+    asm volatile(
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
 
-      "vld1.32    {q0, q1}, [%[c]]!       \n\t"
-      "vst1.32    {q0, q1}, [%[C]]!       \n\t"
+        "vld1.32    {q0, q1}, [%[c]]!       \n\t"
+        "vst1.32    {q0, q1}, [%[C]]!       \n\t"
 
-      "vld1.32    {q2, q3}, [%[c]]!       \n\t"
-      "vst1.32    {q2, q3}, [%[C]]!       \n\t"
+        "vld1.32    {q2, q3}, [%[c]]!       \n\t"
+        "vst1.32    {q2, q3}, [%[C]]!       \n\t"
 
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
 
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "blt        end_nc2_%=              \n\t"
-      "loop_nc2_%=:                       \n\t"
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "blt        end_nc2_%=              \n\t"
+        "loop_nc2_%=:                       \n\t"
 
-      "vld1.32    {q4},     [%[c]]!       \n\t"
-      "vst1.32    {q4},     [%[C]]!       \n\t"
+        "vld1.32    {q4},     [%[c]]!       \n\t"
+        "vst1.32    {q4},     [%[C]]!       \n\t"
 
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "bge        loop_nc2_%=             \n\t"
-      "end_nc2_%=:                        \n\t"
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "bge        loop_nc2_%=             \n\t"
+        "end_nc2_%=:                        \n\t"
 
-      "cmp        %[nc3],    #16          \n\t"
-      "beq        end_nc3_%=              \n\t"
-      "sub        %[c],     %[c],   %[nc3]    \n\t"
-      "sub        %[C],     %[C],   %[nc3]    \n\t"
-      "vld1.32    {q5},     [%[c]]!       \n\t"
-      "vst1.32    {q5},     [%[C]]!       \n\t"
-      "end_nc3_%=:                        \n\t"
+        "cmp        %[nc3],    #16          \n\t"
+        "beq        end_nc3_%=              \n\t"
+        "sub        %[c],     %[c],   %[nc3]    \n\t"
+        "sub        %[C],     %[C],   %[nc3]    \n\t"
+        "vld1.32    {q5},     [%[c]]!       \n\t"
+        "vst1.32    {q5},     [%[C]]!       \n\t"
+        "end_nc3_%=:                        \n\t"
 
-      :
-      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5");
-}
+        :
+        : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5");
+  }
 
-// C = alpha * A * B + beta * C
-void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
+  // C = alpha * A * B + beta * C
+  void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
 
-// C = A * B + C
-void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
+  // C = A * B + C
+  void VecWriteWithAdd(int n, float *c, float *C, int ldc) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
 
-  asm volatile(
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
+    asm volatile(
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
 
-      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
-      "vld1.32    {q2, q3},   [%[C]]      \n\t"
-      "vadd.f32   q10,  q0,   q2          \n\t"
-      "vadd.f32   q11,  q1,   q3          \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+        "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+        "vld1.32    {q2, q3},   [%[C]]      \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vst1.32    {q10, q11}, [%[C]]!     \n\t"
 
-      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
-      "vld1.32    {q6, q7},   [%[C]]      \n\t"
-      "vadd.f32   q12,  q4,   q6          \n\t"
-      "vadd.f32   q13,  q5,   q7          \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+        "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+        "vld1.32    {q6, q7},   [%[C]]      \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vst1.32    {q12, q13}, [%[C]]!     \n\t"
 
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
 
-      : [C] "+r"(C), [c] "+r"(c)
-      : [nc1] "r"(nc1)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
-        "q12", "q13");
+        : [C] "+r"(C), [c] "+r"(c)
+        : [nc1] "r"(nc1)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10",
+  "q11", "q12", "q13");
 
-  if (_nc1 != 0) {
-    for (int j = 0; j < _nc1; j++) {
-      *C++ += *c++;
+    if (_nc1 != 0) {
+      for (int j = 0; j < _nc1; j++) {
+        *C++ += *c++;
+      }
     }
   }
-}
 
-// C = A * B + C, relu(C)
-void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
+  // C = A * B + C, relu(C)
+  void VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
 
-  asm volatile(
-      "vmov.f32   q14,      #0.0          \n\t"
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
+    asm volatile(
+        "vmov.f32   q14,      #0.0          \n\t"
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
 
-      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
-      "vld1.32    {q2, q3},   [%[C]]      \n\t"
-      "vadd.f32   q10,  q0,   q2          \n\t"
-      "vadd.f32   q11,  q1,   q3          \n\t"
-      "vmax.f32   q10,  q10,  q14         \n\t"
-      "vmax.f32   q11,  q11,  q14         \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+        "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+        "vld1.32    {q2, q3},   [%[C]]      \n\t"
+        "vadd.f32   q10,  q0,   q2          \n\t"
+        "vadd.f32   q11,  q1,   q3          \n\t"
+        "vmax.f32   q10,  q10,  q14         \n\t"
+        "vmax.f32   q11,  q11,  q14         \n\t"
+        "vst1.32    {q10, q11}, [%[C]]!     \n\t"
 
-      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
-      "vld1.32    {q6, q7},   [%[C]]      \n\t"
-      "vadd.f32   q12,  q4,   q6          \n\t"
-      "vadd.f32   q13,  q5,   q7          \n\t"
-      "vmax.f32   q12,  q12,  q14         \n\t"
-      "vmax.f32   q13,  q13,  q14         \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+        "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+        "vld1.32    {q6, q7},   [%[C]]      \n\t"
+        "vadd.f32   q12,  q4,   q6          \n\t"
+        "vadd.f32   q13,  q5,   q7          \n\t"
+        "vmax.f32   q12,  q12,  q14         \n\t"
+        "vmax.f32   q13,  q13,  q14         \n\t"
+        "vst1.32    {q12, q13}, [%[C]]!     \n\t"
 
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
 
-      : [C] "+r"(C), [c] "+r"(c)
-      : [nc1] "r"(nc1)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
-        "q12", "q13");
+        : [C] "+r"(C), [c] "+r"(c)
+        : [nc1] "r"(nc1)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10",
+  "q11", "q12", "q13");
 
-  if (_nc1 != 0) {
-    for (int j = 0; j < _nc1; j++) {
-      *C += *c;
-      if (*C < 0) {
-        *C = 0;
+    if (_nc1 != 0) {
+      for (int j = 0; j < _nc1; j++) {
+        *C += *c;
+        if (*C < 0) {
+          *C = 0;
+        }
+        C++;
+        c++;
       }
-      C++;
-      c++;
     }
   }
+
+  // C = A * B, batchnorm(C)
+  void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
+                      float *bias) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
+    int nc2 = _nc1 / 4;
+    int nc3 = 16 - 4 * (_nc1 % 4);
+
+    asm volatile(
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+        "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+        "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+        "vmla.f32   q10,  q0,   q2          \n\t"
+        "vmla.f32   q11,  q1,   q3          \n\t"
+        "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+        "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+        "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+        "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+        "vmla.f32   q12,  q4,   q6          \n\t"
+        "vmla.f32   q13,  q5,   q7          \n\t"
+        "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "blt        end_nc2_%=              \n\t"
+        "loop_nc2_%=:                       \n\t"
+
+        "vld1.32    {q0},   [%[c]]!         \n\t"
+        "vld1.32    {q1},   [%[scale]]!     \n\t"
+        "vld1.32    {q10},  [%[bias]]!      \n\t"
+        "vmla.f32   q10,    q0,   q1        \n\t"
+        "vst1.32    {q10},  [%[C]]!         \n\t"
+
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "bge        loop_nc2_%=             \n\t"
+        "end_nc2_%=:                        \n\t"
+
+        "cmp        %[nc3],    #16          \n\t"
+        "beq        end_nc3_%=              \n\t"
+
+        "sub        %[c],     %[c],   %[nc3]      \n\t"
+        "sub        %[scale], %[scale],  %[nc3]   \n\t"
+        "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+        "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+        "vld1.32    {q0},   [%[c]]!         \n\t"
+        "vld1.32    {q1},   [%[scale]]!     \n\t"
+        "vld1.32    {q10},  [%[bias]]!      \n\t"
+        "vmla.f32   q10,    q0,   q1        \n\t"
+        "vst1.32    {q10},  [%[C]]!         \n\t"
+        "end_nc3_%=:                        \n\t"
+
+        :
+        : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3]
+  "r"(nc3), [scale] "r"(scale), [bias] "r"(bias) : "memory", "q0", "q1", "q2",
+  "q3", "q4", "q5", "q6", "q7", "q10", "q11", "q12", "q13");
+  }
+
+  // C = A * B, batchnorm(C), relu(C)
+  void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
+                          float *bias) {
+    int nc1 = n / 16;
+    int _nc1 = n % 16;
+    int nc2 = _nc1 / 4;
+    int nc3 = 16 - 4 * (_nc1 % 4);
+
+    asm volatile(
+        "vmov.f32   q14,      #0.0          \n\t"
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1},   [%[c]]!     \n\t"
+        "vld1.32    {q2, q3},   [%[scale]]! \n\t"
+        "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
+        "vmla.f32   q10,  q0,   q2          \n\t"
+        "vmla.f32   q11,  q1,   q3          \n\t"
+        "vmax.f32   q10,  q10,  q14         \n\t"
+        "vmax.f32   q11,  q11,  q14         \n\t"
+        "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+
+        "vld1.32    {q4, q5},   [%[c]]!     \n\t"
+        "vld1.32    {q6, q7},   [%[scale]]! \n\t"
+        "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
+        "vmla.f32   q12,  q4,   q6          \n\t"
+        "vmla.f32   q13,  q5,   q7          \n\t"
+        "vmax.f32   q12,  q12,  q14         \n\t"
+        "vmax.f32   q13,  q13,  q14         \n\t"
+        "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+
+        "subs       %[nc1],   %[nc1],   #1  \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "blt        end_nc2_%=              \n\t"
+        "loop_nc2_%=:                       \n\t"
+
+        "vld1.32    {q0},   [%[c]]!         \n\t"
+        "vld1.32    {q1},   [%[scale]]!     \n\t"
+        "vld1.32    {q10},  [%[bias]]!      \n\t"
+        "vmla.f32   q10,    q0,   q1        \n\t"
+        "vmax.f32   q10,    q10,  q14       \n\t"
+        "vst1.32    {q10},  [%[C]]!         \n\t"
+
+        "subs       %[nc2],   %[nc2],   #1  \n\t"
+        "bge        loop_nc2_%=             \n\t"
+        "end_nc2_%=:                        \n\t"
+
+        "cmp        %[nc3],    #16          \n\t"
+        "beq        end_nc3_%=              \n\t"
+
+        "sub        %[c],     %[c],   %[nc3]      \n\t"
+        "sub        %[scale], %[scale],  %[nc3]   \n\t"
+        "sub        %[bias],  %[bias],   %[nc3]   \n\t"
+        "sub        %[C],     %[C],   %[nc3]      \n\t"
+
+        "vld1.32    {q0},   [%[c]]!         \n\t"
+        "vld1.32    {q1},   [%[scale]]!     \n\t"
+        "vld1.32    {q10},  [%[bias]]!      \n\t"
+        "vmla.f32   q10,    q0,   q1        \n\t"
+        "vmax.f32   q10,    q10,  q14       \n\t"
+        "vst1.32    {q10},  [%[C]]!         \n\t"
+        "end_nc3_%=:                        \n\t"
+
+        :
+        : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3]
+  "r"(nc3), [scale] "r"(scale), [bias] "r"(bias) : "memory", "q0", "q1", "q2",
+  "q3", "q4", "q5", "q6", "q7", "q10", "q11", "q12", "q13", "q14");
+  }
+  */
+
+#endif  // __aarch64__
+#else
+
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
+  float *c0, *c1, *c2, *c3;
+  c0 = c;
+  c1 = c + ldc;
+  c2 = c + 2 * ldc;
+  c3 = c + 3 * ldc;
+  for (int p = 0; p < k; p += 1) {
+    // first row
+    c0[0] += a[0] * b[0];
+    c0[1] += a[0] * b[1];
+    c0[2] += a[0] * b[2];
+    c0[3] += a[0] * b[3];
+
+    // second row
+    c1[0] += a[1] * b[0];
+    c1[1] += a[1] * b[1];
+    c1[2] += a[1] * b[2];
+    c1[3] += a[1] * b[3];
+
+    // third row
+    c2[0] += a[2] * b[0];
+    c2[1] += a[2] * b[1];
+    c2[2] += a[2] * b[2];
+    c2[3] += a[2] * b[3];
+
+    // fourth row
+    c3[0] += a[3] * b[0];
+    c3[1] += a[3] * b[1];
+    c3[2] += a[3] * b[2];
+    c3[3] += a[3] * b[3];
+
+    a += 4;
+    b += 4;
+  }
 }
 
-// C = A * B, batchnorm(C)
-void VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
-                    float *bias) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
-  int nc2 = _nc1 / 4;
-  int nc3 = 16 - 4 * (_nc1 % 4);
+#endif  // __ARM_NEON
 
-  asm volatile(
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
+// 32位 float 矩阵乘法
+void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+           const float *B, int ldb, float beta, float *C, int ldc, bool relu) {
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int L1 = 32 * 1024;
+  int L2 = 0.5 * 1024 * 1024;
 
-      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
-      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
-      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
-      "vmla.f32   q10,  q0,   q2          \n\t"
-      "vmla.f32   q11,  q1,   q3          \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+  KC = k;
+  MC = L1 / (KC * sizeof(float));
+  NC = L2 / (KC * sizeof(float));
 
-      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
-      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
-      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
-      "vmla.f32   q12,  q4,   q6          \n\t"
-      "vmla.f32   q13,  q5,   q7          \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+  // make sure MC is multiple of MR, and NC is multiple of NR
+  int mblock_num = (m + MC - 1) / MC;
+  MC = (m + mblock_num - 1) / mblock_num;
+  MC = (MC + MR - 1) / MR * MR;
+  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
 
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
+  int nblock_num = (n + NC - 1) / NC;
+  NC = (n + nblock_num - 1) / nblock_num;
+  NC = (NC + NR - 1) / NR * NR;
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
 
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "blt        end_nc2_%=              \n\t"
-      "loop_nc2_%=:                       \n\t"
+  packedA = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+  packedB = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
 
-      "vld1.32    {q0},   [%[c]]!         \n\t"
-      "vld1.32    {q1},   [%[scale]]!     \n\t"
-      "vld1.32    {q10},  [%[bias]]!      \n\t"
-      "vmla.f32   q10,    q0,   q1        \n\t"
-      "vst1.32    {q10},  [%[C]]!         \n\t"
+  for (int l = 0; l < KC; ++l) {
+    zero[l] = 0;
+  }
 
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "bge        loop_nc2_%=             \n\t"
-      "end_nc2_%=:                        \n\t"
+  int mc, nc;
+  for (int j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    for (int i = 0; i < m; i += MC) {
+      mc = s_min(m - i, MC);
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc,
+                  relu);
+    }
+  }
 
-      "cmp        %[nc3],    #16          \n\t"
-      "beq        end_nc3_%=              \n\t"
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
+}
 
-      "sub        %[c],     %[c],   %[nc3]      \n\t"
-      "sub        %[scale], %[scale],  %[nc3]   \n\t"
-      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
-      "sub        %[C],     %[C],   %[nc3]      \n\t"
+void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 bool relu, float *new_scale, float *new_bias) {
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int L1 = 32 * 1024;
+  int L2 = 0.5 * 1024 * 1024;
 
-      "vld1.32    {q0},   [%[c]]!         \n\t"
-      "vld1.32    {q1},   [%[scale]]!     \n\t"
-      "vld1.32    {q10},  [%[bias]]!      \n\t"
-      "vmla.f32   q10,    q0,   q1        \n\t"
-      "vst1.32    {q10},  [%[C]]!         \n\t"
-      "end_nc3_%=:                        \n\t"
+  KC = k;
+  MC = L1 / (KC * sizeof(float));
+  NC = L2 / (KC * sizeof(float));
 
-      :
-      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3),
-        [scale] "r"(scale), [bias] "r"(bias)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
-        "q12", "q13");
+  // make sure MC is multiple of MR, and NC is multiple of NR
+  int mblock_num = (m + MC - 1) / MC;
+  MC = (m + mblock_num - 1) / mblock_num;
+  MC = (MC + MR - 1) / MR * MR;
+  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+
+  int nblock_num = (n + NC - 1) / NC;
+  NC = (n + nblock_num - 1) / nblock_num;
+  NC = (NC + NR - 1) / NR * NR;
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+
+  packedA = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
+  packedB = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
+  packedC = static_cast<float *>(
+      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
+  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
+
+  for (int l = 0; l < KC; ++l) {
+    zero[l] = 0;
+  }
+
+  int mc, nc;
+  for (int j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
+    for (int i = 0; i < m; i += MC) {
+      mc = s_min(m - i, MC);
+      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
+      InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
+                        &C(i, j), ldc, relu, new_scale + i, new_bias + i);
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA);
+  paddle_mobile::memory::Free(packedB);
+  paddle_mobile::memory::Free(packedC);
+  paddle_mobile::memory::Free(zero);
 }
 
-// C = A * B, batchnorm(C), relu(C)
-void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
-                        float *bias) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
-  int nc2 = _nc1 / 4;
-  int nc3 = 16 - 4 * (_nc1 % 4);
+void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
+#if __ARM_NEON
+#if __aarch64__
 
-  asm volatile(
-      "vmov.f32   q14,      #0.0          \n\t"
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
+  // init C
+  float32x4_t cv0 = vdupq_n_f32(0.0);
+  float32x4_t cv1 = vdupq_n_f32(0.0);
+  float32x4_t cv2 = vdupq_n_f32(0.0);
+  float32x4_t cv3 = vdupq_n_f32(0.0);
+  float32x4_t cv4 = vdupq_n_f32(0.0);
+  float32x4_t cv5 = vdupq_n_f32(0.0);
+  float32x4_t cv6 = vdupq_n_f32(0.0);
+  float32x4_t cv7 = vdupq_n_f32(0.0);
+  float32x4_t cv8 = vdupq_n_f32(0.0);
+  float32x4_t cv9 = vdupq_n_f32(0.0);
+  float32x4_t cv10 = vdupq_n_f32(0.0);
+  float32x4_t cv11 = vdupq_n_f32(0.0);
 
-      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
-      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
-      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
-      "vmla.f32   q10,  q0,   q2          \n\t"
-      "vmla.f32   q11,  q1,   q3          \n\t"
-      "vmax.f32   q10,  q10,  q14         \n\t"
-      "vmax.f32   q11,  q11,  q14         \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
+  float32x4_t av;
+  float32x4_t bv0;
+  float32x4_t bv1;
 
-      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
-      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
-      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
-      "vmla.f32   q12,  q4,   q6          \n\t"
-      "vmla.f32   q13,  q5,   q7          \n\t"
-      "vmax.f32   q12,  q12,  q14         \n\t"
-      "vmax.f32   q13,  q13,  q14         \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
+  float32x2_t av01;
+  float32x2_t av23;
+  float32x2_t av45;
 
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
+  for (int p = 0; p < k; p += 1) {
+    av = vld1q_f32(a);
+    av01 = vget_low_f32(av);
+    av23 = vget_high_f32(av);
+    av45 = vld1_f32(a + 4);
+    bv0 = vld1q_f32(b);
+    bv1 = vld1q_f32(b + 4);
 
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "blt        end_nc2_%=              \n\t"
-      "loop_nc2_%=:                       \n\t"
+    cv0 = vmlaq_lane_f32(cv0, bv0, av01, 0);
+    cv1 = vmlaq_lane_f32(cv1, bv1, av01, 0);
+    cv2 = vmlaq_lane_f32(cv2, bv0, av01, 1);
+    cv3 = vmlaq_lane_f32(cv3, bv1, av01, 1);
 
-      "vld1.32    {q0},   [%[c]]!         \n\t"
-      "vld1.32    {q1},   [%[scale]]!     \n\t"
-      "vld1.32    {q10},  [%[bias]]!      \n\t"
-      "vmla.f32   q10,    q0,   q1        \n\t"
-      "vmax.f32   q10,    q10,  q14       \n\t"
-      "vst1.32    {q10},  [%[C]]!         \n\t"
+    cv4 = vmlaq_lane_f32(cv4, bv0, av23, 0);
+    cv5 = vmlaq_lane_f32(cv5, bv1, av23, 0);
+    cv6 = vmlaq_lane_f32(cv6, bv0, av23, 1);
+    cv7 = vmlaq_lane_f32(cv7, bv1, av23, 1);
 
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "bge        loop_nc2_%=             \n\t"
-      "end_nc2_%=:                        \n\t"
+    cv8 = vmlaq_lane_f32(cv8, bv0, av45, 0);
+    cv9 = vmlaq_lane_f32(cv9, bv1, av45, 0);
+    cv10 = vmlaq_lane_f32(cv10, bv0, av45, 1);
+    cv11 = vmlaq_lane_f32(cv11, bv1, av45, 1);
 
-      "cmp        %[nc3],    #16          \n\t"
-      "beq        end_nc3_%=              \n\t"
+    a += MR;
+    b += NR;
+  }
 
-      "sub        %[c],     %[c],   %[nc3]      \n\t"
-      "sub        %[scale], %[scale],  %[nc3]   \n\t"
-      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
-      "sub        %[C],     %[C],   %[nc3]      \n\t"
+  vst1q_f32(c, cv0);
+  vst1q_f32(c + 4, cv1);
+  vst1q_f32(c + ldc, cv2);
+  vst1q_f32(c + ldc + 4, cv3);
+  vst1q_f32(c + 2 * ldc, cv4);
+  vst1q_f32(c + 2 * ldc + 4, cv5);
+  vst1q_f32(c + 3 * ldc, cv6);
+  vst1q_f32(c + 3 * ldc + 4, cv7);
+  vst1q_f32(c + 4 * ldc, cv8);
+  vst1q_f32(c + 4 * ldc + 4, cv9);
+  vst1q_f32(c + 5 * ldc, cv10);
+  vst1q_f32(c + 5 * ldc + 4, cv11);
 
-      "vld1.32    {q0},   [%[c]]!         \n\t"
-      "vld1.32    {q1},   [%[scale]]!     \n\t"
-      "vld1.32    {q10},  [%[bias]]!      \n\t"
-      "vmla.f32   q10,    q0,   q1        \n\t"
-      "vmax.f32   q10,    q10,  q14       \n\t"
-      "vst1.32    {q10},  [%[C]]!         \n\t"
-      "end_nc3_%=:                        \n\t"
+#else
+
+  const float *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int kc1 = k / 4;
+  int kc2 = k % 4;
+  int step = 4 * ldc;
+  asm volatile(
+      "pld        [%[a_ptr]]            \n\t"
+      "pld        [%[b_ptr]]            \n\t"
+      "pld        [%[a_ptr],  #64]            \n\t"
+      "pld        [%[b_ptr],  #64]            \n\t"
+
+      "vmov.f32   q4,     #0.0          \n\t"
+      "vmov.f32   q5,     #0.0          \n\t"
+      "vmov.f32   q6,     #0.0          \n\t"
+      "vmov.f32   q7,     #0.0          \n\t"
+      "vmov.f32   q8,     #0.0          \n\t"
+      "vmov.f32   q9,     #0.0          \n\t"
+      "vmov.f32   q10,    #0.0          \n\t"
+      "vmov.f32   q11,    #0.0          \n\t"
+      "vmov.f32   q12,    #0.0          \n\t"
+      "vmov.f32   q13,    #0.0          \n\t"
+      "vmov.f32   q14,    #0.0          \n\t"
+      "vmov.f32   q15,    #0.0          \n\t"
+
+      "subs       %[kc1], %[kc1], #1    \n\t"
+      "blt        end_kc1_%=            \n\t"
+      "loop_kc1_%=:                     \n\t"
+
+      //      "pld        [%[a_ptr], #128]       \n\t"
+      //      "pld        [%[b_ptr], #128]       \n\t"
+      //      "pld        [%[a_ptr], #192]       \n\t"
+      //      "pld        [%[b_ptr], #192]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "subs       %[kc1], %[kc1], #1      \n\t"
+      "bge        loop_kc1_%=             \n\t"
+      "end_kc1_%=:                        \n\t"
+
+      "subs       %[kc2], %[kc2], #1      \n\t"
+      "blt        end_kc2_%=              \n\t"
+      "loop_kc2_%=:                       \n\t"
+
+      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
+      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
+
+      "vmla.f32   q4,   q2,   d0[0]       \n\t"
+      "vmla.f32   q5,   q3,   d0[0]       \n\t"
+      "vmla.f32   q6,   q2,   d0[1]       \n\t"
+      "vmla.f32   q7,   q3,   d0[1]       \n\t"
+      "vmla.f32   q8,   q2,   d1[0]       \n\t"
+      "vmla.f32   q9,   q3,   d1[0]       \n\t"
+      "vmla.f32   q10,  q2,   d1[1]       \n\t"
+      "vmla.f32   q11,  q3,   d1[1]       \n\t"
+      "vmla.f32   q12,  q2,   d2[0]       \n\t"
+      "vmla.f32   q13,  q3,   d2[0]       \n\t"
+      "vmla.f32   q14,  q2,   d2[1]       \n\t"
+      "vmla.f32   q15,  q3,   d2[1]       \n\t"
+
+      "subs       %[kc2], %[kc2], #1      \n\t"
+      "bge        loop_kc2_%=             \n\t"
+      "end_kc2_%=:                        \n\t"
+
+      "mov        r5,     %[c]            \n\t"
+      "mov        r6,     %[step]         \n\t"
+      "vst1.32    {q4, q5},   [r5], r6    \n\t"
+      "vst1.32    {q6, q7},   [r5], r6    \n\t"
+      "vst1.32    {q8, q9},   [r5], r6    \n\t"
+      "vst1.32    {q10, q11}, [r5], r6    \n\t"
+      "vst1.32    {q12, q13}, [r5], r6    \n\t"
+      "vst1.32    {q14, q15}, [r5]        \n\t"
 
       :
-      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3),
-        [scale] "r"(scale), [bias] "r"(bias)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
-        "q12", "q13", "q14");
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [kc2] "r"(kc2), [step] "r"(step)
+      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+
+#endif  // __aarch64__
+#else
+
+#endif  // __ARM_NEON
 }
 
+}  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
-}  // namespace paddle_mobile
diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h
index b4bce43c7a29fba09ade7512cbc660f0ac2888ab..2044c264ed1c0f8624690874ed248661a753804c 100644
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #define B(i, j) B[(i)*ldb + (j)]
 #define C(i, j) C[(i)*ldc + (j)]
 
-#define MR 4
+#define MR 6
 #define NR 8
 
 #define s_min(i, j) ((i) < (j) ? (i) : (j))
@@ -28,6 +28,7 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {
 
+/*
 // 将 A 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                  float *buffer);
@@ -35,14 +36,17 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
 // 将 B 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                  float *buffer);
+*/
 
 // 将 A 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
-                  float *buffer);
+void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer);
+void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer);
 
 // 将 B 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
-                  float *buffer);
+void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
+                    float *buffer);
 
 // 分块矩阵乘法
 void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
@@ -51,7 +55,7 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
 void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
                        const float *b, float beta, float *c, float *C, int ldc,
                        bool relu, float *new_scale, float *new_bias);
-
+/*
 // 向量矩阵乘法 (M = 1)
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
                   const float *B, int ldb, float beta, float *C, int ldc,
@@ -60,10 +64,12 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
 void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
                         int lda, const float *B, int ldb, float beta, float *C,
                         int ldc, bool relu, float *new_scale, float *new_bias);
+*/
 
 // 计算一个更小的 C 矩阵分块
 void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
 void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
 
 // 分块矩阵乘法结果回写
 // C = A * B
@@ -81,6 +87,7 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
 void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
                      float *new_scale, float *new_bias);
 
+/*
 // 向量矩阵乘法结果回写
 // C = A * B
 void VecWriteBasic(int n, float *c, float *C, int ldc);
@@ -96,6 +103,7 @@ void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
 // C = A * B, batchnorm(C), relu(C)
 void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
                         float *new_bias);
+*/
 
 // 32位 float 矩阵乘法
 void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
diff --git a/src/operators/math/im2col.cpp b/src/operators/math/im2col.cpp
index 625d120705aab8fcc3ea8d232b4077e213941ec4..7b0b974b542a83d381727128887bef8a48ce937f 100644
--- a/src/operators/math/im2col.cpp
+++ b/src/operators/math/im2col.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "operators/math/im2col.h"
 #include <vector>
 #ifdef __ARM_NEON
-#include "arm_neon.h"
+#include <arm_neon.h>
 #endif
 #include "common/types.h"
 namespace paddle_mobile {
@@ -69,7 +69,7 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
     int channels_col = im_channels * filter_height * filter_width;
     const T *im_data = im.data<T>();
     T *col_data = col->data<T>();
-#ifdef __ARM_NEON
+#if __ARM_NEON
     const int osize = col_height;
     const int isize = im_height;
     bool pad1 = padding[0] > 0;
diff --git a/src/operators/math/pool_2x2.cpp b/src/operators/math/pool_2x2.cpp
index c86003f6f96b632efd50bbb156293510e3d8521c..0a2d96d4d065d7938e6872b4f073e080d7be8c3a 100644
--- a/src/operators/math/pool_2x2.cpp
+++ b/src/operators/math/pool_2x2.cpp
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef POOL_OP
-#include "pool_2x2.h"
+#include "operators/math/pool_2x2.h"
+#include <algorithm>
+#include <vector>
 
 namespace paddle_mobile {
 namespace operators {
@@ -21,10 +23,10 @@ namespace math {
 
 void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                 Tensor *output) {
-#ifdef __ARM_NEON
-
-#ifdef ARMV7
+#if __ARM_NEON
 
+#if __aarch64__
+#else
   const int batch_size = input->dims()[0];
 
   const int input_height = input->dims()[2];
@@ -93,15 +95,16 @@ void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
     output_data += output_batch_stride;
   }
 #endif
-
+#else
 #endif
 }
 
 void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                 Tensor *output) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
 
-#ifdef ARMV7
+#if __aarch64__
+#else
   const int batch_size = input->dims()[0];
 
   const int input_height = input->dims()[2];
@@ -171,12 +174,9 @@ void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
     input_data += input_batch_stride;
     output_data += output_batch_stride;
   }
-#else
-
-// TODO(): to imp other asm
 
 #endif
-
+#else
 #endif
 }
 
diff --git a/src/operators/math/pool_3x3.cpp b/src/operators/math/pool_3x3.cpp
index 28a8877355b2c2cc1221512884b5be1497bc4243..28547b71fca6caea2ff4341b3f832c0035436a72 100644
--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <omp.h>
 #endif
 #include "framework/tensor.h"
-#include "pool_3x3.h"
+#include "operators/math/pool_3x3.h"
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON
@@ -518,6 +518,8 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
     input_data += input_batch_stride;
     out_data += output_batch_stride;
   }
+#else
+
 #endif
 }
 
@@ -582,7 +584,18 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
             }
             output_seg[ph * output_width + pw] = max_value;
           } else {
-#if defined(ARMV7)
+#if __aarch64__
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos1 + input_width);
+            const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width);
+            const float32x4_t max_data =
+                vmaxq_f32(vmaxq_f32(data1, data2), data3);
+            float32x2_t res =
+                vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
+                          vget_low_f32(max_data));
+            res = vpmax_f32(res, res);
+            output_seg[ph * output_width + pw] = vget_lane_f32(res, 0);
+#else
             asm volatile(
                 "vld1.32  {q1}, [%[pos1]]        \n\t"
                 "vld1.32  {q2}, [%[pos2]]        \n\t"
@@ -598,17 +611,6 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                   [pos2] "r"(pos2), [pos3] "r"(pos3),
                   [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
                 : "memory", "q1", "q2", "q3", "q4");
-#else
-            const float32x4_t data1 = vld1q_f32(pos1);
-            const float32x4_t data2 = vld1q_f32(pos1 + input_width);
-            const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width);
-            const float32x4_t max_data =
-                vmaxq_f32(vmaxq_f32(data1, data2), data3);
-            float32x2_t res =
-                vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
-                          vget_low_f32(max_data));
-            res = vpmax_f32(res, res);
-            output_seg[ph * output_width + pw] = vget_lane_f32(res, 0);
 #endif
           }
         }
@@ -676,8 +678,8 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
             }
             output_seg[ph * output_width + pw] = sum / 9.0;
           } else {
-#if defined(ARMV7)
-
+#if __aarch64__
+#else
             asm volatile(
                 "vld1.32  {q1}, [%[pos1]]        \n\t"
                 "vld1.32  {q2}, [%[pos2]]        \n\t"
@@ -696,7 +698,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                   [output_ptr] "r"(output_ptr), [zero] "r"(zero),
                   [nine_ptr] "r"(nine_ptr)
                 : "memory", "r6", "q1", "q2", "q3", "q4");
-#else
+#endif
             const float32x4_t data1 = vld1q_f32(pos1);
             const float32x4_t data2 = vld1q_f32(pos2);
             const float32x4_t data3 = vld1q_f32(pos3);
@@ -707,7 +709,6 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                           vget_low_f32(sum_data));
             res = vpadd_f32(res, res);
             output_seg[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0;
-#endif
           }
         }
       }
@@ -715,6 +716,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
     input_data += input_batch_stride;
     output_data += output_batch_stride;
   }
+#else
 #endif
 }
 }  // namespace math
diff --git a/src/operators/math/softmax.cpp b/src/operators/math/softmax.cpp
index 968915f21e08fce9f25ceb63831ee40ecba9cee6..dba88c93969014f2ad0d2636b4141c734dbc2ed5 100644
--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
@@ -135,6 +135,7 @@ class SoftmaxFuntor<CPU, T> {
       }
     }
   }
+#else
 #endif  // ARM_NEON
 
  public:
diff --git a/src/operators/mul_op.cpp b/src/operators/mul_op.cpp
index 60e0c087383388c83ca1711c057af822a6e2a730..044da7012eccde57a87d417f4f3c00b82e01da42 100644
--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -50,7 +50,7 @@ void MulOp<Dtype, T>::InferShape() const {
   framework::DDim ddim = framework::make_ddim(output_dims);
   this->param_.Out()->Resize(ddim);
 }
-template class MulOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/multiclass_nms_op.cpp b/src/operators/multiclass_nms_op.cpp
index eea625469ec030e0c7d62baea8312e11f1308ce2..4324cab35298a45ece7e375299909994648a27a4 100644
--- a/src/operators/multiclass_nms_op.cpp
+++ b/src/operators/multiclass_nms_op.cpp
@@ -34,7 +34,7 @@ void MultiClassNMSOp<Dtype, T>::InferShape() const {
   // pre size, will change in Compute.
   this->param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6}));
 }
-template class MultiClassNMSOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index 24e3699f2fc8947ee23341ddcefcb219c6f8df03..e2795b3aefe3c67df9b51c882298a717a388ae15 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -232,7 +232,6 @@ class ConvParam : OpParam {
 Print &operator<<(Print &printer, const ConvParam &conv_param);
 #endif
 
-#ifdef ELEMENTWISEADD_OP
 class ElementwiseAddParam : OpParam {
  public:
   ElementwiseAddParam(const VariableNameMap &inputs,
@@ -259,6 +258,8 @@ class ElementwiseAddParam : OpParam {
   int axis_;
 };
 
+#ifdef FUSION_ELEMENTWISEADDRELU_OP
+using ElementwiseAddReluParam = ElementwiseAddParam;
 #endif
 
 #ifdef MUL_OP
@@ -371,7 +372,7 @@ class BatchNormParam : OpParam {
     input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
     epsilon_ = GetAttr<float>("epsilon", attrs);
     momentum_ = GetAttr<float>("momentum", attrs);
-    is_test_ = GetAttr<bool>("is_test", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
   }
 
   const Tensor *InputX() const { return input_x_; }
@@ -421,7 +422,7 @@ class PoolParam : public OpParam {
     strides_ = GetAttr<vector<int>>("strides", attrs);
     paddings_ = GetAttr<vector<int>>("paddings", attrs);
     ceil_mode_ = GetAttr<bool>("ceil_mode", attrs);
-    gloabal_pooling_ = GetAttr<bool>("global_pooling", attrs);
+    global_pooling_ = GetAttr<bool>("global_pooling", attrs);
   }
 
   const Tensor *Input() const { return input_; }
@@ -438,7 +439,7 @@ class PoolParam : public OpParam {
 
   bool isCeilMode() const { return ceil_mode_; }
 
-  bool isGlobalPooling() const { return gloabal_pooling_; }
+  bool isGlobalPooling() const { return global_pooling_; }
 
  private:
   Tensor *input_;
@@ -448,9 +449,82 @@ class PoolParam : public OpParam {
   vector<int> strides_;
   vector<int> paddings_;
   bool ceil_mode_;
-  bool gloabal_pooling_ = false;
+  bool global_pooling_ = false;
 };
+#endif
+
+#ifdef FUSION_POOLBN_OP
+class FusionPoolBNParam : OpParam {
+ public:
+  FusionPoolBNParam(const VariableNameMap &inputs,
+                    const VariableNameMap &outputs, const AttributeMap &attrs,
+                    const Scope &scope) {
+    input_ = InputXFrom<LoDTensor>(inputs, scope);
+    pooling_type_ = GetAttr<string>("pooling_type", attrs);
+    ksize_ = GetAttr<vector<int>>("ksize", attrs);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    ceil_mode_ = GetAttr<bool>("ceil_mode", attrs);
+    global_pooling_ = GetAttr<bool>("global_pooling", attrs);
+    output_y_ = OutputYFrom<LoDTensor>(outputs, scope);
+    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
+    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
+    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
+    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
+    epsilon_ = GetAttr<float>("epsilon", attrs);
+    momentum_ = GetAttr<float>("momentum", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
+  }
+  const Tensor *Input() const { return input_; }
+
+  const string &PoolingType() const { return pooling_type_; }
+
+  const vector<int> &Ksize() const { return ksize_; }
+
+  const vector<int> &Strides() const { return strides_; }
+
+  const vector<int> &Paddings() const { return paddings_; }
+
+  bool isCeilMode() const { return ceil_mode_; }
 
+  bool isGlobalPooling() const { return global_pooling_; }
+
+  Tensor *OutputY() const { return output_y_; }
+
+  const Tensor *InputBias() const { return input_bias_; }
+
+  const Tensor *InputMean() const { return input_mean_; }
+
+  const Tensor *InputScale() const { return input_scale_; }
+
+  const Tensor *InputVariance() const { return input_variance_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+  const float &Momentum() const { return momentum_; }
+
+  const bool &IsTest() const { return is_test_; }
+
+  const string &DataFormat() const { return data_format_; }
+
+ private:
+  Tensor *input_;
+  string pooling_type_;
+  vector<int> ksize_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  bool ceil_mode_;
+  bool global_pooling_ = false;
+  Tensor *output_y_;
+  Tensor *input_bias_;
+  Tensor *input_mean_;
+  Tensor *input_scale_;
+  Tensor *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  string data_format_;
+};
 #endif
 
 #ifdef PRIORBOX_OP
@@ -875,7 +949,6 @@ class PReluParam : public OpParam {
 };
 #endif
 
-#ifdef FUSION_FC_OP
 class FusionFcParam : public OpParam {
  public:
   FusionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -911,9 +984,11 @@ class FusionFcParam : public OpParam {
   int y_num_col_dims_;
   int axis_;
 };
+
+#ifdef FUSION_FCRELU_OP
+using FusionFcReluParam = FusionFcParam;
 #endif
 
-#ifdef FUSION_CONVADD_OP
 class FusionConvAddParam : public OpParam {
  public:
   FusionConvAddParam(const VariableNameMap &inputs,
@@ -960,9 +1035,8 @@ class FusionConvAddParam : public OpParam {
 };
 
 Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
-#endif
 
-#ifdef FUSION_CONVADD_RELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 class FusionConvAddReluParam : public FusionConvAddParam {
  public:
   FusionConvAddReluParam(const VariableNameMap &inputs,
@@ -993,7 +1067,7 @@ class FusionConvAddBNReluParam : public OpParam {
     input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
     epsilon_ = GetAttr<float>("epsilon", attrs);
     momentum_ = GetAttr<float>("momentum", attrs);
-    is_test_ = GetAttr<bool>("is_test", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
   }
   Tensor *Bias() const { return bias_; }
 
@@ -1055,8 +1129,91 @@ class FusionConvAddBNReluParam : public OpParam {
   Tensor *new_bias_;
   Tensor *new_scale_;
 };
+#endif
 
-Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
+#ifdef FUSION_CONVADDBN_OP
+class FusionConvAddBNParam : public OpParam {
+ public:
+  FusionConvAddBNParam(const VariableNameMap &inputs,
+                       const VariableNameMap &outputs,
+                       const AttributeMap &attrs, const Scope &scope) {
+    bias_ = InputYFrom<LoDTensor>(inputs, scope);
+    axis_ = GetAttr<int>("axis", attrs);
+    filter_ = FilterFrom<LoDTensor>(inputs, scope);
+    input_ = InputFrom<LoDTensor>(inputs, scope);
+    output_y_ = OutputYFrom<LoDTensor>(outputs, scope);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
+    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
+    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
+    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
+    epsilon_ = GetAttr<float>("epsilon", attrs);
+    momentum_ = GetAttr<float>("momentum", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
+  }
+  Tensor *Bias() const { return bias_; }
+
+  const int &Axis() const { return axis_; }
+
+  const Tensor *Input() const { return input_; }
+
+  const Tensor *Filter() const { return filter_; }
+
+  Tensor *OutputY() const { return output_y_; }
+
+  const vector<int> &Strides() const { return strides_; }
+
+  const vector<int> &Paddings() const { return paddings_; }
+
+  const vector<int> &Dilations() const { return dilations_; }
+
+  const int &Groups() const { return groups; }
+
+  const Tensor *InputBias() const { return input_bias_; }
+
+  const Tensor *InputMean() const { return input_mean_; }
+
+  const Tensor *InputScale() const { return input_scale_; }
+
+  const Tensor *InputVariance() const { return input_variance_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+  const float &Momentum() const { return momentum_; }
+
+  const bool &IsTest() const { return is_test_; }
+
+  void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; }
+
+  void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; }
+
+  const Tensor *NewScale() const { return new_scale_; }
+
+  const Tensor *NewBias() const { return new_bias_; }
+
+ protected:
+  Tensor *bias_;
+  int axis_;
+  Tensor *input_;
+  Tensor *output_y_;
+  Tensor *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+  Tensor *input_bias_;
+  Tensor *input_mean_;
+  Tensor *input_scale_;
+  Tensor *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  Tensor *new_bias_;
+  Tensor *new_scale_;
+};
 #endif
 
 #ifdef FUSION_DWCONVBNRELU_OP
@@ -1078,7 +1235,7 @@ class FusionDWConvBNReluParam : public OpParam {
     input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
     epsilon_ = GetAttr<float>("epsilon", attrs);
     momentum_ = GetAttr<float>("momentum", attrs);
-    is_test_ = GetAttr<bool>("is_test", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
   }
 
   const Tensor *Input() const { return input_; }
@@ -1139,6 +1296,85 @@ class FusionDWConvBNReluParam : public OpParam {
 Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
 #endif
 
+#ifdef FUSION_CONVBNRELU_OP
+class FusionConvBNReluParam : public OpParam {
+ public:
+  FusionConvBNReluParam(const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const AttributeMap &attrs, const Scope &scope) {
+    filter_ = FilterFrom<LoDTensor>(inputs, scope);
+    input_ = InputFrom<LoDTensor>(inputs, scope);
+    output_ = OutFrom<LoDTensor>(outputs, scope);
+
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
+    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
+    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
+    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
+    epsilon_ = GetAttr<float>("epsilon", attrs);
+    momentum_ = GetAttr<float>("momentum", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
+  }
+
+  const Tensor *Input() const { return input_; }
+
+  const Tensor *Filter() const { return filter_; }
+
+  Tensor *Output() const { return output_; }
+
+  const vector<int> &Strides() const { return strides_; }
+
+  const vector<int> &Paddings() const { return paddings_; }
+
+  const vector<int> &Dilations() const { return dilations_; }
+
+  const int &Groups() const { return groups; }
+
+  const Tensor *InputBias() const { return input_bias_; }
+
+  const Tensor *InputMean() const { return input_mean_; }
+
+  const Tensor *InputScale() const { return input_scale_; }
+
+  const Tensor *InputVariance() const { return input_variance_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+  const float &Momentum() const { return momentum_; }
+
+  const bool &IsTest() const { return is_test_; }
+
+  void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; }
+
+  void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; }
+
+  const Tensor *NewScale() const { return new_scale_; }
+
+  const Tensor *NewBias() const { return new_bias_; }
+
+ protected:
+  Tensor *input_;
+  Tensor *output_;
+  Tensor *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+  Tensor *input_bias_;
+  Tensor *input_mean_;
+  Tensor *input_scale_;
+  Tensor *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  Tensor *new_bias_;
+  Tensor *new_scale_;
+};
+#endif
+
 #ifdef IM2SEQUENCE_OP
 class Im2SequenceParam : public OpParam {
  public:
@@ -1190,5 +1426,9 @@ class DropoutParam : public OpParam {
 };
 #endif
 
+#ifdef REGION_OP
+class RegionParam : public OpParam {};
+#endif
+
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/pool_op.cpp b/src/operators/pool_op.cpp
index 41016d74deb5bcd7d3679b1c762467e2dc65de34..0477c88cf84054090b4c46524284fb0cdf525c0e 100644
--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
@@ -54,7 +54,7 @@ void PoolOp<DeviceType, T>::InferShape() const {
   }
   this->param_.Output()->Resize(framework::make_ddim(output_shape));
 }
-template class PoolOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/prelu_op.cpp b/src/operators/prelu_op.cpp
index e78f6b0374336a3d891a1f3e73f63c706b321ccc..245154ca5ea6971dee33e14550bf1e090fa0ec71 100644
--- a/src/operators/prelu_op.cpp
+++ b/src/operators/prelu_op.cpp
@@ -23,7 +23,7 @@ void PReluOp<Dtype, T>::InferShape() const {
   auto input_dims = this->param_.InputX()->dims();
   this->param_.Out()->Resize(input_dims);
 }
-template class PReluOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/prior_box_op.cpp b/src/operators/prior_box_op.cpp
index 81ba045a209a48105ab895f7687e56ed3db44305..a05a0ddcec5ba9d442b58846468a121e9b655a6a 100644
--- a/src/operators/prior_box_op.cpp
+++ b/src/operators/prior_box_op.cpp
@@ -44,7 +44,7 @@ void PriorBoxOp<Dtype, T>::InferShape() const {
   this->param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec));
   this->param_.OutputVariances()->Resize(framework::make_ddim(dim_vec));
 }
-template class PriorBoxOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/relu_op.cpp b/src/operators/relu_op.cpp
index b80a56f38aec4bf1bf625d54f4115626447a654a..2a771e81e7a5a0e869984990b52b98d15036543a 100644
--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -23,7 +23,7 @@ void ReluOp<Dtype, T>::InferShape() const {
   auto input_dims = this->param_.InputX()->dims();
   this->param_.Out()->Resize(input_dims);
 }
-template class ReluOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/reshape_op.cpp b/src/operators/reshape_op.cpp
index 193678613cc8dd2b8f9b8ae1654b0adacea09505..dcc15009af2b23129552d58b3fa22c3c67684dce 100644
--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -27,7 +27,7 @@ void ReshapeOp<Dtype, T>::InferShape() const {
   auto out_dims = ValidateShape(shape, input_x_dims);
   this->param_.Out()->Resize(out_dims);
 }
-template class ReshapeOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/resize_op.cpp b/src/operators/resize_op.cpp
index f378ff53f513ccf7cfb986f606378895b5af4b9f..02c50b662665fc9bd2f662922cb88dbce9fc5d53 100644
--- a/src/operators/resize_op.cpp
+++ b/src/operators/resize_op.cpp
@@ -24,7 +24,7 @@ void ResizeOp<Dtype, T>::InferShape() const {
   auto out_dims = CalOutputShape(this->param_);
   this->param_.Out()->Resize(out_dims);
 }
-template class ResizeOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/scale_op.cpp b/src/operators/scale_op.cpp
index c1931ed4fdc4c058c979fdceba11ea25f7d752f4..968fcd4098e92a47899c9a733c0261d91c314c29 100644
--- a/src/operators/scale_op.cpp
+++ b/src/operators/scale_op.cpp
@@ -24,7 +24,7 @@ void ScaleOp<Dtype, T>::InferShape() const {
   auto input_dims = this->param_.InputX()->dims();
   this->param_.Out()->Resize(input_dims);
 }
-template class ScaleOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/sigmoid_op.cpp b/src/operators/sigmoid_op.cpp
index c83738b2c88c3c51ebc0d649fe134da9e44f30ea..8ea4c98942e0630f5b69133991583ee1192c8153 100644
--- a/src/operators/sigmoid_op.cpp
+++ b/src/operators/sigmoid_op.cpp
@@ -22,7 +22,7 @@ template <typename DeviceType, typename T>
 void SigmoidOp<DeviceType, T>::InferShape() const {
   this->param_.Out()->Resize(this->param_.InputX()->dims());
 }
-template class SigmoidOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/slice_op.cpp b/src/operators/slice_op.cpp
index 6d70895fcc5edf75f73368813212f7d9177c760b..b77a675e10ed030443e1d4074239a715ddedf772 100644
--- a/src/operators/slice_op.cpp
+++ b/src/operators/slice_op.cpp
@@ -23,7 +23,7 @@ template <typename Dtype, typename T>
 void SliceOp<Dtype, T>::InferShape() const {
   /// todo: add InputShape() detection.
 }
-template class SliceOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/softmax_op.cpp b/src/operators/softmax_op.cpp
index db8fe1d94363c1db578a369d9eca00dde17d30af..c9edfccf4ff08e5a12d735526c3d63c689711357 100644
--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -22,7 +22,7 @@ template <typename DeviceType, typename T>
 void SoftmaxOp<DeviceType, T>::InferShape() const {
   this->param_.Out()->Resize(this->param_.InputX()->dims());
 }
-template class SoftmaxOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/transpose_op.cpp b/src/operators/transpose_op.cpp
index 7e578b290174734ba8c210a354c9e56fde364858..5f193f96396c8d4d7cb58143573015384e7a7c28 100644
--- a/src/operators/transpose_op.cpp
+++ b/src/operators/transpose_op.cpp
@@ -47,7 +47,7 @@ void TransposeOp<Dtype, T>::InferShape() const {
   }
   this->param_.Out()->Resize(out_dims);
 }
-template class TransposeOp<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 09d1ff031f2d29eb64c83d43724b1039fce9385f..418ebff79161675e8b23a4cca8f4319121aa6002 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,23 +1,23 @@
 set(dir ${CMAKE_CURRENT_SOURCE_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build")
 
-if (NET STREQUAL "googlenet")
+if ("googlenet" IN_LIST NET)
     # gen test
     ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-googlenet paddle-mobile)
-elseif (NET STREQUAL "mobilenet")
+elseif ("mobilenet" IN_LIST NET)
     # gen test
     ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-mobilenet paddle-mobile)
-elseif (NET STREQUAL "yolo")
+elseif ("yolo" IN_LIST NET)
     # gen test
     ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-yolo paddle-mobile)
-elseif (NET STREQUAL "squeezenet")
+elseif ("squeezenet" IN_LIST NET)
     # gen test
     ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-squeezenet paddle-mobile)
-elseif(NET STREQUAL "resnet")
+elseif("resnet" IN_LIST NET)
     # gen test
     ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-resnet paddle-mobile)
@@ -145,6 +145,10 @@ else ()
     ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-conv-add-relu-op paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
+
     #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 
 endif()
diff --git a/test/executor_for_test.h b/test/executor_for_test.h
index c9ab4783d6826992ee81ffd63b0391169645576c..93847af20a6d48a6df33dc50f6c6a1db76facf51 100644
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -43,7 +43,7 @@ template <typename DeviceType, typename OpType>
 class Executor4Test : public Executor<DeviceType> {
  public:
   Executor4Test(Program<DeviceType> p, string op_type,
-                bool use_optimize = false)
+                bool use_optimize = false, int predict_op_count = 1)
       : Executor<DeviceType>() {
     this->use_optimize_ = use_optimize;
     this->program_ = p;
@@ -57,12 +57,14 @@ class Executor4Test : public Executor<DeviceType> {
       LOG(paddle_mobile::LogLevel::kLOG_ERROR)
           << "to_predict_program_ == nullptr";
     }
+
     const std::vector<std::shared_ptr<BlockDesc>> blocks =
         this->to_predict_program_->Blocks();
     for (std::shared_ptr<BlockDesc> block_desc : blocks) {
       std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      for (std::shared_ptr<OpDesc> op : ops) {
-        if (op->Type() == op_type) {
+      for (int i = 0; i < ops.size(); ++i) {
+        auto op = ops[i];
+        if (op->Type() == op_type && i < predict_op_count) {
           DLOG << "匹配到: " << op->Type();
 
           /// test first meeting op in program
@@ -72,11 +74,17 @@ class Executor4Test : public Executor<DeviceType> {
                       op->Type(), op->GetInputs(), op->GetOutputs(),
                       op->GetAttrMap(), this->program_.scope);
           this->ops_of_block_[*block_desc.get()].push_back(op_ptr);
-          break;
         }
       }
     }
     this->InitMemory();
+
+    std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
+        this->to_predict_program_->Block(0);
+    auto &ops = this->ops_of_block_[*to_predict_block.get()];
+    for (const auto &op : ops) {
+      op->Init();
+    }
   }
 
   template <typename T = LoDTensor>
@@ -130,9 +138,6 @@ class Executor4Test : public Executor<DeviceType> {
     auto *output_tensor = con_output->GetMutable<LoDTensor>();
     output_tensor->mutable_data<float>(dDim);
 
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
     std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
         this->to_predict_program_->Block(0);
     for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size();
@@ -141,6 +146,7 @@ class Executor4Test : public Executor<DeviceType> {
       op->Run();
     }
 
-    return out_tensor;
+    return std::make_shared<paddle_mobile::framework::Tensor>(
+        paddle_mobile::framework::Tensor(*output_tensor));
   }
 };
diff --git a/test/framework/test_load.cpp b/test/framework/test_load.cpp
index f4215de46c2bafd732b0092b58c25bf6fcefdf7a..bea7d4ba7d2df1344f0819222fbdb389106fa77e 100644
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -19,7 +19,9 @@ int main() {
   paddle_mobile::Loader<paddle_mobile::CPU> loader;
   //  ../../../test/models/googlenet
   //  ../../../test/models/mobilenet
-  auto program = loader.Load(g_googlenet, true);
+  //  auto program = loader.Load(g_googlenet, true);
+
+  auto program = loader.Load(g_mobilenet_ssd, true);
   //  auto program = loader.Load(g_googlenet_combine + "/model",
   //  g_googlenet_combine +
   //    "/params", true);
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index 2ab24736397c1e71350335561abbcabcba6e27a4..d230b9469229946fc74f4dc9e1ee6100196ed9aa 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -23,7 +23,7 @@ int main() {
   auto time1 = time();
   if (paddle_mobile.Load(g_googlenet, optimize)) {
     auto time2 = time();
-    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    DLOG << "load cost: " << time_diff(time1, time1) << "ms";
     std::vector<float> input;
     std::vector<int64_t> dims{1, 3, 224, 224};
     GetInput<float>(g_test_image_1x3x224x224, &input, dims);
diff --git a/test/net/test_mobilenet+ssd.cpp b/test/net/test_mobilenet+ssd.cpp
index 1a7c4cd49cb1707b9c7783cf74e87e74da39732e..9b4e5f2d3a431001e138977b78994f5dfedbe0a3 100644
--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
@@ -12,28 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
 int main() {
   paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
   auto time1 = time();
-  if (paddle_mobile.Load(g_mobilenet_ssd, true)) {
+  auto isok = paddle_mobile.Load(
+      std::string(g_mobilenet_ssd_gesture) + "/model",
+      std::string(g_mobilenet_ssd_gesture) + "/params", true);
+  //  auto isok = paddle_mobile.Load(g_mobilenet_ssd, false);
+  if (isok) {
     auto time2 = time();
-    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
 
+    std::vector<float> input;
     std::vector<int64_t> dims{1, 3, 300, 300};
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 300, 300}, static_cast<float>(0),
-                       static_cast<float>(1));
+    GetInput<float>(g_hand, &input, dims);
 
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
     auto time3 = time();
-    paddle_mobile.Predict(input, dims);
+    auto output = paddle_mobile.Predict(input, dims);
     auto time4 = time();
-    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+              << std::endl;
   }
   return 0;
 }
diff --git a/test/net/test_mobilenet.cpp b/test/net/test_mobilenet.cpp
index 2e285695fb79f3ed5471a653c71a10b36ef4e7f2..9fc7226fc12fa7a0c631c9920487c0bd56c90816 100644
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 
@@ -22,20 +22,23 @@ int main() {
   auto time1 = time();
   if (paddle_mobile.Load(g_mobilenet, true)) {
     auto time2 = time();
-    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
 
+    std::vector<float> input;
     std::vector<int64_t> dims{1, 3, 224, 224};
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
-                       static_cast<float>(1));
-
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
-    auto time3 = time();
-    auto vec_result = paddle_mobile.Predict(input, dims);
-    auto time4 = time();
-
-    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
+
+    for (int i = 0; i < 10; ++i) {
+      auto time3 = time();
+      auto vec_result = paddle_mobile.Predict(input, dims);
+      auto time4 = time();
+      std::vector<float>::iterator biggest =
+          std::max_element(std::begin(vec_result), std::end(vec_result));
+      std::cout << " Max element is " << *biggest << " at position "
+                << std::distance(std::begin(vec_result), biggest) << std::endl;
+      std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+                << std::endl;
+    }
   }
 
   return 0;
diff --git a/test/operators/test_fusion_conv_add_bn_relu_op.cpp b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81400d987195364c06b4b93d0859469b43f90e7b
--- /dev/null
+++ b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/fusion_conv_add_bn_relu_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../models/image_classification_resnet.inference.model
+  auto program = loader.Load(g_mobilenet, true);
+
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::FusionConvAddBNReluOp<
+                    paddle_mobile::CPU, float>>
+      executor(program, "fusion_conv_add_bn_relu", true);
+
+  std::cout << "executor 4 test: " << std::endl;
+
+  paddle_mobile::framework::Tensor input;
+  GetInput<float>(g_test_image_1x3x224x224_banana, &input, {1, 3, 224, 224});
+  //  // use SetupTensor if not has local input image .
+  //  SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
+  //                     static_cast<float>(1));
+
+  DLOG << " fuck: " << input;
+
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 112, 112});
+  std::cout << "before predict: " << std::endl;
+  auto output =
+      executor.Predict(input, "data", "conv2_1_dw_bn.tmp_2", out_ddim);
+  std::cout << "after predict " << std::endl;
+  auto output_ptr = output->data<float>();
+
+  int stride = output->numel() / 100;
+  for (int i = 0; i < 100; i++) {
+    DLOG << " index:" << i * stride << " value: " << output_ptr[i * stride];
+  }
+
+  //  for (int i = 0; i < 100; i++) {
+  //    DLOG << " index:" << i << " value: "<< output_ptr[i];
+  //  }
+
+  //  for (int j = 0; j < output->numel(); ++j) {
+  //    std::cout << " (index: " << j << " value: " << output_ptr[j] << ") ";
+  //  }
+  std::cout << std::endl;
+  return 0;
+}
diff --git a/test/test_helper.h b/test/test_helper.h
index 81ad23ff3b4e53db0225630eebaa34878ad4c139..9a5c62c79c44fdf52657ea5facb5f0768810c440 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -16,22 +16,29 @@ limitations under the License. */
 
 #include <fstream>
 #include <random>
+#include <string>
+#include <vector>
 
 #include "common/common.h"
 #include "common/log.h"
 #include "framework/ddim.h"
 #include "framework/tensor.h"
 
-static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd";
-static const std::string g_squeezenet = "../models/squeezenet";
-static const std::string g_googlenet = "../models/googlenet";
-static const std::string g_mobilenet = "../models/mobilenet";
-static const std::string g_resnet_50 = "../models/resnet_50";
-static const std::string g_resnet = "../models/resnet";
-static const std::string g_googlenet_combine = "../models/googlenet_combine";
-static const std::string g_yolo = "../models/yolo";
-static const std::string g_test_image_1x3x224x224 =
+static const char *g_mobilenet_ssd = "../models/mobilenet+ssd";
+static const char *g_mobilenet_ssd_gesture = "../models/mobilenet+ssd_gesture";
+static const char *g_squeezenet = "../models/squeezenet";
+static const char *g_googlenet = "../models/googlenet";
+static const char *g_mobilenet = "../models/mobilenet";
+static const char *g_resnet_50 = "../models/resnet_50";
+static const char *g_resnet = "../models/resnet";
+static const char *g_googlenet_combine = "../models/googlenet_combine";
+static const char *g_yolo = "../models/yolo";
+static const char *g_test_image_1x3x224x224 =
     "../images/test_image_1x3x224x224_float";
+static const char *g_test_image_1x3x224x224_banana =
+    "../images/input_3x224x224_banana";
+static const char *g_hand = "../images/hand_image";
+
 using paddle_mobile::framework::DDim;
 using paddle_mobile::framework::Tensor;
 
@@ -62,9 +69,9 @@ void GetInput(const std::string &input_name, std::vector<T> *input,
     size *= dim;
   }
 
-  T *input_ptr = (T *)malloc(sizeof(T) * size);
+  T *input_ptr = reinterpret_cast<T *>(malloc(sizeof(T) * size));
   std::ifstream in(input_name, std::ios::in | std::ios::binary);
-  in.read((char *)(input_ptr), size * sizeof(T));
+  in.read(reinterpret_cast<char *>(input_ptr), size * sizeof(T));
   in.close();
   for (int i = 0; i < size; ++i) {
     input->push_back(input_ptr[i]);
@@ -79,6 +86,6 @@ void GetInput(const std::string &input_name,
   T *input_ptr = input->mutable_data<T>(dims);
 
   std::ifstream in(input_name, std::ios::in | std::ios::binary);
-  in.read((char *)(input_ptr), input->numel() * sizeof(T));
+  in.read(reinterpret_cast<char *>(input_ptr), input->numel() * sizeof(T));
   in.close();
 }
diff --git a/tools/build.sh b/tools/build.sh
index ce330e6d631ea1009f28ccf987a50e5f79a032b6..db809f71076e6b6d4aacc53bd8e144db3935cb91 100755
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env bash
+NETS=""
+declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet")
 
 build_for_mac() {
     if [ ! `which brew` ]; then
@@ -38,7 +40,8 @@ build_for_android() {
     fi
 
     if [ -z "$PLATFORM" ]; then
-        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" or other platforms from the command line.
+        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" platform.
+#        PLATFORM="arm-v8a"
     fi
 
     if [ "${PLATFORM}" = "arm-v7a" ]; then
@@ -59,7 +62,8 @@ build_for_android() {
     ANDROID_PLATFORM_VERSION="android-22"
     TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
     ANDROID_ARM_MODE="arm"
-    if [ $# -eq 1 ]; then
+
+    if [ "${#NETS}" > 1 ]; then
     cmake .. \
         -B"../build/release/${PLATFORM}" \
         -DANDROID_ABI="${ABI}" \
@@ -69,7 +73,7 @@ build_for_android() {
         -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
         -DANDROID_STL=c++_static \
         -DANDROID=true \
-        -DNET=$1 \
+        -DNET="${NETS}" \
         -D"${ARM_PLATFORM}"=true
     else
 
@@ -92,23 +96,25 @@ build_for_ios() {
 #    rm -rf "../build"
     PLATFORM="ios"
     MODE="Release"
-    BUILD_DIR=../build/release/"${PLATFORM}"
+    BUILD_DIR=../build/release/"${PLATFORM}"/
     TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
     mkdir -p "${BUILD_DIR}"
-    if [ $# -eq 1 ]; then
+    if [ "${#NETS}" > 1 ]; then
         cmake .. \
             -B"${BUILD_DIR}" \
             -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
             -DIOS_PLATFORM=OS \
-            -DNET=$1 \
+            -DIOS_ARCH="${IOS_ARCH}" \
+            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
+            -DNET="${NETS}" \
             -DIS_IOS="true"
     else
         cmake .. \
             -B"${BUILD_DIR}" \
             -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
             -DIOS_PLATFORM=OS \
+            -DIOS_ARCH="${IOS_ARCH}" \
+            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
             -DIS_IOS="true"
     fi
     cd "${BUILD_DIR}"
@@ -120,7 +126,7 @@ build_for_ios() {
 }
 
 build_error() {
-    echo "unknown argument"
+    echo "unknown target : $1"
 }
 
 if [ $# -lt 1 ]; then
@@ -128,31 +134,37 @@ if [ $# -lt 1 ]; then
     echo "available targets: ios|android"
     echo "sample usage: ./build.sh android"
 else
-    if [ $# -eq 2 ]; then
-        if [ $2 != "googlenet" -a $2 != "mobilenet" -a $2 != "yolo" -a $2 != "squeezenet" -a $2 != "resnet" ]; then
-	        if [ $1 = "android" ]; then
-		        build_for_android
-	        elif [ $1 = "ios" ]; then
-		        build_for_ios
-	        else
-		        build_error
-	        fi
-        else
-	        if [ $1 = "android" ]; then
-		        build_for_android $2
-	        elif [ $1 = "ios" ]; then
-		        build_for_ios $2
-	        else
-		        build_error
-	        fi
+    params=($@)
+    for(( i=1; i<$#; i++ )); do  
+        if [ ${i} != 1 ]; then
+            NETS=$NETS$";"
+        fi
+        NETS=$NETS$"${params[i]}"
+    done
+    params=${@:2}
+
+    supported=false
+    for name in ${params[@]}; do
+        for net in ${supportedNets[@]}; do
+            match=false
+            if [ "$name"x = "$net"x ];then
+                supported=true
+                match=true
+                break 1
+            fi
+        done
+        if [ "$match" = false ];then
+            echo "${name} not supported!"
+            echo "supported nets are: ${supportedNets[@]}"
+            exit -1
         fi
+    done
+
+    if [ $1 = "android" ]; then
+        build_for_android
+    elif [ $1 = "ios" ]; then
+        build_for_ios
     else
-	    if [ $1 = "android" ]; then
-		    build_for_android
-	    elif [ $1 = "ios" ]; then
-		    build_for_ios
-	    else
-		    build_error
-	    fi
-	fi
+        build_error "$1"
+    fi
 fi
\ No newline at end of file
diff --git a/tools/ios-cmake/ios.toolchain.cmake b/tools/ios-cmake/ios.toolchain.cmake
index a8735adc8d853a5825a23f1ddf129d0a95199275..4db079d01de8db35fca8fbe63b59e58fd5a3463e 100644
--- a/tools/ios-cmake/ios.toolchain.cmake
+++ b/tools/ios-cmake/ios.toolchain.cmake
@@ -34,6 +34,7 @@ set (CMAKE_SYSTEM_VERSION 1)
 set (UNIX True)
 set (APPLE True)
 set (IOS True)
+set (IOS_ARCH armv7 armv7s arm64)
 
 # Required as of cmake 2.8.10
 set (CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
@@ -159,7 +160,6 @@ set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS su
 
 # set the architecture for iOS
 if (${IOS_PLATFORM} STREQUAL "OS")
-  set (IOS_ARCH armv7 armv7s arm64)
 elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
   set (IOS_ARCH i386)
 elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")
diff --git a/tools/net-detail.awk b/tools/net-detail.awk
new file mode 100644
index 0000000000000000000000000000000000000000..84d0166ac777b5b7fbd9801665031bb2d51fedbb
--- /dev/null
+++ b/tools/net-detail.awk
@@ -0,0 +1,91 @@
+BEGIN {
+print "digraph G {"
+}
+/op:/ {
+    id++
+    opname[id] = $NF
+}
+/input/ {
+    type = "input"
+    para = $NF
+    if (input[id]) {
+        input[id] = input[id] "|"
+    }
+    input[id] = input[id] "<" para ">" para
+}
+/output/ {
+    type = "output"
+    para = $NF
+    if (output[id]) {
+        output[id] = output[id] "|"
+    }
+    output[id] = output[id] "<" para ">" para
+}
+/attr/ {
+    type = "attr"
+    aname = $NF
+    if (attr_key[id]) {
+        attr_key[id] = attr_key[id] "|"
+        attr_value[id] = attr_value[id] "|"
+    }
+    attr_key[id] = attr_key[id] $NF
+}
+/argument/ {
+    if (type == "attr") {
+        split($0, arr, " - ")
+        attr_value[id] = attr_value[id] arr[2]
+    } else if ((type == "input") || (type == "output")) {
+        if (!var2id[$NF]) {
+            var_id++
+            var[var_id] = $NF
+            var2id[$NF] = var_id
+        }
+        varid = var2id[$NF]
+        lid++
+        if (type == "input") {
+            line[lid] = "var_" varid " -> " "op_" id ":<" para ">"
+            if (xout[$NF]) {
+                xi++
+                xline[xi] = "xop_" xout[$NF] " -> " "xop_" id
+            }
+        } else if (type == "output") {
+            line[lid] = "op_" id ":<" para ">" " -> " "var_" varid
+            xout[$NF] = id
+        }
+    }
+}
+/var name/ {
+    varname = $NF
+    vid = var2id[varname]
+}
+/var tensor desc dim / {
+    if (tensor[vid]) tensor[vid] = tensor[vid] " x "
+    tensor[vid] = tensor[vid] $NF
+}
+END {
+
+print "subgraph cluster_G0 {"
+for (i = 1; i <= id; i++) {
+    print "xop_" i "[label=\"" i ". " opname[i] "\"]"
+}
+for (i = 1; i <= xi; i++) {
+    print xline[i]
+}
+print "}"
+
+for (i = 1; i <= id; i++) {
+print "op_" i "[group=op;shape=record;label=\"{{" input[i] "}|<op>" i ". " opname[i] "|{" output[i] "}}\"]"
+}
+for (i = 1; i <= var_id; i++) {
+print "var_" i "[label=\"" var[i] " [" tensor[i] "]\"]"
+}
+for (i = 1; i <= lid; i++) {
+print line[i]
+}
+for (i = 1; i <= id; i++) {
+print "attr_" i "[shape=record;label=\"{" attr_key[i] "}|{" attr_value[i] "}\"]"
+print "attr_" i " -> " "op_" i ":<op>"
+}
+print "}"
+}
+
diff --git a/tools/op.cmake b/tools/op.cmake
index 456d36262e9abf997a7861838c870e698d64f3c1..ec9768443c5e9825931111803acf1f51c1aa1acd 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -1,4 +1,6 @@
-if (NET STREQUAL "googlenet")
+set(FOUND_MATCH OFF)
+if ("googlenet" IN_LIST NET)
+  message("googlenet enabled")
   set(CONCAT_OP ON)
   set(CONV_OP ON)
   set(LRN_OP ON)
@@ -8,8 +10,13 @@ if (NET STREQUAL "googlenet")
   set(POOL_OP ON)
   set(RELU_OP ON)
   set(FUSION_CONVADD_OP ON)
-  set(FUSION_CONVADD_RELU_OP ON)
-elseif (NET STREQUAL "mobilenet")
+  set(FUSION_CONVADDRELU_OP ON)
+
+  set(FOUND_MATCH ON)
+endif()
+
+if ("mobilenet" IN_LIST NET)
+  message("mobilenet enabled")
   set(CONV_OP ON)
   set(ELEMENTWISEADD_OP ON)
   set(RELU_OP ON)
@@ -21,12 +28,23 @@ elseif (NET STREQUAL "mobilenet")
   set(RESHAPE_OP ON)
   set(FUSION_CONVADDBNRELU_OP ON)
   set(FUSION_CONVADD_OP ON)
-elseif (NET STREQUAL "yolo")
+
+  set(FOUND_MATCH ON)
+endif()
+
+
+if ("yolo" IN_LIST NET)
+  message("yolo enabled")
   set(BATCHNORM_OP ON)
   set(CONV_OP ON)
   set(RELU_OP ON)
   set(ELEMENTWISEADD_OP ON)
-elseif (NET STREQUAL "squeezenet")
+
+  set(FOUND_MATCH ON)
+endif()
+
+if ("squeezenet" IN_LIST NET)
+  message("squeezenet enabled")
   set(CONCAT_OP ON)
   set(CONV_OP ON)
   set(RELU_OP ON)
@@ -34,15 +52,45 @@ elseif (NET STREQUAL "squeezenet")
   set(POOL_OP ON)
   set(RESHAPE_OP ON)
   set(SOFTMAX_OP ON)
-elseif (NET STREQUAL "resnet")
+
+  set(FOUND_MATCH ON)
+endif()
+
+
+if ("resnet" IN_LIST NET)
+  message("resnet enabled")
+  set(CONCAT_OP ON)
   set(CONV_OP ON)
-  set(BATCHNORM_OP ON)
+  set(RELU_OP ON)
   set(ELEMENTWISEADD_OP ON)
+  set(POOL_OP ON)
+  set(RESHAPE_OP ON)
   set(SOFTMAX_OP ON)
-  set(MUL_OP ON)
+
+  set(FOUND_MATCH ON)
+endif()
+
+if ("FPGAnets" IN_LIST NET)
+  message("FPGAnets enabled")
+  set(FUSION_CONVADDRELU_OP ON)
+  set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_CONVADDBN_OP ON)
+  set(FUSION_POOLBN_OP ON)
+  set(FUSION_ELEMENTWISEADDRELU_OP ON)
+  set(FUSION_FC_OP ON)
+  set(FUSION_FCRELU_OP ON)
+  set(REGION_OP ON)
   set(POOL_OP ON)
-  set(RELU_OP ON)
-else ()
+  set(CONCAT_OP ON)
+  set(SOFTMAX_OP ON)
+  set(DROPOUT_OP ON)
+
+  set(FOUND_MATCH ON)   
+endif()
+
+
+if(NOT FOUND_MATCH)
+  message("--default--")
   set(BATCHNORM_OP ON)
   set(BOXCODER_OP ON)
   set(CONCAT_OP ON)
@@ -50,7 +98,7 @@ else ()
   set(DEPTHWISECONV_OP ON)
   set(ELEMENTWISEADD_OP ON)
   set(FUSION_CONVADD_OP ON)
-  set(CONVADDRELU_OP ON)
+  set(FUSION_CONVADDRELU_OP ON)
   set(FUSION_FC_OP ON)
   set(LRN_OP ON)
   set(MUL_OP ON)
@@ -62,15 +110,17 @@ else ()
   set(SIGMOID_OP ON)
   set(SOFTMAX_OP ON)
   set(TRANSPOSE_OP ON)
-  set(FUSION_CONVADD_RELU_OP ON)
   set(FUSION_CONVADDBNRELU_OP ON)
   set(FUSION_DWCONVBNRELU_OP ON)
+  set(FUSION_CONVBNRELU_OP ON)
   set(PRELU_OP ON)
   set(RESIZE_OP ON)
   set(SCALE_OP ON)
   set(SLICE_OP ON)
   set(DROPOUT_OP ON)
   set(IM2SEQUENCE_OP ON)
+endif()
+
   # option(BATCHNORM_OP "" ON)
   # option(BOXCODER_OP "" ON)
   # option(CONCAT_OP "" ON)
@@ -78,7 +128,7 @@ else ()
   # option(DEPTHWISECONV_OP "" ON)
   # option(ELEMENTWISEADD_OP "" ON)
   # option(FUSION_CONVADD_OP "" ON)
-  # option(CONVADDRELU_OP "" ON)
+  # option(FUSION_CONVADDRELU_OP "" ON)
   # option(FUSION_FC_OP "" ON)
   # option(LRN_OP "" ON)
   # option(MUL_OP "" ON)
@@ -90,8 +140,7 @@ else ()
   # option(SIGMOID_OP "" ON)
   # option(SOFTMAX_OP "" ON)
   # option(TRANSPOSE_OP "" ON)
-  # option(FUSION_CONVADD_RELU_OP "" ON)
-endif ()
+# endif ()
 
 if (BATCHNORM_OP)
   add_definitions(-DBATCHNORM_OP)
@@ -114,8 +163,8 @@ endif()
 if (FUSION_CONVADD_OP)
   add_definitions(-DFUSION_CONVADD_OP)
 endif()
-if (CONVADDRELU_OP)
-  add_definitions(-DCONVADDRELU_OP)
+if (FUSION_CONVADDRELU_OP)
+  add_definitions(-DFUSION_CONVADDRELU_OP)
 endif()
 if (FUSION_FC_OP)
   add_definitions(-DFUSION_FC_OP)
@@ -150,15 +199,17 @@ endif()
 if (TRANSPOSE_OP)
   add_definitions(-DTRANSPOSE_OP)
 endif()
-if (FUSION_CONVADD_RELU_OP)
-  add_definitions(-DFUSION_CONVADD_RELU_OP)
-endif()
 if (FUSION_CONVADDBNRELU_OP)
   add_definitions(-DFUSION_CONVADDBNRELU_OP)
 endif()
 if (FUSION_DWCONVBNRELU_OP)
   add_definitions(-DFUSION_DWCONVBNRELU_OP)
 endif()
+
+if (FUSION_CONVBNRELU_OP)
+  add_definitions(-DFUSION_CONVBNRELU_OP)
+endif()
+
 if (PRELU_OP)
   add_definitions(-DPRELU_OP)
 endif()
@@ -177,3 +228,20 @@ endif()
 if (IM2SEQUENCE_OP)
   add_definitions(-DIM2SEQUENCE_OP)
 endif()
+
+if (FUSION_CONVADDBN_OP)
+  add_definitions(-DFUSION_CONVADDBN_OP)
+endif()
+if (FUSION_FCRELU_OP)
+  add_definitions(-DFUSION_FCRELU_OP)
+endif()
+if (FUSION_POOLBN_OP)
+  add_definitions(-DFUSION_POOLBN_OP)
+endif()
+if (FUSION_ELEMENTWISEADDRELU_OP)
+  add_definitions(-DFUSION_ELEMENTWISEADDRELU_OP)
+endif()
+if (REGION_OP)
+  add_definitions(-DREGION_OP)
+endif()
+
diff --git a/tools/quantification/CMakeLists.txt b/tools/quantification/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5f1ca7fdc2b65638c7158b0933b924c71eadc4a0
--- /dev/null
+++ b/tools/quantification/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 3.6)
+project(quali)
+add_definitions(-DENABLE_EXCEPTION)
+
+set(CMAKE_CXX_STANDARD 11)
+file(GLOB_RECURSE QULIFICATON_CC src/*.cc src/*.cpp src/*.c src/*.mm)
+file(GLOB_RECURSE QULIFICATON_H src/*.h)
+include_directories(. src/)
+
+#add_library(paddle-mobile SHARED ${QULIFICATON_CC} ${QULIFICATON_H} convert.cpp)
+
+add_executable(quantify convert.cpp ${QULIFICATON_CC} ${QULIFICATON_H})
\ No newline at end of file
diff --git a/tools/quantification/README.md b/tools/quantification/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ac729af01e7e73328b884097009dad1d468e7997
--- /dev/null
+++ b/tools/quantification/README.md
@@ -0,0 +1,39 @@
+# 模型量化脚本
+
+#### 量化脚本使用指南
+1. 在PaddleMobile项目目录下（如 ~/PaddleProject/paddle-mobile）
+
+2. cd到  tools/quantification/ 目录
+
+3. cmake编译
+
+    ``` sh
+    cmake .
+    make
+    ```
+
+4. 运行量化脚本
+    ```sh
+    ./quantify (0:seperated. 1:combined ) (输入路径) (输出路径)
+    # quantify googlenet seperated   from  /Users/xiebaiyuan/PaddleProject/quali/models/googlenet to ./googlenet_min
+    ./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min 
+
+    ```
+
+*注:*
+*量化工具中*
+*1.seperated模型model文件默认命名为 "__model__";*
+*2.combined模型的model文件默认命名为 "model",参数文件默认命名为"params";*
+
+    
+##### 整体如下:
+以googlenet非combined为例：
+
+```sh
+cd tools/quantification/
+cmake .
+make
+./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min
+```
+
+
diff --git a/tools/quantification/convert.cpp b/tools/quantification/convert.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..88eef48b39ab8d2aeb1d4e3858ba97ef6360c9a9
--- /dev/null
+++ b/tools/quantification/convert.cpp
@@ -0,0 +1,275 @@
+
+
+#include "src/enforce.h"
+#include "src/var_desc.h"
+#include "src/program_desc.h"
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <iostream>
+#include <utility>
+#include <vector>
+#include "src/framework.pb-c.h"
+#include "src/protobuf-c.h"
+#include <fstream>
+#include <iostream>
+
+
+const size_t kSize64 = sizeof(uint64_t);
+const size_t kSize32 = sizeof(uint32_t);
+
+char *Get_binary_data(const std::string &filename) {
+
+    FILE *file = fopen(filename.c_str(), "rb");
+
+    PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
+                          filename.c_str());
+    fseek(file, 0, SEEK_END);
+    int64_t size = ftell(file);
+
+    PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
+    rewind(file);
+    auto *data = new char[size];
+    size_t bytes_read = fread(data, 1, static_cast<size_t>(size), file);
+    PADDLE_MOBILE_ENFORCE(bytes_read == size,
+                          "read binary file bytes do not match with fseek");
+    fclose(file);
+    return data;
+}
+
+
+static size_t ReadBuffer(const char *file_name, uint8_t **out) {
+    FILE *fp;
+    fp = fopen(file_name, "rb");
+    PADDLE_MOBILE_ENFORCE(fp != nullptr, " %s open failed !", file_name);
+    fseek(fp, 0, SEEK_END);
+    auto size = static_cast<size_t>(ftell(fp));
+    rewind(fp);
+    *out = reinterpret_cast<uint8_t *>(malloc(size));
+    size_t cur_len = 0;
+    size_t nread;
+    while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
+        cur_len += nread;
+    }
+    fclose(fp);
+    return cur_len;
+}
+
+std::shared_ptr<ProgramDesc> loadParams(const std::string &model_path) {
+    PaddleMobile__Framework__Proto__ProgramDesc *c_program;
+    uint8_t *buf = nullptr;
+    size_t read_size = ReadBuffer(model_path.c_str(), &buf);
+    PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
+    c_program = paddle_mobile__framework__proto__program_desc__unpack(
+            nullptr, read_size, buf);
+    PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
+    auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
+    return originProgramDesc;
+
+}
+
+void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char *dataP, FILE *out_file) {
+    // 1. version
+    uint32_t version = *reinterpret_cast<uint32_t *>(dataP);
+
+    // write version
+    fwrite(&version, kSize32, 1, out_file);
+
+    dataP += kSize32;
+
+    // 2 Lod information
+    auto *lod_level_ptr = new uint64_t();
+    memcpy(lod_level_ptr, dataP, kSize64);
+
+    uint64_t lod_level = 0;
+    // write lod Information
+    fwrite(&lod_level, kSize64, 1, out_file);
+    delete lod_level_ptr;
+
+    dataP += kSize64;
+
+    for (uint64_t i = 0; i < lod_level; ++i) {
+        uint64_t size = *reinterpret_cast<uint64_t *>(dataP);
+        // write lod size
+        fwrite(&size, kSize64, 1, out_file);
+        (dataP) += kSize64;
+
+        std::vector<size_t> tmp(size / sizeof(size_t));
+        for (unsigned long &k : tmp) {
+            k = *reinterpret_cast<size_t *>(dataP);
+            (dataP) += sizeof(size_t);
+        }
+        // write lod size vector
+        fwrite(&tmp, sizeof(size_t), tmp.size(), out_file);
+    }
+
+    // 3. tensor version
+    uint32_t tensor_version = *reinterpret_cast<uint32_t *>(dataP);
+    // write tensor version
+    fwrite(&tensor_version, kSize32, 1, out_file);
+    (dataP) += kSize32;
+
+    // 4. tensor desc
+    int32_t size = *reinterpret_cast<int32_t *>(dataP);
+    // write tensor desc
+    fwrite(&size, sizeof(int32_t), 1, out_file);
+    (dataP) += sizeof(int32_t);
+
+    std::unique_ptr<char[]> buf(new char[size]);
+    for (int m = 0; m < size; ++m) {
+        buf.get()[m] = (dataP)[m];
+    }
+
+    fwrite(buf.get(), sizeof(char), static_cast<size_t>(size), out_file);
+    (dataP) += (sizeof(char) * size);
+
+    const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
+    int memory_size = 1;
+    for (auto l : desc.Dims()) {
+        memory_size *= l;
+    }
+
+    void *memory = nullptr;
+    int type_size = 0;
+    switch (desc.DataType()) {
+        case paddle_mobile::framework::VARTYPE_TYPE_FP16:
+            type_size = 2;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_FP32:
+            type_size = 4;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_FP64:
+            type_size = 8;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_INT32:
+            type_size = 4;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_INT64:
+            type_size = 8;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_BOOL:
+            type_size = 1;
+            break;
+        default:
+            break;
+    }
+    size_t tensorSize = sizeof(char) * memory_size * type_size;
+
+    memory = new char[tensorSize];
+
+    for (int n = 0; n < tensorSize; ++n) {
+        static_cast<char *>(memory)[n] = (dataP)[n];
+    }
+    dataP += tensorSize;
+
+    // for float 32
+    float min_value = std::numeric_limits<float>::max();
+    float max_value = std::numeric_limits<float>::min();
+
+    for (int k = 0; k < memory_size; ++k) {
+        min_value = std::min(min_value, static_cast<float *> (memory)[k]);
+        max_value = std::max(max_value, static_cast<float *> (memory)[k]);
+    }
+
+    fwrite(&min_value, sizeof(float), 1, out_file);
+    fwrite(&max_value, sizeof(float), 1, out_file);
+
+    for (int g = 0; g < memory_size; ++g) {
+        float value = static_cast<float *> (memory)[g];
+        auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
+        fwrite(&factor, sizeof(uint8_t), 1, out_file);
+    }
+}
+
+void
+quantificate_combined(const std::string &model_path, const std::string &param_path, const std::string &param_min_path) {
+
+    auto program = loadParams(model_path);
+    char *origin_data = Get_binary_data(param_path);
+    char *data = origin_data;
+    FILE *out_file = fopen(param_min_path.c_str(), "wb");
+    for (const auto &block : program->Blocks()) {
+        for (const auto &var_desc : block->Vars()) {
+            if (var_desc->Persistable()) {
+                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+                    continue;
+                }
+                LoadWithDump(*var_desc, data, out_file);
+            }
+        }
+    }
+    fclose(out_file);
+    delete origin_data;
+
+}
+
+void quantificate_seperated(const std::string model_dir, const std::string param_min_path) {
+
+    auto program = loadParams(model_dir + "/__model__");
+
+    std::string shell_command = "mkdir " + param_min_path;
+    system(shell_command.c_str());
+
+    for (const auto &block : program->Blocks()) {
+        for (const auto &var_desc : block->Vars()) {
+            if (var_desc->Persistable()) {
+                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+                    continue;
+                }
+                std::string file_name = param_min_path + "/" + var_desc->Name();
+                FILE *out_file = fopen(file_name.c_str(), "wb");
+                char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
+                char *data = origin_data;
+                LoadWithDump(*var_desc, data, out_file);
+                delete origin_data;
+                fclose(out_file);
+            }
+        }
+    }
+
+}
+
+
+int main(int argc, char **argv) {
+
+    const std::string kNoteEg = "( eg:  ./quantify 1 your_combined_model_path output_path  or  ./quantify 0 your_seperated_model_path output_path)";
+
+    PADDLE_MOBILE_ENFORCE(argc > 1, "wee need params.%s ", kNoteEg.c_str());
+
+    std::string action_type = argv[1];
+    PADDLE_MOBILE_ENFORCE(argc > 1 && (action_type) == "1" || action_type == "0",
+                          "only 1 or 2 supported, current is %s %s ",
+                          action_type.c_str(),
+                          kNoteEg.c_str());
+
+    PADDLE_MOBILE_ENFORCE(argc > 2, "we need your model path. %s ", kNoteEg.c_str());
+    std::string base_path = argv[2];
+
+    PADDLE_MOBILE_ENFORCE(argc > 3, "we need your output path. %s ", kNoteEg.c_str());
+    std::string output_path = argv[3];
+
+    if (action_type == "0") {
+        // for seperated
+        const std::string &seperated_min_dir = output_path;
+        quantificate_seperated(base_path, seperated_min_dir);
+        return 0;
+    }
+
+    if (action_type == "1") {
+        // for combined
+        const std::string &combined_min_dir = output_path;
+        std::string model_path = base_path + "/model";
+        std::string param_path = base_path + "/params";
+        quantificate_combined(model_path, param_path, combined_min_dir);
+
+        return 0;
+    }
+
+    return -1;
+}
+
+
+
+
+
+
diff --git a/tools/quantification/src/block_desc_local.cpp b/tools/quantification/src/block_desc_local.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ad1982c05ed0b1b7c7bec5ef26aa8151f941cf3
--- /dev/null
+++ b/tools/quantification/src/block_desc_local.cpp
@@ -0,0 +1,48 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+//
+// Created by 谢柏渊 on 2018/7/25.
+//
+#include "src/block_desc_local.h"
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "src/framework.pb-c.h"
+
+std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>>
+BlockDesc::Vars() const {
+  return vars_;
+}
+
+BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc)
+    : index_(desc->idx), parent_index_(desc->idx) {
+  for (int i = 0; i < desc->n_vars; ++i) {
+    PaddleMobile__Framework__Proto__VarDesc *var_desc = desc->vars[i];
+    vars_.emplace_back(std::shared_ptr<paddle_mobile::framework::VarDesc>(
+        new paddle_mobile::framework::VarDesc(var_desc)));
+  }
+
+  std::sort(vars_.begin(), vars_.end(),
+            [](std::shared_ptr<paddle_mobile::framework::VarDesc> left,
+               std::shared_ptr<paddle_mobile::framework::VarDesc> right) {
+              return left->Name() < right->Name();
+            });
+
+  //        for (int j = 0; j < desc->n_ops; ++j) {
+  //            PaddleMobile__Framework__Proto__OpDesc *op_desc = desc->ops[j];
+  //            ops_.emplace_back(new OpDesc(op_desc));
+  //        }
+}
diff --git a/tools/quantification/src/block_desc_local.h b/tools/quantification/src/block_desc_local.h
new file mode 100644
index 0000000000000000000000000000000000000000..41c2dc0abbdf8bb006f4152674e92dd1f7d01500
--- /dev/null
+++ b/tools/quantification/src/block_desc_local.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+//
+// Created by 谢柏渊 on 2018/7/25.
+//
+
+#ifndef TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
+#define TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
+
+#include <vector>
+#include "src/var_desc.h"
+
+class BlockDesc {
+ public:
+  friend class Node;
+  friend class ProgramOptimize;
+  BlockDesc() {}
+  explicit BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc);
+
+  const int &ID() const { return index_; }
+
+  const bool &MultiThread() const { return multi_thread_; }
+
+  const int &Parent() const { return parent_index_; }
+
+  bool operator==(const BlockDesc &in_block) const {
+    return this->ID() == in_block.ID() && this->Parent() == in_block.Parent();
+  }
+
+  bool operator<(const BlockDesc &in_block) const {
+    return this->ID() < in_block.ID() && this->Parent() < in_block.Parent();
+  }
+
+  std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>> Vars() const;
+
+ private:
+  int index_;
+  bool multi_thread_;
+  int parent_index_;
+  std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>> vars_;
+};
+
+#endif  // TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
diff --git a/tools/quantification/src/enforce.h b/tools/quantification/src/enforce.h
new file mode 100644
index 0000000000000000000000000000000000000000..51d2110e32433686d1b3353bc63b92a564a13e9d
--- /dev/null
+++ b/tools/quantification/src/enforce.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef ENABLE_EXCEPTION
+#include <stdio.h>
+#include <exception>
+#include <string>
+
+#endif
+
+namespace paddle_mobile {
+
+#ifdef ENABLE_EXCEPTION
+struct PaddleMobileException : public std::exception {
+  const std::string exception_prefix = "paddle mobile C++ Exception: \n";
+  std::string message;
+
+  PaddleMobileException(const char *header, const char *detail,
+                        const char *file, const int line) {
+    char buffer[1500];
+    snprintf(buffer, sizeof(buffer),
+             "%s| %s \n| [in file] : %s\n| [on line] : %d\n| [detail]  : %s\n",
+             exception_prefix.c_str(), header, file, line, detail);
+    message = std::string(buffer);
+  }
+  const char *what() const noexcept { return message.c_str(); }
+};
+
+#define PADDLE_MOBILE_THROW_EXCEPTION(...)                                 \
+  {                                                                        \
+    char buffer[1000];                                                     \
+    snprintf(buffer, sizeof(buffer), __VA_ARGS__);                         \
+    std::string detail(buffer);                                            \
+    throw paddle_mobile::PaddleMobileException("Custom Exception", buffer, \
+                                               __FILE__, __LINE__);        \
+  }
+
+#define PADDLE_MOBILE_ENFORCE(stat, ...)                                      \
+  {                                                                           \
+    if (stat) {                                                               \
+    } else {                                                                  \
+      char buffer[1000];                                                      \
+      snprintf(buffer, sizeof(buffer), __VA_ARGS__);                          \
+      std::string detail(buffer);                                             \
+      throw paddle_mobile::PaddleMobileException("paddle-mobile enforce",     \
+                                                 buffer, __FILE__, __LINE__); \
+    }                                                                         \
+  }
+#else
+#define PADDLE_MOBILE_THROW_EXCEPTION(...)
+#define PADDLE_MOBILE_ENFORCE(stat, ...)
+#endif
+
+}  // namespace paddle_mobile
diff --git a/tools/quantification/src/framework.pb-c.c b/tools/quantification/src/framework.pb-c.c
new file mode 100644
index 0000000000000000000000000000000000000000..aed0a6c9c0614da74a82cea8c7aa705978dddafc
--- /dev/null
+++ b/tools/quantification/src/framework.pb-c.c
@@ -0,0 +1,1403 @@
+/* Generated by the protocol buffer compiler.  DO NOT EDIT! */
+/* Generated from: framework.proto */
+
+/* Do not generate deprecated warnings for self */
+#ifndef PROTOBUF_C__NO_DEPRECATED
+#define PROTOBUF_C__NO_DEPRECATED
+#endif
+
+#include "framework.pb-c.h"
+void paddle_mobile__framework__proto__op_desc__attr__init(
+    PaddleMobile__Framework__Proto__OpDesc__Attr *message) {
+  static const PaddleMobile__Framework__Proto__OpDesc__Attr init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__op_desc__var__init(
+    PaddleMobile__Framework__Proto__OpDesc__Var *message) {
+  static const PaddleMobile__Framework__Proto__OpDesc__Var init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__op_desc__init(
+    PaddleMobile__Framework__Proto__OpDesc *message) {
+  static const PaddleMobile__Framework__Proto__OpDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__OpDesc *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__op_desc__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+
+PaddleMobile__Framework__Proto__OpDesc *
+paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
+                                                 size_t len,
+                                                 const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__OpDesc *)protobuf_c_message_unpack(
+      &paddle_mobile__framework__proto__op_desc__descriptor, allocator, len,
+      data);
+}
+void paddle_mobile__framework__proto__op_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__OpDesc *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__op_desc__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+void paddle_mobile__framework__proto__op_proto__var__init(
+    PaddleMobile__Framework__Proto__OpProto__Var *message) {
+  static const PaddleMobile__Framework__Proto__OpProto__Var init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__op_proto__attr__init(
+    PaddleMobile__Framework__Proto__OpProto__Attr *message) {
+  static const PaddleMobile__Framework__Proto__OpProto__Attr init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__op_proto__init(
+    PaddleMobile__Framework__Proto__OpProto *message) {
+  static const PaddleMobile__Framework__Proto__OpProto init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
+    const PaddleMobile__Framework__Proto__OpProto *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__op_proto__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+
+PaddleMobile__Framework__Proto__OpProto *
+paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__OpProto *)protobuf_c_message_unpack(
+      &paddle_mobile__framework__proto__op_proto__descriptor, allocator, len,
+      data);
+}
+void paddle_mobile__framework__proto__op_proto__free_unpacked(
+    PaddleMobile__Framework__Proto__OpProto *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__op_proto__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+void paddle_mobile__framework__proto__var_type__tensor_desc__init(
+    PaddleMobile__Framework__Proto__VarType__TensorDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarType__TensorDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init(
+    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc
+      init_value =
+          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init(
+    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc
+      init_value =
+          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__reader_desc__init(
+    PaddleMobile__Framework__Proto__VarType__ReaderDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarType__ReaderDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__channel_desc__init(
+    PaddleMobile__Framework__Proto__VarType__ChannelDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarType__ChannelDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__tuple__init(
+    PaddleMobile__Framework__Proto__VarType__Tuple *message) {
+  static const PaddleMobile__Framework__Proto__VarType__Tuple init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT;
+  *message = init_value;
+}
+void paddle_mobile__framework__proto__var_type__init(
+    PaddleMobile__Framework__Proto__VarType *message) {
+  static const PaddleMobile__Framework__Proto__VarType init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__var_type__get_packed_size(
+    const PaddleMobile__Framework__Proto__VarType *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__var_type__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+PaddleMobile__Framework__Proto__VarType *
+paddle_mobile__framework__proto__var_type__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__VarType *)protobuf_c_message_unpack(
+      &paddle_mobile__framework__proto__var_type__descriptor, allocator, len,
+      data);
+}
+void paddle_mobile__framework__proto__var_type__free_unpacked(
+    PaddleMobile__Framework__Proto__VarType *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__var_type__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+void paddle_mobile__framework__proto__var_desc__init(
+    PaddleMobile__Framework__Proto__VarDesc *message) {
+  static const PaddleMobile__Framework__Proto__VarDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__VarDesc *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__var_desc__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+
+PaddleMobile__Framework__Proto__VarDesc *
+paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__VarDesc *)protobuf_c_message_unpack(
+      &paddle_mobile__framework__proto__var_desc__descriptor, allocator, len,
+      data);
+}
+void paddle_mobile__framework__proto__var_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__VarDesc *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__var_desc__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+void paddle_mobile__framework__proto__block_desc__init(
+    PaddleMobile__Framework__Proto__BlockDesc *message) {
+  static const PaddleMobile__Framework__Proto__BlockDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__BlockDesc *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__block_desc__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+
+PaddleMobile__Framework__Proto__BlockDesc *
+paddle_mobile__framework__proto__block_desc__unpack(
+    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__BlockDesc *)protobuf_c_message_unpack(
+      &paddle_mobile__framework__proto__block_desc__descriptor, allocator, len,
+      data);
+}
+void paddle_mobile__framework__proto__block_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__BlockDesc *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__block_desc__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+void paddle_mobile__framework__proto__program_desc__init(
+    PaddleMobile__Framework__Proto__ProgramDesc *message) {
+  static const PaddleMobile__Framework__Proto__ProgramDesc init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__ProgramDesc *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__program_desc__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+
+PaddleMobile__Framework__Proto__ProgramDesc *
+paddle_mobile__framework__proto__program_desc__unpack(
+    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__ProgramDesc *)
+      protobuf_c_message_unpack(
+          &paddle_mobile__framework__proto__program_desc__descriptor, allocator,
+          len, data);
+}
+void paddle_mobile__framework__proto__program_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__ProgramDesc *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__program_desc__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_desc__attr__field_descriptors[12] = {
+        {
+            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, name), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, type),
+            &paddle_mobile__framework__proto__attr_type__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "i", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_i),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, i), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "f", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_FLOAT,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_f),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, f), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "s", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, s), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "ints", 6, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT32,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_ints),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, ints), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "floats", 7, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_FLOAT,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_floats),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, floats),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "strings", 8, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_strings),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, strings),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "b", 10, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_b),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, b), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "bools", 11, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_bools),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, bools), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "block_idx", 12, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr,
+                     has_block_idx),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, block_idx),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "l", 13, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT64,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_l),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, l), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name[] = {
+        8,  /* field[8] = b */
+        10, /* field[10] = block_idx */
+        9,  /* field[9] = bools */
+        3,  /* field[3] = f */
+        6,  /* field[6] = floats */
+        2,  /* field[2] = i */
+        5,  /* field[5] = ints */
+        11, /* field[11] = l */
+        0,  /* field[0] = name */
+        4,  /* field[4] = s */
+        7,  /* field[7] = strings */
+        1,  /* field[1] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_desc__attr__number_ranges[2 + 1] = {
+        {1, 0}, {10, 8}, {0, 12}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__attr__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpDesc.Attr",
+        "Attr",
+        "PaddleMobile__Framework__Proto__OpDesc__Attr",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpDesc__Attr),
+        12,
+        paddle_mobile__framework__proto__op_desc__attr__field_descriptors,
+        paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name,
+        2,
+        paddle_mobile__framework__proto__op_desc__attr__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__op_desc__attr__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_desc__var__field_descriptors[2] = {
+        {
+            "parameter", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, parameter),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "arguments", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, n_arguments),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, arguments),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_desc__var__field_indices_by_name[] = {
+        1, /* field[1] = arguments */
+        0, /* field[0] = parameter */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_desc__var__number_ranges[1 + 1] = {
+        {1, 0}, {0, 2}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__var__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpDesc.Var",
+        "Var",
+        "PaddleMobile__Framework__Proto__OpDesc__Var",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpDesc__Var),
+        2,
+        paddle_mobile__framework__proto__op_desc__var__field_descriptors,
+        paddle_mobile__framework__proto__op_desc__var__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__op_desc__var__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__op_desc__var__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__op_desc__is_target__default_value = 0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_desc__field_descriptors[5] = {
+        {
+            "inputs", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_inputs),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, inputs),
+            &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "outputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_outputs),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, outputs),
+            &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "type", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, type), NULL, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_attrs),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, attrs),
+            &paddle_mobile__framework__proto__op_desc__attr__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "is_target", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, has_is_target),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc, is_target), NULL,
+            &paddle_mobile__framework__proto__op_desc__is_target__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_desc__field_indices_by_name[] = {
+        3, /* field[3] = attrs */
+        0, /* field[0] = inputs */
+        4, /* field[4] = is_target */
+        1, /* field[1] = outputs */
+        2, /* field[2] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_desc__number_ranges[1 + 1] = {{1, 0},
+                                                                      {0, 5}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpDesc",
+        "OpDesc",
+        "PaddleMobile__Framework__Proto__OpDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpDesc),
+        5,
+        paddle_mobile__framework__proto__op_desc__field_descriptors,
+        paddle_mobile__framework__proto__op_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__op_desc__number_ranges,
+        (ProtobufCMessageInit)paddle_mobile__framework__proto__op_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__op_proto__var__duplicable__default_value =
+        0;
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__op_proto__var__intermediate__default_value =
+        0;
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__op_proto__var__dispensable__default_value =
+        0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_proto__var__field_descriptors[5] = {
+        {
+            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, name), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "comment", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, comment),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "duplicable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
+                     has_duplicable),
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, duplicable),
+            NULL,
+            &paddle_mobile__framework__proto__op_proto__var__duplicable__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "intermediate", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
+                     has_intermediate),
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
+                     intermediate),
+            NULL,
+            &paddle_mobile__framework__proto__op_proto__var__intermediate__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "dispensable", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
+                     has_dispensable),
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, dispensable),
+            NULL,
+            &paddle_mobile__framework__proto__op_proto__var__dispensable__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_proto__var__field_indices_by_name[] = {
+        1, /* field[1] = comment */
+        4, /* field[4] = dispensable */
+        2, /* field[2] = duplicable */
+        3, /* field[3] = intermediate */
+        0, /* field[0] = name */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_proto__var__number_ranges[1 + 1] = {
+        {1, 0}, {0, 5}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__var__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpProto.Var",
+        "Var",
+        "PaddleMobile__Framework__Proto__OpProto__Var",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpProto__Var),
+        5,
+        paddle_mobile__framework__proto__op_proto__var__field_descriptors,
+        paddle_mobile__framework__proto__op_proto__var__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__op_proto__var__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__op_proto__var__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__op_proto__attr__generated__default_value =
+        0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_proto__attr__field_descriptors[4] = {
+        {
+            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, name), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, type),
+            &paddle_mobile__framework__proto__attr_type__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "comment", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, comment),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "generated", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr,
+                     has_generated),
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, generated),
+            NULL,
+            &paddle_mobile__framework__proto__op_proto__attr__generated__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name[] = {
+        2, /* field[2] = comment */
+        3, /* field[3] = generated */
+        0, /* field[0] = name */
+        1, /* field[1] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_proto__attr__number_ranges[1 + 1] = {
+        {1, 0}, {0, 4}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__attr__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpProto.Attr",
+        "Attr",
+        "PaddleMobile__Framework__Proto__OpProto__Attr",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpProto__Attr),
+        4,
+        paddle_mobile__framework__proto__op_proto__attr__field_descriptors,
+        paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__op_proto__attr__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__op_proto__attr__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_proto__field_descriptors[5] = {
+        {
+            "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto, type), NULL, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "inputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpProto, n_inputs),
+            offsetof(PaddleMobile__Framework__Proto__OpProto, inputs),
+            &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "outputs", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpProto, n_outputs),
+            offsetof(PaddleMobile__Framework__Proto__OpProto, outputs),
+            &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__OpProto, n_attrs),
+            offsetof(PaddleMobile__Framework__Proto__OpProto, attrs),
+            &paddle_mobile__framework__proto__op_proto__attr__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "comment", 5, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto, comment), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__op_proto__field_indices_by_name[] = {
+        3, /* field[3] = attrs */
+        4, /* field[4] = comment */
+        1, /* field[1] = inputs */
+        2, /* field[2] = outputs */
+        0, /* field[0] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__op_proto__number_ranges[1 + 1] = {{1, 0},
+                                                                       {0, 5}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.OpProto",
+        "OpProto",
+        "PaddleMobile__Framework__Proto__OpProto",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__OpProto),
+        5,
+        paddle_mobile__framework__proto__op_proto__field_descriptors,
+        paddle_mobile__framework__proto__op_proto__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__op_proto__number_ranges,
+        (ProtobufCMessageInit)paddle_mobile__framework__proto__op_proto__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors
+        [2] = {
+            {
+                "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
+                0, /* quantifier_offset */
+                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
+                         data_type),
+                &paddle_mobile__framework__proto__var_type__type__descriptor,
+                NULL, 0,      /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+            {
+                "dims", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT64,
+                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
+                         n_dims),
+                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
+                         dims),
+                NULL, NULL, 0, /* flags */
+                0, NULL, NULL  /* reserved1,reserved2, etc */
+            },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name
+        [] = {
+            0, /* field[0] = data_type */
+            1, /* field[1] = dims */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges[1 +
+                                                                          1] = {
+        {1, 0}, {0, 2}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__tensor_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.TensorDesc",
+        "TensorDesc",
+        "PaddleMobile__Framework__Proto__VarType__TensorDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__TensorDesc),
+        2,
+        paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__tensor_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const int32_t
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value =
+        0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors
+        [2] = {
+            {
+                "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
+                0, /* quantifier_offset */
+                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
+                         tensor),
+                &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
+                NULL, 0,      /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+            {
+                "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL,
+                PROTOBUF_C_TYPE_INT32,
+                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
+                         has_lod_level),
+                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
+                         lod_level),
+                NULL,
+                &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value,
+                0,            /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name
+        [] = {
+            1, /* field[1] = lod_level */
+            0, /* field[0] = tensor */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges
+        [1 + 1] = {{1, 0}, {0, 2}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.LoDTensorDesc",
+        "LoDTensorDesc",
+        "PaddleMobile__Framework__Proto__VarType__LoDTensorDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc),
+        2,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const int32_t
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value =
+        0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors
+        [2] = {
+            {
+                "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
+                0, /* quantifier_offset */
+                offsetof(
+                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
+                    tensor),
+                &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
+                NULL, 0,      /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+            {
+                "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL,
+                PROTOBUF_C_TYPE_INT32,
+                offsetof(
+                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
+                    has_lod_level),
+                offsetof(
+                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
+                    lod_level),
+                NULL,
+                &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value,
+                0,            /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name
+        [] = {
+            1, /* field[1] = lod_level */
+            0, /* field[0] = tensor */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges
+        [1 + 1] = {{1, 0}, {0, 2}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc",
+        "LoDTensorArrayDesc",
+        "PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc),
+        2,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors[1] = {
+        {
+            "lod_tensor", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc,
+                     n_lod_tensor),
+            offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc,
+                     lod_tensor),
+            &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name
+        [] = {
+            0, /* field[0] = lod_tensor */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__reader_desc__number_ranges[1 +
+                                                                          1] = {
+        {1, 0}, {0, 1}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__reader_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.ReaderDesc",
+        "ReaderDesc",
+        "PaddleMobile__Framework__Proto__VarType__ReaderDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__ReaderDesc),
+        1,
+        paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__reader_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__reader_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors
+        [2] = {
+            {
+                "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
+                0, /* quantifier_offset */
+                offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc,
+                         data_type),
+                &paddle_mobile__framework__proto__var_type__type__descriptor,
+                NULL, 0,      /* flags */
+                0, NULL, NULL /* reserved1,reserved2, etc */
+            },
+            {
+                "capacity", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT64,
+                0, /* quantifier_offset */
+                offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc,
+                         capacity),
+                NULL, NULL, 0, /* flags */
+                0, NULL, NULL  /* reserved1,reserved2, etc */
+            },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name
+        [] = {
+            1, /* field[1] = capacity */
+            0, /* field[0] = data_type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__channel_desc__number_ranges[1 +
+                                                                           1] =
+        {{1, 0}, {0, 2}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__channel_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.ChannelDesc",
+        "ChannelDesc",
+        "PaddleMobile__Framework__Proto__VarType__ChannelDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__ChannelDesc),
+        2,
+        paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__channel_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__channel_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__tuple__field_descriptors[1] = {
+        {
+            "element_type", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_ENUM,
+            offsetof(PaddleMobile__Framework__Proto__VarType__Tuple,
+                     n_element_type),
+            offsetof(PaddleMobile__Framework__Proto__VarType__Tuple,
+                     element_type),
+            &paddle_mobile__framework__proto__var_type__type__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name[] =
+        {
+            0, /* field[0] = element_type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__tuple__number_ranges[1 + 1] = {
+        {1, 0}, {0, 1}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__tuple__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.Tuple",
+        "Tuple",
+        "PaddleMobile__Framework__Proto__VarType__Tuple",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType__Tuple),
+        1,
+        paddle_mobile__framework__proto__var_type__tuple__field_descriptors,
+        paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__tuple__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__var_type__tuple__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCEnumValue
+    paddle_mobile__framework__proto__var_type__type__enum_values_by_number[19] =
+        {
+            {"BOOL", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL",
+             0},
+            {"INT16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16",
+             1},
+            {"INT32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32",
+             2},
+            {"INT64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64",
+             3},
+            {"FP16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16",
+             4},
+            {"FP32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32",
+             5},
+            {"FP64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64",
+             6},
+            {"LOD_TENSOR",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR", 7},
+            {"SELECTED_ROWS",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS",
+             8},
+            {"FEED_MINIBATCH",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH",
+             9},
+            {"FETCH_LIST",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST", 10},
+            {"STEP_SCOPES",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES",
+             11},
+            {"LOD_RANK_TABLE",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE",
+             12},
+            {"LOD_TENSOR_ARRAY",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_"
+             "ARRAY",
+             13},
+            {"PLACE_LIST",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST", 14},
+            {"READER",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER", 15},
+            {"CHANNEL",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL", 16},
+            {"RAW", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW", 17},
+            {"TUPLE", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE",
+             18},
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__type__value_ranges[] = {{0, 0},
+                                                                       {0, 19}};
+static const ProtobufCEnumValueIndex
+    paddle_mobile__framework__proto__var_type__type__enum_values_by_name[19] = {
+        {"BOOL", 0},
+        {"CHANNEL", 16},
+        {"FEED_MINIBATCH", 9},
+        {"FETCH_LIST", 10},
+        {"FP16", 4},
+        {"FP32", 5},
+        {"FP64", 6},
+        {"INT16", 1},
+        {"INT32", 2},
+        {"INT64", 3},
+        {"LOD_RANK_TABLE", 12},
+        {"LOD_TENSOR", 7},
+        {"LOD_TENSOR_ARRAY", 13},
+        {"PLACE_LIST", 14},
+        {"RAW", 17},
+        {"READER", 15},
+        {"SELECTED_ROWS", 8},
+        {"STEP_SCOPES", 11},
+        {"TUPLE", 18},
+};
+const ProtobufCEnumDescriptor
+    paddle_mobile__framework__proto__var_type__type__descriptor = {
+        PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType.Type",
+        "Type",
+        "PaddleMobile__Framework__Proto__VarType__Type",
+        "paddle_mobile.framework.proto",
+        19,
+        paddle_mobile__framework__proto__var_type__type__enum_values_by_number,
+        19,
+        paddle_mobile__framework__proto__var_type__type__enum_values_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__type__value_ranges,
+        NULL,
+        NULL,
+        NULL,
+        NULL /* reserved[1234] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_type__field_descriptors[7] = {
+        {
+            "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, type),
+            &paddle_mobile__framework__proto__var_type__type__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "selected_rows", 2, PROTOBUF_C_LABEL_OPTIONAL,
+            PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, selected_rows),
+            &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "lod_tensor", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, lod_tensor),
+            &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "tensor_array", 4, PROTOBUF_C_LABEL_OPTIONAL,
+            PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, tensor_array),
+            &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "reader", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, reader),
+            &paddle_mobile__framework__proto__var_type__reader_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "channel", 6, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, channel),
+            &paddle_mobile__framework__proto__var_type__channel_desc__descriptor,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "tuple", 7, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarType, tuple),
+            &paddle_mobile__framework__proto__var_type__tuple__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_type__field_indices_by_name[] = {
+        5, /* field[5] = channel */
+        2, /* field[2] = lod_tensor */
+        4, /* field[4] = reader */
+        1, /* field[1] = selected_rows */
+        3, /* field[3] = tensor_array */
+        6, /* field[6] = tuple */
+        0, /* field[0] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_type__number_ranges[1 + 1] = {{1, 0},
+                                                                       {0, 7}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarType",
+        "VarType",
+        "PaddleMobile__Framework__Proto__VarType",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarType),
+        7,
+        paddle_mobile__framework__proto__var_type__field_descriptors,
+        paddle_mobile__framework__proto__var_type__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_type__number_ranges,
+        (ProtobufCMessageInit)paddle_mobile__framework__proto__var_type__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const protobuf_c_boolean
+    paddle_mobile__framework__proto__var_desc__persistable__default_value = 0;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__var_desc__field_descriptors[3] = {
+        {
+            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarDesc, name), NULL, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__VarDesc, type),
+            &paddle_mobile__framework__proto__var_type__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "persistable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
+            offsetof(PaddleMobile__Framework__Proto__VarDesc, has_persistable),
+            offsetof(PaddleMobile__Framework__Proto__VarDesc, persistable),
+            NULL,
+            &paddle_mobile__framework__proto__var_desc__persistable__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__var_desc__field_indices_by_name[] = {
+        0, /* field[0] = name */
+        2, /* field[2] = persistable */
+        1, /* field[1] = type */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__var_desc__number_ranges[1 + 1] = {{1, 0},
+                                                                       {0, 3}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.VarDesc",
+        "VarDesc",
+        "PaddleMobile__Framework__Proto__VarDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__VarDesc),
+        3,
+        paddle_mobile__framework__proto__var_desc__field_descriptors,
+        paddle_mobile__framework__proto__var_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__var_desc__number_ranges,
+        (ProtobufCMessageInit)paddle_mobile__framework__proto__var_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const int32_t
+    paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value =
+        -1;
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__block_desc__field_descriptors[5] = {
+        {
+            "idx", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, idx), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "parent_idx", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, parent_idx),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
+        {
+            "vars", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_vars),
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, vars),
+            &paddle_mobile__framework__proto__var_desc__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "ops", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_ops),
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc, ops),
+            &paddle_mobile__framework__proto__op_desc__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+        {
+            "forward_block_idx", 5, PROTOBUF_C_LABEL_OPTIONAL,
+            PROTOBUF_C_TYPE_INT32,
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc,
+                     has_forward_block_idx),
+            offsetof(PaddleMobile__Framework__Proto__BlockDesc,
+                     forward_block_idx),
+            NULL,
+            &paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__block_desc__field_indices_by_name[] = {
+        4, /* field[4] = forward_block_idx */
+        0, /* field[0] = idx */
+        3, /* field[3] = ops */
+        1, /* field[1] = parent_idx */
+        2, /* field[2] = vars */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__block_desc__number_ranges[1 + 1] = {
+        {1, 0}, {0, 5}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__block_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.BlockDesc",
+        "BlockDesc",
+        "PaddleMobile__Framework__Proto__BlockDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__BlockDesc),
+        5,
+        paddle_mobile__framework__proto__block_desc__field_descriptors,
+        paddle_mobile__framework__proto__block_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__block_desc__number_ranges,
+        (ProtobufCMessageInit)paddle_mobile__framework__proto__block_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__program_desc__field_descriptors[1] = {
+        {
+            "blocks", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
+            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, n_blocks),
+            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, blocks),
+            &paddle_mobile__framework__proto__block_desc__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__program_desc__field_indices_by_name[] = {
+        0, /* field[0] = blocks */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__program_desc__number_ranges[1 + 1] = {
+        {1, 0}, {0, 1}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__program_desc__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.ProgramDesc",
+        "ProgramDesc",
+        "PaddleMobile__Framework__Proto__ProgramDesc",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__ProgramDesc),
+        1,
+        paddle_mobile__framework__proto__program_desc__field_descriptors,
+        paddle_mobile__framework__proto__program_desc__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__program_desc__number_ranges,
+        (ProtobufCMessageInit)
+            paddle_mobile__framework__proto__program_desc__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCEnumValue
+    paddle_mobile__framework__proto__attr_type__enum_values_by_number[10] = {
+        {"INT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT", 0},
+        {"FLOAT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT", 1},
+        {"STRING", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING", 2},
+        {"INTS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS", 3},
+        {"FLOATS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS", 4},
+        {"STRINGS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS", 5},
+        {"BOOLEAN", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN", 6},
+        {"BOOLEANS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS", 7},
+        {"BLOCK", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK", 8},
+        {"LONG", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG", 9},
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__attr_type__value_ranges[] = {{0, 0},
+                                                                  {0, 10}};
+static const ProtobufCEnumValueIndex
+    paddle_mobile__framework__proto__attr_type__enum_values_by_name[10] = {
+        {"BLOCK", 8},  {"BOOLEAN", 6}, {"BOOLEANS", 7}, {"FLOAT", 1},
+        {"FLOATS", 4}, {"INT", 0},     {"INTS", 3},     {"LONG", 9},
+        {"STRING", 2}, {"STRINGS", 5},
+};
+const ProtobufCEnumDescriptor
+    paddle_mobile__framework__proto__attr_type__descriptor = {
+        PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.AttrType",
+        "AttrType",
+        "PaddleMobile__Framework__Proto__AttrType",
+        "paddle_mobile.framework.proto",
+        10,
+        paddle_mobile__framework__proto__attr_type__enum_values_by_number,
+        10,
+        paddle_mobile__framework__proto__attr_type__enum_values_by_name,
+        1,
+        paddle_mobile__framework__proto__attr_type__value_ranges,
+        NULL,
+        NULL,
+        NULL,
+        NULL /* reserved[1234] */
+};
diff --git a/tools/quantification/src/framework.pb-c.h b/tools/quantification/src/framework.pb-c.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d63bad76ad188d02986971bd911d8f30cf0af81
--- /dev/null
+++ b/tools/quantification/src/framework.pb-c.h
@@ -0,0 +1,579 @@
+/* Generated by the protocol buffer compiler.  DO NOT EDIT! */
+/* Generated from: framework.proto */
+
+#ifndef PROTOBUF_C_framework_2eproto__INCLUDED
+#define PROTOBUF_C_framework_2eproto__INCLUDED
+
+#include "protobuf-c.h"
+
+PROTOBUF_C__BEGIN_DECLS
+
+#if PROTOBUF_C_VERSION_NUMBER < 1000000
+# error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers.
+#elif 1003000 < PROTOBUF_C_MIN_COMPILER_VERSION
+# error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c.
+#endif
+
+typedef struct _PaddleMobile__Framework__Proto__OpDesc
+    PaddleMobile__Framework__Proto__OpDesc;
+typedef struct _PaddleMobile__Framework__Proto__OpDesc__Attr
+    PaddleMobile__Framework__Proto__OpDesc__Attr;
+typedef struct _PaddleMobile__Framework__Proto__OpDesc__Var
+    PaddleMobile__Framework__Proto__OpDesc__Var;
+typedef struct _PaddleMobile__Framework__Proto__OpProto
+    PaddleMobile__Framework__Proto__OpProto;
+typedef struct _PaddleMobile__Framework__Proto__OpProto__Var
+    PaddleMobile__Framework__Proto__OpProto__Var;
+typedef struct _PaddleMobile__Framework__Proto__OpProto__Attr
+    PaddleMobile__Framework__Proto__OpProto__Attr;
+typedef struct _PaddleMobile__Framework__Proto__VarType
+    PaddleMobile__Framework__Proto__VarType;
+typedef struct _PaddleMobile__Framework__Proto__VarType__TensorDesc
+    PaddleMobile__Framework__Proto__VarType__TensorDesc;
+typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc
+    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc;
+typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc
+    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc;
+typedef struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc
+    PaddleMobile__Framework__Proto__VarType__ReaderDesc;
+typedef struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc
+    PaddleMobile__Framework__Proto__VarType__ChannelDesc;
+typedef struct _PaddleMobile__Framework__Proto__VarType__Tuple
+    PaddleMobile__Framework__Proto__VarType__Tuple;
+typedef struct _PaddleMobile__Framework__Proto__VarDesc
+    PaddleMobile__Framework__Proto__VarDesc;
+typedef struct _PaddleMobile__Framework__Proto__BlockDesc
+    PaddleMobile__Framework__Proto__BlockDesc;
+typedef struct _PaddleMobile__Framework__Proto__ProgramDesc
+    PaddleMobile__Framework__Proto__ProgramDesc;
+
+/* --- enums --- */
+
+typedef enum _PaddleMobile__Framework__Proto__VarType__Type {
+  /*
+   * Pod Types
+   */
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL = 0,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16 = 1,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32 = 2,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64 = 3,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16 = 4,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32 = 5,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64 = 6,
+  /*
+   * Other types that may need additional descriptions
+   */
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR = 7,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS = 8,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH = 9,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST = 10,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES = 11,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE = 12,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_ARRAY = 13,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST = 14,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER = 15,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL = 16,
+  /*
+   * Any runtime decided variable type is raw
+   * raw variables should manage their own allocations
+   * in operators like nccl_op
+   */
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW = 17,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE =
+      18 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
+          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE)
+} PaddleMobile__Framework__Proto__VarType__Type;
+typedef enum _PaddleMobile__Framework__Proto__AttrType {
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT = 0,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT = 1,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING = 2,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS = 3,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS = 4,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS = 5,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN = 6,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG =
+      9 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
+          PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE)
+} PaddleMobile__Framework__Proto__AttrType;
+
+/* --- messages --- */
+
+struct _PaddleMobile__Framework__Proto__OpDesc__Attr {
+  ProtobufCMessage base;
+  char *name;
+  PaddleMobile__Framework__Proto__AttrType type;
+  protobuf_c_boolean has_i;
+  int32_t i;
+  protobuf_c_boolean has_f;
+  float f;
+  char *s;
+  size_t n_ints;
+  int32_t *ints;
+  size_t n_floats;
+  float *floats;
+  size_t n_strings;
+  char **strings;
+  protobuf_c_boolean has_b;
+  protobuf_c_boolean b;
+  size_t n_bools;
+  protobuf_c_boolean *bools;
+  protobuf_c_boolean has_block_idx;
+  int32_t block_idx;
+  protobuf_c_boolean has_l;
+  int64_t l;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT                   \
+  {                                                                            \
+    PROTOBUF_C_MESSAGE_INIT(                                                   \
+        &paddle_mobile__framework__proto__op_desc__attr__descriptor)           \
+    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, 0, 0, 0, 0, NULL, \
+        0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0                   \
+  }
+
+struct _PaddleMobile__Framework__Proto__OpDesc__Var {
+  ProtobufCMessage base;
+  char *parameter;
+  size_t n_arguments;
+  char **arguments;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT         \
+  {                                                                 \
+    PROTOBUF_C_MESSAGE_INIT(                                        \
+        &paddle_mobile__framework__proto__op_desc__var__descriptor) \
+    , NULL, 0, NULL                                                 \
+  }
+
+/*
+ * OpDesc describes an instance of a C++ framework::OperatorBase
+ * derived class type.
+ */
+struct _PaddleMobile__Framework__Proto__OpDesc {
+  ProtobufCMessage base;
+  char *type;
+  size_t n_inputs;
+  PaddleMobile__Framework__Proto__OpDesc__Var **inputs;
+  size_t n_outputs;
+  PaddleMobile__Framework__Proto__OpDesc__Var **outputs;
+  size_t n_attrs;
+  PaddleMobile__Framework__Proto__OpDesc__Attr **attrs;
+  protobuf_c_boolean has_is_target;
+  protobuf_c_boolean is_target;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT         \
+  {                                                            \
+    PROTOBUF_C_MESSAGE_INIT(                                   \
+        &paddle_mobile__framework__proto__op_desc__descriptor) \
+    , NULL, 0, NULL, 0, NULL, 0, NULL, 0, 0                    \
+  }
+
+/*
+ * VarProto describes the C++ type framework::Variable.
+ */
+struct _PaddleMobile__Framework__Proto__OpProto__Var {
+  ProtobufCMessage base;
+  char *name;
+  char *comment;
+  protobuf_c_boolean has_duplicable;
+  protobuf_c_boolean duplicable;
+  protobuf_c_boolean has_intermediate;
+  protobuf_c_boolean intermediate;
+  protobuf_c_boolean has_dispensable;
+  protobuf_c_boolean dispensable;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT         \
+  {                                                                  \
+    PROTOBUF_C_MESSAGE_INIT(                                         \
+        &paddle_mobile__framework__proto__op_proto__var__descriptor) \
+    , NULL, NULL, 0, 0, 0, 0, 0, 0                                   \
+  }
+
+/*
+ * AttrProto describes the C++ type Attribute.
+ */
+struct _PaddleMobile__Framework__Proto__OpProto__Attr {
+  ProtobufCMessage base;
+  char *name;
+  PaddleMobile__Framework__Proto__AttrType type;
+  char *comment;
+  /*
+   * If that attribute is generated, it means the Paddle third
+   * language binding has responsibility to fill that
+   * attribute. End-User should not set that attribute.
+   */
+  protobuf_c_boolean has_generated;
+  protobuf_c_boolean generated;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT           \
+  {                                                                     \
+    PROTOBUF_C_MESSAGE_INIT(                                            \
+        &paddle_mobile__framework__proto__op_proto__attr__descriptor)   \
+    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, NULL, 0, 0 \
+  }
+
+/*
+ * OpProto describes a C++ framework::OperatorBase derived class.
+ */
+struct _PaddleMobile__Framework__Proto__OpProto {
+  ProtobufCMessage base;
+  char *type;
+  size_t n_inputs;
+  PaddleMobile__Framework__Proto__OpProto__Var **inputs;
+  size_t n_outputs;
+  PaddleMobile__Framework__Proto__OpProto__Var **outputs;
+  size_t n_attrs;
+  PaddleMobile__Framework__Proto__OpProto__Attr **attrs;
+  char *comment;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT         \
+  {                                                             \
+    PROTOBUF_C_MESSAGE_INIT(                                    \
+        &paddle_mobile__framework__proto__op_proto__descriptor) \
+    , NULL, 0, NULL, 0, NULL, 0, NULL, NULL                     \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__TensorDesc {
+  ProtobufCMessage base;
+  /*
+   * Should only be PODType. Is enforced in C++
+   */
+  PaddleMobile__Framework__Proto__VarType__Type data_type;
+  /*
+   * [UNK, 640, 480] is saved as [-1, 640, 480]
+   */
+  size_t n_dims;
+  int64_t *dims;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT         \
+  {                                                                          \
+    PROTOBUF_C_MESSAGE_INIT(                                                 \
+        &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor) \
+    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0, NULL         \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc {
+  ProtobufCMessage base;
+  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor;
+  protobuf_c_boolean has_lod_level;
+  int32_t lod_level;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT         \
+  {                                                                              \
+    PROTOBUF_C_MESSAGE_INIT(                                                     \
+        &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor) \
+    , NULL, 0, 0                                                                 \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc {
+  ProtobufCMessage base;
+  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor;
+  protobuf_c_boolean has_lod_level;
+  int32_t lod_level;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT         \
+  {                                                                                    \
+    PROTOBUF_C_MESSAGE_INIT(                                                           \
+        &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor) \
+    , NULL, 0, 0                                                                       \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc {
+  ProtobufCMessage base;
+  size_t n_lod_tensor;
+  PaddleMobile__Framework__Proto__VarType__LoDTensorDesc **lod_tensor;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT         \
+  {                                                                          \
+    PROTOBUF_C_MESSAGE_INIT(                                                 \
+        &paddle_mobile__framework__proto__var_type__reader_desc__descriptor) \
+    , 0, NULL                                                                \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc {
+  ProtobufCMessage base;
+  PaddleMobile__Framework__Proto__VarType__Type data_type;
+  int64_t capacity;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT         \
+  {                                                                           \
+    PROTOBUF_C_MESSAGE_INIT(                                                  \
+        &paddle_mobile__framework__proto__var_type__channel_desc__descriptor) \
+    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0                \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType__Tuple {
+  ProtobufCMessage base;
+  size_t n_element_type;
+  PaddleMobile__Framework__Proto__VarType__Type *element_type;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT         \
+  {                                                                    \
+    PROTOBUF_C_MESSAGE_INIT(                                           \
+        &paddle_mobile__framework__proto__var_type__tuple__descriptor) \
+    , 0, NULL                                                          \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarType {
+  ProtobufCMessage base;
+  PaddleMobile__Framework__Proto__VarType__Type type;
+  PaddleMobile__Framework__Proto__VarType__TensorDesc *selected_rows;
+  PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *lod_tensor;
+  PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *tensor_array;
+  PaddleMobile__Framework__Proto__VarType__ReaderDesc *reader;
+  PaddleMobile__Framework__Proto__VarType__ChannelDesc *channel;
+  PaddleMobile__Framework__Proto__VarType__Tuple *tuple;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT                        \
+  {                                                                            \
+    PROTOBUF_C_MESSAGE_INIT(                                                   \
+        &paddle_mobile__framework__proto__var_type__descriptor)                \
+    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, NULL, NULL, NULL, \
+        NULL, NULL, NULL                                                       \
+  }
+
+struct _PaddleMobile__Framework__Proto__VarDesc {
+  ProtobufCMessage base;
+  char *name;
+  PaddleMobile__Framework__Proto__VarType *type;
+  protobuf_c_boolean has_persistable;
+  protobuf_c_boolean persistable;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT         \
+  {                                                             \
+    PROTOBUF_C_MESSAGE_INIT(                                    \
+        &paddle_mobile__framework__proto__var_desc__descriptor) \
+    , NULL, NULL, 0, 0                                          \
+  }
+
+struct _PaddleMobile__Framework__Proto__BlockDesc {
+  ProtobufCMessage base;
+  int32_t idx;
+  int32_t parent_idx;
+  size_t n_vars;
+  PaddleMobile__Framework__Proto__VarDesc **vars;
+  size_t n_ops;
+  PaddleMobile__Framework__Proto__OpDesc **ops;
+  protobuf_c_boolean has_forward_block_idx;
+  int32_t forward_block_idx;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT         \
+  {                                                               \
+    PROTOBUF_C_MESSAGE_INIT(                                      \
+        &paddle_mobile__framework__proto__block_desc__descriptor) \
+    , 0, 0, 0, NULL, 0, NULL, 0, -1                               \
+  }
+
+/*
+ * Please refer to
+ * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
+ * for more details.
+ * TODO(panyx0718): A model can have multiple programs. Need a
+ * way to distinguish them. Maybe ID or name?
+ */
+struct _PaddleMobile__Framework__Proto__ProgramDesc {
+  ProtobufCMessage base;
+  size_t n_blocks;
+  PaddleMobile__Framework__Proto__BlockDesc **blocks;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT         \
+  {                                                                 \
+    PROTOBUF_C_MESSAGE_INIT(                                        \
+        &paddle_mobile__framework__proto__program_desc__descriptor) \
+    , 0, NULL                                                       \
+  }
+
+/* PaddleMobile__Framework__Proto__OpDesc__Attr methods */
+void paddle_mobile__framework__proto__op_desc__attr__init(
+    PaddleMobile__Framework__Proto__OpDesc__Attr *message);
+/* PaddleMobile__Framework__Proto__OpDesc__Var methods */
+void paddle_mobile__framework__proto__op_desc__var__init(
+    PaddleMobile__Framework__Proto__OpDesc__Var *message);
+/* PaddleMobile__Framework__Proto__OpDesc methods */
+void paddle_mobile__framework__proto__op_desc__init(
+    PaddleMobile__Framework__Proto__OpDesc *message);
+
+size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__OpDesc *message);
+
+PaddleMobile__Framework__Proto__OpDesc *
+paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
+                                                 size_t len,
+                                                 const uint8_t *data);
+void paddle_mobile__framework__proto__op_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__OpDesc *message,
+    ProtobufCAllocator *allocator);
+/* PaddleMobile__Framework__Proto__OpProto__Var methods */
+void paddle_mobile__framework__proto__op_proto__var__init(
+    PaddleMobile__Framework__Proto__OpProto__Var *message);
+/* PaddleMobile__Framework__Proto__OpProto__Attr methods */
+void paddle_mobile__framework__proto__op_proto__attr__init(
+    PaddleMobile__Framework__Proto__OpProto__Attr *message);
+/* PaddleMobile__Framework__Proto__OpProto methods */
+void paddle_mobile__framework__proto__op_proto__init(
+    PaddleMobile__Framework__Proto__OpProto *message);
+size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
+    const PaddleMobile__Framework__Proto__OpProto *message);
+PaddleMobile__Framework__Proto__OpProto *
+paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data);
+void paddle_mobile__framework__proto__op_proto__free_unpacked(
+    PaddleMobile__Framework__Proto__OpProto *message,
+    ProtobufCAllocator *allocator);
+/* PaddleMobile__Framework__Proto__VarType__TensorDesc methods */
+void paddle_mobile__framework__proto__var_type__tensor_desc__init(
+    PaddleMobile__Framework__Proto__VarType__TensorDesc *message);
+/* PaddleMobile__Framework__Proto__VarType__LoDTensorDesc methods */
+void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init(
+    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message);
+/* PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc methods */
+void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init(
+    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message);
+/* PaddleMobile__Framework__Proto__VarType__ReaderDesc methods */
+void paddle_mobile__framework__proto__var_type__reader_desc__init(
+    PaddleMobile__Framework__Proto__VarType__ReaderDesc *message);
+/* PaddleMobile__Framework__Proto__VarType__ChannelDesc methods */
+void paddle_mobile__framework__proto__var_type__channel_desc__init(
+    PaddleMobile__Framework__Proto__VarType__ChannelDesc *message);
+/* PaddleMobile__Framework__Proto__VarType__Tuple methods */
+void paddle_mobile__framework__proto__var_type__tuple__init(
+    PaddleMobile__Framework__Proto__VarType__Tuple *message);
+/* PaddleMobile__Framework__Proto__VarType methods */
+void paddle_mobile__framework__proto__var_type__init(
+    PaddleMobile__Framework__Proto__VarType *message);
+size_t paddle_mobile__framework__proto__var_type__get_packed_size(
+    const PaddleMobile__Framework__Proto__VarType *message);
+PaddleMobile__Framework__Proto__VarType *
+paddle_mobile__framework__proto__var_type__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data);
+void paddle_mobile__framework__proto__var_type__free_unpacked(
+    PaddleMobile__Framework__Proto__VarType *message,
+    ProtobufCAllocator *allocator);
+/* PaddleMobile__Framework__Proto__VarDesc methods */
+void paddle_mobile__framework__proto__var_desc__init(
+    PaddleMobile__Framework__Proto__VarDesc *message);
+size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__VarDesc *message);
+PaddleMobile__Framework__Proto__VarDesc *
+paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
+                                                  size_t len,
+                                                  const uint8_t *data);
+void paddle_mobile__framework__proto__var_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__VarDesc *message,
+    ProtobufCAllocator *allocator);
+/* PaddleMobile__Framework__Proto__BlockDesc methods */
+void paddle_mobile__framework__proto__block_desc__init(
+    PaddleMobile__Framework__Proto__BlockDesc *message);
+size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__BlockDesc *message);
+PaddleMobile__Framework__Proto__BlockDesc *
+paddle_mobile__framework__proto__block_desc__unpack(
+    ProtobufCAllocator *allocator, size_t len, const uint8_t *data);
+void paddle_mobile__framework__proto__block_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__BlockDesc *message,
+    ProtobufCAllocator *allocator);
+/* PaddleMobile__Framework__Proto__ProgramDesc methods */
+void paddle_mobile__framework__proto__program_desc__init(
+    PaddleMobile__Framework__Proto__ProgramDesc *message);
+size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
+    const PaddleMobile__Framework__Proto__ProgramDesc *message);
+PaddleMobile__Framework__Proto__ProgramDesc *
+paddle_mobile__framework__proto__program_desc__unpack(
+    ProtobufCAllocator *allocator, size_t len, const uint8_t *data);
+void paddle_mobile__framework__proto__program_desc__free_unpacked(
+    PaddleMobile__Framework__Proto__ProgramDesc *message,
+    ProtobufCAllocator *allocator);
+/* --- per-message closures --- */
+
+typedef void (*PaddleMobile__Framework__Proto__OpDesc__Attr_Closure)(
+    const PaddleMobile__Framework__Proto__OpDesc__Attr *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__OpDesc__Var_Closure)(
+    const PaddleMobile__Framework__Proto__OpDesc__Var *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__OpDesc_Closure)(
+    const PaddleMobile__Framework__Proto__OpDesc *message, void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__OpProto__Var_Closure)(
+    const PaddleMobile__Framework__Proto__OpProto__Var *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__OpProto__Attr_Closure)(
+    const PaddleMobile__Framework__Proto__OpProto__Attr *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__OpProto_Closure)(
+    const PaddleMobile__Framework__Proto__OpProto *message, void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__TensorDesc *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType__LoDTensorDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message,
+    void *closure_data);
+typedef void (
+    *PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType__ReaderDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__ReaderDesc *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType__ChannelDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__ChannelDesc *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType__Tuple_Closure)(
+    const PaddleMobile__Framework__Proto__VarType__Tuple *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarType_Closure)(
+    const PaddleMobile__Framework__Proto__VarType *message, void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__VarDesc_Closure)(
+    const PaddleMobile__Framework__Proto__VarDesc *message, void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__BlockDesc_Closure)(
+    const PaddleMobile__Framework__Proto__BlockDesc *message,
+    void *closure_data);
+typedef void (*PaddleMobile__Framework__Proto__ProgramDesc_Closure)(
+    const PaddleMobile__Framework__Proto__ProgramDesc *message,
+    void *closure_data);
+
+/* --- services --- */
+
+/* --- descriptors --- */
+
+extern const ProtobufCEnumDescriptor
+    paddle_mobile__framework__proto__attr_type__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__attr__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_desc__var__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__var__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__op_proto__attr__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__tensor_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__reader_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__channel_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_type__tuple__descriptor;
+extern const ProtobufCEnumDescriptor
+    paddle_mobile__framework__proto__var_type__type__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__var_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__block_desc__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__program_desc__descriptor;
+
+PROTOBUF_C__END_DECLS
+
+#endif /* PROTOBUF_C_framework_2eproto__INCLUDED */
diff --git a/tools/quantification/src/program_desc.cpp b/tools/quantification/src/program_desc.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f9984832ada5061c7691aeb7fadba86cb5b8c0c
--- /dev/null
+++ b/tools/quantification/src/program_desc.cpp
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+//
+// Created by 谢柏渊 on 2018/7/25.
+//
+
+#include "src/program_desc.h"
+#include <vector>
+
+ProgramDesc::ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc) {
+  for (int i = 0; i < desc->n_blocks; ++i) {
+    blocks_.emplace_back(std::make_shared<BlockDesc>(desc->blocks[i]));
+  }
+}
+
+const std::vector<std::shared_ptr<BlockDesc>> ProgramDesc::Blocks() {
+  return blocks_;
+}
diff --git a/tools/quantification/src/program_desc.h b/tools/quantification/src/program_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..60a0f757b0c907165d7639a41e35a407ef083b59
--- /dev/null
+++ b/tools/quantification/src/program_desc.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+//
+// Created by 谢柏渊 on 2018/7/25.
+//
+
+#ifndef TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
+#define TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
+
+#include <memory>
+#include <vector>
+#include "src/block_desc_local.h"
+#include "src/framework.pb-c.h"
+
+class ProgramDesc {
+ public:
+  //    friend class Node;
+  //
+  //    friend class ProgramOptimize;
+
+  explicit ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc);
+
+  const std::vector<std::shared_ptr<BlockDesc>> Blocks();
+
+ private:
+  std::vector<std::shared_ptr<BlockDesc>> blocks_;
+};
+
+#endif  // TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
diff --git a/tools/quantification/src/protobuf-c.c b/tools/quantification/src/protobuf-c.c
new file mode 100644
index 0000000000000000000000000000000000000000..1092e3f78b02a343d8c8965ea7b2d777a6fac9ae
--- /dev/null
+++ b/tools/quantification/src/protobuf-c.c
@@ -0,0 +1,2098 @@
+/*
+ * Copyright (c) 2008-2015, Dave Benson and the protobuf-c authors.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*! \file
+ * Support library for `protoc-c` generated code.
+ *
+ * This file implements the public API used by the code generated
+ * by `protoc-c`.
+ *
+ * \authors Dave Benson and the protobuf-c authors
+ *
+ * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license.
+ */
+
+/**
+ * \todo 64-BIT OPTIMIZATION: certain implementations use 32-bit math
+ * even on 64-bit platforms (uint64_size, uint64_pack, parse_uint64).
+ *
+ * \todo Use size_t consistently.
+ */
+
+#include <stdlib.h> /* for malloc, free */
+#include <string.h> /* for strcmp, strlen, memcpy, memmove, memset */
+
+#include "protobuf-c.h"
+
+#define TRUE 1
+#define FALSE 0
+
+#define PROTOBUF_C__ASSERT_NOT_REACHED() assert(0)
+
+/* Workaround for Microsoft compilers. */
+#ifdef _MSC_VER
+#define inline __inline
+#endif
+
+/**
+ * \defgroup internal Internal functions and macros
+ *
+ * These are not exported by the library but are useful to developers working
+ * on `libprotobuf-c` itself.
+ */
+
+/**
+ * \defgroup macros Utility macros for manipulating structures
+ *
+ * Macros and constants used to manipulate the base "classes" generated by
+ * `protobuf-c`. They also define limits and check correctness.
+ *
+ * \ingroup internal
+ * @{
+ */
+
+/** The maximum length of a 64-bit integer in varint encoding. */
+#define MAX_UINT64_ENCODED_SIZE 10
+
+#ifndef PROTOBUF_C_UNPACK_ERROR
+#define PROTOBUF_C_UNPACK_ERROR(...)
+#endif
+
+const char protobuf_c_empty_string[] = "";
+
+/**
+ * Internal `ProtobufCMessage` manipulation macro.
+ *
+ * Base macro for manipulating a `ProtobufCMessage`. Used by STRUCT_MEMBER() and
+ * STRUCT_MEMBER_PTR().
+ */
+#define STRUCT_MEMBER_P(struct_p, struct_offset) \
+  ((void *)((uint8_t *)(struct_p) + (struct_offset)))
+
+/**
+ * Return field in a `ProtobufCMessage` based on offset.
+ *
+ * Take a pointer to a `ProtobufCMessage` and find the field at the offset.
+ * Cast it to the passed type.
+ */
+#define STRUCT_MEMBER(member_type, struct_p, struct_offset) \
+  (*(member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset)))
+
+/**
+ * Return field in a `ProtobufCMessage` based on offset.
+ *
+ * Take a pointer to a `ProtobufCMessage` and find the field at the offset. Cast
+ * it to a pointer to the passed type.
+ */
+#define STRUCT_MEMBER_PTR(member_type, struct_p, struct_offset) \
+  ((member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset)))
+
+/* Assertions for magic numbers. */
+
+#define ASSERT_IS_ENUM_DESCRIPTOR(desc) \
+  assert((desc)->magic == PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC)
+
+#define ASSERT_IS_MESSAGE_DESCRIPTOR(desc) \
+  assert((desc)->magic == PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC)
+
+#define ASSERT_IS_MESSAGE(message) \
+  ASSERT_IS_MESSAGE_DESCRIPTOR((message)->descriptor)
+
+#define ASSERT_IS_SERVICE_DESCRIPTOR(desc) \
+  assert((desc)->magic == PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC)
+
+/**@}*/
+
+/* --- version --- */
+
+const char *protobuf_c_version(void) { return PROTOBUF_C_VERSION; }
+
+uint32_t protobuf_c_version_number(void) { return PROTOBUF_C_VERSION_NUMBER; }
+
+/* --- allocator --- */
+
+static void *system_alloc(void *allocator_data, size_t size) {
+  return malloc(size);
+}
+
+static void system_free(void *allocator_data, void *data) { free(data); }
+
+static inline void *do_alloc(ProtobufCAllocator *allocator, size_t size) {
+  return allocator->alloc(allocator->allocator_data, size);
+}
+
+static inline void do_free(ProtobufCAllocator *allocator, void *data) {
+  if (data != NULL) allocator->free(allocator->allocator_data, data);
+}
+
+/*
+ * This allocator uses the system's malloc() and free(). It is the default
+ * allocator used if NULL is passed as the ProtobufCAllocator to an exported
+ * function.
+ */
+static ProtobufCAllocator protobuf_c__allocator = {
+    .alloc = &system_alloc,
+    .free = &system_free,
+    .allocator_data = NULL,
+};
+
+/* === buffer-simple === */
+
+void protobuf_c_buffer_simple_append(ProtobufCBuffer *buffer, size_t len,
+                                     const uint8_t *data) {
+  ProtobufCBufferSimple *simp = (ProtobufCBufferSimple *)buffer;
+  size_t new_len = simp->len + len;
+
+  if (new_len > simp->alloced) {
+    ProtobufCAllocator *allocator = simp->allocator;
+    size_t new_alloced = simp->alloced * 2;
+    uint8_t *new_data;
+
+    if (allocator == NULL) allocator = &protobuf_c__allocator;
+    while (new_alloced < new_len) new_alloced += new_alloced;
+    new_data = do_alloc(allocator, new_alloced);
+    if (!new_data) return;
+    memcpy(new_data, simp->data, simp->len);
+    if (simp->must_free_data)
+      do_free(allocator, simp->data);
+    else
+      simp->must_free_data = TRUE;
+    simp->data = new_data;
+    simp->alloced = new_alloced;
+  }
+  memcpy(simp->data + simp->len, data, len);
+  simp->len = new_len;
+}
+
+/**
+ * \defgroup packedsz protobuf_c_message_get_packed_size() implementation
+ *
+ * Routines mainly used by protobuf_c_message_get_packed_size().
+ *
+ * \ingroup internal
+ * @{
+ */
+
+/**
+ * Return the number of bytes required to store the tag for the field. Includes
+ * 3 bits for the wire-type, and a single bit that denotes the end-of-tag.
+ *
+ * \param number
+ *      Field tag to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t get_tag_size(uint32_t number) {
+  if (number < (1UL << 4)) {
+    return 1;
+  } else if (number < (1UL << 11)) {
+    return 2;
+  } else if (number < (1UL << 18)) {
+    return 3;
+  } else if (number < (1UL << 25)) {
+    return 4;
+  } else {
+    return 5;
+  }
+}
+
+/**
+ * Return the number of bytes required to store a variable-length unsigned
+ * 32-bit integer in base-128 varint encoding.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t uint32_size(uint32_t v) {
+  if (v < (1UL << 7)) {
+    return 1;
+  } else if (v < (1UL << 14)) {
+    return 2;
+  } else if (v < (1UL << 21)) {
+    return 3;
+  } else if (v < (1UL << 28)) {
+    return 4;
+  } else {
+    return 5;
+  }
+}
+
+/**
+ * Return the number of bytes required to store a variable-length signed 32-bit
+ * integer in base-128 varint encoding.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t int32_size(int32_t v) {
+  if (v < 0) {
+    return 10;
+  } else if (v < (1L << 7)) {
+    return 1;
+  } else if (v < (1L << 14)) {
+    return 2;
+  } else if (v < (1L << 21)) {
+    return 3;
+  } else if (v < (1L << 28)) {
+    return 4;
+  } else {
+    return 5;
+  }
+}
+
+/**
+ * Return the ZigZag-encoded 32-bit unsigned integer form of a 32-bit signed
+ * integer.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      ZigZag encoded integer.
+ */
+static inline uint32_t zigzag32(int32_t v) {
+  if (v < 0)
+    return (-(uint32_t)v) * 2 - 1;
+  else
+    return (uint32_t)(v)*2;
+}
+
+/**
+ * Return the number of bytes required to store a signed 32-bit integer,
+ * converted to an unsigned 32-bit integer with ZigZag encoding, using base-128
+ * varint encoding.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t sint32_size(int32_t v) { return uint32_size(zigzag32(v)); }
+
+/**
+ * Return the number of bytes required to store a 64-bit unsigned integer in
+ * base-128 varint encoding.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t uint64_size(uint64_t v) {
+  uint32_t upper_v = (uint32_t)(v >> 32);
+
+  if (upper_v == 0) {
+    return uint32_size((uint32_t)v);
+  } else if (upper_v < (1UL << 3)) {
+    return 5;
+  } else if (upper_v < (1UL << 10)) {
+    return 6;
+  } else if (upper_v < (1UL << 17)) {
+    return 7;
+  } else if (upper_v < (1UL << 24)) {
+    return 8;
+  } else if (upper_v < (1UL << 31)) {
+    return 9;
+  } else {
+    return 10;
+  }
+}
+
+/**
+ * Return the ZigZag-encoded 64-bit unsigned integer form of a 64-bit signed
+ * integer.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      ZigZag encoded integer.
+ */
+static inline uint64_t zigzag64(int64_t v) {
+  if (v < 0)
+    return (-(uint64_t)v) * 2 - 1;
+  else
+    return (uint64_t)(v)*2;
+}
+
+/**
+ * Return the number of bytes required to store a signed 64-bit integer,
+ * converted to an unsigned 64-bit integer with ZigZag encoding, using base-128
+ * varint encoding.
+ *
+ * \param v
+ *      Value to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t sint64_size(int64_t v) { return uint64_size(zigzag64(v)); }
+
+/**
+ * Calculate the serialized size of a single required message field, including
+ * the space needed by the preceding tag.
+ *
+ * \param field
+ *      Field descriptor for member.
+ * \param member
+ *      Field to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static size_t required_field_get_packed_size(
+    const ProtobufCFieldDescriptor *field, const void *member) {
+  size_t rv = get_tag_size(field->id);
+
+  switch (field->type) {
+    case PROTOBUF_C_TYPE_SINT32:
+      return rv + sint32_size(*(const int32_t *)member);
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_INT32:
+      return rv + int32_size(*(const int32_t *)member);
+    case PROTOBUF_C_TYPE_UINT32:
+      return rv + uint32_size(*(const uint32_t *)member);
+    case PROTOBUF_C_TYPE_SINT64:
+      return rv + sint64_size(*(const int64_t *)member);
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+      return rv + uint64_size(*(const uint64_t *)member);
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+      return rv + 4;
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+      return rv + 8;
+    case PROTOBUF_C_TYPE_BOOL:
+      return rv + 1;
+    case PROTOBUF_C_TYPE_FLOAT:
+      return rv + 4;
+    case PROTOBUF_C_TYPE_DOUBLE:
+      return rv + 8;
+    case PROTOBUF_C_TYPE_STRING: {
+      const char *str = *(char *const *)member;
+      size_t len = str ? strlen(str) : 0;
+      return rv + uint32_size(len) + len;
+    }
+    case PROTOBUF_C_TYPE_BYTES: {
+      size_t len = ((const ProtobufCBinaryData *)member)->len;
+      return rv + uint32_size(len) + len;
+    }
+    case PROTOBUF_C_TYPE_MESSAGE: {
+      const ProtobufCMessage *msg = *(ProtobufCMessage *const *)member;
+      size_t subrv = msg ? protobuf_c_message_get_packed_size(msg) : 0;
+      return rv + uint32_size(subrv) + subrv;
+    }
+  }
+  PROTOBUF_C__ASSERT_NOT_REACHED();
+  return 0;
+}
+
+/**
+ * Calculate the serialized size of a single oneof message field, including
+ * the space needed by the preceding tag. Returns 0 if the oneof field isn't
+ * selected or is not set.
+ *
+ * \param field
+ *      Field descriptor for member.
+ * \param oneof_case
+ *      Enum value that selects the field in the oneof.
+ * \param member
+ *      Field to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static size_t oneof_field_get_packed_size(const ProtobufCFieldDescriptor *field,
+                                          uint32_t oneof_case,
+                                          const void *member) {
+  if (oneof_case != field->id) {
+    return 0;
+  }
+  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
+      field->type == PROTOBUF_C_TYPE_STRING) {
+    const void *ptr = *(const void *const *)member;
+    if (ptr == NULL || ptr == field->default_value) return 0;
+  }
+  return required_field_get_packed_size(field, member);
+}
+
+/**
+ * Calculate the serialized size of a single optional message field, including
+ * the space needed by the preceding tag. Returns 0 if the optional field isn't
+ * set.
+ *
+ * \param field
+ *      Field descriptor for member.
+ * \param has
+ *      True if the field exists, false if not.
+ * \param member
+ *      Field to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static size_t optional_field_get_packed_size(
+    const ProtobufCFieldDescriptor *field, const protobuf_c_boolean has,
+    const void *member) {
+  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
+      field->type == PROTOBUF_C_TYPE_STRING) {
+    const void *ptr = *(const void *const *)member;
+    if (ptr == NULL || ptr == field->default_value) return 0;
+  } else {
+    if (!has) return 0;
+  }
+  return required_field_get_packed_size(field, member);
+}
+
+static protobuf_c_boolean field_is_zeroish(
+    const ProtobufCFieldDescriptor *field, const void *member) {
+  protobuf_c_boolean ret = FALSE;
+
+  switch (field->type) {
+    case PROTOBUF_C_TYPE_BOOL:
+      ret = (0 == *(const protobuf_c_boolean *)member);
+      break;
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_SINT32:
+    case PROTOBUF_C_TYPE_INT32:
+    case PROTOBUF_C_TYPE_UINT32:
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+      ret = (0 == *(const uint32_t *)member);
+      break;
+    case PROTOBUF_C_TYPE_SINT64:
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+      ret = (0 == *(const uint64_t *)member);
+      break;
+    case PROTOBUF_C_TYPE_FLOAT:
+      ret = (0 == *(const float *)member);
+      break;
+    case PROTOBUF_C_TYPE_DOUBLE:
+      ret = (0 == *(const double *)member);
+      break;
+    case PROTOBUF_C_TYPE_STRING:
+      ret = (NULL == *(const char *const *)member) ||
+            ('\0' == **(const char *const *)member);
+      break;
+    case PROTOBUF_C_TYPE_BYTES:
+    case PROTOBUF_C_TYPE_MESSAGE:
+      ret = (NULL == *(const void *const *)member);
+      break;
+    default:
+      ret = TRUE;
+      break;
+  }
+
+  return ret;
+}
+
+/**
+ * Calculate the serialized size of a single unlabeled message field, including
+ * the space needed by the preceding tag. Returns 0 if the field isn't set or
+ * if it is set to a "zeroish" value (null pointer or 0 for numerical values).
+ * Unlabeled fields are supported only in proto3.
+ *
+ * \param field
+ *      Field descriptor for member.
+ * \param member
+ *      Field to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static size_t unlabeled_field_get_packed_size(
+    const ProtobufCFieldDescriptor *field, const void *member) {
+  if (field_is_zeroish(field, member)) return 0;
+  return required_field_get_packed_size(field, member);
+}
+
+/**
+ * Calculate the serialized size of repeated message fields, which may consist
+ * of any number of values (including 0). Includes the space needed by the
+ * preceding tags (as needed).
+ *
+ * \param field
+ *      Field descriptor for member.
+ * \param count
+ *      Number of repeated field members.
+ * \param member
+ *      Field to encode.
+ * \return
+ *      Number of bytes required.
+ */
+static size_t repeated_field_get_packed_size(
+    const ProtobufCFieldDescriptor *field, size_t count, const void *member) {
+  size_t header_size;
+  size_t rv = 0;
+  unsigned i;
+  void *array = *(void *const *)member;
+
+  if (count == 0) return 0;
+  header_size = get_tag_size(field->id);
+  if (0 == (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED)) header_size *= count;
+
+  switch (field->type) {
+    case PROTOBUF_C_TYPE_SINT32:
+      for (i = 0; i < count; i++) rv += sint32_size(((int32_t *)array)[i]);
+      break;
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_INT32:
+      for (i = 0; i < count; i++) rv += int32_size(((int32_t *)array)[i]);
+      break;
+    case PROTOBUF_C_TYPE_UINT32:
+      for (i = 0; i < count; i++) rv += uint32_size(((uint32_t *)array)[i]);
+      break;
+    case PROTOBUF_C_TYPE_SINT64:
+      for (i = 0; i < count; i++) rv += sint64_size(((int64_t *)array)[i]);
+      break;
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+      for (i = 0; i < count; i++) rv += uint64_size(((uint64_t *)array)[i]);
+      break;
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+    case PROTOBUF_C_TYPE_FLOAT:
+      rv += 4 * count;
+      break;
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+    case PROTOBUF_C_TYPE_DOUBLE:
+      rv += 8 * count;
+      break;
+    case PROTOBUF_C_TYPE_BOOL:
+      rv += count;
+      break;
+    case PROTOBUF_C_TYPE_STRING:
+      for (i = 0; i < count; i++) {
+        size_t len = strlen(((char **)array)[i]);
+        rv += uint32_size(len) + len;
+      }
+      break;
+    case PROTOBUF_C_TYPE_BYTES:
+      for (i = 0; i < count; i++) {
+        size_t len = ((ProtobufCBinaryData *)array)[i].len;
+        rv += uint32_size(len) + len;
+      }
+      break;
+    case PROTOBUF_C_TYPE_MESSAGE:
+      for (i = 0; i < count; i++) {
+        size_t len =
+            protobuf_c_message_get_packed_size(((ProtobufCMessage **)array)[i]);
+        rv += uint32_size(len) + len;
+      }
+      break;
+  }
+
+  if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED))
+    header_size += uint32_size(rv);
+  return header_size + rv;
+}
+
+/**
+ * Calculate the serialized size of an unknown field, i.e. one that is passed
+ * through mostly uninterpreted. This is required for forward compatibility if
+ * new fields are added to the message descriptor.
+ *
+ * \param field
+ *      Unknown field type.
+ * \return
+ *      Number of bytes required.
+ */
+static inline size_t unknown_field_get_packed_size(
+    const ProtobufCMessageUnknownField *field) {
+  return get_tag_size(field->tag) + field->len;
+}
+
+/**@}*/
+
+/*
+ * Calculate the serialized size of the message.
+ */
+size_t protobuf_c_message_get_packed_size(const ProtobufCMessage *message) {
+  unsigned i;
+  size_t rv = 0;
+
+  ASSERT_IS_MESSAGE(message);
+  for (i = 0; i < message->descriptor->n_fields; i++) {
+    const ProtobufCFieldDescriptor *field = message->descriptor->fields + i;
+    const void *member = ((const char *)message) + field->offset;
+    const void *qmember = ((const char *)message) + field->quantifier_offset;
+
+    if (field->label == PROTOBUF_C_LABEL_REQUIRED) {
+      rv += required_field_get_packed_size(field, member);
+    } else if ((field->label == PROTOBUF_C_LABEL_OPTIONAL ||
+                field->label == PROTOBUF_C_LABEL_NONE) &&
+               (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF))) {
+      rv += oneof_field_get_packed_size(field, *(const uint32_t *)qmember,
+                                        member);
+    } else if (field->label == PROTOBUF_C_LABEL_OPTIONAL) {
+      rv += optional_field_get_packed_size(
+          field, *(protobuf_c_boolean *)qmember, member);
+    } else if (field->label == PROTOBUF_C_LABEL_NONE) {
+      rv += unlabeled_field_get_packed_size(field, member);
+    } else {
+      rv += repeated_field_get_packed_size(field, *(const size_t *)qmember,
+                                           member);
+    }
+  }
+  for (i = 0; i < message->n_unknown_fields; i++)
+    rv += unknown_field_get_packed_size(&message->unknown_fields[i]);
+  return rv;
+}
+
+/**
+ * \defgroup pack protobuf_c_message_pack() implementation
+ *
+ * Routines mainly used by protobuf_c_message_pack().
+ *
+ * \ingroup internal
+ * @{
+ */
+
+/**
+ * Pack an unsigned 32-bit integer in base-128 varint encoding and return the
+ * number of bytes written, which must be 5 or less.
+ *
+ * \param value
+ *      Value to encode.
+ * \param[out] out
+ *      Packed value.
+ * \return
+ *      Number of bytes written to `out`.
+ */
+static inline size_t uint32_pack(uint32_t value, uint8_t *out) {
+  unsigned rv = 0;
+
+  if (value >= 0x80) {
+    out[rv++] = value | 0x80;
+    value >>= 7;
+    if (value >= 0x80) {
+      out[rv++] = value | 0x80;
+      value >>= 7;
+      if (value >= 0x80) {
+        out[rv++] = value | 0x80;
+        value >>= 7;
+        if (value >= 0x80) {
+          out[rv++] = value | 0x80;
+          value >>= 7;
+        }
+      }
+    }
+  }
+  /* assert: value<128 */
+  out[rv++] = value;
+  return rv;
+}
+
+/**
+ * Pack a 64-bit unsigned integer using base-128 varint encoding and return the
+ * number of bytes written.
+ *
+ * \param value
+ *      Value to encode.
+ * \param[out] out
+ *      Packed value.
+ * \return
+ *      Number of bytes written to `out`.
+ */
+static size_t uint64_pack(uint64_t value, uint8_t *out) {
+  uint32_t hi = (uint32_t)(value >> 32);
+  uint32_t lo = (uint32_t)value;
+  unsigned rv;
+
+  if (hi == 0) return uint32_pack((uint32_t)lo, out);
+  out[0] = (lo) | 0x80;
+  out[1] = (lo >> 7) | 0x80;
+  out[2] = (lo >> 14) | 0x80;
+  out[3] = (lo >> 21) | 0x80;
+  if (hi < 8) {
+    out[4] = (hi << 4) | (lo >> 28);
+    return 5;
+  } else {
+    out[4] = ((hi & 7) << 4) | (lo >> 28) | 0x80;
+    hi >>= 3;
+  }
+  rv = 5;
+  while (hi >= 128) {
+    out[rv++] = hi | 0x80;
+    hi >>= 7;
+  }
+  out[rv++] = hi;
+  return rv;
+}
+
+/**
+ * Pack a ProtobufCBinaryData and return the number of bytes written. The output
+ * includes a length delimiter.
+ *
+ * \param bd
+ *      ProtobufCBinaryData to encode.
+ * \param[out] out
+ *      Packed value.
+ * \return
+ *      Number of bytes written to `out`.
+ */
+static inline size_t binary_data_pack(const ProtobufCBinaryData *bd,
+                                      uint8_t *out) {
+  size_t len = bd->len;
+  size_t rv = uint32_pack(len, out);
+  memcpy(out + rv, bd->data, len);
+  return rv + len;
+}
+
+/**
+ * Pack a field tag.
+ *
+ * Wire-type will be added in required_field_pack().
+ *
+ * \todo Just call uint64_pack on 64-bit platforms.
+ *
+ * \param id
+ *      Tag value to encode.
+ * \param[out] out
+ *      Packed value.
+ * \return
+ *      Number of bytes written to `out`.
+ */
+static size_t tag_pack(uint32_t id, uint8_t *out) {
+  if (id < (1UL << (32 - 3)))
+    return uint32_pack(id << 3, out);
+  else
+    return uint64_pack(((uint64_t)id) << 3, out);
+}
+
+/**
+ * Given a field type, return the in-memory size.
+ *
+ * \todo Implement as a table lookup.
+ *
+ * \param type
+ *      Field type.
+ * \return
+ *      Size of the field.
+ */
+static inline size_t sizeof_elt_in_repeated_array(ProtobufCType type) {
+  switch (type) {
+    case PROTOBUF_C_TYPE_SINT32:
+    case PROTOBUF_C_TYPE_INT32:
+    case PROTOBUF_C_TYPE_UINT32:
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+    case PROTOBUF_C_TYPE_FLOAT:
+    case PROTOBUF_C_TYPE_ENUM:
+      return 4;
+    case PROTOBUF_C_TYPE_SINT64:
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+    case PROTOBUF_C_TYPE_DOUBLE:
+      return 8;
+    case PROTOBUF_C_TYPE_BOOL:
+      return sizeof(protobuf_c_boolean);
+    case PROTOBUF_C_TYPE_STRING:
+    case PROTOBUF_C_TYPE_MESSAGE:
+      return sizeof(void *);
+    case PROTOBUF_C_TYPE_BYTES:
+      return sizeof(ProtobufCBinaryData);
+  }
+  PROTOBUF_C__ASSERT_NOT_REACHED();
+  return 0;
+}
+
+static inline int int_range_lookup(unsigned n_ranges,
+                                   const ProtobufCIntRange *ranges, int value) {
+  unsigned n;
+  unsigned start;
+
+  if (n_ranges == 0) return -1;
+  start = 0;
+  n = n_ranges;
+  while (n > 1) {
+    unsigned mid = start + n / 2;
+
+    if (value < ranges[mid].start_value) {
+      n = mid - start;
+    } else if (value >=
+               ranges[mid].start_value +
+                   (int)(ranges[mid + 1].orig_index - ranges[mid].orig_index)) {
+      unsigned new_start = mid + 1;
+      n = start + n - new_start;
+      start = new_start;
+    } else
+      return (value - ranges[mid].start_value) + ranges[mid].orig_index;
+  }
+  if (n > 0) {
+    unsigned start_orig_index = ranges[start].orig_index;
+    unsigned range_size = ranges[start + 1].orig_index - start_orig_index;
+
+    if (ranges[start].start_value <= value &&
+        value < (int)(ranges[start].start_value + range_size)) {
+      return (value - ranges[start].start_value) + start_orig_index;
+    }
+  }
+  return -1;
+}
+
+static size_t parse_tag_and_wiretype(size_t len, const uint8_t *data,
+                                     uint32_t *tag_out,
+                                     ProtobufCWireType *wiretype_out) {
+  unsigned max_rv = len > 5 ? 5 : len;
+  uint32_t tag = (data[0] & 0x7f) >> 3;
+  unsigned shift = 4;
+  unsigned rv;
+
+  *wiretype_out = data[0] & 7;
+  if ((data[0] & 0x80) == 0) {
+    *tag_out = tag;
+    return 1;
+  }
+  for (rv = 1; rv < max_rv; rv++) {
+    if (data[rv] & 0x80) {
+      tag |= (data[rv] & 0x7f) << shift;
+      shift += 7;
+    } else {
+      tag |= data[rv] << shift;
+      *tag_out = tag;
+      return rv + 1;
+    }
+  }
+  return 0; /* error: bad header */
+}
+
+/* sizeof(ScannedMember) must be <= (1UL<<BOUND_SIZEOF_SCANNED_MEMBER_LOG2) */
+#define BOUND_SIZEOF_SCANNED_MEMBER_LOG2 5
+typedef struct _ScannedMember ScannedMember;
+/** Field as it's being read. */
+struct _ScannedMember {
+  uint32_t tag;                          /**< Field tag. */
+  uint8_t wire_type;                     /**< Field type. */
+  uint8_t length_prefix_len;             /**< Prefix length. */
+  const ProtobufCFieldDescriptor *field; /**< Field descriptor. */
+  size_t len;                            /**< Field length. */
+  const uint8_t *data;                   /**< Pointer to field data. */
+};
+
+static inline uint32_t scan_length_prefixed_data(size_t len,
+                                                 const uint8_t *data,
+                                                 size_t *prefix_len_out) {
+  unsigned hdr_max = len < 5 ? len : 5;
+  unsigned hdr_len;
+  uint32_t val = 0;
+  unsigned i;
+  unsigned shift = 0;
+
+  for (i = 0; i < hdr_max; i++) {
+    val |= (data[i] & 0x7f) << shift;
+    shift += 7;
+    if ((data[i] & 0x80) == 0) break;
+  }
+  if (i == hdr_max) {
+    PROTOBUF_C_UNPACK_ERROR("error parsing length for length-prefixed data");
+    return 0;
+  }
+  hdr_len = i + 1;
+  *prefix_len_out = hdr_len;
+  if (hdr_len + val > len) {
+    PROTOBUF_C_UNPACK_ERROR("data too short after length-prefix of %u", val);
+    return 0;
+  }
+  return hdr_len + val;
+}
+
+static size_t max_b128_numbers(size_t len, const uint8_t *data) {
+  size_t rv = 0;
+  while (len--)
+    if ((*data++ & 0x80) == 0) ++rv;
+  return rv;
+}
+
+/**@}*/
+
+/**
+ * Merge earlier message into a latter message.
+ *
+ * For numeric types and strings, if the same value appears multiple
+ * times, the parser accepts the last value it sees. For embedded
+ * message fields, the parser merges multiple instances of the same
+ * field. That is, all singular scalar fields in the latter instance
+ * replace those in the former, singular embedded messages are merged,
+ * and repeated fields are concatenated.
+ *
+ * The earlier message should be freed after calling this function, as
+ * some of its fields may have been reused and changed to their default
+ * values during the merge.
+ */
+static protobuf_c_boolean merge_messages(ProtobufCMessage *earlier_msg,
+                                         ProtobufCMessage *latter_msg,
+                                         ProtobufCAllocator *allocator) {
+  unsigned i;
+  const ProtobufCFieldDescriptor *fields = latter_msg->descriptor->fields;
+  for (i = 0; i < latter_msg->descriptor->n_fields; i++) {
+    if (fields[i].label == PROTOBUF_C_LABEL_REPEATED) {
+      size_t *n_earlier =
+          STRUCT_MEMBER_PTR(size_t, earlier_msg, fields[i].quantifier_offset);
+      uint8_t **p_earlier =
+          STRUCT_MEMBER_PTR(uint8_t *, earlier_msg, fields[i].offset);
+      size_t *n_latter =
+          STRUCT_MEMBER_PTR(size_t, latter_msg, fields[i].quantifier_offset);
+      uint8_t **p_latter =
+          STRUCT_MEMBER_PTR(uint8_t *, latter_msg, fields[i].offset);
+
+      if (*n_earlier > 0) {
+        if (*n_latter > 0) {
+          /* Concatenate the repeated field */
+          size_t el_size = sizeof_elt_in_repeated_array(fields[i].type);
+          uint8_t *new_field;
+
+          new_field = do_alloc(allocator, (*n_earlier + *n_latter) * el_size);
+          if (!new_field) return FALSE;
+
+          memcpy(new_field, *p_earlier, *n_earlier * el_size);
+          memcpy(new_field + *n_earlier * el_size, *p_latter,
+                 *n_latter * el_size);
+
+          do_free(allocator, *p_latter);
+          do_free(allocator, *p_earlier);
+          *p_latter = new_field;
+          *n_latter = *n_earlier + *n_latter;
+        } else {
+          /* Zero copy the repeated field from the earlier message */
+          *n_latter = *n_earlier;
+          *p_latter = *p_earlier;
+        }
+        /* Make sure the field does not get double freed */
+        *n_earlier = 0;
+        *p_earlier = 0;
+      }
+    } else if (fields[i].label == PROTOBUF_C_LABEL_OPTIONAL ||
+               fields[i].label == PROTOBUF_C_LABEL_NONE) {
+      const ProtobufCFieldDescriptor *field;
+      uint32_t *earlier_case_p =
+          STRUCT_MEMBER_PTR(uint32_t, earlier_msg, fields[i].quantifier_offset);
+      uint32_t *latter_case_p =
+          STRUCT_MEMBER_PTR(uint32_t, latter_msg, fields[i].quantifier_offset);
+      protobuf_c_boolean need_to_merge = FALSE;
+      void *earlier_elem;
+      void *latter_elem;
+      const void *def_val;
+
+      if (fields[i].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) {
+        if (*latter_case_p == 0) {
+          /* lookup correct oneof field */
+          int field_index = int_range_lookup(
+              latter_msg->descriptor->n_field_ranges,
+              latter_msg->descriptor->field_ranges, *earlier_case_p);
+          field = latter_msg->descriptor->fields + field_index;
+        } else {
+          /* Oneof is present in the latter message, move on */
+          continue;
+        }
+      } else {
+        field = &fields[i];
+      }
+
+      earlier_elem = STRUCT_MEMBER_P(earlier_msg, field->offset);
+      latter_elem = STRUCT_MEMBER_P(latter_msg, field->offset);
+      def_val = field->default_value;
+
+      switch (field->type) {
+        case PROTOBUF_C_TYPE_MESSAGE: {
+          ProtobufCMessage *em = *(ProtobufCMessage **)earlier_elem;
+          ProtobufCMessage *lm = *(ProtobufCMessage **)latter_elem;
+          if (em != NULL) {
+            if (lm != NULL) {
+              if (!merge_messages(em, lm, allocator)) return FALSE;
+              /* Already merged */
+              need_to_merge = FALSE;
+            } else {
+              /* Zero copy the message */
+              need_to_merge = TRUE;
+            }
+          }
+          break;
+        }
+        case PROTOBUF_C_TYPE_BYTES: {
+          uint8_t *e_data = ((ProtobufCBinaryData *)earlier_elem)->data;
+          uint8_t *l_data = ((ProtobufCBinaryData *)latter_elem)->data;
+          const ProtobufCBinaryData *d_bd = (ProtobufCBinaryData *)def_val;
+
+          need_to_merge =
+              (e_data != NULL && (d_bd == NULL || e_data != d_bd->data)) &&
+              (l_data == NULL || (d_bd != NULL && l_data == d_bd->data));
+          break;
+        }
+        case PROTOBUF_C_TYPE_STRING: {
+          char *e_str = *(char **)earlier_elem;
+          char *l_str = *(char **)latter_elem;
+          const char *d_str = def_val;
+
+          need_to_merge = e_str != d_str && l_str == d_str;
+          break;
+        }
+        default: {
+          /* Could be has field or case enum, the logic is
+           * equivalent, since 0 (FALSE) means not set for
+           * oneof */
+          need_to_merge = (*earlier_case_p != 0) && (*latter_case_p == 0);
+          break;
+        }
+      }
+
+      if (need_to_merge) {
+        size_t el_size = sizeof_elt_in_repeated_array(field->type);
+        memcpy(latter_elem, earlier_elem, el_size);
+        /*
+         * Reset the element from the old message to 0
+         * to make sure earlier message deallocation
+         * doesn't corrupt zero-copied data in the new
+         * message, earlier message will be freed after
+         * this function is called anyway
+         */
+        memset(earlier_elem, 0, el_size);
+
+        if (field->quantifier_offset != 0) {
+          /* Set the has field or the case enum,
+           * if applicable */
+          *latter_case_p = *earlier_case_p;
+          *earlier_case_p = 0;
+        }
+      }
+    }
+  }
+  return TRUE;
+}
+
+/**
+ * Count packed elements.
+ *
+ * Given a raw slab of packed-repeated values, determine the number of
+ * elements. This function detects certain kinds of errors but not
+ * others; the remaining error checking is done by
+ * parse_packed_repeated_member().
+ */
+static protobuf_c_boolean count_packed_elements(ProtobufCType type, size_t len,
+                                                const uint8_t *data,
+                                                size_t *count_out) {
+  switch (type) {
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+    case PROTOBUF_C_TYPE_FLOAT:
+      if (len % 4 != 0) {
+        PROTOBUF_C_UNPACK_ERROR(
+            "length must be a multiple of 4 for fixed-length 32-bit types");
+        return FALSE;
+      }
+      *count_out = len / 4;
+      return TRUE;
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+    case PROTOBUF_C_TYPE_DOUBLE:
+      if (len % 8 != 0) {
+        PROTOBUF_C_UNPACK_ERROR(
+            "length must be a multiple of 8 for fixed-length 64-bit types");
+        return FALSE;
+      }
+      *count_out = len / 8;
+      return TRUE;
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_INT32:
+    case PROTOBUF_C_TYPE_SINT32:
+    case PROTOBUF_C_TYPE_UINT32:
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_SINT64:
+    case PROTOBUF_C_TYPE_UINT64:
+      *count_out = max_b128_numbers(len, data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_BOOL:
+      *count_out = len;
+      return TRUE;
+    case PROTOBUF_C_TYPE_STRING:
+    case PROTOBUF_C_TYPE_BYTES:
+    case PROTOBUF_C_TYPE_MESSAGE:
+    default:
+      PROTOBUF_C_UNPACK_ERROR("bad protobuf-c type %u for packed-repeated",
+                              type);
+      return FALSE;
+  }
+}
+
+static inline uint32_t parse_uint32(unsigned len, const uint8_t *data) {
+  uint32_t rv = data[0] & 0x7f;
+  if (len > 1) {
+    rv |= ((uint32_t)(data[1] & 0x7f) << 7);
+    if (len > 2) {
+      rv |= ((uint32_t)(data[2] & 0x7f) << 14);
+      if (len > 3) {
+        rv |= ((uint32_t)(data[3] & 0x7f) << 21);
+        if (len > 4) rv |= ((uint32_t)(data[4]) << 28);
+      }
+    }
+  }
+  return rv;
+}
+
+static inline uint32_t parse_int32(unsigned len, const uint8_t *data) {
+  return parse_uint32(len, data);
+}
+
+static inline int32_t unzigzag32(uint32_t v) {
+  if (v & 1)
+    return -(v >> 1) - 1;
+  else
+    return v >> 1;
+}
+
+static inline uint32_t parse_fixed_uint32(const uint8_t *data) {
+#if !defined(WORDS_BIGENDIAN)
+  uint32_t t;
+  memcpy(&t, data, 4);
+  return t;
+#else
+  return data[0] | ((uint32_t)(data[1]) << 8) | ((uint32_t)(data[2]) << 16) |
+         ((uint32_t)(data[3]) << 24);
+#endif
+}
+
+static uint64_t parse_uint64(unsigned len, const uint8_t *data) {
+  unsigned shift, i;
+  uint64_t rv;
+
+  if (len < 5) return parse_uint32(len, data);
+  rv = ((uint64_t)(data[0] & 0x7f)) | ((uint64_t)(data[1] & 0x7f) << 7) |
+       ((uint64_t)(data[2] & 0x7f) << 14) | ((uint64_t)(data[3] & 0x7f) << 21);
+  shift = 28;
+  for (i = 4; i < len; i++) {
+    rv |= (((uint64_t)(data[i] & 0x7f)) << shift);
+    shift += 7;
+  }
+  return rv;
+}
+
+static inline int64_t unzigzag64(uint64_t v) {
+  if (v & 1)
+    return -(v >> 1) - 1;
+  else
+    return v >> 1;
+}
+
+static inline uint64_t parse_fixed_uint64(const uint8_t *data) {
+#if !defined(WORDS_BIGENDIAN)
+  uint64_t t;
+  memcpy(&t, data, 8);
+  return t;
+#else
+  return (uint64_t)parse_fixed_uint32(data) |
+         (((uint64_t)parse_fixed_uint32(data + 4)) << 32);
+#endif
+}
+
+static protobuf_c_boolean parse_boolean(unsigned len, const uint8_t *data) {
+  unsigned i;
+  for (i = 0; i < len; i++)
+    if (data[i] & 0x7f) return TRUE;
+  return FALSE;
+}
+
+static protobuf_c_boolean parse_required_member(
+    ScannedMember *scanned_member, void *member, ProtobufCAllocator *allocator,
+    protobuf_c_boolean maybe_clear) {
+  unsigned len = scanned_member->len;
+  const uint8_t *data = scanned_member->data;
+  ProtobufCWireType wire_type = scanned_member->wire_type;
+
+  switch (scanned_member->field->type) {
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_INT32:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
+      *(int32_t *)member = parse_int32(len, data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_UINT32:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
+      *(uint32_t *)member = parse_uint32(len, data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_SINT32:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
+      *(int32_t *)member = unzigzag32(parse_uint32(len, data));
+      return TRUE;
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+    case PROTOBUF_C_TYPE_FLOAT:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_32BIT) return FALSE;
+      *(uint32_t *)member = parse_fixed_uint32(data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
+      *(uint64_t *)member = parse_uint64(len, data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_SINT64:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
+      *(int64_t *)member = unzigzag64(parse_uint64(len, data));
+      return TRUE;
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+    case PROTOBUF_C_TYPE_DOUBLE:
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_64BIT) return FALSE;
+      *(uint64_t *)member = parse_fixed_uint64(data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_BOOL:
+      *(protobuf_c_boolean *)member = parse_boolean(len, data);
+      return TRUE;
+    case PROTOBUF_C_TYPE_STRING: {
+      char **pstr = member;
+      unsigned pref_len = scanned_member->length_prefix_len;
+
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
+
+      if (maybe_clear && *pstr != NULL) {
+        const char *def = scanned_member->field->default_value;
+        if (*pstr != NULL && *pstr != def) do_free(allocator, *pstr);
+      }
+      *pstr = do_alloc(allocator, len - pref_len + 1);
+      if (*pstr == NULL) return FALSE;
+      memcpy(*pstr, data + pref_len, len - pref_len);
+      (*pstr)[len - pref_len] = 0;
+      return TRUE;
+    }
+    case PROTOBUF_C_TYPE_BYTES: {
+      ProtobufCBinaryData *bd = member;
+      const ProtobufCBinaryData *def_bd;
+      unsigned pref_len = scanned_member->length_prefix_len;
+
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
+
+      def_bd = scanned_member->field->default_value;
+      if (maybe_clear && bd->data != NULL &&
+          (def_bd == NULL || bd->data != def_bd->data)) {
+        do_free(allocator, bd->data);
+      }
+      if (len - pref_len > 0) {
+        bd->data = do_alloc(allocator, len - pref_len);
+        if (bd->data == NULL) return FALSE;
+        memcpy(bd->data, data + pref_len, len - pref_len);
+      } else {
+        bd->data = NULL;
+      }
+      bd->len = len - pref_len;
+      return TRUE;
+    }
+    case PROTOBUF_C_TYPE_MESSAGE: {
+      ProtobufCMessage **pmessage = member;
+      ProtobufCMessage *subm;
+      const ProtobufCMessage *def_mess;
+      protobuf_c_boolean merge_successful = TRUE;
+      unsigned pref_len = scanned_member->length_prefix_len;
+
+      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
+
+      def_mess = scanned_member->field->default_value;
+      subm =
+          protobuf_c_message_unpack(scanned_member->field->descriptor,
+                                    allocator, len - pref_len, data + pref_len);
+
+      if (maybe_clear && *pmessage != NULL && *pmessage != def_mess) {
+        if (subm != NULL)
+          merge_successful = merge_messages(*pmessage, subm, allocator);
+        /* Delete the previous message */
+        protobuf_c_message_free_unpacked(*pmessage, allocator);
+      }
+      *pmessage = subm;
+      if (subm == NULL || !merge_successful) return FALSE;
+      return TRUE;
+    }
+  }
+  return FALSE;
+}
+
+static protobuf_c_boolean parse_oneof_member(ScannedMember *scanned_member,
+                                             void *member,
+                                             ProtobufCMessage *message,
+                                             ProtobufCAllocator *allocator) {
+  uint32_t *oneof_case = STRUCT_MEMBER_PTR(
+      uint32_t, message, scanned_member->field->quantifier_offset);
+
+  /* If we have already parsed a member of this oneof, free it. */
+  if (*oneof_case != 0) {
+    /* lookup field */
+    int field_index =
+        int_range_lookup(message->descriptor->n_field_ranges,
+                         message->descriptor->field_ranges, *oneof_case);
+    const ProtobufCFieldDescriptor *old_field =
+        message->descriptor->fields + field_index;
+    size_t el_size = sizeof_elt_in_repeated_array(old_field->type);
+
+    switch (old_field->type) {
+      case PROTOBUF_C_TYPE_STRING: {
+        char **pstr = member;
+        const char *def = old_field->default_value;
+        if (*pstr != NULL && *pstr != def) do_free(allocator, *pstr);
+        break;
+      }
+      case PROTOBUF_C_TYPE_BYTES: {
+        ProtobufCBinaryData *bd = member;
+        const ProtobufCBinaryData *def_bd = old_field->default_value;
+        if (bd->data != NULL && (def_bd == NULL || bd->data != def_bd->data)) {
+          do_free(allocator, bd->data);
+        }
+        break;
+      }
+      case PROTOBUF_C_TYPE_MESSAGE: {
+        ProtobufCMessage **pmessage = member;
+        const ProtobufCMessage *def_mess = old_field->default_value;
+        if (*pmessage != NULL && *pmessage != def_mess)
+          protobuf_c_message_free_unpacked(*pmessage, allocator);
+        break;
+      }
+      default:
+        break;
+    }
+
+    memset(member, 0, el_size);
+  }
+  if (!parse_required_member(scanned_member, member, allocator, TRUE))
+    return FALSE;
+
+  *oneof_case = scanned_member->tag;
+  return TRUE;
+}
+
+static protobuf_c_boolean parse_optional_member(ScannedMember *scanned_member,
+                                                void *member,
+                                                ProtobufCMessage *message,
+                                                ProtobufCAllocator *allocator) {
+  if (!parse_required_member(scanned_member, member, allocator, TRUE))
+    return FALSE;
+  if (scanned_member->field->quantifier_offset != 0)
+    STRUCT_MEMBER(protobuf_c_boolean, message,
+                  scanned_member->field->quantifier_offset) = TRUE;
+  return TRUE;
+}
+
+static protobuf_c_boolean parse_repeated_member(ScannedMember *scanned_member,
+                                                void *member,
+                                                ProtobufCMessage *message,
+                                                ProtobufCAllocator *allocator) {
+  const ProtobufCFieldDescriptor *field = scanned_member->field;
+  size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset);
+  size_t siz = sizeof_elt_in_repeated_array(field->type);
+  char *array = *(char **)member;
+
+  if (!parse_required_member(scanned_member, array + siz * (*p_n), allocator,
+                             FALSE)) {
+    return FALSE;
+  }
+  *p_n += 1;
+  return TRUE;
+}
+
+static unsigned scan_varint(unsigned len, const uint8_t *data) {
+  unsigned i;
+  if (len > 10) len = 10;
+  for (i = 0; i < len; i++)
+    if ((data[i] & 0x80) == 0) break;
+  if (i == len) return 0;
+  return i + 1;
+}
+
+static protobuf_c_boolean parse_packed_repeated_member(
+    ScannedMember *scanned_member, void *member, ProtobufCMessage *message) {
+  const ProtobufCFieldDescriptor *field = scanned_member->field;
+  size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset);
+  size_t siz = sizeof_elt_in_repeated_array(field->type);
+  void *array = *(char **)member + siz * (*p_n);
+  const uint8_t *at = scanned_member->data + scanned_member->length_prefix_len;
+  size_t rem = scanned_member->len - scanned_member->length_prefix_len;
+  size_t count = 0;
+  unsigned i;
+
+  switch (field->type) {
+    case PROTOBUF_C_TYPE_SFIXED32:
+    case PROTOBUF_C_TYPE_FIXED32:
+    case PROTOBUF_C_TYPE_FLOAT:
+      count = (scanned_member->len - scanned_member->length_prefix_len) / 4;
+#if !defined(WORDS_BIGENDIAN)
+      goto no_unpacking_needed;
+#else
+      for (i = 0; i < count; i++) {
+        ((uint32_t *)array)[i] = parse_fixed_uint32(at);
+        at += 4;
+      }
+      break;
+#endif
+    case PROTOBUF_C_TYPE_SFIXED64:
+    case PROTOBUF_C_TYPE_FIXED64:
+    case PROTOBUF_C_TYPE_DOUBLE:
+      count = (scanned_member->len - scanned_member->length_prefix_len) / 8;
+#if !defined(WORDS_BIGENDIAN)
+      goto no_unpacking_needed;
+#else
+      for (i = 0; i < count; i++) {
+        ((uint64_t *)array)[i] = parse_fixed_uint64(at);
+        at += 8;
+      }
+      break;
+#endif
+    case PROTOBUF_C_TYPE_ENUM:
+    case PROTOBUF_C_TYPE_INT32:
+      while (rem > 0) {
+        unsigned s = scan_varint(rem, at);
+        if (s == 0) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int32 value");
+          return FALSE;
+        }
+        ((int32_t *)array)[count++] = parse_int32(s, at);
+        at += s;
+        rem -= s;
+      }
+      break;
+    case PROTOBUF_C_TYPE_SINT32:
+      while (rem > 0) {
+        unsigned s = scan_varint(rem, at);
+        if (s == 0) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint32 value");
+          return FALSE;
+        }
+        ((int32_t *)array)[count++] = unzigzag32(parse_uint32(s, at));
+        at += s;
+        rem -= s;
+      }
+      break;
+    case PROTOBUF_C_TYPE_UINT32:
+      while (rem > 0) {
+        unsigned s = scan_varint(rem, at);
+        if (s == 0) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated enum or uint32 value");
+          return FALSE;
+        }
+        ((uint32_t *)array)[count++] = parse_uint32(s, at);
+        at += s;
+        rem -= s;
+      }
+      break;
+
+    case PROTOBUF_C_TYPE_SINT64:
+      while (rem > 0) {
+        unsigned s = scan_varint(rem, at);
+        if (s == 0) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint64 value");
+          return FALSE;
+        }
+        ((int64_t *)array)[count++] = unzigzag64(parse_uint64(s, at));
+        at += s;
+        rem -= s;
+      }
+      break;
+    case PROTOBUF_C_TYPE_INT64:
+    case PROTOBUF_C_TYPE_UINT64:
+      while (rem > 0) {
+        unsigned s = scan_varint(rem, at);
+        if (s == 0) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int64/uint64 value");
+          return FALSE;
+        }
+        ((int64_t *)array)[count++] = parse_uint64(s, at);
+        at += s;
+        rem -= s;
+      }
+      break;
+    case PROTOBUF_C_TYPE_BOOL:
+      count = rem;
+      for (i = 0; i < count; i++) {
+        if (at[i] > 1) {
+          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated boolean value");
+          return FALSE;
+        }
+        ((protobuf_c_boolean *)array)[i] = at[i];
+      }
+      break;
+    default:
+      PROTOBUF_C__ASSERT_NOT_REACHED();
+  }
+  *p_n += count;
+  return TRUE;
+
+#if !defined(WORDS_BIGENDIAN)
+no_unpacking_needed:
+  memcpy(array, at, count * siz);
+  *p_n += count;
+  return TRUE;
+#endif
+}
+
+static protobuf_c_boolean is_packable_type(ProtobufCType type) {
+  return type != PROTOBUF_C_TYPE_STRING && type != PROTOBUF_C_TYPE_BYTES &&
+         type != PROTOBUF_C_TYPE_MESSAGE;
+}
+
+static protobuf_c_boolean parse_member(ScannedMember *scanned_member,
+                                       ProtobufCMessage *message,
+                                       ProtobufCAllocator *allocator) {
+  const ProtobufCFieldDescriptor *field = scanned_member->field;
+  void *member;
+
+  if (field == NULL) {
+    ProtobufCMessageUnknownField *ufield =
+        message->unknown_fields + (message->n_unknown_fields++);
+    ufield->tag = scanned_member->tag;
+    ufield->wire_type = scanned_member->wire_type;
+    ufield->len = scanned_member->len;
+    ufield->data = do_alloc(allocator, scanned_member->len);
+    if (ufield->data == NULL) return FALSE;
+    memcpy(ufield->data, scanned_member->data, ufield->len);
+    return TRUE;
+  }
+  member = (char *)message + field->offset;
+  switch (field->label) {
+    case PROTOBUF_C_LABEL_REQUIRED:
+      return parse_required_member(scanned_member, member, allocator, TRUE);
+    case PROTOBUF_C_LABEL_OPTIONAL:
+    case PROTOBUF_C_LABEL_NONE:
+      if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF)) {
+        return parse_oneof_member(scanned_member, member, message, allocator);
+      } else {
+        return parse_optional_member(scanned_member, member, message,
+                                     allocator);
+      }
+    case PROTOBUF_C_LABEL_REPEATED:
+      if (scanned_member->wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED &&
+          (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) ||
+           is_packable_type(field->type))) {
+        return parse_packed_repeated_member(scanned_member, member, message);
+      } else {
+        return parse_repeated_member(scanned_member, member, message,
+                                     allocator);
+      }
+  }
+  PROTOBUF_C__ASSERT_NOT_REACHED();
+  return 0;
+}
+
+/**
+ * Initialise messages generated by old code.
+ *
+ * This function is used if desc->message_init == NULL (which occurs
+ * for old code, and which would be useful to support allocating
+ * descriptors dynamically).
+ */
+static void message_init_generic(const ProtobufCMessageDescriptor *desc,
+                                 ProtobufCMessage *message) {
+  unsigned i;
+
+  memset(message, 0, desc->sizeof_message);
+  message->descriptor = desc;
+  for (i = 0; i < desc->n_fields; i++) {
+    if (desc->fields[i].default_value != NULL &&
+        desc->fields[i].label != PROTOBUF_C_LABEL_REPEATED) {
+      void *field = STRUCT_MEMBER_P(message, desc->fields[i].offset);
+      const void *dv = desc->fields[i].default_value;
+
+      switch (desc->fields[i].type) {
+        case PROTOBUF_C_TYPE_INT32:
+        case PROTOBUF_C_TYPE_SINT32:
+        case PROTOBUF_C_TYPE_SFIXED32:
+        case PROTOBUF_C_TYPE_UINT32:
+        case PROTOBUF_C_TYPE_FIXED32:
+        case PROTOBUF_C_TYPE_FLOAT:
+        case PROTOBUF_C_TYPE_ENUM:
+          memcpy(field, dv, 4);
+          break;
+        case PROTOBUF_C_TYPE_INT64:
+        case PROTOBUF_C_TYPE_SINT64:
+        case PROTOBUF_C_TYPE_SFIXED64:
+        case PROTOBUF_C_TYPE_UINT64:
+        case PROTOBUF_C_TYPE_FIXED64:
+        case PROTOBUF_C_TYPE_DOUBLE:
+          memcpy(field, dv, 8);
+          break;
+        case PROTOBUF_C_TYPE_BOOL:
+          memcpy(field, dv, sizeof(protobuf_c_boolean));
+          break;
+        case PROTOBUF_C_TYPE_BYTES:
+          memcpy(field, dv, sizeof(ProtobufCBinaryData));
+          break;
+
+        case PROTOBUF_C_TYPE_STRING:
+        case PROTOBUF_C_TYPE_MESSAGE:
+          /*
+           * The next line essentially implements a cast
+           * from const, which is totally unavoidable.
+           */
+          *(const void **)field = dv;
+          break;
+      }
+    }
+  }
+}
+
+/**@}*/
+
+/*
+ * ScannedMember slabs (an unpacking implementation detail). Before doing real
+ * unpacking, we first scan through the elements to see how many there are (for
+ * repeated fields), and which field to use (for non-repeated fields given
+ * twice).
+ *
+ * In order to avoid allocations for small messages, we keep a stack-allocated
+ * slab of ScannedMembers of size FIRST_SCANNED_MEMBER_SLAB_SIZE (16). After we
+ * fill that up, we allocate each slab twice as large as the previous one.
+ */
+#define FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2 4
+
+/*
+ * The number of slabs, including the stack-allocated ones; choose the number so
+ * that we would overflow if we needed a slab larger than provided.
+ */
+#define MAX_SCANNED_MEMBER_SLAB                                      \
+  (sizeof(unsigned int) * 8 - 1 - BOUND_SIZEOF_SCANNED_MEMBER_LOG2 - \
+   FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2)
+
+#define REQUIRED_FIELD_BITMAP_SET(index) \
+  (required_fields_bitmap[(index) / 8] |= (1UL << ((index) % 8)))
+
+#define REQUIRED_FIELD_BITMAP_IS_SET(index) \
+  (required_fields_bitmap[(index) / 8] & (1UL << ((index) % 8)))
+
+ProtobufCMessage *protobuf_c_message_unpack(
+    const ProtobufCMessageDescriptor *desc, ProtobufCAllocator *allocator,
+    size_t len, const uint8_t *data) {
+  ProtobufCMessage *rv;
+  size_t rem = len;
+  const uint8_t *at = data;
+  const ProtobufCFieldDescriptor *last_field = desc->fields + 0;
+  ScannedMember first_member_slab[1UL << FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2];
+
+  /*
+   * scanned_member_slabs[i] is an array of arrays of ScannedMember.
+   * The first slab (scanned_member_slabs[0] is just a pointer to
+   * first_member_slab), above. All subsequent slabs will be allocated
+   * using the allocator.
+   */
+  ScannedMember *scanned_member_slabs[MAX_SCANNED_MEMBER_SLAB + 1];
+  unsigned which_slab = 0;    /* the slab we are currently populating */
+  unsigned in_slab_index = 0; /* number of members in the slab */
+  size_t n_unknown = 0;
+  unsigned f;
+  unsigned j;
+  unsigned i_slab;
+  unsigned last_field_index = 0;
+  unsigned required_fields_bitmap_len;
+  unsigned char required_fields_bitmap_stack[16];
+  unsigned char *required_fields_bitmap = required_fields_bitmap_stack;
+  protobuf_c_boolean required_fields_bitmap_alloced = FALSE;
+
+  ASSERT_IS_MESSAGE_DESCRIPTOR(desc);
+
+  if (allocator == NULL) allocator = &protobuf_c__allocator;
+
+  rv = do_alloc(allocator, desc->sizeof_message);
+  if (!rv) return (NULL);
+  scanned_member_slabs[0] = first_member_slab;
+
+  required_fields_bitmap_len = (desc->n_fields + 7) / 8;
+  if (required_fields_bitmap_len > sizeof(required_fields_bitmap_stack)) {
+    required_fields_bitmap = do_alloc(allocator, required_fields_bitmap_len);
+    if (!required_fields_bitmap) {
+      do_free(allocator, rv);
+      return (NULL);
+    }
+    required_fields_bitmap_alloced = TRUE;
+  }
+  memset(required_fields_bitmap, 0, required_fields_bitmap_len);
+
+  /*
+   * Generated code always defines "message_init". However, we provide a
+   * fallback for (1) users of old protobuf-c generated-code that do not
+   * provide the function, and (2) descriptors constructed from some other
+   * source (most likely, direct construction from the .proto file).
+   */
+  if (desc->message_init != NULL)
+    protobuf_c_message_init(desc, rv);
+  else
+    message_init_generic(desc, rv);
+
+  while (rem > 0) {
+    uint32_t tag;
+    ProtobufCWireType wire_type;
+    size_t used = parse_tag_and_wiretype(rem, at, &tag, &wire_type);
+    const ProtobufCFieldDescriptor *field;
+    ScannedMember tmp;
+
+    if (used == 0) {
+      PROTOBUF_C_UNPACK_ERROR("error parsing tag/wiretype at offset %u",
+                              (unsigned)(at - data));
+      goto error_cleanup_during_scan;
+    }
+    /*
+     * \todo Consider optimizing for field[1].id == tag, if field[1]
+     * exists!
+     */
+    if (last_field == NULL || last_field->id != tag) {
+      /* lookup field */
+      int field_index =
+          int_range_lookup(desc->n_field_ranges, desc->field_ranges, tag);
+      if (field_index < 0) {
+        field = NULL;
+        n_unknown++;
+      } else {
+        field = desc->fields + field_index;
+        last_field = field;
+        last_field_index = field_index;
+      }
+    } else {
+      field = last_field;
+    }
+
+    if (field != NULL && field->label == PROTOBUF_C_LABEL_REQUIRED)
+      REQUIRED_FIELD_BITMAP_SET(last_field_index);
+
+    at += used;
+    rem -= used;
+    tmp.tag = tag;
+    tmp.wire_type = wire_type;
+    tmp.field = field;
+    tmp.data = at;
+    tmp.length_prefix_len = 0;
+
+    switch (wire_type) {
+      case PROTOBUF_C_WIRE_TYPE_VARINT: {
+        unsigned max_len = rem < 10 ? rem : 10;
+        unsigned i;
+
+        for (i = 0; i < max_len; i++)
+          if ((at[i] & 0x80) == 0) break;
+        if (i == max_len) {
+          PROTOBUF_C_UNPACK_ERROR("unterminated varint at offset %u",
+                                  (unsigned)(at - data));
+          goto error_cleanup_during_scan;
+        }
+        tmp.len = i + 1;
+        break;
+      }
+      case PROTOBUF_C_WIRE_TYPE_64BIT:
+        if (rem < 8) {
+          PROTOBUF_C_UNPACK_ERROR("too short after 64bit wiretype at offset %u",
+                                  (unsigned)(at - data));
+          goto error_cleanup_during_scan;
+        }
+        tmp.len = 8;
+        break;
+      case PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED: {
+        size_t pref_len;
+
+        tmp.len = scan_length_prefixed_data(rem, at, &pref_len);
+        if (tmp.len == 0) {
+          /* NOTE: scan_length_prefixed_data calls UNPACK_ERROR */
+          goto error_cleanup_during_scan;
+        }
+        tmp.length_prefix_len = pref_len;
+        break;
+      }
+      case PROTOBUF_C_WIRE_TYPE_32BIT:
+        if (rem < 4) {
+          PROTOBUF_C_UNPACK_ERROR("too short after 32bit wiretype at offset %u",
+                                  (unsigned)(at - data));
+          goto error_cleanup_during_scan;
+        }
+        tmp.len = 4;
+        break;
+      default:
+        PROTOBUF_C_UNPACK_ERROR("unsupported tag %u at offset %u", wire_type,
+                                (unsigned)(at - data));
+        goto error_cleanup_during_scan;
+    }
+
+    if (in_slab_index ==
+        (1UL << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2))) {
+      size_t size;
+
+      in_slab_index = 0;
+      if (which_slab == MAX_SCANNED_MEMBER_SLAB) {
+        PROTOBUF_C_UNPACK_ERROR("too many fields");
+        goto error_cleanup_during_scan;
+      }
+      which_slab++;
+      size = sizeof(ScannedMember)
+             << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2);
+      scanned_member_slabs[which_slab] = do_alloc(allocator, size);
+      if (scanned_member_slabs[which_slab] == NULL)
+        goto error_cleanup_during_scan;
+    }
+    scanned_member_slabs[which_slab][in_slab_index++] = tmp;
+
+    if (field != NULL && field->label == PROTOBUF_C_LABEL_REPEATED) {
+      size_t *n = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset);
+      if (wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED &&
+          (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) ||
+           is_packable_type(field->type))) {
+        size_t count;
+        if (!count_packed_elements(field->type, tmp.len - tmp.length_prefix_len,
+                                   tmp.data + tmp.length_prefix_len, &count)) {
+          PROTOBUF_C_UNPACK_ERROR("counting packed elements");
+          goto error_cleanup_during_scan;
+        }
+        *n += count;
+      } else {
+        *n += 1;
+      }
+    }
+
+    at += tmp.len;
+    rem -= tmp.len;
+  }
+
+  /* allocate space for repeated fields, also check that all required fields
+   * have been set */
+  for (f = 0; f < desc->n_fields; f++) {
+    const ProtobufCFieldDescriptor *field = desc->fields + f;
+    if (field->label == PROTOBUF_C_LABEL_REPEATED) {
+      size_t siz = sizeof_elt_in_repeated_array(field->type);
+      size_t *n_ptr = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset);
+      if (*n_ptr != 0) {
+        unsigned n = *n_ptr;
+        void *a;
+        *n_ptr = 0;
+        assert(rv->descriptor != NULL);
+#define CLEAR_REMAINING_N_PTRS()                               \
+  for (f++; f < desc->n_fields; f++) {                         \
+    field = desc->fields + f;                                  \
+    if (field->label == PROTOBUF_C_LABEL_REPEATED)             \
+      STRUCT_MEMBER(size_t, rv, field->quantifier_offset) = 0; \
+  }
+        a = do_alloc(allocator, siz * n);
+        if (!a) {
+          CLEAR_REMAINING_N_PTRS();
+          goto error_cleanup;
+        }
+        STRUCT_MEMBER(void *, rv, field->offset) = a;
+      }
+    } else if (field->label == PROTOBUF_C_LABEL_REQUIRED) {
+      if (field->default_value == NULL && !REQUIRED_FIELD_BITMAP_IS_SET(f)) {
+        CLEAR_REMAINING_N_PTRS();
+        PROTOBUF_C_UNPACK_ERROR("message '%s': missing required field '%s'",
+                                desc->name, field->name);
+        goto error_cleanup;
+      }
+    }
+  }
+#undef CLEAR_REMAINING_N_PTRS
+
+  /* allocate space for unknown fields */
+  if (n_unknown) {
+    rv->unknown_fields =
+        do_alloc(allocator, n_unknown * sizeof(ProtobufCMessageUnknownField));
+    if (rv->unknown_fields == NULL) goto error_cleanup;
+  }
+
+  /* do real parsing */
+  for (i_slab = 0; i_slab <= which_slab; i_slab++) {
+    unsigned max =
+        (i_slab == which_slab) ? in_slab_index : (1UL << (i_slab + 4));
+    ScannedMember *slab = scanned_member_slabs[i_slab];
+
+    for (j = 0; j < max; j++) {
+      if (!parse_member(slab + j, rv, allocator)) {
+        PROTOBUF_C_UNPACK_ERROR(
+            "error parsing member %s of %s",
+            slab->field ? slab->field->name : "*unknown-field*", desc->name);
+        goto error_cleanup;
+      }
+    }
+  }
+
+  /* cleanup */
+  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
+  if (required_fields_bitmap_alloced)
+    do_free(allocator, required_fields_bitmap);
+  return rv;
+
+error_cleanup:
+  protobuf_c_message_free_unpacked(rv, allocator);
+  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
+  if (required_fields_bitmap_alloced)
+    do_free(allocator, required_fields_bitmap);
+  return NULL;
+
+error_cleanup_during_scan:
+  do_free(allocator, rv);
+  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
+  if (required_fields_bitmap_alloced)
+    do_free(allocator, required_fields_bitmap);
+  return NULL;
+}
+
+void protobuf_c_message_free_unpacked(ProtobufCMessage *message,
+                                      ProtobufCAllocator *allocator) {
+  const ProtobufCMessageDescriptor *desc;
+  unsigned f;
+
+  if (message == NULL) return;
+
+  desc = message->descriptor;
+
+  ASSERT_IS_MESSAGE(message);
+
+  if (allocator == NULL) allocator = &protobuf_c__allocator;
+  message->descriptor = NULL;
+  for (f = 0; f < desc->n_fields; f++) {
+    if (0 != (desc->fields[f].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) &&
+        desc->fields[f].id !=
+            STRUCT_MEMBER(uint32_t, message,
+                          desc->fields[f].quantifier_offset)) {
+      /* This is not the selected oneof, skip it */
+      continue;
+    }
+
+    if (desc->fields[f].label == PROTOBUF_C_LABEL_REPEATED) {
+      size_t n =
+          STRUCT_MEMBER(size_t, message, desc->fields[f].quantifier_offset);
+      void *arr = STRUCT_MEMBER(void *, message, desc->fields[f].offset);
+
+      if (arr != NULL) {
+        if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) {
+          unsigned i;
+          for (i = 0; i < n; i++) do_free(allocator, ((char **)arr)[i]);
+        } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) {
+          unsigned i;
+          for (i = 0; i < n; i++)
+            do_free(allocator, ((ProtobufCBinaryData *)arr)[i].data);
+        } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) {
+          unsigned i;
+          for (i = 0; i < n; i++)
+            protobuf_c_message_free_unpacked(((ProtobufCMessage **)arr)[i],
+                                             allocator);
+        }
+        do_free(allocator, arr);
+      }
+    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) {
+      char *str = STRUCT_MEMBER(char *, message, desc->fields[f].offset);
+
+      if (str && str != desc->fields[f].default_value) do_free(allocator, str);
+    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) {
+      void *data =
+          STRUCT_MEMBER(ProtobufCBinaryData, message, desc->fields[f].offset)
+              .data;
+      const ProtobufCBinaryData *default_bd;
+
+      default_bd = desc->fields[f].default_value;
+      if (data != NULL && (default_bd == NULL || default_bd->data != data)) {
+        do_free(allocator, data);
+      }
+    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) {
+      ProtobufCMessage *sm;
+
+      sm = STRUCT_MEMBER(ProtobufCMessage *, message, desc->fields[f].offset);
+      if (sm && sm != desc->fields[f].default_value)
+        protobuf_c_message_free_unpacked(sm, allocator);
+    }
+  }
+
+  for (f = 0; f < message->n_unknown_fields; f++)
+    do_free(allocator, message->unknown_fields[f].data);
+  if (message->unknown_fields != NULL)
+    do_free(allocator, message->unknown_fields);
+
+  do_free(allocator, message);
+}
+
+void protobuf_c_message_init(const ProtobufCMessageDescriptor *descriptor,
+                             void *message) {
+  descriptor->message_init((ProtobufCMessage *)(message));
+}
+
+protobuf_c_boolean protobuf_c_message_check(const ProtobufCMessage *message) {
+  unsigned i;
+
+  if (!message || !message->descriptor ||
+      message->descriptor->magic != PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC) {
+    return FALSE;
+  }
+
+  for (i = 0; i < message->descriptor->n_fields; i++) {
+    const ProtobufCFieldDescriptor *f = message->descriptor->fields + i;
+    ProtobufCType type = f->type;
+    ProtobufCLabel label = f->label;
+    void *field = STRUCT_MEMBER_P(message, f->offset);
+
+    if (label == PROTOBUF_C_LABEL_REPEATED) {
+      size_t *quantity = STRUCT_MEMBER_P(message, f->quantifier_offset);
+
+      if (*quantity > 0 && *(void **)field == NULL) {
+        return FALSE;
+      }
+
+      if (type == PROTOBUF_C_TYPE_MESSAGE) {
+        ProtobufCMessage **submessage = *(ProtobufCMessage ***)field;
+        unsigned j;
+        for (j = 0; j < *quantity; j++) {
+          if (!protobuf_c_message_check(submessage[j])) return FALSE;
+        }
+      } else if (type == PROTOBUF_C_TYPE_STRING) {
+        char **string = *(char ***)field;
+        unsigned j;
+        for (j = 0; j < *quantity; j++) {
+          if (!string[j]) return FALSE;
+        }
+      } else if (type == PROTOBUF_C_TYPE_BYTES) {
+        ProtobufCBinaryData *bd = *(ProtobufCBinaryData **)field;
+        unsigned j;
+        for (j = 0; j < *quantity; j++) {
+          if (bd[j].len > 0 && bd[j].data == NULL) return FALSE;
+        }
+      }
+
+    } else { /* PROTOBUF_C_LABEL_REQUIRED or PROTOBUF_C_LABEL_OPTIONAL */
+
+      if (type == PROTOBUF_C_TYPE_MESSAGE) {
+        ProtobufCMessage *submessage = *(ProtobufCMessage **)field;
+        if (label == PROTOBUF_C_LABEL_REQUIRED || submessage != NULL) {
+          if (!protobuf_c_message_check(submessage)) return FALSE;
+        }
+      } else if (type == PROTOBUF_C_TYPE_STRING) {
+        char *string = *(char **)field;
+        if (label == PROTOBUF_C_LABEL_REQUIRED && string == NULL) return FALSE;
+      } else if (type == PROTOBUF_C_TYPE_BYTES) {
+        protobuf_c_boolean *has =
+            STRUCT_MEMBER_P(message, f->quantifier_offset);
+        ProtobufCBinaryData *bd = field;
+        if (label == PROTOBUF_C_LABEL_REQUIRED || *has == TRUE) {
+          if (bd->len > 0 && bd->data == NULL) return FALSE;
+        }
+      }
+    }
+  }
+
+  return TRUE;
+}
+
+/* === services === */
+
+typedef void (*GenericHandler)(void *service, const ProtobufCMessage *input,
+                               ProtobufCClosure closure, void *closure_data);
diff --git a/tools/quantification/src/protobuf-c.h b/tools/quantification/src/protobuf-c.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd85695b868af6c7b91590196339bc4f7826a256
--- /dev/null
+++ b/tools/quantification/src/protobuf-c.h
@@ -0,0 +1,921 @@
+/*
+ * Copyright (c) 2008-2017, Dave Benson and the protobuf-c authors.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*! \file
+ * \mainpage Introduction
+ *
+ * This is [protobuf-c], a C implementation of [Protocol Buffers].
+ *
+ * This file defines the public API for the `libprotobuf-c` support library.
+ * This API includes interfaces that can be used directly by client code as well
+ * as the interfaces used by the code generated by the `protoc-c` compiler.
+ *
+ * The `libprotobuf-c` support library performs the actual serialization and
+ * deserialization of Protocol Buffers messages. It interacts with structures,
+ * definitions, and metadata generated by the `protoc-c` compiler from .proto
+ * files.
+ *
+ * \authors Dave Benson and the `protobuf-c` authors.
+ *
+ * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license.
+ *
+ * [protobuf-c]:       https://github.com/protobuf-c/protobuf-c
+ * [Protocol Buffers]: https://developers.google.com/protocol-buffers/
+ * [BSD-2-Clause]:     http://opensource.org/licenses/BSD-2-Clause
+ *
+ * \page gencode Generated Code
+ *
+ * For each enum, we generate a C enum. For each message, we generate a C
+ * structure which can be cast to a `ProtobufCMessage`.
+ *
+ * For each enum and message, we generate a descriptor object that allows us to
+ * implement a kind of reflection on the structures.
+ *
+ * First, some naming conventions:
+ *
+ * - The name of the type for enums and messages and services is camel case
+ *   (meaning WordsAreCrammedTogether) except that double underscores are used
+ *   to delimit scopes. For example, the following `.proto` file:
+ *
+~~~{.proto}
+        package foo.bar;
+        message BazBah {
+            optional int32 val = 1;
+        }
+~~~
+ *
+ * would generate a C type `Foo__Bar__BazBah`.
+ *
+ * - Identifiers for functions and globals are all lowercase, with camel case
+ *   words separated by single underscores. For example, one of the function
+ *   prototypes generated by `protoc-c` for the above example:
+ *
+~~~{.c}
+Foo__Bar__BazBah *
+       foo__bar__baz_bah__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data);
+~~~
+ *
+ * - Identifiers for enum values contain an uppercase prefix which embeds the
+ *   package name and the enum type name.
+ *
+ * - A double underscore is used to separate further components of identifier
+ *   names.
+ *
+ * For example, in the name of the unpack function above, the package name
+ * `foo.bar` has become `foo__bar`, the message name BazBah has become
+ * `baz_bah`, and the method name is `unpack`. These are all joined with double
+ * underscores to form the C identifier `foo__bar__baz_bah__unpack`.
+ *
+ * We also generate descriptor objects for messages and enums. These are
+ * declared in the `.pb-c.h` files:
+ *
+~~~{.c}
+extern const ProtobufCMessageDescriptor foo__bar__baz_bah__descriptor;
+~~~
+ *
+ * The message structures all begin with `ProtobufCMessageDescriptor *` which is
+ * sufficient to allow them to be cast to `ProtobufCMessage`.
+ *
+ * For each message defined in a `.proto` file, we generate a number of
+ * functions and macros. Each function name contains a prefix based on the
+ * package name and message name in order to make it a unique C identifier.
+ *
+ * - `INIT`. Statically initializes a message object, initializing its
+ *   descriptor and setting its fields to default values. Uninitialized
+ *   messages cannot be processed by the protobuf-c library.
+ *
+~~~{.c}
+#define FOO__BAR__BAZ_BAH__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&foo__bar__baz_bah__descriptor), 0 }
+~~~
+ * - `init()`. Initializes a message object, initializing its descriptor and
+ *   setting its fields to default values. Uninitialized messages cannot be
+ *   processed by the protobuf-c library.
+ *
+~~~{.c}
+void foo__bar__baz_bah__init
+                     (Foo__Bar__BazBah *message);
+~~~
+ * - `unpack()`. Unpacks data for a particular message format. Note that the
+ *   `allocator` parameter is usually `NULL` to indicate that the system's
+ *   `malloc()` and `free()` functions should be used for dynamically allocating
+ *   memory.
+ *
+~~~{.c}
+Foo__Bar__BazBah *
+       foo__bar__baz_bah__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data);
+~~~
+ *
+ * - `free_unpacked()`. Frees a message object obtained with the `unpack()`
+ *   method. Freeing `NULL` is allowed (the same as with `free()`).
+ *
+~~~{.c}
+void   foo__bar__baz_bah__free_unpacked
+                     (Foo__Bar__BazBah *message,
+                      ProtobufCAllocator *allocator);
+~~~
+ *
+ * - `get_packed_size()`. Calculates the length in bytes of the serialized
+ *   representation of the message object.
+ *
+~~~{.c}
+size_t foo__bar__baz_bah__get_packed_size
+                     (const Foo__Bar__BazBah   *message);
+~~~
+ *
+ * - `pack()`. Pack a message object into a preallocated buffer. Assumes that
+ *   the buffer is large enough. (Use `get_packed_size()` first.)
+ *
+~~~{.c}
+size_t foo__bar__baz_bah__pack
+                     (const Foo__Bar__BazBah   *message,
+                      uint8_t             *out);
+~~~
+ *
+ * - `pack_to_buffer()`. Packs a message into a "virtual buffer". This is an
+ *   object which defines an "append bytes" callback to consume data as it is
+ *   serialized.
+ *
+~~~{.c}
+size_t foo__bar__baz_bah__pack_to_buffer
+                     (const Foo__Bar__BazBah   *message,
+                      ProtobufCBuffer     *buffer);
+~~~
+ *
+ * \page pack Packing and unpacking messages
+ *
+ * To pack a message, first compute the packed size of the message with
+ * protobuf_c_message_get_packed_size(), then allocate a buffer of at least
+ * that size, then call protobuf_c_message_pack().
+ *
+ * Alternatively, a message can be serialized without calculating the final size
+ * first. Use the protobuf_c_message_pack_to_buffer() function and provide a
+ * ProtobufCBuffer object which implements an "append" method that consumes
+ * data.
+ *
+ * To unpack a message, call the protobuf_c_message_unpack() function. The
+ * result can be cast to an object of the type that matches the descriptor for
+ * the message.
+ *
+ * The result of unpacking a message should be freed with
+ * protobuf_c_message_free_unpacked().
+ */
+
+#ifndef PROTOBUF_C_H
+#define PROTOBUF_C_H
+
+#include <assert.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+#define PROTOBUF_C__BEGIN_DECLS extern "C" {
+#define PROTOBUF_C__END_DECLS }
+#else
+#define PROTOBUF_C__BEGIN_DECLS
+#define PROTOBUF_C__END_DECLS
+#endif
+
+PROTOBUF_C__BEGIN_DECLS
+
+#if defined(_WIN32) && defined(PROTOBUF_C_USE_SHARED_LIB)
+#ifdef PROTOBUF_C_EXPORT
+#define PROTOBUF_C__API __declspec(dllexport)
+#else
+#define PROTOBUF_C__API __declspec(dllimport)
+#endif
+#else
+#define PROTOBUF_C__API
+#endif
+
+#if !defined(PROTOBUF_C__NO_DEPRECATED) && \
+    ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+#define PROTOBUF_C__DEPRECATED __attribute__((__deprecated__))
+#else
+#define PROTOBUF_C__DEPRECATED
+#endif
+
+#ifndef PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE
+#define PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(enum_name) \
+  , _##enum_name##_IS_INT_SIZE = INT_MAX
+#endif
+
+#define PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC 0x14159bc3
+#define PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC 0x28aaeef9
+#define PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC 0x114315af
+
+/* Empty string used for initializers */
+extern const char protobuf_c_empty_string[];
+
+/**
+ * \defgroup api Public API
+ *
+ * This is the public API for `libprotobuf-c`. These interfaces are stable and
+ * subject to Semantic Versioning guarantees.
+ *
+ * @{
+ */
+
+/**
+ * Values for the `flags` word in `ProtobufCFieldDescriptor`.
+ */
+typedef enum {
+  /** Set if the field is repeated and marked with the `packed` option. */
+  PROTOBUF_C_FIELD_FLAG_PACKED = (1 << 0),
+
+  /** Set if the field is marked with the `deprecated` option. */
+  PROTOBUF_C_FIELD_FLAG_DEPRECATED = (1 << 1),
+
+  /** Set if the field is a member of a oneof (union). */
+  PROTOBUF_C_FIELD_FLAG_ONEOF = (1 << 2),
+} ProtobufCFieldFlag;
+
+/**
+ * Message field rules.
+ *
+ * \see [Defining A Message Type] in the Protocol Buffers documentation.
+ *
+ * [Defining A Message Type]:
+ *      https://developers.google.com/protocol-buffers/docs/proto#simple
+ */
+typedef enum {
+  /** A well-formed message must have exactly one of this field. */
+  PROTOBUF_C_LABEL_REQUIRED,
+
+  /**
+   * A well-formed message can have zero or one of this field (but not
+   * more than one).
+   */
+  PROTOBUF_C_LABEL_OPTIONAL,
+
+  /**
+   * This field can be repeated any number of times (including zero) in a
+   * well-formed message. The order of the repeated values will be
+   * preserved.
+   */
+  PROTOBUF_C_LABEL_REPEATED,
+
+  /**
+   * This field has no label. This is valid only in proto3 and is
+   * equivalent to OPTIONAL but no "has" quantifier will be consulted.
+   */
+  PROTOBUF_C_LABEL_NONE,
+} ProtobufCLabel;
+
+/**
+ * Field value types.
+ *
+ * \see [Scalar Value Types] in the Protocol Buffers documentation.
+ *
+ * [Scalar Value Types]:
+ *      https://developers.google.com/protocol-buffers/docs/proto#scalar
+ */
+typedef enum {
+  PROTOBUF_C_TYPE_INT32,    /**< int32 */
+  PROTOBUF_C_TYPE_SINT32,   /**< signed int32 */
+  PROTOBUF_C_TYPE_SFIXED32, /**< signed int32 (4 bytes) */
+  PROTOBUF_C_TYPE_INT64,    /**< int64 */
+  PROTOBUF_C_TYPE_SINT64,   /**< signed int64 */
+  PROTOBUF_C_TYPE_SFIXED64, /**< signed int64 (8 bytes) */
+  PROTOBUF_C_TYPE_UINT32,   /**< unsigned int32 */
+  PROTOBUF_C_TYPE_FIXED32,  /**< unsigned int32 (4 bytes) */
+  PROTOBUF_C_TYPE_UINT64,   /**< unsigned int64 */
+  PROTOBUF_C_TYPE_FIXED64,  /**< unsigned int64 (8 bytes) */
+  PROTOBUF_C_TYPE_FLOAT,    /**< float */
+  PROTOBUF_C_TYPE_DOUBLE,   /**< double */
+  PROTOBUF_C_TYPE_BOOL,     /**< boolean */
+  PROTOBUF_C_TYPE_ENUM,     /**< enumerated type */
+  PROTOBUF_C_TYPE_STRING,   /**< UTF-8 or ASCII string */
+  PROTOBUF_C_TYPE_BYTES,    /**< arbitrary byte sequence */
+  PROTOBUF_C_TYPE_MESSAGE,  /**< nested message */
+} ProtobufCType;
+
+/**
+ * Field wire types.
+ *
+ * \see [Message Structure] in the Protocol Buffers documentation.
+ *
+ * [Message Structure]:
+ *      https://developers.google.com/protocol-buffers/docs/encoding#structure
+ */
+typedef enum {
+  PROTOBUF_C_WIRE_TYPE_VARINT = 0,
+  PROTOBUF_C_WIRE_TYPE_64BIT = 1,
+  PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED = 2,
+  /* "Start group" and "end group" wire types are unsupported. */
+  PROTOBUF_C_WIRE_TYPE_32BIT = 5,
+} ProtobufCWireType;
+
+struct ProtobufCAllocator;
+struct ProtobufCBinaryData;
+struct ProtobufCBuffer;
+struct ProtobufCBufferSimple;
+struct ProtobufCEnumDescriptor;
+struct ProtobufCEnumValue;
+struct ProtobufCEnumValueIndex;
+struct ProtobufCFieldDescriptor;
+struct ProtobufCIntRange;
+struct ProtobufCMessage;
+struct ProtobufCMessageDescriptor;
+struct ProtobufCMessageUnknownField;
+struct ProtobufCMethodDescriptor;
+struct ProtobufCService;
+struct ProtobufCServiceDescriptor;
+
+typedef struct ProtobufCAllocator ProtobufCAllocator;
+typedef struct ProtobufCBinaryData ProtobufCBinaryData;
+typedef struct ProtobufCBuffer ProtobufCBuffer;
+typedef struct ProtobufCBufferSimple ProtobufCBufferSimple;
+typedef struct ProtobufCEnumDescriptor ProtobufCEnumDescriptor;
+typedef struct ProtobufCEnumValue ProtobufCEnumValue;
+typedef struct ProtobufCEnumValueIndex ProtobufCEnumValueIndex;
+typedef struct ProtobufCFieldDescriptor ProtobufCFieldDescriptor;
+typedef struct ProtobufCIntRange ProtobufCIntRange;
+typedef struct ProtobufCMessage ProtobufCMessage;
+typedef struct ProtobufCMessageDescriptor ProtobufCMessageDescriptor;
+typedef struct ProtobufCMessageUnknownField ProtobufCMessageUnknownField;
+typedef struct ProtobufCMethodDescriptor ProtobufCMethodDescriptor;
+typedef struct ProtobufCService ProtobufCService;
+typedef struct ProtobufCServiceDescriptor ProtobufCServiceDescriptor;
+
+/** Boolean type. */
+typedef int protobuf_c_boolean;
+
+typedef void (*ProtobufCClosure)(const ProtobufCMessage *, void *closure_data);
+typedef void (*ProtobufCMessageInit)(ProtobufCMessage *);
+typedef void (*ProtobufCServiceDestroy)(ProtobufCService *);
+
+/**
+ * Structure for defining a custom memory allocator.
+ */
+struct ProtobufCAllocator {
+  /** Function to allocate memory. */
+  void *(*alloc)(void *allocator_data, size_t size);
+
+  /** Function to free memory. */
+  void (*free)(void *allocator_data, void *pointer);
+
+  /** Opaque pointer passed to `alloc` and `free` functions. */
+  void *allocator_data;
+};
+
+/**
+ * Structure for the protobuf `bytes` scalar type.
+ *
+ * The data contained in a `ProtobufCBinaryData` is an arbitrary sequence of
+ * bytes. It may contain embedded `NUL` characters and is not required to be
+ * `NUL`-terminated.
+ */
+struct ProtobufCBinaryData {
+  size_t len;    /**< Number of bytes in the `data` field. */
+  uint8_t *data; /**< Data bytes. */
+};
+
+/**
+ * Structure for defining a virtual append-only buffer. Used by
+ * protobuf_c_message_pack_to_buffer() to abstract the consumption of serialized
+ * bytes.
+ *
+ * `ProtobufCBuffer` "subclasses" may be defined on the stack. For example, to
+ * write to a `FILE` object:
+ *
+~~~{.c}
+typedef struct {
+        ProtobufCBuffer base;
+        FILE *fp;
+} BufferAppendToFile;
+
+static void
+my_buffer_file_append(ProtobufCBuffer *buffer,
+                      size_t len,
+                      const uint8_t *data)
+{
+        BufferAppendToFile *file_buf = (BufferAppendToFile *) buffer;
+        fwrite(data, len, 1, file_buf->fp); // XXX: No error handling!
+}
+~~~
+ *
+ * To use this new type of ProtobufCBuffer, it could be called as follows:
+ *
+~~~{.c}
+...
+BufferAppendToFile tmp = {0};
+tmp.base.append = my_buffer_file_append;
+tmp.fp = fp;
+protobuf_c_message_pack_to_buffer(&message, &tmp);
+...
+~~~
+ */
+struct ProtobufCBuffer {
+  /** Append function. Consumes the `len` bytes stored at `data`. */
+  void (*append)(ProtobufCBuffer *buffer, size_t len, const uint8_t *data);
+};
+
+/**
+ * Simple buffer "subclass" of `ProtobufCBuffer`.
+ *
+ * A `ProtobufCBufferSimple` object is declared on the stack and uses a
+ * scratch buffer provided by the user for the initial allocation. It performs
+ * exponential resizing, using dynamically allocated memory. A
+ * `ProtobufCBufferSimple` object can be created and used as follows:
+ *
+~~~{.c}
+uint8_t pad[128];
+ProtobufCBufferSimple simple = PROTOBUF_C_BUFFER_SIMPLE_INIT(pad);
+ProtobufCBuffer *buffer = (ProtobufCBuffer *) &simple;
+~~~
+ *
+ * `buffer` can now be used with `protobuf_c_message_pack_to_buffer()`. Once a
+ * message has been serialized to a `ProtobufCBufferSimple` object, the
+ * serialized data bytes can be accessed from the `.data` field.
+ *
+ * To free the memory allocated by a `ProtobufCBufferSimple` object, if any,
+ * call PROTOBUF_C_BUFFER_SIMPLE_CLEAR() on the object, for example:
+ *
+~~~{.c}
+PROTOBUF_C_BUFFER_SIMPLE_CLEAR(&simple);
+~~~
+ *
+ * \see PROTOBUF_C_BUFFER_SIMPLE_INIT
+ * \see PROTOBUF_C_BUFFER_SIMPLE_CLEAR
+ */
+struct ProtobufCBufferSimple {
+  /** "Base class". */
+  ProtobufCBuffer base;
+  /** Number of bytes allocated in `data`. */
+  size_t alloced;
+  /** Number of bytes currently stored in `data`. */
+  size_t len;
+  /** Data bytes. */
+  uint8_t *data;
+  /** Whether `data` must be freed. */
+  protobuf_c_boolean must_free_data;
+  /** Allocator to use. May be NULL to indicate the system allocator. */
+  ProtobufCAllocator *allocator;
+};
+
+/**
+ * Describes an enumeration as a whole, with all of its values.
+ */
+struct ProtobufCEnumDescriptor {
+  /** Magic value checked to ensure that the API is used correctly. */
+  uint32_t magic;
+
+  /** The qualified name (e.g., "namespace.Type"). */
+  const char *name;
+  /** The unqualified name as given in the .proto file (e.g., "Type"). */
+  const char *short_name;
+  /** Identifier used in generated C code. */
+  const char *c_name;
+  /** The dot-separated namespace. */
+  const char *package_name;
+
+  /** Number elements in `values`. */
+  unsigned n_values;
+  /** Array of distinct values, sorted by numeric value. */
+  const ProtobufCEnumValue *values;
+
+  /** Number of elements in `values_by_name`. */
+  unsigned n_value_names;
+  /** Array of named values, including aliases, sorted by name. */
+  const ProtobufCEnumValueIndex *values_by_name;
+
+  /** Number of elements in `value_ranges`. */
+  unsigned n_value_ranges;
+  /** Value ranges, for faster lookups by numeric value. */
+  const ProtobufCIntRange *value_ranges;
+
+  /** Reserved for future use. */
+  void *reserved1;
+  /** Reserved for future use. */
+  void *reserved2;
+  /** Reserved for future use. */
+  void *reserved3;
+  /** Reserved for future use. */
+  void *reserved4;
+};
+
+/**
+ * Represents a single value of an enumeration.
+ */
+struct ProtobufCEnumValue {
+  /** The string identifying this value in the .proto file. */
+  const char *name;
+
+  /** The string identifying this value in generated C code. */
+  const char *c_name;
+
+  /** The numeric value assigned in the .proto file. */
+  int value;
+};
+
+/**
+ * Used by `ProtobufCEnumDescriptor` to look up enum values.
+ */
+struct ProtobufCEnumValueIndex {
+  /** Name of the enum value. */
+  const char *name;
+  /** Index into values[] array. */
+  unsigned index;
+};
+
+/**
+ * Describes a single field in a message.
+ */
+struct ProtobufCFieldDescriptor {
+  /** Name of the field as given in the .proto file. */
+  const char *name;
+
+  /** Tag value of the field as given in the .proto file. */
+  uint32_t id;
+
+  /** Whether the field is `REQUIRED`, `OPTIONAL`, or `REPEATED`. */
+  ProtobufCLabel label;
+
+  /** The type of the field. */
+  ProtobufCType type;
+
+  /**
+   * The offset in bytes of the message's C structure's quantifier field
+   * (the `has_MEMBER` field for optional members or the `n_MEMBER` field
+   * for repeated members or the case enum for oneofs).
+   */
+  unsigned quantifier_offset;
+
+  /**
+   * The offset in bytes into the message's C structure for the member
+   * itself.
+   */
+  unsigned offset;
+
+  /**
+   * A type-specific descriptor.
+   *
+   * If `type` is `PROTOBUF_C_TYPE_ENUM`, then `descriptor` points to the
+   * corresponding `ProtobufCEnumDescriptor`.
+   *
+   * If `type` is `PROTOBUF_C_TYPE_MESSAGE`, then `descriptor` points to
+   * the corresponding `ProtobufCMessageDescriptor`.
+   *
+   * Otherwise this field is NULL.
+   */
+  const void *descriptor; /* for MESSAGE and ENUM types */
+
+  /** The default value for this field, if defined. May be NULL. */
+  const void *default_value;
+
+  /**
+   * A flag word. Zero or more of the bits defined in the
+   * `ProtobufCFieldFlag` enum may be set.
+   */
+  uint32_t flags;
+
+  /** Reserved for future use. */
+  unsigned reserved_flags;
+  /** Reserved for future use. */
+  void *reserved2;
+  /** Reserved for future use. */
+  void *reserved3;
+};
+
+/**
+ * Helper structure for optimizing int => index lookups in the case
+ * where the keys are mostly consecutive values, as they presumably are for
+ * enums and fields.
+ *
+ * The data structures requires that the values in the original array are
+ * sorted.
+ */
+struct ProtobufCIntRange {
+  int start_value;
+  unsigned orig_index;
+  /*
+   * NOTE: the number of values in the range can be inferred by looking
+   * at the next element's orig_index. A dummy element is added to make
+   * this simple.
+   */
+};
+
+/**
+ * An instance of a message.
+ *
+ * `ProtobufCMessage` is a light-weight "base class" for all messages.
+ *
+ * In particular, `ProtobufCMessage` doesn't have any allocation policy
+ * associated with it. That's because it's common to create `ProtobufCMessage`
+ * objects on the stack. In fact, that's what we recommend for sending messages.
+ * If the object is allocated from the stack, you can't really have a memory
+ * leak.
+ *
+ * This means that calls to functions like protobuf_c_message_unpack() which
+ * return a `ProtobufCMessage` must be paired with a call to a free function,
+ * like protobuf_c_message_free_unpacked().
+ */
+struct ProtobufCMessage {
+  /** The descriptor for this message type. */
+  const ProtobufCMessageDescriptor *descriptor;
+  /** The number of elements in `unknown_fields`. */
+  unsigned n_unknown_fields;
+  /** The fields that weren't recognized by the parser. */
+  ProtobufCMessageUnknownField *unknown_fields;
+};
+
+/**
+ * Describes a message.
+ */
+struct ProtobufCMessageDescriptor {
+  /** Magic value checked to ensure that the API is used correctly. */
+  uint32_t magic;
+
+  /** The qualified name (e.g., "namespace.Type"). */
+  const char *name;
+  /** The unqualified name as given in the .proto file (e.g., "Type"). */
+  const char *short_name;
+  /** Identifier used in generated C code. */
+  const char *c_name;
+  /** The dot-separated namespace. */
+  const char *package_name;
+
+  /**
+   * Size in bytes of the C structure representing an instance of this
+   * type of message.
+   */
+  size_t sizeof_message;
+
+  /** Number of elements in `fields`. */
+  unsigned n_fields;
+  /** Field descriptors, sorted by tag number. */
+  const ProtobufCFieldDescriptor *fields;
+  /** Used for looking up fields by name. */
+  const unsigned *fields_sorted_by_name;
+
+  /** Number of elements in `field_ranges`. */
+  unsigned n_field_ranges;
+  /** Used for looking up fields by id. */
+  const ProtobufCIntRange *field_ranges;
+
+  /** Message initialisation function. */
+  ProtobufCMessageInit message_init;
+
+  /** Reserved for future use. */
+  void *reserved1;
+  /** Reserved for future use. */
+  void *reserved2;
+  /** Reserved for future use. */
+  void *reserved3;
+};
+
+/**
+ * An unknown message field.
+ */
+struct ProtobufCMessageUnknownField {
+  /** The tag number. */
+  uint32_t tag;
+  /** The wire type of the field. */
+  ProtobufCWireType wire_type;
+  /** Number of bytes in `data`. */
+  size_t len;
+  /** Field data. */
+  uint8_t *data;
+};
+
+/**
+ * Method descriptor.
+ */
+struct ProtobufCMethodDescriptor {
+  /** Method name. */
+  const char *name;
+  /** Input message descriptor. */
+  const ProtobufCMessageDescriptor *input;
+  /** Output message descriptor. */
+  const ProtobufCMessageDescriptor *output;
+};
+
+/**
+ * Service.
+ */
+struct ProtobufCService {
+  /** Service descriptor. */
+  const ProtobufCServiceDescriptor *descriptor;
+  /** Function to invoke the service. */
+  void (*invoke)(ProtobufCService *service, unsigned method_index,
+                 const ProtobufCMessage *input, ProtobufCClosure closure,
+                 void *closure_data);
+  /** Function to destroy the service. */
+  void (*destroy)(ProtobufCService *service);
+};
+
+/**
+ * Service descriptor.
+ */
+struct ProtobufCServiceDescriptor {
+  /** Magic value checked to ensure that the API is used correctly. */
+  uint32_t magic;
+
+  /** Service name. */
+  const char *name;
+  /** Short version of service name. */
+  const char *short_name;
+  /** C identifier for the service name. */
+  const char *c_name;
+  /** Package name. */
+  const char *package;
+  /** Number of elements in `methods`. */
+  unsigned n_methods;
+  /** Method descriptors, in the order defined in the .proto file. */
+  const ProtobufCMethodDescriptor *methods;
+  /** Sort index of methods. */
+  const unsigned *method_indices_by_name;
+};
+
+/**
+ * Get the version of the protobuf-c library. Note that this is the version of
+ * the library linked against, not the version of the headers compiled against.
+ *
+ * \return A string containing the version number of protobuf-c.
+ */
+PROTOBUF_C__API
+const char *protobuf_c_version(void);
+
+/**
+ * Get the version of the protobuf-c library. Note that this is the version of
+ * the library linked against, not the version of the headers compiled against.
+ *
+ * \return A 32 bit unsigned integer containing the version number of
+ *      protobuf-c, represented in base-10 as (MAJOR*1E6) + (MINOR*1E3) + PATCH.
+ */
+PROTOBUF_C__API
+uint32_t protobuf_c_version_number(void);
+
+/**
+ * The version of the protobuf-c headers, represented as a string using the same
+ * format as protobuf_c_version().
+ */
+#define PROTOBUF_C_VERSION "1.3.0"
+
+/**
+ * The version of the protobuf-c headers, represented as an integer using the
+ * same format as protobuf_c_version_number().
+ */
+#define PROTOBUF_C_VERSION_NUMBER 1003000
+
+/**
+ * The minimum protoc-c version which works with the current version of the
+ * protobuf-c headers.
+ */
+#define PROTOBUF_C_MIN_COMPILER_VERSION 1000000
+
+/**
+ * Determine the number of bytes required to store the serialised message.
+ *
+ * \param message
+ *      The message object to serialise.
+ * \return
+ *      Number of bytes.
+ */
+PROTOBUF_C__API
+size_t protobuf_c_message_get_packed_size(const ProtobufCMessage *message);
+
+/**
+ * Unpack a serialised message into an in-memory representation.
+ *
+ * \param descriptor
+ *      The message descriptor.
+ * \param allocator
+ *      `ProtobufCAllocator` to use for memory allocation. May be NULL to
+ *      specify the default allocator.
+ * \param len
+ *      Length in bytes of the serialised message.
+ * \param data
+ *      Pointer to the serialised message.
+ * \return
+ *      An unpacked message object.
+ * \retval NULL
+ *      If an error occurred during unpacking.
+ */
+PROTOBUF_C__API
+ProtobufCMessage *protobuf_c_message_unpack(
+    const ProtobufCMessageDescriptor *descriptor, ProtobufCAllocator *allocator,
+    size_t len, const uint8_t *data);
+
+/**
+ * Free an unpacked message object.
+ *
+ * This function should be used to deallocate the memory used by a call to
+ * protobuf_c_message_unpack().
+ *
+ * \param message
+ *      The message object to free. May be NULL.
+ * \param allocator
+ *      `ProtobufCAllocator` to use for memory deallocation. May be NULL to
+ *      specify the default allocator.
+ */
+PROTOBUF_C__API
+void protobuf_c_message_free_unpacked(ProtobufCMessage *message,
+                                      ProtobufCAllocator *allocator);
+
+/**
+ * Check the validity of a message object.
+ *
+ * Makes sure all required fields (`PROTOBUF_C_LABEL_REQUIRED`) are present.
+ * Recursively checks nested messages.
+ *
+ * \retval TRUE
+ *      Message is valid.
+ * \retval FALSE
+ *      Message is invalid.
+ */
+PROTOBUF_C__API
+protobuf_c_boolean protobuf_c_message_check(const ProtobufCMessage *);
+
+/** Message initialiser. */
+#define PROTOBUF_C_MESSAGE_INIT(descriptor) \
+  { descriptor, 0, NULL }
+
+/**
+ * Initialise a message object from a message descriptor.
+ *
+ * \param descriptor
+ *      Message descriptor.
+ * \param message
+ *      Allocated block of memory of size `descriptor->sizeof_message`.
+ */
+PROTOBUF_C__API
+void protobuf_c_message_init(const ProtobufCMessageDescriptor *descriptor,
+                             void *message);
+
+/**
+ * Initialise a `ProtobufCBufferSimple` object.
+ */
+#define PROTOBUF_C_BUFFER_SIMPLE_INIT(array_of_bytes)             \
+  {                                                               \
+    {protobuf_c_buffer_simple_append}, sizeof(array_of_bytes), 0, \
+        (array_of_bytes), 0, NULL                                 \
+  }
+
+/**
+ * Clear a `ProtobufCBufferSimple` object, freeing any allocated memory.
+ */
+#define PROTOBUF_C_BUFFER_SIMPLE_CLEAR(simp_buf)                              \
+  do {                                                                        \
+    if ((simp_buf)->must_free_data) {                                         \
+      if ((simp_buf)->allocator != NULL)                                      \
+        (simp_buf)->allocator->free((simp_buf)->allocator, (simp_buf)->data); \
+      else                                                                    \
+        free((simp_buf)->data);                                               \
+    }                                                                         \
+  } while (0)
+
+/**
+ * The `append` method for `ProtobufCBufferSimple`.
+ *
+ * \param buffer
+ *      The buffer object to append to. Must actually be a
+ *      `ProtobufCBufferSimple` object.
+ * \param len
+ *      Number of bytes in `data`.
+ * \param data
+ *      Data to append.
+ */
+PROTOBUF_C__API
+void protobuf_c_buffer_simple_append(ProtobufCBuffer *buffer, size_t len,
+                                     const unsigned char *data);
+
+/**@}*/
+
+PROTOBUF_C__END_DECLS
+
+#endif /* PROTOBUF_C_H */
diff --git a/tools/quantification/src/tensor_desc.h b/tools/quantification/src/tensor_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..4eadf341db998ae12939d252d585051ba54c3bf0
--- /dev/null
+++ b/tools/quantification/src/tensor_desc.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "src/framework.pb-c.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+enum VarType_Type {
+  VARTYPE_TYPE_BOOL = 0,
+  VARTYPE_TYPE_INT16 = 1,
+  VARTYPE_TYPE_INT32 = 2,
+  VARTYPE_TYPE_INT64 = 3,
+  VARTYPE_TYPE_FP16 = 4,
+  VARTYPE_TYPE_FP32 = 5,
+  VARTYPE_TYPE_FP64 = 6,
+  VARTYPE_TYPE_LOD_TENSOR = 7,
+  VARTYPE_TYPE_SELECTED_ROWS = 8,
+  VARTYPE_TYPE_FEED_MINIBATCH = 9,
+  VARTYPE_TYPE_FETCH_LIST = 10,
+  VARTYPE_TYPE_STEP_SCOPES = 11,
+  VARTYPE_TYPE_STEP_LOD_RANK_TABLE = 12,
+  VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY = 13,
+  VARTYPE_TYPE_STEP_PLACE_LIST = 14,
+  VARTYPE_TYPE_READER = 15,
+  VARTYPE_TYPE_CHANNEL = 16,
+  VARTYPE_TYPE_RAW = 17,
+  VARTYPE_TYPE_TUPLE = 18
+};
+
+class TensorDesc {
+ public:
+  TensorDesc() = default;
+  TensorDesc(const TensorDesc &desc) {
+    this->dims_ = desc.dims_;
+    this->data_type_ = desc.data_type_;
+  }
+
+  explicit TensorDesc(
+      PaddleMobile__Framework__Proto__VarType__TensorDesc *desc) {
+    for (int i = 0; i < desc->n_dims; ++i) {
+      int64_t d = desc->dims[i];
+      dims_.emplace_back(d);
+    }
+    data_type_ = (VarType_Type)desc->data_type;
+  }
+
+  std::vector<int64_t> Dims() const { return dims_; }
+  VarType_Type DataType() const { return data_type_; }
+
+ private:
+  std::vector<int64_t> dims_;
+  VarType_Type data_type_;
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/tools/quantification/src/var_desc.h b/tools/quantification/src/var_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b9c5ac4d672be2dd8a8a2a2695c2816f9cae05a
--- /dev/null
+++ b/tools/quantification/src/var_desc.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "src/framework.pb-c.h"
+#include "src/tensor_desc.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+class VarDesc {
+ public:
+  VarDesc(const VarDesc &var_desc) {
+    this->data_type_ = var_desc.data_type_;
+    this->name_ = var_desc.name_;
+    this->persistable_ = var_desc.persistable_;
+    this->tensor_desc_ = var_desc.tensor_desc_;
+    this->type_ = var_desc.type_;
+  }
+  explicit VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) {
+    type_ = (VarType_Type)desc->type->type;
+    name_ = std::string(desc->name);
+    persistable_ = static_cast<bool>(desc->persistable);
+
+    switch (type_) {
+      case VARTYPE_TYPE_SELECTED_ROWS:
+        tensor_desc_ = TensorDesc(desc->type->selected_rows);
+        break;
+      case VARTYPE_TYPE_LOD_TENSOR:
+        tensor_desc_ = TensorDesc(desc->type->lod_tensor->tensor);
+        break;
+      case VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY:
+        // desc->type->tensor_array->tensor->data_type;
+        tensor_desc_ = TensorDesc(desc->type->tensor_array->tensor);
+
+        break;
+      default:
+        break;
+    }
+    switch (type_) {
+      case VARTYPE_TYPE_CHANNEL:
+        data_type_ = (VarType_Type)desc->type->channel->data_type;
+        break;
+      default:
+        data_type_ = tensor_desc_.DataType();
+        break;
+    }
+  }
+  std::string Name() const { return name_; }
+
+  VarType_Type Type() const { return type_; }
+
+  bool Persistable() const { return persistable_; }
+
+  const TensorDesc &Tensor_desc() const { return tensor_desc_; }
+
+ private:
+  std::string name_;
+  bool persistable_;
+  TensorDesc tensor_desc_;
+  VarType_Type type_;
+  VarType_Type data_type_;
+};
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/tools/toolchains/arm-android-neon.cmake b/tools/toolchains/arm-android-neon.cmake
index f2fa600b90fb54886838e953e61c1e940569dee6..5e431059a974810b2fd0481e0942447f57bf1286 100644
--- a/tools/toolchains/arm-android-neon.cmake
+++ b/tools/toolchains/arm-android-neon.cmake
@@ -1,2 +1,5 @@
 set(ANDROID_ARM_NEON ON)
-include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")
\ No newline at end of file
+set(ANDROID_PIE TRUE)
+set(ANDROID_STL "c++_static")
+set(ANDROID_PLATFORM "android-22")
+include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")