Merge branch 'develop' into develop

523a949c · eclipsycn · GitHub · 7a386c06 · 04ea1b77 · 523a949c
100 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.6)
 project(paddle-mobile)
 option(DEBUGING "enable debug mode" ON)
-option(USE_OPENMP "openmp support" ON)
+option(USE_OPENMP "openmp support" OFF)
 option(USE_EXCEPTION "use std exception" ON)
 option(LOG_PROFILE "log profile" ON)
 # select the platform to build
@@ -15,7 +15,7 @@ file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
 include_directories(src/)
 if(IS_IOS)
-    set(CMAKE_CXX_FLAGS "-fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
 else()
    set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
 endif()
@@ -43,7 +43,7 @@ if (LOG_PROFILE)
    add_definitions(-DPADDLE_MOBILE_PROFILE)
 endif()
-if(USE_OPENMP)
+if(USE_OPENMP AND NOT IS_IOS)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
 endif()
@@ -104,12 +104,21 @@ else()
    foreach(f ${_tmp_list_h})
        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
    endforeach()
-endif()
+    file(GLOB_RECURSE _tmp_list src/fpga/*.cpp src/fpga/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+    file(GLOB_RECURSE _tmp_list_h src/fpga/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
+endif()
 if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
-    add_definitions(-DARMV7)
 else()
    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
@@ -130,8 +139,8 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
 # NET default
-set(NET "defult" CACHE STRING "select net type")
+set(NET "default" CACHE STRING "select net type")
-set_property(CACHE NET PROPERTY STRINGS "defult" "googlenet" "mobilenet" "yolo" "squeezenet")
+set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGAnets")
 include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
@@ -153,3 +162,4 @@ if(DEBUGING)
    endif()
 endif()
--- a/README.md
+++ b/README.md
@@ -27,10 +27,10 @@ Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 - **ARM CPU**
-![](http://7xop3k.com1.z0.glb.clouddn.com/15312108766575.jpg)
+![](http://mms-graph.bj.bcebos.com/paddle-mobile%2F2018_07_29.png)
    arm cpu是paddle-mobile的主要支持方向，cpu的通用性一直是其优势。嵌入式深度学习，需要大量的cpu汇编实现。我们正在紧锣密鼓的编码，为的是能充分硬件的每一点加速能力。
-    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是120+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
+    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
 - **Mali GPU**

--- a/doc/quantification.md
+++ b/doc/quantification.md
+# Quantification 模型量化、反量化
+## 背景故事
+部分网络如AlexNet训练出的模型体积较大，不适宜在移动设备上使用。
+## 解决模型过大办法
+1. 选用适合移动端的模型结构如：mobilenet、googlenet、 yolo、squeezenet 等；
+2. 使用我们提供的量化工具，可以在几乎不影响精度的情况下将float32模型减小至原模型的 1/4；
+- - - - - 
+## 量化工具介绍
+### 模型转化工具目录：
+- [量化工具目录](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/tools/quantification)
+- [模型转化工具](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/convert.cpp)
+#### 使用说明
+- [工具使用](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/README.md)
+## 如何读取量化后的模型
+load方法中添加了 quantification 参数，默认为false。 如果需要load量化后的模型，按需传参即可。
+[我是源代码](https://github.com/PaddlePaddle/paddle-mobile/blob/55302b33ea3bd68c9797d8f65e527544792b8095/src/io/paddle_mobile.h)
+```c++
+bool Load(const std::string &dirname, bool optimize = false,
+            bool quantification = false, int batch_size = 1);
+```
+- - - - - 
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -17,38 +17,46 @@ limitations under the License. */
 namespace paddle_mobile {
-const std::string G_OP_TYPE_CONV = "conv2d";
+const char *G_OP_TYPE_CONV = "conv2d";
-const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
+const char *G_OP_TYPE_BATCHNORM = "batch_norm";
-const std::string G_OP_TYPE_BOX_CODER = "box_coder";
+const char *G_OP_TYPE_BOX_CODER = "box_coder";
-const std::string G_OP_TYPE_CONCAT = "concat";
+const char *G_OP_TYPE_CONCAT = "concat";
-const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
+const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
-const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
+const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
-const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
+const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
-const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
+const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
+const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
-const std::string G_OP_TYPE_FC = "fusion_fc";
+const char *G_OP_TYPE_FC = "fusion_fc";
-const std::string G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
+const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
-const std::string G_OP_TYPE_LRN = "lrn";
+const char *G_OP_TYPE_LRN = "lrn";
-const std::string G_OP_TYPE_MUL = "mul";
+const char *G_OP_TYPE_MUL = "mul";
-const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
+const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
-const std::string G_OP_TYPE_POOL2D = "pool2d";
+const char *G_OP_TYPE_POOL2D = "pool2d";
-const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
+const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
-const std::string G_OP_TYPE_RELU = "relu";
+const char *G_OP_TYPE_RELU = "relu";
-const std::string G_OP_TYPE_RESHAPE = "reshape";
+const char *G_OP_TYPE_RESHAPE = "reshape";
-const std::string G_OP_TYPE_SIGMOID = "sigmoid";
+const char *G_OP_TYPE_SIGMOID = "sigmoid";
-const std::string G_OP_TYPE_SOFTMAX = "softmax";
+const char *G_OP_TYPE_SOFTMAX = "softmax";
-const std::string G_OP_TYPE_TRANSPOSE = "transpose";
+const char *G_OP_TYPE_TRANSPOSE = "transpose";
-const std::string G_OP_TYPE_SPLIT = "split";
+const char *G_OP_TYPE_SPLIT = "split";
-const std::string G_OP_TYPE_FEED = "feed";
+const char *G_OP_TYPE_FEED = "feed";
-const std::string G_OP_TYPE_FETCH = "fetch";
+const char *G_OP_TYPE_FETCH = "fetch";
-const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
+const char *G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
-const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence";
+const char *G_OP_TYPE_IM2SEQUENCE = "im2sequence";
-const std::string G_OP_TYPE_DROPOUT = "dropout";
+const char *G_OP_TYPE_DROPOUT = "dropout";
+const char *G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn";
+const char *G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn";
+const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU =
+    "fusion_elementwise_add_relu";
+const char *G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu";
+const char *G_OP_TYPE_REGION = "region";
 std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
    op_input_output_key = {
        {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_FUSION_DWCONV_BN_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_BN_RELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
        {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
@@ -72,6 +80,11 @@ std::unordered_map<
        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}}};
+        {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_BN, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_POOL_BN, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_FC_RELU, {{"X", "Y", "Z"}, {"Out"}}},
+        {G_OP_TYPE_REGION, {{"X"}, {"Out"}}}};
 }  // namespace paddle_mobile
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 namespace paddle_mobile {
@@ -72,33 +73,40 @@ enum PMStatus {
  PMWrongDevice = 0x08     /*!< un-correct device. */
 };
-extern const std::string G_OP_TYPE_CONV;
+extern const char *G_OP_TYPE_CONV;
-extern const std::string G_OP_TYPE_BATCHNORM;
+extern const char *G_OP_TYPE_BATCHNORM;
-extern const std::string G_OP_TYPE_BOX_CODER;
+extern const char *G_OP_TYPE_BOX_CODER;
-extern const std::string G_OP_TYPE_CONCAT;
+extern const char *G_OP_TYPE_CONCAT;
-extern const std::string G_OP_TYPE_ELEMENTWISE_ADD;
+extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
-extern const std::string G_OP_TYPE_FC;
+extern const char *G_OP_TYPE_FC;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD;
-extern const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
-extern const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_CONV_BN_RELU;
-extern const std::string G_OP_TYPE_LRN;
-extern const std::string G_OP_TYPE_MUL;
+extern const char *G_OP_TYPE_LRN;
-extern const std::string G_OP_TYPE_MULTICLASS_NMS;
+extern const char *G_OP_TYPE_MUL;
-extern const std::string G_OP_TYPE_POOL2D;
+extern const char *G_OP_TYPE_MULTICLASS_NMS;
-extern const std::string G_OP_TYPE_PRIOR_BOX;
+extern const char *G_OP_TYPE_POOL2D;
-extern const std::string G_OP_TYPE_RELU;
+extern const char *G_OP_TYPE_PRIOR_BOX;
-extern const std::string G_OP_TYPE_RESHAPE;
+extern const char *G_OP_TYPE_RELU;
-extern const std::string G_OP_TYPE_SIGMOID;
+extern const char *G_OP_TYPE_RESHAPE;
-extern const std::string G_OP_TYPE_SOFTMAX;
+extern const char *G_OP_TYPE_SIGMOID;
-extern const std::string G_OP_TYPE_TRANSPOSE;
+extern const char *G_OP_TYPE_SOFTMAX;
-extern const std::string G_OP_TYPE_SPLIT;
+extern const char *G_OP_TYPE_TRANSPOSE;
-extern const std::string G_OP_TYPE_FEED;
+extern const char *G_OP_TYPE_SPLIT;
-extern const std::string G_OP_TYPE_FETCH;
+extern const char *G_OP_TYPE_FEED;
-extern const std::string G_OP_TYPE_DEPTHWISE_CONV;
+extern const char *G_OP_TYPE_FETCH;
-extern const std::string G_OP_TYPE_IM2SEQUENCE;
+extern const char *G_OP_TYPE_DEPTHWISE_CONV;
-extern const std::string G_OP_TYPE_DROPOUT;
+extern const char *G_OP_TYPE_IM2SEQUENCE;
+extern const char *G_OP_TYPE_DROPOUT;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN;
+extern const char *G_OP_TYPE_FUSION_POOL_BN;
+extern const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU;
+extern const char *G_OP_TYPE_FUSION_FC_RELU;
+extern const char *G_OP_TYPE_REGION;
 extern std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>

--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -84,7 +84,7 @@ struct Variant {
    if (type_id == typeid(T).hash_code()) {
      return *const_cast<T *>(reinterpret_cast<const T *>(&data));
    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant ");
+      PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant");
      exit(0);
    }
  }

--- a/src/fpga/api/fpga_api.cpp
+++ b/src/fpga/api/fpga_api.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include "fpga/api/fpga_api.h"
+namespace paddle {
+namespace mobile {
+namespace fpga {
+namespace api {
+static int fd = -1;
+static const char *device_path = "/dev/fpgadrv0";
+static inline int do_ioctl(int req, void *arg) { return ioctl(req, arg); }
+int open_device() {
+  if (fd == -1) {
+    fd = open(device_path, O_RDWR);
+  }
+  return fd;
+}
+// memory management;
+void *fpga_malloc(size_t size) {
+  return reinterpret_cast<(void *)> mmap64(NULL, size, PROT_READ | PROT_WRITE,
+                                           MAP_SHARED, fd, 0);
+}
+void fpga_free(void *ptr) { munmap(ptr, 0); }
+void fpga_copy(void *dest, const void *src, size_t num) {
+  memcpy(dest, src, num);
+}
+int ComputeFpgaConv(struct FpgaConvArgs) {}
+int ComputeFpgaPool(struct FpgaPoolArgs) {}
+int ComputeFpgaEWAdd(struct FpgaEWAddArgs) {}
+}  // namespace api
+}  // namespace fpga
+}  // namespace mobile
+}  // namespace paddle
--- a/src/fpga/api/fpga_api.h
+++ b/src/fpga/api/fpga_api.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cstddef>
+#include <iostream>
+#include <limits>
+// memory management;
+namespace paddle {
+namespace mobile {
+namespace fpga {
+namespace api {
+int open_device();
+int close_device();
+void *fpga_malloc(size_t size);
+void fpga_free(void *ptr);
+void fpga_copy(void *dst, const void *src, size_t num);
+struct FpgaVersionArgs {
+  void *buf;
+};
+struct MemoryToPhysicalArgs {
+  const void *src;
+  uint64_t physical;
+};
+struct MemoryCopyArgs {
+  void *src;
+  void *dst;
+  size_t size;
+};
+struct FpgaQuantArgs {
+  float scale;
+};
+struct FpgaBNArgs {};
+struct FpgaConvArgs {
+  bool enable_BN = false;
+  bool enable_Relu = false;
+  struct FpgaBNParam bn_parm;
+};
+struct FpgaPoolArgs {
+  bool enable_BN = false;
+  struct FpgaBNParam bn_parm;
+};
+struct FpgaEWAddArgs {  // only support X + Y
+  bool enable_Relu = false;
+};
+int ComputeFpgaConv(struct FpgaConvArgs);
+int ComputeFpgaPool(struct FpgaPoolArgs);
+int ComputeFpgaEWAdd(struct FpgaEWAddArgs);
+#define IOCTL_FPGA_MAGIC 'FPGA'
+#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 1, struct FpgaVersionArgs)
+#define IOCTL_GET_QUANT _IOW(IOCTL_FPGA_MAGIC, 2, struct FpgaQuantArgs)
+#define IOCTL_SET_QUANT _IOW(IOCTL_FPGA_MAGIC, 3, struct FpgaArgs)
+#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
+#define IOCTL_MEM_TOPHY _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryToPhysicalArgs)
+#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct FpgaConvArgs)
+#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct FpgaPoolArgs)
+#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct FpgaEWAddArgs)
+}  // namespace api
+}  // namespace fpga
+}  // namespace mobile
+}  // namespace paddle
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -28,6 +28,16 @@ vector<string> OperatorBase<Dtype>::GetOutKeys() const {
  return it->second.second;
 }
+template <typename Dtype>
+vector<string> OperatorBase<Dtype>::GetInputKeys() const {
+  auto it = op_input_output_key.find(type_);
+  if (it == op_input_output_key.end()) {
+    DLOG << type_ << " has no outputs";
+    return {};
+  }
+  return it->second.first;
+}
 template <typename Dtype>
 OperatorBase<Dtype>::OperatorBase(const std::string &type,
                                  const VariableNameMap &inputs,
@@ -49,6 +59,11 @@ template <typename Dtype>
 void OperatorBase<Dtype>::Run() const {
  RunImpl();
 #ifdef PADDLE_MOBILE_DEBUG
+  vector<string> input_keys = GetInputKeys();
+  for (const auto key : input_keys) {
+    Tensor *input = GetVarValue<framework::LoDTensor>(key, inputs_, *scope_);
+    DLOG << type_ << " input- " << key << "=" << *input;
+  }
  vector<string> output_keys = GetOutKeys();
  for (const auto key : output_keys) {
    Tensor *out_ = GetVarValue<framework::LoDTensor>(key, outputs_, *scope_);

--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -61,6 +61,7 @@ class OperatorBase {
  virtual ~OperatorBase() {}
  void Run() const;
  std::vector<string> GetOutKeys() const;
+  std::vector<string> GetInputKeys() const;
  virtual void RunImpl() const = 0;
  virtual void Init() = 0;
@@ -118,6 +119,10 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
  virtual void InferShape() const = 0;
  void Init() {
+    //    for (auto i : this->inputs_) {
+    //      DLOG << i.first;
+    //      DLOG << i.second;
+    //    }
    PADDLE_MOBILE_ENFORCE(kernel_.Init(&param_), "  %s kernel init failed",
                          this->type_.c_str());
  }
@@ -146,7 +151,7 @@ class OpKernelBase {
  }
 #endif
  virtual void Compute(const P &para) const = 0;
-  virtual bool Init(P *para) { return true; };
+  virtual bool Init(P *para) { return true; }
  virtual ~OpKernelBase() = default;
 private:

--- a/src/framework/program/program-optimize/fusion_op_register.h
+++ b/src/framework/program/program-optimize/fusion_op_register.h
@@ -42,8 +42,17 @@ class FusionOpRegister {
    matchers_[matcher->Type()] = shared_matcher;
  }
-  const std::map<std::string, std::shared_ptr<FusionOpMatcher>> Matchers() {
+  const std::vector<std::shared_ptr<FusionOpMatcher>> Matchers() {
-    return matchers_;
+    std::vector<std::shared_ptr<FusionOpMatcher>> matchers;
+    for (const auto& match : matchers_) {
+      matchers.push_back(match.second);
+    }
+    std::sort(matchers.begin(), matchers.end(),
+              [](std::shared_ptr<FusionOpMatcher> first,
+                 std::shared_ptr<FusionOpMatcher> second) {
+                return first->BeginNode().Depth() > second->BeginNode().Depth();
+              });
+    return matchers;
  }
 private:

--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -44,23 +44,6 @@ bool Node::operator==(const Node &in) {
  return true;
 }
-std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(int size) {
-  std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
-  OpDescs(size - 1, &op_descs);
-  return op_descs;
-}
-void Node::OpDescs(int index,
-                   std::vector<std::shared_ptr<framework::OpDesc>> *op_desc) {
-  if (index == 0) {
-    return;
-  }
-  op_desc->push_back(this->op_desc_);
-  for (auto &output : outputs_) {
-    output->OpDescs(index, op_desc);
-  }
-}
 std::shared_ptr<Node> Node::To(int size) {
  std::shared_ptr<Node> node = std::make_shared<Node>();
  this->To(size - 1, node);

--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -47,13 +47,10 @@ class Node {
      std::map<std::string, std::vector<std::pair<std::string, std::string>>>
          change,
      std::vector<std::shared_ptr<Node>> *removed_nodes);
-  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(int size);
  std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
  std::string Type() { return type_; }
 private:
-  void OpDescs(int size,
-               std::vector<std::shared_ptr<framework::OpDesc>> *op_desc);
  void To(int index, std::shared_ptr<Node>);
  void Folder(
      std::shared_ptr<framework::OpDesc> op_desc,

--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -78,9 +78,8 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
    }
    for (auto &registed : FusionOpRegister::Instance()->Matchers()) {
-      std::string fusion_type = registed.first;
+      std::string fusion_type = registed->Type();
-      std::shared_ptr<FusionOpMatcher> matcher = registed.second;
+      std::shared_ptr<FusionOpMatcher> matcher = registed;
-      //      DLOG << " registed node \n " << matcher->BeginNode();
      auto match_vector = type_map[matcher->BeginType()];

--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -30,6 +30,7 @@ class Program {
  std::string model_path;
  std::string para_path;
  bool combined = false;
+  bool quantification = false;
 private:
 };

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -154,7 +154,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
  tensor->Resize(framework::make_ddim(desc.Dims()));
-  void *memory = tensor;
+  void *memory = nullptr;
  int type_size = 0;
  switch (desc.DataType()) {
    case framework::VARTYPE_TYPE_FP16:
@@ -179,11 +179,25 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
    default:
      break;
  }
+  if (program_.quantification) {
-  for (int n = 0; n < memory_size * type_size; ++n) {
+    float min_value;
-    static_cast<char *>(memory)[n] = (*data)[n];
+    float max_value;
+    memcpy(&min_value, *data, sizeof(float));
+    memcpy(&max_value, *data + sizeof(float), sizeof(float));
+    *data += 2 * sizeof(float);
+    const float factor = (max_value - min_value) / 255.0;
+    uint8_t *uint8_data = (uint8_t *)(*data);
+    for (int k = 0; k < memory_size; ++k) {
+      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
+    }
+    *data += (memory_size * sizeof(uint8_t));
+  } else {
+    for (int n = 0; n < memory_size * type_size; ++n) {
+      static_cast<char *>(memory)[n] = (*data)[n];
+    }
+    (*data) += (sizeof(char) * memory_size * type_size);
  }
-  (*data) += (sizeof(char) * memory_size * type_size);
 }
 template <typename Dtype, Precision P>

--- a/src/io/loader.cpp
+++ b/src/io/loader.cpp
@@ -44,26 +44,29 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &dirname, bool optimize, bool can_add_split) {
+    const std::string &dirname, bool optimize, bool quantification,
-  auto program =
+    bool can_add_split) {
-      this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
+  auto program = this->LoadProgram(dirname + "/__model__", optimize,
+                                   quantification, can_add_split);
  program.model_path = dirname;
  return program;
 }
 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &model_path, const std::string &para_path,
+    const std::string &model_path, const std::string &para_path, bool optimize,
-    bool optimize) {
+    bool quantification) {
  auto program = this->LoadProgram(model_path, optimize);
  program.para_path = para_path;
  program.combined = true;
+  program.quantification = quantification;
  return program;
 }
 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
-    const std::string &model_path, bool optimize, bool can_add_split) {
+    const std::string &model_path, bool optimize, bool quantification,
+    bool can_add_split) {
  std::string model_filename = model_path;
  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
  uint8_t *buf = NULL;
@@ -82,6 +85,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
  framework::Program<Dtype, P> program;
  program.originProgram = originProgramDesc;
+  program.quantification = quantification;
  auto scope = std::make_shared<framework::Scope>();
  program.scope = scope;

--- a/src/io/loader.h
+++ b/src/io/loader.h
@@ -30,6 +30,7 @@ class Loader {
   * */
  const framework::Program<Dtype, P> Load(const std::string &dirname,
                                          bool optimize = false,
+                                          bool quantification = false,
                                          bool can_add_split = false);
  /*
@@ -38,11 +39,13 @@ class Loader {
   * */
  const framework::Program<Dtype, P> Load(const std::string &model_path,
                                          const std::string &para_path,
-                                          bool optimize = false);
+                                          bool optimize = false,
+                                          bool quantification = false);
 private:
  const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
                                                 bool optimize = false,
+                                                 bool quantification = false,
                                                 bool can_add_split = false);
 };

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -26,7 +26,7 @@ void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
 template <typename Dtype, Precision P>
 bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
-                                  int batch_size) {
+                                  bool quantification, int batch_size) {
  if (loader_.get() == nullptr) {
    loader_ = std::make_shared<Loader<Dtype, P>>();
  } else {
@@ -35,7 +35,7 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
  if (executor_.get() == nullptr) {
    executor_ = std::make_shared<Executor<Dtype, P>>(
-        loader_->Load(dirname, optimize), batch_size, optimize);
+        loader_->Load(dirname, optimize, quantification), batch_size, optimize);
  } else {
    LOG(kLOG_INFO) << "executor inited";
  }
@@ -46,7 +46,7 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
 template <typename Dtype, Precision P>
 bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
                                  const std::string &para_path, bool optimize,
-                                  int batch_size) {
+                                  bool quantification, int batch_size) {
  if (loader_.get() == nullptr) {
    loader_ = std::make_shared<Loader<Dtype, P>>();
  } else {
@@ -55,7 +55,8 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
  if (executor_.get() == nullptr) {
    executor_ = std::make_shared<Executor<Dtype, P>>(
-        loader_->Load(model_path, para_path, optimize), batch_size, optimize);
+        loader_->Load(model_path, para_path, optimize, quantification),
+        batch_size, optimize);
  } else {
    LOG(kLOG_INFO) << "executor inited";
  }

--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -39,14 +39,18 @@ class PaddleMobile {
   * @b 加载分开形式的 fluid 模型
   * */
  bool Load(const std::string &dirname, bool optimize = false,
-            int batch_size = 1);
+            bool quantification = false, int batch_size = 1);
  /*
   * @b load combine format fluid mode
   * @b 加载结合在一起格式的模型
   * */
  bool Load(const std::string &model_path, const std::string &para_path,
-            bool optimize = false, int batch_size = 1);
+            bool optimize = false, bool quantification = false,
+            int batch_size = 1);
+  /*
+   * @b 设置线程数, 当 cmake 中开启 openmp 时生效
+   * */
  void SetThreadNum(int num);
  /*

--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -16,10 +16,32 @@ limitations under the License. */
 #include <cstdlib>
 #include <cstring>
+#ifdef PADDLE_MOBILE_FPGA
+#include "fpga/api/fpga_api.h"
+#endif
 namespace paddle_mobile {
 namespace memory {
 const int MALLOC_ALIGN = 64;
+#ifdef PADDLE_MOBILE_FPGA
+namespace api = paddle::mobile::fpga::api;
+void Copy(void *dst, const void *src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+void *Alloc(size_t size) { return api::malloc(size); }
+void Free(void *ptr) {
+  if (ptr) {
+    api::fpga_free(ptr);
+  }
+}
+#else
 void Copy(void *dst, const void *src, size_t num) {
  std::memcpy(dst, src, num);
 }
@@ -42,5 +64,7 @@ void Free(void *ptr) {
  }
 }
+#endif
 }  // namespace memory
 }  // namespace paddle_mobile
--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -26,7 +26,7 @@ void BatchNormOp<Dtype, T>::InferShape() const {
  auto x_dims = this->param_.InputX()->dims();
  this->param_.OutputY()->Resize(x_dims);
 }
-template class BatchNormOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/box_coder_op.cpp
+++ b/src/operators/box_coder_op.cpp
@@ -47,7 +47,7 @@ void BoxCoderOp<Dtype, T>::InferShape() const {
  this->param_.OutputBox()->Resize(framework::make_ddim(
      {input_targetbox_dims[0], input_priorbox_dims[0], 4}));
 }
-template class BoxCoderOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -56,7 +56,6 @@ void ConcatOp<Dtype, T>::InferShape() const {
  this->param_.Out()->Resize(out_dims);
 }
-template class ConcatOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -48,8 +48,6 @@ void ConvOp<Dtype, T>::InferShape() const {
  this->param_.Output()->Resize(ddim);
 }
-template class ConvOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -49,8 +49,6 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
  this->param_.Output()->Resize(ddim);
 }
-template class DepthwiseConvOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/dropout_op.cpp
+++ b/src/operators/dropout_op.cpp
@@ -22,7 +22,7 @@ void DropoutOp<Dtype, T>::InferShape() const {
  auto input_dims = this->param_.InputX()->dims();
  this->param_.Out()->Resize(input_dims);
 }
-template class DropoutOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -24,7 +24,7 @@ void ElementwiseAddOp<Dtype, T>::InferShape() const {
  auto x_dim = this->param_.InputX()->dims();
  this->param_.Out()->Resize(x_dim);
 }
-template class ElementwiseAddOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -14,10 +14,7 @@ limitations under the License. */
 #include "feed_op.h"
 namespace paddle_mobile {
-namespace operators {
+namespace operators {}
-template class FeedOp<CPU, float>;
-}
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;

--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
@@ -14,10 +14,7 @@ limitations under the License. */
 #include "fetch_op.h"
 namespace paddle_mobile {
-namespace operators {
+namespace operators {}
-template class FetchOp<CPU, float>;
-}
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;

--- a/src/operators/fusion_conv_add.cpp
+++ b/src/operators/fusion_conv_add.cpp
@@ -45,7 +45,6 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
  this->param_.Output()->Resize(ddim);
 }
-template class FusionConvAddOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/fusion_conv_add.h
+++ b/src/operators/fusion_conv_add.h
@@ -36,8 +36,6 @@ class FusionConvAddMatcher : public framework::FusionOpMatcher {
  void FolderNodes(
      framework::Node *node,
      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node->OpDescs(node_.Depth());
    node->Folder(node_.Depth(), Type(),
                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
  }
@@ -68,11 +66,11 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
 #ifdef PADDLE_MOBILE_CPU
-//#ifndef CONV_ADD_REGISTER
+#ifndef CONV_ADD_REGISTER
-// static framework::FusionOpRegistrar convadd_registrar(
+static framework::FusionOpRegistrar convadd_registrar(
-//    new FusionConvAddMatcher());
+    new FusionConvAddMatcher());
-//#define CONV_ADD_REGISTER
+#define CONV_ADD_REGISTER
-//#endif
+#endif
 #endif

--- a/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
@@ -44,7 +44,7 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
  framework::DDim ddim = framework::make_ddim(output_shape);
  this->param_.Output()->Resize(ddim);
 }
-template class FusionConvAddBNReluOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/fusion_conv_add_bn_relu_op.h
+++ b/src/operators/fusion_conv_add_bn_relu_op.h
@@ -39,8 +39,6 @@ class FusionConvAddBNReluMatcher : public framework::FusionOpMatcher {
  void FolderNodes(
      framework::Node *node,
      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node->OpDescs(node_.Depth());
    node->Folder(node_.Depth(), Type(),
                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
                  {G_OP_TYPE_BATCHNORM,

--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef CONVADDRELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 #include "fusion_conv_add_relu_op.h"
 #include "operators/math/conv_func.h"

--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef CONVADDRELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 #pragma once

--- a/src/operators/fusion_conv_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_relu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVBNRELU_OP
+#include "operators/fusion_conv_bn_relu_op.h"
+#include "operators/math/conv_func.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename Dtype, typename T>
+void FusionConvBNReluOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/fusion_conv_bn_relu_op.h
+++ b/src/operators/fusion_conv_bn_relu_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVBNRELU_OP
+#pragma once
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/conv_bn_relu_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionConvBNReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionConvBNReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"}}}},
+                 removed_nodes);
+  }
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_BN_RELU; }
+};
+template <typename DeviceType, typename T>
+class FusionConvBNReluOp : public framework::OperatorWithKernel<
+                               DeviceType, FusionConvBNReluParam,
+                               operators::ConvBNReluKernel<DeviceType, T>> {
+ public:
+  FusionConvBNReluOp(const string &type, const VariableNameMap &inputs,
+                     const VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs,
+                     std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionConvBNReluParam,
+            operators::ConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                        attrs, scope) {}
+  using framework::OperatorWithKernel<
+      DeviceType, FusionConvBNReluParam,
+      operators::ConvBNReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+ protected:
+};
+#ifdef PADDLE_MOBILE_CPU
+#ifndef FUSION_CONV_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_bn_relu_registrar(
+    new FusionConvBNReluMatcher());
+#define FUSION_CONV_BN_RELU_REGISTER
+#endif
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+}  // namespace operators
+}  // namespace paddle_mobile
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_bn_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/fusion_dwconv_bn_relu_op.cpp
+++ b/src/operators/fusion_dwconv_bn_relu_op.cpp
@@ -44,7 +44,7 @@ void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
  framework::DDim ddim = framework::make_ddim(output_shape);
  this->param_.Output()->Resize(ddim);
 }
-template class FusionDWConvBNReluOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/fusion_dwconv_bn_relu_op.h
+++ b/src/operators/fusion_dwconv_bn_relu_op.h
@@ -38,8 +38,6 @@ class FusionDWConvBNReluMatcher : public framework::FusionOpMatcher {
  void FolderNodes(
      framework::Node *node,
      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node->OpDescs(node_.Depth());
    node->Folder(node_.Depth(), Type(),
                 {{G_OP_TYPE_BATCHNORM,
                   {{"Scale", "Scale"},

--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -50,7 +50,6 @@ void FusionFcOp<Dtype, T>::InferShape() const {
  this->param_.Out()->Resize(ddim);
 }
-template class FusionFcOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/im2sequence_op.cpp
+++ b/src/operators/im2sequence_op.cpp
@@ -47,8 +47,6 @@ void Im2SequenceOp<Dtype, T>::InferShape() const {
  this->param_.Output()->Resize(ddim);
 }
-template class Im2SequenceOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef FUSION_CONVADD_RELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 #include "operators/kernel/conv_add_relu_kernel.h"
 #include "operators/kernel/central-arm-func/conv_add_relu_arm_func.h"

--- a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVBNRELU_OP
+#include "operators/kernel/conv_bn_relu_kernel.h"
+#include "operators/kernel/central-arm-func/conv_bn_relu_arm_func.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ConvBNReluKernel<CPU, float>::Init(FusionConvBNReluParam *param) {
+  const Tensor *mean = param->InputMean();
+  const Tensor *variance = param->InputVariance();
+  const Tensor *scale = param->InputScale();
+  const Tensor *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+  //   DLOG << "variance: " << *variance;
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+  const int C = mean->numel();
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  Tensor *new_scale = new Tensor();
+  Tensor *new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({C});
+  auto new_bias_ptr = new_bias->mutable_data<float>({C});
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  return true;
+}
+template <>
+void ConvBNReluKernel<CPU, float>::Compute(
+    const FusionConvBNReluParam &param) const {
+  ConvBNReluCompute<float>(param);
+}
+template class ConvBNReluKernel<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
+++ b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
@@ -54,7 +54,40 @@ void BatchnormCompute(const BatchNormParam &param) {
  int HXW = H * W;
-#ifdef ARMV7
+#if __ARM_NEON
+#if __aarch64__
+  float *inv_std_ptr = new float[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  Tensor new_scale;
+  auto new_scale_ptr = new_scale.mutable_data<float>(framework::make_ddim({C}));
+  Tensor new_bias;
+  auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
+  /// ((x - est_mean) * (inv_var) * scale + bias equal to
+  /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+    {
+      for (int n = 0; n < N; n++) {
+        for (int h = 0; h < H; h++) {
+          int tmp_index = n * stride0 + i * stride1 + h * stride2;
+          for (int w = 0; w < W; w++) {
+            int index = tmp_index + w;
+            out_ptr[index] =
+                input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
+          }
+        }
+      }
+    }
+  }
+  delete[] inv_std_ptr;
+#else
  if (HXW > 32) {
    int NXC = N * C;
    float *inv_std_ptr = new float[NXC * 4];
@@ -229,6 +262,7 @@ void BatchnormCompute(const BatchNormParam &param) {
    delete[] inv_std_ptr;
  }
+#endif
 #else
  float *inv_std_ptr = new float[C];
  for (int i = 0; i < C; i++) {

--- a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef FUSION_CONVADD_RELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 #pragma once
 #include <vector>

--- a/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVBNRELU_OP
+#pragma once
+#include <vector>
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+void ConvBNReluBasic(const FusionConvBNReluParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor new_bias = *param.NewBias();
+  Tensor new_scale = *param.NewScale();
+  Tensor *output = param.Output();
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmulWithBn<float>(
+          filter_slice, false, col_matrix, false, static_cast<float>(1),
+          &out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
+    }
+  }
+}
+template <typename P>
+void ConvBNReluCompute(const FusionConvBNReluParam &param) {
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
+                                        param.Output(), param.NewScale(),
+                                        param.NewBias(), true);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
+    //                                        param.Output(), param.NewScale(),
+    //                                        param.NewBias(), 1);
+    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
+                                          param.Output(), param.NewScale(),
+                                          param.NewBias(), true);
+  } else {
+    ConvBNReluBasic(param);
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/pool_arm_func.h
+++ b/src/operators/kernel/central-arm-func/pool_arm_func.h
@@ -76,15 +76,20 @@ void PoolCompute(const PoolParam &param) {
    }
  } else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
-#ifndef IOS
+#if __ARM_NEON
+#if __aarch64__
+    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+#else
    if (pooling_type == "max") {
      math::Pool2x2Max(strides, paddings, in_x, out);
    } else if (pooling_type == "avg") {
      math::Pool2x2Avg(strides, paddings, in_x, out);
    }
+#endif
 #else
    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
-#endif
+#endif  // __ARM_NEON
  } else {
    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
  }

--- a/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
+++ b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
@@ -68,6 +68,7 @@ void sigmoid(const Tensor *X, Tensor *Y) {
      input_outer_ptr++;
    }
  }
+#else
 #endif
 }

--- a/src/operators/kernel/conv_add_relu_kernel.h
+++ b/src/operators/kernel/conv_add_relu_kernel.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #pragma once
-#ifdef FUSION_CONVADD_RELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 #include <vector>
 #include "framework/ddim.h"

--- a/src/operators/kernel/conv_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_bn_relu_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef FUSION_CONVBNRELU_OP
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using framework::DDim;
+using framework::OpKernelBase;
+template <typename DeviceType, typename T>
+class ConvBNReluKernel
+    : public OpKernelBase<DeviceType, FusionConvBNReluParam> {
+ public:
+  void Compute(const FusionConvBNReluParam &param) const;
+  bool Init(FusionConvBNReluParam *param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -24,7 +24,7 @@ void LrnOp<Dtype, T>::InferShape() const {
  auto x_dims = this->param_.InputX()->dims();
  this->param_.Out()->Resize(x_dims);
 }
-template class LrnOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "operators/math/depthwise_conv_3x3.h"
-#ifdef __ARM_NEON
+#if __ARM_NEON
 #include <arm_neon.h>
 #endif
 #include <vector>
@@ -23,7 +23,6 @@ namespace math {
 void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
                      vector<int> paddings, const Tensor *filter, Tensor *bias,
                      Tensor *output, bool if_bias) {
-#ifdef __ARM_NEON
  const int batch_size = input->dims()[0];
  const int input_height = input->dims()[2];
@@ -181,7 +180,27 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
            }
          } else {
-#if defined(ARMV17)
+#if __ARM_NEON
+#if __aarch64__
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos2);
+            const float32x4_t data3 = vld1q_f32(pos3);
+            const float32x4_t v_filter1 = vld1q_f32(filter1);
+            const float32x4_t v_filter2 = vld1q_f32(filter2);
+            const float32x4_t v_filter3 = vld1q_f32(filter3);
+            float32x4_t mula = vmulq_f32(data1, v_filter1);
+            mula = vmlaq_f32(mula, data2, v_filter2);
+            mula = vmlaq_f32(mula, data3, v_filter3);
+            float32x2_t res = vpadd_f32(
+                vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
+            res = vpadd_f32(res, res);
+            if (if_bias) {
+              output_data[ph * output_width + pw] += vget_lane_f32(res, 0);
+            } else {
+              output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
+            }
+#else
            asm volatile(
                "vld1.32  {q1}, [%[pos1]]        \n\t"
@@ -209,26 +228,10 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
                  [filter2] "r"(filter2), [filter3] "r"(filter3),
                  [output_ptr] "r"(output_ptr), [zero] "r"(zero)
                : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
+#endif  // __aarch64__
 #else
-            const float32x4_t data1 = vld1q_f32(pos1);
-            const float32x4_t data2 = vld1q_f32(pos2);
-            const float32x4_t data3 = vld1q_f32(pos3);
-            const float32x4_t v_filter1 = vld1q_f32(filter1);
+#endif  // __ARM_NEON
-            const float32x4_t v_filter2 = vld1q_f32(filter2);
-            const float32x4_t v_filter3 = vld1q_f32(filter3);
-            float32x4_t mula = vmulq_f32(data1, v_filter1);
-            mula = vmlaq_f32(mula, data2, v_filter2);
-            mula = vmlaq_f32(mula, data3, v_filter3);
-            float32x2_t res = vpadd_f32(
-                vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
-            res = vpadd_f32(res, res);
-            if (if_bias) {
-              output_data[ph * output_width + pw] += vget_lane_f32(res, 0);
-            } else {
-              output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
-            }
-#endif
          }
        }
      }
@@ -239,12 +242,11 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
    input_data += input_batch_stride;
    output_data += output_batch_stride;
  }
-#endif
 }
 void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
                          Tensor *output, Tensor *bias, bool if_bias) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
  const float *input_data = input->data<float>();
  const float *filter_data = filter->data<float>();
  float *output_data = output->data<float>();
@@ -520,7 +522,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
 void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
                                   Tensor *output, const Tensor *new_scale,
                                   const Tensor *new_bias, bool if_relu) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
  const float *input_data = input->data<float>();
  const float *filter_data = filter->data<float>();
  float *output_data = output->data<float>();
@@ -824,7 +826,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
 void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
                                   Tensor *output, const Tensor *new_scale,
                                   const Tensor *new_bias, bool if_relu) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
  const int batch_size = input->dims()[0];
@@ -1022,7 +1024,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
 void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
                            Tensor *output, Tensor bias, bool if_bias) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
  const float *input_data = input->data<float>();
  const float *filter_data = filter->data<float>();
  float *output_data = output->data<float>();
@@ -1225,7 +1227,7 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
 void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
                                     Tensor *output, const Tensor *new_scale,
                                     const Tensor *new_bias, bool if_relu) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
  const float *input_data = input->data<float>();
  const float *filter_data = filter->data<float>();
  float *output_data = output->data<float>();

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #define B(i, j) B[(i)*ldb + (j)]
 #define C(i, j) C[(i)*ldc + (j)]
-#define MR 4
+#define MR 6
 #define NR 8
 #define s_min(i, j) ((i) < (j) ? (i) : (j))
@@ -28,6 +28,7 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {
+/*
 // 将 A 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                 float *buffer);
@@ -35,14 +36,17 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
 // 将 B 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                 float *buffer);
+*/
 // 将 A 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
+void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
-                  float *buffer);
+                    float *buffer);
+void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
+                    float *buffer);
 // 将 B 矩阵分块复制到连续内存(RowMajor)
-void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
+void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
-                  float *buffer);
+                    float *buffer);
 // 分块矩阵乘法
 void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
@@ -51,7 +55,7 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
 void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
                       const float *b, float beta, float *c, float *C, int ldc,
                       bool relu, float *new_scale, float *new_bias);
+/*
 // 向量矩阵乘法 (M = 1)
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
                  const float *B, int ldb, float beta, float *C, int ldc,
@@ -60,10 +64,12 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
 void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
                        int lda, const float *B, int ldb, float beta, float *C,
                        int ldc, bool relu, float *new_scale, float *new_bias);
+*/
 // 计算一个更小的 C 矩阵分块
 void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
 void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
 // 分块矩阵乘法结果回写
 // C = A * B
@@ -81,6 +87,7 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
 void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
                     float *new_scale, float *new_bias);
+/*
 // 向量矩阵乘法结果回写
 // C = A * B
 void VecWriteBasic(int n, float *c, float *C, int ldc);
@@ -96,6 +103,7 @@ void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
 // C = A * B, batchnorm(C), relu(C)
 void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
                        float *new_bias);
+*/
 // 32位 float 矩阵乘法
 void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,

--- a/src/operators/math/im2col.cpp
+++ b/src/operators/math/im2col.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "operators/math/im2col.h"
 #include <vector>
 #ifdef __ARM_NEON
-#include "arm_neon.h"
+#include <arm_neon.h>
 #endif
 #include "common/types.h"
 namespace paddle_mobile {
@@ -69,7 +69,7 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
    int channels_col = im_channels * filter_height * filter_width;
    const T *im_data = im.data<T>();
    T *col_data = col->data<T>();
-#ifdef __ARM_NEON
+#if __ARM_NEON
    const int osize = col_height;
    const int isize = im_height;
    bool pad1 = padding[0] > 0;

--- a/src/operators/math/pool_2x2.cpp
+++ b/src/operators/math/pool_2x2.cpp
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef POOL_OP
-#include "pool_2x2.h"
+#include "operators/math/pool_2x2.h"
+#include <algorithm>
+#include <vector>
 namespace paddle_mobile {
 namespace operators {
@@ -21,10 +23,10 @@ namespace math {
 void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                Tensor *output) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
-#ifdef ARMV7
+#if __aarch64__
+#else
  const int batch_size = input->dims()[0];
  const int input_height = input->dims()[2];
@@ -93,15 +95,16 @@ void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
    output_data += output_batch_stride;
  }
 #endif
+#else
 #endif
 }
 void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                Tensor *output) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
-#ifdef ARMV7
+#if __aarch64__
+#else
  const int batch_size = input->dims()[0];
  const int input_height = input->dims()[2];
@@ -171,12 +174,9 @@ void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
    input_data += input_batch_stride;
    output_data += output_batch_stride;
  }
-#else
-// TODO(): to imp other asm
 #endif
+#else
 #endif
 }

--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <omp.h>
 #endif
 #include "framework/tensor.h"
-#include "pool_3x3.h"
+#include "operators/math/pool_3x3.h"
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON
@@ -518,6 +518,8 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
    input_data += input_batch_stride;
    out_data += output_batch_stride;
  }
+#else
 #endif
 }
@@ -582,7 +584,18 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
            }
            output_seg[ph * output_width + pw] = max_value;
          } else {
-#if defined(ARMV7)
+#if __aarch64__
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos1 + input_width);
+            const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width);
+            const float32x4_t max_data =
+                vmaxq_f32(vmaxq_f32(data1, data2), data3);
+            float32x2_t res =
+                vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
+                          vget_low_f32(max_data));
+            res = vpmax_f32(res, res);
+            output_seg[ph * output_width + pw] = vget_lane_f32(res, 0);
+#else
            asm volatile(
                "vld1.32  {q1}, [%[pos1]]        \n\t"
                "vld1.32  {q2}, [%[pos2]]        \n\t"
@@ -598,17 +611,6 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                  [pos2] "r"(pos2), [pos3] "r"(pos3),
                  [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
                : "memory", "q1", "q2", "q3", "q4");
-#else
-            const float32x4_t data1 = vld1q_f32(pos1);
-            const float32x4_t data2 = vld1q_f32(pos1 + input_width);
-            const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width);
-            const float32x4_t max_data =
-                vmaxq_f32(vmaxq_f32(data1, data2), data3);
-            float32x2_t res =
-                vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
-                          vget_low_f32(max_data));
-            res = vpmax_f32(res, res);
-            output_seg[ph * output_width + pw] = vget_lane_f32(res, 0);
 #endif
          }
        }
@@ -676,8 +678,8 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
            }
            output_seg[ph * output_width + pw] = sum / 9.0;
          } else {
-#if defined(ARMV7)
+#if __aarch64__
+#else
            asm volatile(
                "vld1.32  {q1}, [%[pos1]]        \n\t"
                "vld1.32  {q2}, [%[pos2]]        \n\t"
@@ -696,7 +698,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                  [output_ptr] "r"(output_ptr), [zero] "r"(zero),
                  [nine_ptr] "r"(nine_ptr)
                : "memory", "r6", "q1", "q2", "q3", "q4");
-#else
+#endif
            const float32x4_t data1 = vld1q_f32(pos1);
            const float32x4_t data2 = vld1q_f32(pos2);
            const float32x4_t data3 = vld1q_f32(pos3);
@@ -707,7 +709,6 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                          vget_low_f32(sum_data));
            res = vpadd_f32(res, res);
            output_seg[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0;
-#endif
          }
        }
      }
@@ -715,6 +716,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
    input_data += input_batch_stride;
    output_data += output_batch_stride;
  }
+#else
 #endif
 }
 }  // namespace math

--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
@@ -135,6 +135,7 @@ class SoftmaxFuntor<CPU, T> {
      }
    }
  }
+#else
 #endif  // ARM_NEON
 public:

--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -50,7 +50,7 @@ void MulOp<Dtype, T>::InferShape() const {
  framework::DDim ddim = framework::make_ddim(output_dims);
  this->param_.Out()->Resize(ddim);
 }
-template class MulOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/multiclass_nms_op.cpp
+++ b/src/operators/multiclass_nms_op.cpp
@@ -34,7 +34,7 @@ void MultiClassNMSOp<Dtype, T>::InferShape() const {
  // pre size, will change in Compute.
  this->param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6}));
 }
-template class MultiClassNMSOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -232,7 +232,6 @@ class ConvParam : OpParam {
 Print &operator<<(Print &printer, const ConvParam &conv_param);
 #endif
-#ifdef ELEMENTWISEADD_OP
 class ElementwiseAddParam : OpParam {
 public:
  ElementwiseAddParam(const VariableNameMap &inputs,
@@ -259,6 +258,8 @@ class ElementwiseAddParam : OpParam {
  int axis_;
 };
+#ifdef FUSION_ELEMENTWISEADDRELU_OP
+using ElementwiseAddReluParam = ElementwiseAddParam;
 #endif
 #ifdef MUL_OP
@@ -371,7 +372,7 @@ class BatchNormParam : OpParam {
    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
    epsilon_ = GetAttr<float>("epsilon", attrs);
    momentum_ = GetAttr<float>("momentum", attrs);
-    is_test_ = GetAttr<bool>("is_test", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
  }
  const Tensor *InputX() const { return input_x_; }
@@ -421,7 +422,7 @@ class PoolParam : public OpParam {
    strides_ = GetAttr<vector<int>>("strides", attrs);
    paddings_ = GetAttr<vector<int>>("paddings", attrs);
    ceil_mode_ = GetAttr<bool>("ceil_mode", attrs);
-    gloabal_pooling_ = GetAttr<bool>("global_pooling", attrs);
+    global_pooling_ = GetAttr<bool>("global_pooling", attrs);
  }
  const Tensor *Input() const { return input_; }
@@ -438,7 +439,7 @@ class PoolParam : public OpParam {
  bool isCeilMode() const { return ceil_mode_; }
-  bool isGlobalPooling() const { return gloabal_pooling_; }
+  bool isGlobalPooling() const { return global_pooling_; }
 private:
  Tensor *input_;
@@ -448,9 +449,82 @@ class PoolParam : public OpParam {
  vector<int> strides_;
  vector<int> paddings_;
  bool ceil_mode_;
-  bool gloabal_pooling_ = false;
+  bool global_pooling_ = false;
 };
+#endif
+#ifdef FUSION_POOLBN_OP
+class FusionPoolBNParam : OpParam {
+ public:
+  FusionPoolBNParam(const VariableNameMap &inputs,
+                    const VariableNameMap &outputs, const AttributeMap &attrs,
+                    const Scope &scope) {
+    input_ = InputXFrom<LoDTensor>(inputs, scope);
+    pooling_type_ = GetAttr<string>("pooling_type", attrs);
+    ksize_ = GetAttr<vector<int>>("ksize", attrs);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    ceil_mode_ = GetAttr<bool>("ceil_mode", attrs);
+    global_pooling_ = GetAttr<bool>("global_pooling", attrs);
+    output_y_ = OutputYFrom<LoDTensor>(outputs, scope);
+    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
+    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
+    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
+    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
+    epsilon_ = GetAttr<float>("epsilon", attrs);
+    momentum_ = GetAttr<float>("momentum", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
+  }
+  const Tensor *Input() const { return input_; }
+  const string &PoolingType() const { return pooling_type_; }
+  const vector<int> &Ksize() const { return ksize_; }
+  const vector<int> &Strides() const { return strides_; }
+  const vector<int> &Paddings() const { return paddings_; }
+  bool isCeilMode() const { return ceil_mode_; }
+  bool isGlobalPooling() const { return global_pooling_; }
+  Tensor *OutputY() const { return output_y_; }
+  const Tensor *InputBias() const { return input_bias_; }
+  const Tensor *InputMean() const { return input_mean_; }
+  const Tensor *InputScale() const { return input_scale_; }
+  const Tensor *InputVariance() const { return input_variance_; }
+  const float &Epsilon() const { return epsilon_; }
+  const float &Momentum() const { return momentum_; }
+  const bool &IsTest() const { return is_test_; }
+  const string &DataFormat() const { return data_format_; }
+ private:
+  Tensor *input_;
+  string pooling_type_;
+  vector<int> ksize_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  bool ceil_mode_;
+  bool global_pooling_ = false;
+  Tensor *output_y_;
+  Tensor *input_bias_;
+  Tensor *input_mean_;
+  Tensor *input_scale_;
+  Tensor *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  string data_format_;
+};
 #endif
 #ifdef PRIORBOX_OP
@@ -875,7 +949,6 @@ class PReluParam : public OpParam {
 };
 #endif
-#ifdef FUSION_FC_OP
 class FusionFcParam : public OpParam {
 public:
  FusionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
@@ -911,9 +984,11 @@ class FusionFcParam : public OpParam {
  int y_num_col_dims_;
  int axis_;
 };
+#ifdef FUSION_FCRELU_OP
+using FusionFcReluParam = FusionFcParam;
 #endif
-#ifdef FUSION_CONVADD_OP
 class FusionConvAddParam : public OpParam {
 public:
  FusionConvAddParam(const VariableNameMap &inputs,
@@ -960,9 +1035,8 @@ class FusionConvAddParam : public OpParam {
 };
 Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
-#endif
-#ifdef FUSION_CONVADD_RELU_OP
+#ifdef FUSION_CONVADDRELU_OP
 class FusionConvAddReluParam : public FusionConvAddParam {
 public:
  FusionConvAddReluParam(const VariableNameMap &inputs,
@@ -993,7 +1067,7 @@ class FusionConvAddBNReluParam : public OpParam {
    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
    epsilon_ = GetAttr<float>("epsilon", attrs);
    momentum_ = GetAttr<float>("momentum", attrs);
-    is_test_ = GetAttr<bool>("is_test", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
  }
  Tensor *Bias() const { return bias_; }
@@ -1055,8 +1129,91 @@ class FusionConvAddBNReluParam : public OpParam {
  Tensor *new_bias_;
  Tensor *new_scale_;
 };
+#endif
-Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
+#ifdef FUSION_CONVADDBN_OP
+class FusionConvAddBNParam : public OpParam {
+ public:
+  FusionConvAddBNParam(const VariableNameMap &inputs,
+                       const VariableNameMap &outputs,
+                       const AttributeMap &attrs, const Scope &scope) {
+    bias_ = InputYFrom<LoDTensor>(inputs, scope);
+    axis_ = GetAttr<int>("axis", attrs);
+    filter_ = FilterFrom<LoDTensor>(inputs, scope);
+    input_ = InputFrom<LoDTensor>(inputs, scope);
+    output_y_ = OutputYFrom<LoDTensor>(outputs, scope);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
+    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
+    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
+    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
+    epsilon_ = GetAttr<float>("epsilon", attrs);
+    momentum_ = GetAttr<float>("momentum", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
+  }
+  Tensor *Bias() const { return bias_; }
+  const int &Axis() const { return axis_; }
+  const Tensor *Input() const { return input_; }
+  const Tensor *Filter() const { return filter_; }
+  Tensor *OutputY() const { return output_y_; }
+  const vector<int> &Strides() const { return strides_; }
+  const vector<int> &Paddings() const { return paddings_; }
+  const vector<int> &Dilations() const { return dilations_; }
+  const int &Groups() const { return groups; }
+  const Tensor *InputBias() const { return input_bias_; }
+  const Tensor *InputMean() const { return input_mean_; }
+  const Tensor *InputScale() const { return input_scale_; }
+  const Tensor *InputVariance() const { return input_variance_; }
+  const float &Epsilon() const { return epsilon_; }
+  const float &Momentum() const { return momentum_; }
+  const bool &IsTest() const { return is_test_; }
+  void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; }
+  void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; }
+  const Tensor *NewScale() const { return new_scale_; }
+  const Tensor *NewBias() const { return new_bias_; }
+ protected:
+  Tensor *bias_;
+  int axis_;
+  Tensor *input_;
+  Tensor *output_y_;
+  Tensor *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+  Tensor *input_bias_;
+  Tensor *input_mean_;
+  Tensor *input_scale_;
+  Tensor *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  Tensor *new_bias_;
+  Tensor *new_scale_;
+};
 #endif
 #ifdef FUSION_DWCONVBNRELU_OP
@@ -1078,7 +1235,7 @@ class FusionDWConvBNReluParam : public OpParam {
    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
    epsilon_ = GetAttr<float>("epsilon", attrs);
    momentum_ = GetAttr<float>("momentum", attrs);
-    is_test_ = GetAttr<bool>("is_test", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
  }
  const Tensor *Input() const { return input_; }
@@ -1139,6 +1296,85 @@ class FusionDWConvBNReluParam : public OpParam {
 Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
 #endif
+#ifdef FUSION_CONVBNRELU_OP
+class FusionConvBNReluParam : public OpParam {
+ public:
+  FusionConvBNReluParam(const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const AttributeMap &attrs, const Scope &scope) {
+    filter_ = FilterFrom<LoDTensor>(inputs, scope);
+    input_ = InputFrom<LoDTensor>(inputs, scope);
+    output_ = OutFrom<LoDTensor>(outputs, scope);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
+    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
+    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
+    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
+    epsilon_ = GetAttr<float>("epsilon", attrs);
+    momentum_ = GetAttr<float>("momentum", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
+  }
+  const Tensor *Input() const { return input_; }
+  const Tensor *Filter() const { return filter_; }
+  Tensor *Output() const { return output_; }
+  const vector<int> &Strides() const { return strides_; }
+  const vector<int> &Paddings() const { return paddings_; }
+  const vector<int> &Dilations() const { return dilations_; }
+  const int &Groups() const { return groups; }
+  const Tensor *InputBias() const { return input_bias_; }
+  const Tensor *InputMean() const { return input_mean_; }
+  const Tensor *InputScale() const { return input_scale_; }
+  const Tensor *InputVariance() const { return input_variance_; }
+  const float &Epsilon() const { return epsilon_; }
+  const float &Momentum() const { return momentum_; }
+  const bool &IsTest() const { return is_test_; }
+  void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; }
+  void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; }
+  const Tensor *NewScale() const { return new_scale_; }
+  const Tensor *NewBias() const { return new_bias_; }
+ protected:
+  Tensor *input_;
+  Tensor *output_;
+  Tensor *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+  Tensor *input_bias_;
+  Tensor *input_mean_;
+  Tensor *input_scale_;
+  Tensor *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  Tensor *new_bias_;
+  Tensor *new_scale_;
+};
+#endif
 #ifdef IM2SEQUENCE_OP
 class Im2SequenceParam : public OpParam {
 public:
@@ -1190,5 +1426,9 @@ class DropoutParam : public OpParam {
 };
 #endif
+#ifdef REGION_OP
+class RegionParam : public OpParam {};
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
@@ -54,7 +54,7 @@ void PoolOp<DeviceType, T>::InferShape() const {
  }
  this->param_.Output()->Resize(framework::make_ddim(output_shape));
 }
-template class PoolOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/prelu_op.cpp
+++ b/src/operators/prelu_op.cpp
@@ -23,7 +23,7 @@ void PReluOp<Dtype, T>::InferShape() const {
  auto input_dims = this->param_.InputX()->dims();
  this->param_.Out()->Resize(input_dims);
 }
-template class PReluOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/prior_box_op.cpp
+++ b/src/operators/prior_box_op.cpp
@@ -44,7 +44,7 @@ void PriorBoxOp<Dtype, T>::InferShape() const {
  this->param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec));
  this->param_.OutputVariances()->Resize(framework::make_ddim(dim_vec));
 }
-template class PriorBoxOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -23,7 +23,7 @@ void ReluOp<Dtype, T>::InferShape() const {
  auto input_dims = this->param_.InputX()->dims();
  this->param_.Out()->Resize(input_dims);
 }
-template class ReluOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -27,7 +27,7 @@ void ReshapeOp<Dtype, T>::InferShape() const {
  auto out_dims = ValidateShape(shape, input_x_dims);
  this->param_.Out()->Resize(out_dims);
 }
-template class ReshapeOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/resize_op.cpp
+++ b/src/operators/resize_op.cpp
@@ -24,7 +24,7 @@ void ResizeOp<Dtype, T>::InferShape() const {
  auto out_dims = CalOutputShape(this->param_);
  this->param_.Out()->Resize(out_dims);
 }
-template class ResizeOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/scale_op.cpp
+++ b/src/operators/scale_op.cpp
@@ -24,7 +24,7 @@ void ScaleOp<Dtype, T>::InferShape() const {
  auto input_dims = this->param_.InputX()->dims();
  this->param_.Out()->Resize(input_dims);
 }
-template class ScaleOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/sigmoid_op.cpp
+++ b/src/operators/sigmoid_op.cpp
@@ -22,7 +22,7 @@ template <typename DeviceType, typename T>
 void SigmoidOp<DeviceType, T>::InferShape() const {
  this->param_.Out()->Resize(this->param_.InputX()->dims());
 }
-template class SigmoidOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/slice_op.cpp
+++ b/src/operators/slice_op.cpp
@@ -23,7 +23,7 @@ template <typename Dtype, typename T>
 void SliceOp<Dtype, T>::InferShape() const {
  /// todo: add InputShape() detection.
 }
-template class SliceOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -22,7 +22,7 @@ template <typename DeviceType, typename T>
 void SoftmaxOp<DeviceType, T>::InferShape() const {
  this->param_.Out()->Resize(this->param_.InputX()->dims());
 }
-template class SoftmaxOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/transpose_op.cpp
+++ b/src/operators/transpose_op.cpp
@@ -47,7 +47,7 @@ void TransposeOp<Dtype, T>::InferShape() const {
  }
  this->param_.Out()->Resize(out_dims);
 }
-template class TransposeOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
 set(dir ${CMAKE_CURRENT_SOURCE_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build")
-if (NET STREQUAL "googlenet")
+if ("googlenet" IN_LIST NET)
    # gen test
    ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-googlenet paddle-mobile)
-elseif (NET STREQUAL "mobilenet")
+elseif ("mobilenet" IN_LIST NET)
    # gen test
    ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-mobilenet paddle-mobile)
-elseif (NET STREQUAL "yolo")
+elseif ("yolo" IN_LIST NET)
    # gen test
    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-yolo paddle-mobile)
-elseif (NET STREQUAL "squeezenet")
+elseif ("squeezenet" IN_LIST NET)
    # gen test
    ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-squeezenet paddle-mobile)
-elseif(NET STREQUAL "resnet")
+elseif("resnet" IN_LIST NET)
    # gen test
    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-resnet paddle-mobile)
@@ -145,6 +145,10 @@ else ()
    ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-conv-add-relu-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 endif()
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -43,7 +43,7 @@ template <typename DeviceType, typename OpType>
 class Executor4Test : public Executor<DeviceType> {
 public:
  Executor4Test(Program<DeviceType> p, string op_type,
-                bool use_optimize = false)
+                bool use_optimize = false, int predict_op_count = 1)
      : Executor<DeviceType>() {
    this->use_optimize_ = use_optimize;
    this->program_ = p;
@@ -57,12 +57,14 @@ class Executor4Test : public Executor<DeviceType> {
      LOG(paddle_mobile::LogLevel::kLOG_ERROR)
          << "to_predict_program_ == nullptr";
    }
    const std::vector<std::shared_ptr<BlockDesc>> blocks =
        this->to_predict_program_->Blocks();
    for (std::shared_ptr<BlockDesc> block_desc : blocks) {
      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      for (std::shared_ptr<OpDesc> op : ops) {
+      for (int i = 0; i < ops.size(); ++i) {
-        if (op->Type() == op_type) {
+        auto op = ops[i];
+        if (op->Type() == op_type && i < predict_op_count) {
          DLOG << "匹配到: " << op->Type();
          /// test first meeting op in program
@@ -72,11 +74,17 @@ class Executor4Test : public Executor<DeviceType> {
                      op->Type(), op->GetInputs(), op->GetOutputs(),
                      op->GetAttrMap(), this->program_.scope);
          this->ops_of_block_[*block_desc.get()].push_back(op_ptr);
-          break;
        }
      }
    }
    this->InitMemory();
+    std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
+        this->to_predict_program_->Block(0);
+    auto &ops = this->ops_of_block_[*to_predict_block.get()];
+    for (const auto &op : ops) {
+      op->Init();
+    }
  }
  template <typename T = LoDTensor>
@@ -130,9 +138,6 @@ class Executor4Test : public Executor<DeviceType> {
    auto *output_tensor = con_output->GetMutable<LoDTensor>();
    output_tensor->mutable_data<float>(dDim);
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
    std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
        this->to_predict_program_->Block(0);
    for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size();
@@ -141,6 +146,7 @@ class Executor4Test : public Executor<DeviceType> {
      op->Run();
    }
-    return out_tensor;
+    return std::make_shared<paddle_mobile::framework::Tensor>(
+        paddle_mobile::framework::Tensor(*output_tensor));
  }
 };
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -19,7 +19,9 @@ int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
-  auto program = loader.Load(g_googlenet, true);
+  //  auto program = loader.Load(g_googlenet, true);
+  auto program = loader.Load(g_mobilenet_ssd, true);
  //  auto program = loader.Load(g_googlenet_combine + "/model",
  //  g_googlenet_combine +
  //    "/params", true);

--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -23,7 +23,7 @@ int main() {
  auto time1 = time();
  if (paddle_mobile.Load(g_googlenet, optimize)) {
    auto time2 = time();
-    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    DLOG << "load cost: " << time_diff(time1, time1) << "ms";
    std::vector<float> input;
    std::vector<int64_t> dims{1, 3, 224, 224};
    GetInput<float>(g_test_image_1x3x224x224, &input, dims);

--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
@@ -12,28 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
  auto time1 = time();
-  if (paddle_mobile.Load(g_mobilenet_ssd, true)) {
+  auto isok = paddle_mobile.Load(
+      std::string(g_mobilenet_ssd_gesture) + "/model",
+      std::string(g_mobilenet_ssd_gesture) + "/params", true);
+  //  auto isok = paddle_mobile.Load(g_mobilenet_ssd, false);
+  if (isok) {
    auto time2 = time();
-    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
+    std::vector<float> input;
    std::vector<int64_t> dims{1, 3, 300, 300};
-    Tensor input_tensor;
+    GetInput<float>(g_hand, &input, dims);
-    SetupTensor<float>(&input_tensor, {1, 3, 300, 300}, static_cast<float>(0),
-                       static_cast<float>(1));
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
    auto time3 = time();
-    paddle_mobile.Predict(input, dims);
+    auto output = paddle_mobile.Predict(input, dims);
    auto time4 = time();
-    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+              << std::endl;
  }
  return 0;
 }
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
@@ -22,20 +22,23 @@ int main() {
  auto time1 = time();
  if (paddle_mobile.Load(g_mobilenet, true)) {
    auto time2 = time();
-    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    std::vector<float> input;
    std::vector<int64_t> dims{1, 3, 224, 224};
-    Tensor input_tensor;
+    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
-    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
-                       static_cast<float>(1));
+    for (int i = 0; i < 10; ++i) {
+      auto time3 = time();
-    std::vector<float> input(input_tensor.data<float>(),
+      auto vec_result = paddle_mobile.Predict(input, dims);
-                             input_tensor.data<float>() + input_tensor.numel());
+      auto time4 = time();
-    auto time3 = time();
+      std::vector<float>::iterator biggest =
-    auto vec_result = paddle_mobile.Predict(input, dims);
+          std::max_element(std::begin(vec_result), std::end(vec_result));
-    auto time4 = time();
+      std::cout << " Max element is " << *biggest << " at position "
+                << std::distance(std::begin(vec_result), biggest) << std::endl;
-    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+      std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+                << std::endl;
+    }
  }
  return 0;

--- a/test/operators/test_fusion_conv_add_bn_relu_op.cpp
+++ b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "../test_include.h"
+#include "operators/fusion_conv_add_bn_relu_op.h"
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../models/image_classification_resnet.inference.model
+  auto program = loader.Load(g_mobilenet, true);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::FusionConvAddBNReluOp<
+                    paddle_mobile::CPU, float>>
+      executor(program, "fusion_conv_add_bn_relu", true);
+  std::cout << "executor 4 test: " << std::endl;
+  paddle_mobile::framework::Tensor input;
+  GetInput<float>(g_test_image_1x3x224x224_banana, &input, {1, 3, 224, 224});
+  //  // use SetupTensor if not has local input image .
+  //  SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
+  //                     static_cast<float>(1));
+  DLOG << " fuck: " << input;
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 112, 112});
+  std::cout << "before predict: " << std::endl;
+  auto output =
+      executor.Predict(input, "data", "conv2_1_dw_bn.tmp_2", out_ddim);
+  std::cout << "after predict " << std::endl;
+  auto output_ptr = output->data<float>();
+  int stride = output->numel() / 100;
+  for (int i = 0; i < 100; i++) {
+    DLOG << " index:" << i * stride << " value: " << output_ptr[i * stride];
+  }
+  //  for (int i = 0; i < 100; i++) {
+  //    DLOG << " index:" << i << " value: "<< output_ptr[i];
+  //  }
+  //  for (int j = 0; j < output->numel(); ++j) {
+  //    std::cout << " (index: " << j << " value: " << output_ptr[j] << ") ";
+  //  }
+  std::cout << std::endl;
+  return 0;
+}
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -16,22 +16,29 @@ limitations under the License. */
 #include <fstream>
 #include <random>
+#include <string>
+#include <vector>
 #include "common/common.h"
 #include "common/log.h"
 #include "framework/ddim.h"
 #include "framework/tensor.h"
-static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd";
+static const char *g_mobilenet_ssd = "../models/mobilenet+ssd";
-static const std::string g_squeezenet = "../models/squeezenet";
+static const char *g_mobilenet_ssd_gesture = "../models/mobilenet+ssd_gesture";
-static const std::string g_googlenet = "../models/googlenet";
+static const char *g_squeezenet = "../models/squeezenet";
-static const std::string g_mobilenet = "../models/mobilenet";
+static const char *g_googlenet = "../models/googlenet";
-static const std::string g_resnet_50 = "../models/resnet_50";
+static const char *g_mobilenet = "../models/mobilenet";
-static const std::string g_resnet = "../models/resnet";
+static const char *g_resnet_50 = "../models/resnet_50";
-static const std::string g_googlenet_combine = "../models/googlenet_combine";
+static const char *g_resnet = "../models/resnet";
-static const std::string g_yolo = "../models/yolo";
+static const char *g_googlenet_combine = "../models/googlenet_combine";
-static const std::string g_test_image_1x3x224x224 =
+static const char *g_yolo = "../models/yolo";
+static const char *g_test_image_1x3x224x224 =
    "../images/test_image_1x3x224x224_float";
+static const char *g_test_image_1x3x224x224_banana =
+    "../images/input_3x224x224_banana";
+static const char *g_hand = "../images/hand_image";
 using paddle_mobile::framework::DDim;
 using paddle_mobile::framework::Tensor;
@@ -62,9 +69,9 @@ void GetInput(const std::string &input_name, std::vector<T> *input,
    size *= dim;
  }
-  T *input_ptr = (T *)malloc(sizeof(T) * size);
+  T *input_ptr = reinterpret_cast<T *>(malloc(sizeof(T) * size));
  std::ifstream in(input_name, std::ios::in | std::ios::binary);
-  in.read((char *)(input_ptr), size * sizeof(T));
+  in.read(reinterpret_cast<char *>(input_ptr), size * sizeof(T));
  in.close();
  for (int i = 0; i < size; ++i) {
    input->push_back(input_ptr[i]);
@@ -79,6 +86,6 @@ void GetInput(const std::string &input_name,
  T *input_ptr = input->mutable_data<T>(dims);
  std::ifstream in(input_name, std::ios::in | std::ios::binary);
-  in.read((char *)(input_ptr), input->numel() * sizeof(T));
+  in.read(reinterpret_cast<char *>(input_ptr), input->numel() * sizeof(T));
  in.close();
 }
--- a/tools/build.sh
+++ b/tools/build.sh
 #!/usr/bin/env bash
+NETS=""
+declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet")
 build_for_mac() {
    if [ ! `which brew` ]; then
@@ -38,7 +40,8 @@ build_for_android() {
    fi
    if [ -z "$PLATFORM" ]; then
-        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" or other platforms from the command line.
+        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" platform.
+#        PLATFORM="arm-v8a"
    fi
    if [ "${PLATFORM}" = "arm-v7a" ]; then
@@ -59,7 +62,8 @@ build_for_android() {
    ANDROID_PLATFORM_VERSION="android-22"
    TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
    ANDROID_ARM_MODE="arm"
-    if [ $# -eq 1 ]; then
+    if [ "${#NETS}" > 1 ]; then
    cmake .. \
        -B"../build/release/${PLATFORM}" \
        -DANDROID_ABI="${ABI}" \
@@ -69,7 +73,7 @@ build_for_android() {
        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
        -DANDROID_STL=c++_static \
        -DANDROID=true \
-        -DNET=$1 \
+        -DNET="${NETS}" \
        -D"${ARM_PLATFORM}"=true
    else
@@ -92,23 +96,25 @@ build_for_ios() {
 #    rm -rf "../build"
    PLATFORM="ios"
    MODE="Release"
-    BUILD_DIR=../build/release/"${PLATFORM}"
+    BUILD_DIR=../build/release/"${PLATFORM}"/
    TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
    mkdir -p "${BUILD_DIR}"
-    if [ $# -eq 1 ]; then
+    if [ "${#NETS}" > 1 ]; then
        cmake .. \
            -B"${BUILD_DIR}" \
            -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
            -DIOS_PLATFORM=OS \
-            -DNET=$1 \
+            -DIOS_ARCH="${IOS_ARCH}" \
+            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
+            -DNET="${NETS}" \
            -DIS_IOS="true"
    else
        cmake .. \
            -B"${BUILD_DIR}" \
            -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
            -DIOS_PLATFORM=OS \
+            -DIOS_ARCH="${IOS_ARCH}" \
+            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
            -DIS_IOS="true"
    fi
    cd "${BUILD_DIR}"
@@ -120,7 +126,7 @@ build_for_ios() {
 }
 build_error() {
-    echo "unknown argument"
+    echo "unknown target : $1"
 }
 if [ $# -lt 1 ]; then
@@ -128,31 +134,37 @@ if [ $# -lt 1 ]; then
    echo "available targets: ios|android"
    echo "sample usage: ./build.sh android"
 else
-    if [ $# -eq 2 ]; then
+    params=($@)
-        if [ $2 != "googlenet" -a $2 != "mobilenet" -a $2 != "yolo" -a $2 != "squeezenet" -a $2 != "resnet" ]; then
+    for(( i=1; i<$#; i++ )); do  
-	        if [ $1 = "android" ]; then
+        if [ ${i} != 1 ]; then
-		        build_for_android
+            NETS=$NETS$";"
-	        elif [ $1 = "ios" ]; then
+        fi
-		        build_for_ios
+        NETS=$NETS$"${params[i]}"
-	        else
+    done
-		        build_error
+    params=${@:2}
-	        fi
-        else
+    supported=false
-	        if [ $1 = "android" ]; then
+    for name in ${params[@]}; do
-		        build_for_android $2
+        for net in ${supportedNets[@]}; do
-	        elif [ $1 = "ios" ]; then
+            match=false
-		        build_for_ios $2
+            if [ "$name"x = "$net"x ];then
-	        else
+                supported=true
-		        build_error
+                match=true
-	        fi
+                break 1
+            fi
+        done
+        if [ "$match" = false ];then
+            echo "${name} not supported!"
+            echo "supported nets are: ${supportedNets[@]}"
+            exit -1
        fi
+    done
+    if [ $1 = "android" ]; then
+        build_for_android
+    elif [ $1 = "ios" ]; then
+        build_for_ios
    else
-	    if [ $1 = "android" ]; then
+        build_error "$1"
-		    build_for_android
+    fi
-	    elif [ $1 = "ios" ]; then
-		    build_for_ios
-	    else
-		    build_error
-	    fi
-	fi
 fi
\ No newline at end of file
--- a/tools/ios-cmake/ios.toolchain.cmake
+++ b/tools/ios-cmake/ios.toolchain.cmake
@@ -34,6 +34,7 @@ set (CMAKE_SYSTEM_VERSION 1)
 set (UNIX True)
 set (APPLE True)
 set (IOS True)
+set (IOS_ARCH armv7 armv7s arm64)
 # Required as of cmake 2.8.10
 set (CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
@@ -159,7 +160,6 @@ set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS su
 # set the architecture for iOS
 if (${IOS_PLATFORM} STREQUAL "OS")
-  set (IOS_ARCH armv7 armv7s arm64)
 elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
  set (IOS_ARCH i386)
 elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")

--- a/tools/net-detail.awk
+++ b/tools/net-detail.awk
+BEGIN {
+print "digraph G {"
+}
+/op:/ {
+    id++
+    opname[id] = $NF
+}
+/input/ {
+    type = "input"
+    para = $NF
+    if (input[id]) {
+        input[id] = input[id] "|"
+    }
+    input[id] = input[id] "<" para ">" para
+}
+/output/ {
+    type = "output"
+    para = $NF
+    if (output[id]) {
+        output[id] = output[id] "|"
+    }
+    output[id] = output[id] "<" para ">" para
+}
+/attr/ {
+    type = "attr"
+    aname = $NF
+    if (attr_key[id]) {
+        attr_key[id] = attr_key[id] "|"
+        attr_value[id] = attr_value[id] "|"
+    }
+    attr_key[id] = attr_key[id] $NF
+}
+/argument/ {
+    if (type == "attr") {
+        split($0, arr, " - ")
+        attr_value[id] = attr_value[id] arr[2]
+    } else if ((type == "input") || (type == "output")) {
+        if (!var2id[$NF]) {
+            var_id++
+            var[var_id] = $NF
+            var2id[$NF] = var_id
+        }
+        varid = var2id[$NF]
+        lid++
+        if (type == "input") {
+            line[lid] = "var_" varid " -> " "op_" id ":<" para ">"
+            if (xout[$NF]) {
+                xi++
+                xline[xi] = "xop_" xout[$NF] " -> " "xop_" id
+            }
+        } else if (type == "output") {
+            line[lid] = "op_" id ":<" para ">" " -> " "var_" varid
+            xout[$NF] = id
+        }
+    }
+}
+/var name/ {
+    varname = $NF
+    vid = var2id[varname]
+}
+/var tensor desc dim / {
+    if (tensor[vid]) tensor[vid] = tensor[vid] " x "
+    tensor[vid] = tensor[vid] $NF
+}
+END {
+print "subgraph cluster_G0 {"
+for (i = 1; i <= id; i++) {
+    print "xop_" i "[label=\"" i ". " opname[i] "\"]"
+}
+for (i = 1; i <= xi; i++) {
+    print xline[i]
+}
+print "}"
+for (i = 1; i <= id; i++) {
+print "op_" i "[group=op;shape=record;label=\"{{" input[i] "}|<op>" i ". " opname[i] "|{" output[i] "}}\"]"
+}
+for (i = 1; i <= var_id; i++) {
+print "var_" i "[label=\"" var[i] " [" tensor[i] "]\"]"
+}
+for (i = 1; i <= lid; i++) {
+print line[i]
+}
+for (i = 1; i <= id; i++) {
+print "attr_" i "[shape=record;label=\"{" attr_key[i] "}|{" attr_value[i] "}\"]"
+print "attr_" i " -> " "op_" i ":<op>"
+}
+print "}"
+}
--- a/tools/op.cmake
+++ b/tools/op.cmake
-if (NET STREQUAL "googlenet")
+set(FOUND_MATCH OFF)
+if ("googlenet" IN_LIST NET)
+  message("googlenet enabled")
  set(CONCAT_OP ON)
  set(CONV_OP ON)
  set(LRN_OP ON)
@@ -8,8 +10,13 @@ if (NET STREQUAL "googlenet")
  set(POOL_OP ON)
  set(RELU_OP ON)
  set(FUSION_CONVADD_OP ON)
-  set(FUSION_CONVADD_RELU_OP ON)
+  set(FUSION_CONVADDRELU_OP ON)
-elseif (NET STREQUAL "mobilenet")
+  set(FOUND_MATCH ON)
+endif()
+if ("mobilenet" IN_LIST NET)
+  message("mobilenet enabled")
  set(CONV_OP ON)
  set(ELEMENTWISEADD_OP ON)
  set(RELU_OP ON)
@@ -21,12 +28,23 @@ elseif (NET STREQUAL "mobilenet")
  set(RESHAPE_OP ON)
  set(FUSION_CONVADDBNRELU_OP ON)
  set(FUSION_CONVADD_OP ON)
-elseif (NET STREQUAL "yolo")
+  set(FOUND_MATCH ON)
+endif()
+if ("yolo" IN_LIST NET)
+  message("yolo enabled")
  set(BATCHNORM_OP ON)
  set(CONV_OP ON)
  set(RELU_OP ON)
  set(ELEMENTWISEADD_OP ON)
-elseif (NET STREQUAL "squeezenet")
+  set(FOUND_MATCH ON)
+endif()
+if ("squeezenet" IN_LIST NET)
+  message("squeezenet enabled")
  set(CONCAT_OP ON)
  set(CONV_OP ON)
  set(RELU_OP ON)
@@ -34,15 +52,45 @@ elseif (NET STREQUAL "squeezenet")
  set(POOL_OP ON)
  set(RESHAPE_OP ON)
  set(SOFTMAX_OP ON)
-elseif (NET STREQUAL "resnet")
+  set(FOUND_MATCH ON)
+endif()
+if ("resnet" IN_LIST NET)
+  message("resnet enabled")
+  set(CONCAT_OP ON)
  set(CONV_OP ON)
-  set(BATCHNORM_OP ON)
+  set(RELU_OP ON)
  set(ELEMENTWISEADD_OP ON)
+  set(POOL_OP ON)
+  set(RESHAPE_OP ON)
  set(SOFTMAX_OP ON)
-  set(MUL_OP ON)
+  set(FOUND_MATCH ON)
+endif()
+if ("FPGAnets" IN_LIST NET)
+  message("FPGAnets enabled")
+  set(FUSION_CONVADDRELU_OP ON)
+  set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_CONVADDBN_OP ON)
+  set(FUSION_POOLBN_OP ON)
+  set(FUSION_ELEMENTWISEADDRELU_OP ON)
+  set(FUSION_FC_OP ON)
+  set(FUSION_FCRELU_OP ON)
+  set(REGION_OP ON)
  set(POOL_OP ON)
-  set(RELU_OP ON)
+  set(CONCAT_OP ON)
-else ()
+  set(SOFTMAX_OP ON)
+  set(DROPOUT_OP ON)
+  set(FOUND_MATCH ON)   
+endif()
+if(NOT FOUND_MATCH)
+  message("--default--")
  set(BATCHNORM_OP ON)
  set(BOXCODER_OP ON)
  set(CONCAT_OP ON)
@@ -50,7 +98,7 @@ else ()
  set(DEPTHWISECONV_OP ON)
  set(ELEMENTWISEADD_OP ON)
  set(FUSION_CONVADD_OP ON)
-  set(CONVADDRELU_OP ON)
+  set(FUSION_CONVADDRELU_OP ON)
  set(FUSION_FC_OP ON)
  set(LRN_OP ON)
  set(MUL_OP ON)
@@ -62,15 +110,17 @@ else ()
  set(SIGMOID_OP ON)
  set(SOFTMAX_OP ON)
  set(TRANSPOSE_OP ON)
-  set(FUSION_CONVADD_RELU_OP ON)
  set(FUSION_CONVADDBNRELU_OP ON)
  set(FUSION_DWCONVBNRELU_OP ON)
+  set(FUSION_CONVBNRELU_OP ON)
  set(PRELU_OP ON)
  set(RESIZE_OP ON)
  set(SCALE_OP ON)
  set(SLICE_OP ON)
  set(DROPOUT_OP ON)
  set(IM2SEQUENCE_OP ON)
+endif()
  # option(BATCHNORM_OP "" ON)
  # option(BOXCODER_OP "" ON)
  # option(CONCAT_OP "" ON)
@@ -78,7 +128,7 @@ else ()
  # option(DEPTHWISECONV_OP "" ON)
  # option(ELEMENTWISEADD_OP "" ON)
  # option(FUSION_CONVADD_OP "" ON)
-  # option(CONVADDRELU_OP "" ON)
+  # option(FUSION_CONVADDRELU_OP "" ON)
  # option(FUSION_FC_OP "" ON)
  # option(LRN_OP "" ON)
  # option(MUL_OP "" ON)
@@ -90,8 +140,7 @@ else ()
  # option(SIGMOID_OP "" ON)
  # option(SOFTMAX_OP "" ON)
  # option(TRANSPOSE_OP "" ON)
-  # option(FUSION_CONVADD_RELU_OP "" ON)
+# endif ()
-endif ()
 if (BATCHNORM_OP)
  add_definitions(-DBATCHNORM_OP)
@@ -114,8 +163,8 @@ endif()
 if (FUSION_CONVADD_OP)
  add_definitions(-DFUSION_CONVADD_OP)
 endif()
-if (CONVADDRELU_OP)
+if (FUSION_CONVADDRELU_OP)
-  add_definitions(-DCONVADDRELU_OP)
+  add_definitions(-DFUSION_CONVADDRELU_OP)
 endif()
 if (FUSION_FC_OP)
  add_definitions(-DFUSION_FC_OP)
@@ -150,15 +199,17 @@ endif()
 if (TRANSPOSE_OP)
  add_definitions(-DTRANSPOSE_OP)
 endif()
-if (FUSION_CONVADD_RELU_OP)
-  add_definitions(-DFUSION_CONVADD_RELU_OP)
-endif()
 if (FUSION_CONVADDBNRELU_OP)
  add_definitions(-DFUSION_CONVADDBNRELU_OP)
 endif()
 if (FUSION_DWCONVBNRELU_OP)
  add_definitions(-DFUSION_DWCONVBNRELU_OP)
 endif()
+if (FUSION_CONVBNRELU_OP)
+  add_definitions(-DFUSION_CONVBNRELU_OP)
+endif()
 if (PRELU_OP)
  add_definitions(-DPRELU_OP)
 endif()
@@ -177,3 +228,20 @@ endif()
 if (IM2SEQUENCE_OP)
  add_definitions(-DIM2SEQUENCE_OP)
 endif()
+if (FUSION_CONVADDBN_OP)
+  add_definitions(-DFUSION_CONVADDBN_OP)
+endif()
+if (FUSION_FCRELU_OP)
+  add_definitions(-DFUSION_FCRELU_OP)
+endif()
+if (FUSION_POOLBN_OP)
+  add_definitions(-DFUSION_POOLBN_OP)
+endif()
+if (FUSION_ELEMENTWISEADDRELU_OP)
+  add_definitions(-DFUSION_ELEMENTWISEADDRELU_OP)
+endif()
+if (REGION_OP)
+  add_definitions(-DREGION_OP)
+endif()
--- a/tools/quantification/CMakeLists.txt
+++ b/tools/quantification/CMakeLists.txt
+cmake_minimum_required(VERSION 3.6)
+project(quali)
+add_definitions(-DENABLE_EXCEPTION)
+set(CMAKE_CXX_STANDARD 11)
+file(GLOB_RECURSE QULIFICATON_CC src/*.cc src/*.cpp src/*.c src/*.mm)
+file(GLOB_RECURSE QULIFICATON_H src/*.h)
+include_directories(. src/)
+#add_library(paddle-mobile SHARED ${QULIFICATON_CC} ${QULIFICATON_H} convert.cpp)
+add_executable(quantify convert.cpp ${QULIFICATON_CC} ${QULIFICATON_H})
\ No newline at end of file
--- a/tools/quantification/README.md
+++ b/tools/quantification/README.md
+# 模型量化脚本
+#### 量化脚本使用指南
+1. 在PaddleMobile项目目录下（如 ~/PaddleProject/paddle-mobile）
+2. cd到  tools/quantification/ 目录
+3. cmake编译
+    ``` sh
+    cmake .
+    make
+    ```
+4. 运行量化脚本
+    ```sh
+    ./quantify (0:seperated. 1:combined ) (输入路径) (输出路径)
+    # quantify googlenet seperated   from  /Users/xiebaiyuan/PaddleProject/quali/models/googlenet to ./googlenet_min
+    ./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min 
+    ```
+*注:*
+*量化工具中*
+*1.seperated模型model文件默认命名为 "__model__";*
+*2.combined模型的model文件默认命名为 "model",参数文件默认命名为"params";*
+##### 整体如下:
+以googlenet非combined为例：
+```sh
+cd tools/quantification/
+cmake .
+make
+./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min
+```
--- a/tools/quantification/convert.cpp
+++ b/tools/quantification/convert.cpp
+#include "src/enforce.h"
+#include "src/var_desc.h"
+#include "src/program_desc.h"
+#include <cstdlib>
+#include <string>
+#include <cmath>
+#include <iostream>
+#include <utility>
+#include <vector>
+#include "src/framework.pb-c.h"
+#include "src/protobuf-c.h"
+#include <fstream>
+#include <iostream>
+const size_t kSize64 = sizeof(uint64_t);
+const size_t kSize32 = sizeof(uint32_t);
+char *Get_binary_data(const std::string &filename) {
+    FILE *file = fopen(filename.c_str(), "rb");
+    PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
+                          filename.c_str());
+    fseek(file, 0, SEEK_END);
+    int64_t size = ftell(file);
+    PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
+    rewind(file);
+    auto *data = new char[size];
+    size_t bytes_read = fread(data, 1, static_cast<size_t>(size), file);
+    PADDLE_MOBILE_ENFORCE(bytes_read == size,
+                          "read binary file bytes do not match with fseek");
+    fclose(file);
+    return data;
+}
+static size_t ReadBuffer(const char *file_name, uint8_t **out) {
+    FILE *fp;
+    fp = fopen(file_name, "rb");
+    PADDLE_MOBILE_ENFORCE(fp != nullptr, " %s open failed !", file_name);
+    fseek(fp, 0, SEEK_END);
+    auto size = static_cast<size_t>(ftell(fp));
+    rewind(fp);
+    *out = reinterpret_cast<uint8_t *>(malloc(size));
+    size_t cur_len = 0;
+    size_t nread;
+    while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
+        cur_len += nread;
+    }
+    fclose(fp);
+    return cur_len;
+}
+std::shared_ptr<ProgramDesc> loadParams(const std::string &model_path) {
+    PaddleMobile__Framework__Proto__ProgramDesc *c_program;
+    uint8_t *buf = nullptr;
+    size_t read_size = ReadBuffer(model_path.c_str(), &buf);
+    PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
+    c_program = paddle_mobile__framework__proto__program_desc__unpack(
+            nullptr, read_size, buf);
+    PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
+    auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
+    return originProgramDesc;
+}
+void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char *dataP, FILE *out_file) {
+    // 1. version
+    uint32_t version = *reinterpret_cast<uint32_t *>(dataP);
+    // write version
+    fwrite(&version, kSize32, 1, out_file);
+    dataP += kSize32;
+    // 2 Lod information
+    auto *lod_level_ptr = new uint64_t();
+    memcpy(lod_level_ptr, dataP, kSize64);
+    uint64_t lod_level = 0;
+    // write lod Information
+    fwrite(&lod_level, kSize64, 1, out_file);
+    delete lod_level_ptr;
+    dataP += kSize64;
+    for (uint64_t i = 0; i < lod_level; ++i) {
+        uint64_t size = *reinterpret_cast<uint64_t *>(dataP);
+        // write lod size
+        fwrite(&size, kSize64, 1, out_file);
+        (dataP) += kSize64;
+        std::vector<size_t> tmp(size / sizeof(size_t));
+        for (unsigned long &k : tmp) {
+            k = *reinterpret_cast<size_t *>(dataP);
+            (dataP) += sizeof(size_t);
+        }
+        // write lod size vector
+        fwrite(&tmp, sizeof(size_t), tmp.size(), out_file);
+    }
+    // 3. tensor version
+    uint32_t tensor_version = *reinterpret_cast<uint32_t *>(dataP);
+    // write tensor version
+    fwrite(&tensor_version, kSize32, 1, out_file);
+    (dataP) += kSize32;
+    // 4. tensor desc
+    int32_t size = *reinterpret_cast<int32_t *>(dataP);
+    // write tensor desc
+    fwrite(&size, sizeof(int32_t), 1, out_file);
+    (dataP) += sizeof(int32_t);
+    std::unique_ptr<char[]> buf(new char[size]);
+    for (int m = 0; m < size; ++m) {
+        buf.get()[m] = (dataP)[m];
+    }
+    fwrite(buf.get(), sizeof(char), static_cast<size_t>(size), out_file);
+    (dataP) += (sizeof(char) * size);
+    const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
+    int memory_size = 1;
+    for (auto l : desc.Dims()) {
+        memory_size *= l;
+    }
+    void *memory = nullptr;
+    int type_size = 0;
+    switch (desc.DataType()) {
+        case paddle_mobile::framework::VARTYPE_TYPE_FP16:
+            type_size = 2;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_FP32:
+            type_size = 4;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_FP64:
+            type_size = 8;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_INT32:
+            type_size = 4;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_INT64:
+            type_size = 8;
+            break;
+        case paddle_mobile::framework::VARTYPE_TYPE_BOOL:
+            type_size = 1;
+            break;
+        default:
+            break;
+    }
+    size_t tensorSize = sizeof(char) * memory_size * type_size;
+    memory = new char[tensorSize];
+    for (int n = 0; n < tensorSize; ++n) {
+        static_cast<char *>(memory)[n] = (dataP)[n];
+    }
+    dataP += tensorSize;
+    // for float 32
+    float min_value = std::numeric_limits<float>::max();
+    float max_value = std::numeric_limits<float>::min();
+    for (int k = 0; k < memory_size; ++k) {
+        min_value = std::min(min_value, static_cast<float *> (memory)[k]);
+        max_value = std::max(max_value, static_cast<float *> (memory)[k]);
+    }
+    fwrite(&min_value, sizeof(float), 1, out_file);
+    fwrite(&max_value, sizeof(float), 1, out_file);
+    for (int g = 0; g < memory_size; ++g) {
+        float value = static_cast<float *> (memory)[g];
+        auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
+        fwrite(&factor, sizeof(uint8_t), 1, out_file);
+    }
+}
+void
+quantificate_combined(const std::string &model_path, const std::string &param_path, const std::string &param_min_path) {
+    auto program = loadParams(model_path);
+    char *origin_data = Get_binary_data(param_path);
+    char *data = origin_data;
+    FILE *out_file = fopen(param_min_path.c_str(), "wb");
+    for (const auto &block : program->Blocks()) {
+        for (const auto &var_desc : block->Vars()) {
+            if (var_desc->Persistable()) {
+                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+                    continue;
+                }
+                LoadWithDump(*var_desc, data, out_file);
+            }
+        }
+    }
+    fclose(out_file);
+    delete origin_data;
+}
+void quantificate_seperated(const std::string model_dir, const std::string param_min_path) {
+    auto program = loadParams(model_dir + "/__model__");
+    std::string shell_command = "mkdir " + param_min_path;
+    system(shell_command.c_str());
+    for (const auto &block : program->Blocks()) {
+        for (const auto &var_desc : block->Vars()) {
+            if (var_desc->Persistable()) {
+                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+                    continue;
+                }
+                std::string file_name = param_min_path + "/" + var_desc->Name();
+                FILE *out_file = fopen(file_name.c_str(), "wb");
+                char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
+                char *data = origin_data;
+                LoadWithDump(*var_desc, data, out_file);
+                delete origin_data;
+                fclose(out_file);
+            }
+        }
+    }
+}
+int main(int argc, char **argv) {
+    const std::string kNoteEg = "( eg:  ./quantify 1 your_combined_model_path output_path  or  ./quantify 0 your_seperated_model_path output_path)";
+    PADDLE_MOBILE_ENFORCE(argc > 1, "wee need params.%s ", kNoteEg.c_str());
+    std::string action_type = argv[1];
+    PADDLE_MOBILE_ENFORCE(argc > 1 && (action_type) == "1" || action_type == "0",
+                          "only 1 or 2 supported, current is %s %s ",
+                          action_type.c_str(),
+                          kNoteEg.c_str());
+    PADDLE_MOBILE_ENFORCE(argc > 2, "we need your model path. %s ", kNoteEg.c_str());
+    std::string base_path = argv[2];
+    PADDLE_MOBILE_ENFORCE(argc > 3, "we need your output path. %s ", kNoteEg.c_str());
+    std::string output_path = argv[3];
+    if (action_type == "0") {
+        // for seperated
+        const std::string &seperated_min_dir = output_path;
+        quantificate_seperated(base_path, seperated_min_dir);
+        return 0;
+    }
+    if (action_type == "1") {
+        // for combined
+        const std::string &combined_min_dir = output_path;
+        std::string model_path = base_path + "/model";
+        std::string param_path = base_path + "/params";
+        quantificate_combined(model_path, param_path, combined_min_dir);
+        return 0;
+    }
+    return -1;
+}
--- a/tools/quantification/src/block_desc_local.cpp
+++ b/tools/quantification/src/block_desc_local.cpp
--- a/tools/quantification/src/block_desc_local.h
+++ b/tools/quantification/src/block_desc_local.h
--- a/tools/quantification/src/enforce.h
+++ b/tools/quantification/src/enforce.h
--- a/tools/quantification/src/framework.pb-c.c
+++ b/tools/quantification/src/framework.pb-c.c
--- a/tools/quantification/src/framework.pb-c.h
+++ b/tools/quantification/src/framework.pb-c.h
--- a/tools/quantification/src/program_desc.cpp
+++ b/tools/quantification/src/program_desc.cpp
--- a/tools/quantification/src/program_desc.h
+++ b/tools/quantification/src/program_desc.h
--- a/tools/quantification/src/protobuf-c.c
+++ b/tools/quantification/src/protobuf-c.c
--- a/tools/quantification/src/protobuf-c.h
+++ b/tools/quantification/src/protobuf-c.h
--- a/tools/quantification/src/tensor_desc.h
+++ b/tools/quantification/src/tensor_desc.h
--- a/tools/quantification/src/var_desc.h
+++ b/tools/quantification/src/var_desc.h
--- a/tools/toolchains/arm-android-neon.cmake
+++ b/tools/toolchains/arm-android-neon.cmake
 set(ANDROID_ARM_NEON ON)
-include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")
+set(ANDROID_PIE TRUE)
\ No newline at end of file
+set(ANDROID_STL "c++_static")
+set(ANDROID_PLATFORM "android-22")
+include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")