Merge branch 'develop' into develop

7b3df2f0 · dolphin8 · GitHub · c0f0ec14 · c1dc792d · 7b3df2f0
84 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.6)
 project(paddle-mobile)
 option(DEBUGING "enable debug mode" ON)
-option(USE_OPENMP "openmp support" ON)
+option(USE_OPENMP "openmp support" OFF)
 option(USE_EXCEPTION "use std exception" ON)
 option(LOG_PROFILE "log profile" ON)
 # select the platform to build
 option(CPU "armv7 with neon" ON)
 option(MALI_GPU "mali gpu" OFF)
 option(FPGA "fpga" OFF)
+option(QUANTI "quantification" OFF)
 file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
 file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
 include_directories(src/)
 if(IS_IOS)
-    set(CMAKE_CXX_FLAGS "-fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
 else()
    set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
 endif()
@@ -43,7 +44,7 @@ if (LOG_PROFILE)
    add_definitions(-DPADDLE_MOBILE_PROFILE)
 endif()
-if(USE_OPENMP)
+if(USE_OPENMP AND NOT IS_IOS)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
 endif()
@@ -104,12 +105,21 @@ else()
    foreach(f ${_tmp_list_h})
        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
    endforeach()
-endif()
+    file(GLOB_RECURSE _tmp_list src/fpga/*.cpp src/fpga/*.cc)
+    foreach(f ${_tmp_list})
+        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+    endforeach()
+    file(GLOB_RECURSE _tmp_list_h src/fpga/*.h)
+    foreach(f ${_tmp_list_h})
+        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
+    endforeach()
+endif()
 if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
-    add_definitions(-DARMV7)
 else()
    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
@@ -131,7 +141,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
 # NET default
 set(NET "defult" CACHE STRING "select net type")
-set_property(CACHE NET PROPERTY STRINGS "defult" "googlenet" "mobilenet" "yolo" "squeezenet")
+set_property(CACHE NET PROPERTY STRINGS "defult" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGAnets")
 include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
@@ -153,3 +163,7 @@ if(DEBUGING)
    endif()
 endif()
+if (QUANTI)
+    add_subdirectory(tools/quantification)
+endif ()
--- a/README.md
+++ b/README.md
@@ -27,10 +27,10 @@ Paddle-Moible是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 - **ARM CPU**
-![](http://7xop3k.com1.z0.glb.clouddn.com/15312108766575.jpg)
+![](http://mms-graph.bj.bcebos.com/paddle-mobile%2F2018_07_18.png)
    arm cpu是paddle-mobile的主要支持方向，cpu的通用性一直是其优势。嵌入式深度学习，需要大量的cpu汇编实现。我们正在紧锣密鼓的编码，为的是能充分硬件的每一点加速能力。
-    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是120+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
+    arm cpu的优化工作还在进行中，现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms，显然这不是我们的最终目标，我们正在用大量的汇编改写，后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
 - **Mali GPU**

--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -24,6 +24,8 @@ const std::string G_OP_TYPE_CONCAT = "concat";
 const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
 const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
 const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
+const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
+const std::string G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
 const std::string G_OP_TYPE_FC = "fusion_fc";
 const std::string G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
 const std::string G_OP_TYPE_LRN = "lrn";
@@ -42,11 +44,21 @@ const std::string G_OP_TYPE_FETCH = "fetch";
 const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
 const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence";
 const std::string G_OP_TYPE_DROPOUT = "dropout";
+const std::string G_OP_TYPE_FUSION_CONV_RELU = "fusion_conv_relu";
+const std::string G_OP_TYPE_FUSION_CONV_BN_SCALE = "fusion_conv_bn_scale";
+const std::string G_OP_TYPE_FUSION_CONV_BN_SCALE_RELU =
+    "fusion_conv_bn_scale_relu";
+const std::string G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn";
+const std::string G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU =
+    "fusion_elementwise_add_relu";
+const std::string G_OP_TYPE_REGION = "region";
 std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
    op_input_output_key = {
        {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_FUSION_DWCONV_BN_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_BN_RELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
        {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
@@ -70,6 +82,12 @@ std::unordered_map<
        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}}};
+        {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_BN_SCALE, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_BN_SCALE_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_POOL_BN, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_REGION, {{"X"}, {"Out"}}}};
 }  // namespace paddle_mobile
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 namespace paddle_mobile {
@@ -81,6 +82,8 @@ extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU;
 extern const std::string G_OP_TYPE_FC;
 extern const std::string G_OP_TYPE_FUSION_CONV_ADD;
 extern const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
+extern const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU;
+extern const std::string G_OP_TYPE_FUSION_CONV_BN_RELU;
 extern const std::string G_OP_TYPE_LRN;
 extern const std::string G_OP_TYPE_MUL;
@@ -99,6 +102,13 @@ extern const std::string G_OP_TYPE_DEPTHWISE_CONV;
 extern const std::string G_OP_TYPE_IM2SEQUENCE;
 extern const std::string G_OP_TYPE_DROPOUT;
+extern const std::string G_OP_TYPE_FUSION_CONV_RELU;
+extern const std::string G_OP_TYPE_FUSION_CONV_BN_SCALE;
+extern const std::string G_OP_TYPE_FUSION_CONV_BN_SCALE_RELU;
+extern const std::string G_OP_TYPE_FUSION_POOL_BN;
+extern const std::string G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU;
+extern const std::string G_OP_TYPE_REGION;
 extern std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
    op_input_output_key;

--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -84,7 +84,7 @@ struct Variant {
    if (type_id == typeid(T).hash_code()) {
      return *const_cast<T *>(reinterpret_cast<const T *>(&data));
    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant ");
+      PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant");
      exit(0);
    }
  }

--- a/src/fpga/api/fpga_api.cpp
+++ b/src/fpga/api/fpga_api.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include "fpga/api/fpga_api.h"
+namespace paddle {
+namespace mobile {
+namespace fpga {
+namespace api {
+static int fd = -1;
+static const char *device_path = "/dev/fpgadrv0";
+static inline int do_ioctl(int req, void *arg) { return ioctl(req, arg); }
+int open_device() {
+  if (fd == -1) {
+    fd = open(device_path, O_RDWR);
+  }
+  return fd;
+}
+// memory management;
+void *fpga_malloc(size_t size) {
+  return reinterpret_cast<(void *)> mmap64(NULL, size, PROT_READ | PROT_WRITE,
+                                           MAP_SHARED, fd, 0);
+}
+void fpga_free(void *ptr) { munmap(ptr, 0); }
+void fpga_copy(void *dest, const void *src, size_t num) {
+  memcpy(dest, src, num);
+}
+}  // namespace api
+}  // namespace fpga
+}  // namespace mobile
+}  // namespace paddle
--- a/src/fpga/api/fpga_api.h
+++ b/src/fpga/api/fpga_api.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cstddef>
+#include <iostream>
+#include <limits>
+// memory management;
+namespace paddle {
+namespace mobile {
+namespace fpga {
+namespace api {
+int open_device();
+int close_device();
+void *fpga_malloc(size_t size);
+void fpga_free(void *ptr);
+void fpga_copy(void *dst, const void *src, size_t num);
+struct CnnVersionArgs {
+  void *buf;
+};
+struct QuantArgs {
+  float scale;
+};
+struct BatchNormalizationArgs {
+  bool enable;
+};
+struct ScaleArgs {};
+#define IOCTL_CNN_MAGIC 'CNN'
+#define IOCTL_VERSION _IOW(IOCTL_CNN_MAGIC, 1, struct CnnVersionArgs)
+#define IOCTL_GET_QUANT _IOW(IOCTL_CNN_MAGIC, 2, struct QuantArgs)
+#define IOCTL_SET_QUANT _IOW(IOCTL_CNN_MAGIC, 3, struct QuantArgs)
+}  // namespace api
+}  // namespace fpga
+}  // namespace mobile
+}  // namespace paddle
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -28,6 +28,16 @@ vector<string> OperatorBase<Dtype>::GetOutKeys() const {
  return it->second.second;
 }
+template <typename Dtype>
+vector<string> OperatorBase<Dtype>::GetInputKeys() const {
+  auto it = op_input_output_key.find(type_);
+  if (it == op_input_output_key.end()) {
+    DLOG << type_ << " has no outputs";
+    return {};
+  }
+  return it->second.first;
+}
 template <typename Dtype>
 OperatorBase<Dtype>::OperatorBase(const std::string &type,
                                  const VariableNameMap &inputs,
@@ -49,6 +59,11 @@ template <typename Dtype>
 void OperatorBase<Dtype>::Run() const {
  RunImpl();
 #ifdef PADDLE_MOBILE_DEBUG
+  vector<string> input_keys = GetInputKeys();
+  for (const auto key : input_keys) {
+    Tensor *input = GetVarValue<framework::LoDTensor>(key, inputs_, *scope_);
+    DLOG << type_ << " input- " << key << "=" << *input;
+  }
  vector<string> output_keys = GetOutKeys();
  for (const auto key : output_keys) {
    Tensor *out_ = GetVarValue<framework::LoDTensor>(key, outputs_, *scope_);

--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -61,6 +61,7 @@ class OperatorBase {
  virtual ~OperatorBase() {}
  void Run() const;
  std::vector<string> GetOutKeys() const;
+  std::vector<string> GetInputKeys() const;
  virtual void RunImpl() const = 0;
  virtual void Init() = 0;
@@ -118,6 +119,10 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
  virtual void InferShape() const = 0;
  void Init() {
+    //    for (auto i : this->inputs_) {
+    //      DLOG << i.first;
+    //      DLOG << i.second;
+    //    }
    PADDLE_MOBILE_ENFORCE(kernel_.Init(&param_), "  %s kernel init failed",
                          this->type_.c_str());
  }
@@ -146,7 +151,7 @@ class OpKernelBase {
  }
 #endif
  virtual void Compute(const P &para) const = 0;
-  virtual bool Init(P *para) { return true; };
+  virtual bool Init(P *para) { return true; }
  virtual ~OpKernelBase() = default;
 private:

--- a/src/framework/program/program-optimize/fusion_op_register.h
+++ b/src/framework/program/program-optimize/fusion_op_register.h
@@ -42,8 +42,17 @@ class FusionOpRegister {
    matchers_[matcher->Type()] = shared_matcher;
  }
-  const std::map<std::string, std::shared_ptr<FusionOpMatcher>> Matchers() {
+  const std::vector<std::shared_ptr<FusionOpMatcher>> Matchers() {
-    return matchers_;
+    std::vector<std::shared_ptr<FusionOpMatcher>> matchers;
+    for (const auto& match : matchers_) {
+      matchers.push_back(match.second);
+    }
+    std::sort(matchers.begin(), matchers.end(),
+              [](std::shared_ptr<FusionOpMatcher> first,
+                 std::shared_ptr<FusionOpMatcher> second) {
+                return first->BeginNode().Depth() > second->BeginNode().Depth();
+              });
+    return matchers;
  }
 private:

--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -44,23 +44,6 @@ bool Node::operator==(const Node &in) {
  return true;
 }
-std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(int size) {
-  std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
-  OpDescs(size - 1, &op_descs);
-  return op_descs;
-}
-void Node::OpDescs(int index,
-                   std::vector<std::shared_ptr<framework::OpDesc>> *op_desc) {
-  if (index == 0) {
-    return;
-  }
-  op_desc->push_back(this->op_desc_);
-  for (auto &output : outputs_) {
-    output->OpDescs(index, op_desc);
-  }
-}
 std::shared_ptr<Node> Node::To(int size) {
  std::shared_ptr<Node> node = std::make_shared<Node>();
  this->To(size - 1, node);

--- a/src/framework/program/program-optimize/node.h
+++ b/src/framework/program/program-optimize/node.h
@@ -47,13 +47,10 @@ class Node {
      std::map<std::string, std::vector<std::pair<std::string, std::string>>>
          change,
      std::vector<std::shared_ptr<Node>> *removed_nodes);
-  std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(int size);
  std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
  std::string Type() { return type_; }
 private:
-  void OpDescs(int size,
-               std::vector<std::shared_ptr<framework::OpDesc>> *op_desc);
  void To(int index, std::shared_ptr<Node>);
  void Folder(
      std::shared_ptr<framework::OpDesc> op_desc,

--- a/src/framework/program/program-optimize/program_optimize.cpp
+++ b/src/framework/program/program-optimize/program_optimize.cpp
@@ -78,9 +78,8 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
    }
    for (auto &registed : FusionOpRegister::Instance()->Matchers()) {
-      std::string fusion_type = registed.first;
+      std::string fusion_type = registed->Type();
-      std::shared_ptr<FusionOpMatcher> matcher = registed.second;
+      std::shared_ptr<FusionOpMatcher> matcher = registed;
-      //      DLOG << " registed node \n " << matcher->BeginNode();
      auto match_vector = type_map[matcher->BeginType()];

--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -30,6 +30,7 @@ class Program {
  std::string model_path;
  std::string para_path;
  bool combined = false;
+  bool quantification = false;
 private:
 };

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -154,7 +154,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
  tensor->Resize(framework::make_ddim(desc.Dims()));
-  void *memory = tensor;
+  void *memory = nullptr;
  int type_size = 0;
  switch (desc.DataType()) {
    case framework::VARTYPE_TYPE_FP16:
@@ -179,11 +179,25 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
    default:
      break;
  }
+  if (program_.quantification) {
-  for (int n = 0; n < memory_size * type_size; ++n) {
+    float min_value;
-    static_cast<char *>(memory)[n] = (*data)[n];
+    float max_value;
+    memcpy(&min_value, *data, sizeof(float));
+    memcpy(&max_value, *data + sizeof(float), sizeof(float));
+    *data += 2 * sizeof(float);
+    const float factor = (max_value - min_value) / 255.0;
+    uint8_t *uint8_data = (uint8_t *)(*data);
+    for (int k = 0; k < memory_size; ++k) {
+      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
+    }
+    *data += (memory_size * sizeof(uint8_t));
+  } else {
+    for (int n = 0; n < memory_size * type_size; ++n) {
+      static_cast<char *>(memory)[n] = (*data)[n];
+    }
+    (*data) += (sizeof(char) * memory_size * type_size);
  }
-  (*data) += (sizeof(char) * memory_size * type_size);
 }
 template <typename Dtype, Precision P>

--- a/src/io/loader.cpp
+++ b/src/io/loader.cpp
@@ -44,26 +44,29 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &dirname, bool optimize, bool can_add_split) {
+    const std::string &dirname, bool optimize, bool quantification,
-  auto program =
+    bool can_add_split) {
-      this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
+  auto program = this->LoadProgram(dirname + "/__model__", optimize,
+                                   quantification, can_add_split);
  program.model_path = dirname;
  return program;
 }
 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
-    const std::string &model_path, const std::string &para_path,
+    const std::string &model_path, const std::string &para_path, bool optimize,
-    bool optimize) {
+    bool quantification) {
  auto program = this->LoadProgram(model_path, optimize);
  program.para_path = para_path;
  program.combined = true;
+  program.quantification = quantification;
  return program;
 }
 template <typename Dtype, Precision P>
 const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
-    const std::string &model_path, bool optimize, bool can_add_split) {
+    const std::string &model_path, bool optimize, bool quantification,
+    bool can_add_split) {
  std::string model_filename = model_path;
  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
  uint8_t *buf = NULL;
@@ -82,6 +85,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
  framework::Program<Dtype, P> program;
  program.originProgram = originProgramDesc;
+  program.quantification = quantification;
  auto scope = std::make_shared<framework::Scope>();
  program.scope = scope;

--- a/src/io/loader.h
+++ b/src/io/loader.h
@@ -30,6 +30,7 @@ class Loader {
   * */
  const framework::Program<Dtype, P> Load(const std::string &dirname,
                                          bool optimize = false,
+                                          bool quantification = false,
                                          bool can_add_split = false);
  /*
@@ -38,11 +39,13 @@ class Loader {
   * */
  const framework::Program<Dtype, P> Load(const std::string &model_path,
                                          const std::string &para_path,
-                                          bool optimize = false);
+                                          bool optimize = false,
+                                          bool quantification = false);
 private:
  const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
                                                 bool optimize = false,
+                                                 bool quantification = false,
                                                 bool can_add_split = false);
 };

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -26,7 +26,7 @@ void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
 template <typename Dtype, Precision P>
 bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
-                                  int batch_size) {
+                                  bool quantification, int batch_size) {
  if (loader_.get() == nullptr) {
    loader_ = std::make_shared<Loader<Dtype, P>>();
  } else {
@@ -35,7 +35,7 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
  if (executor_.get() == nullptr) {
    executor_ = std::make_shared<Executor<Dtype, P>>(
-        loader_->Load(dirname, optimize), batch_size, optimize);
+        loader_->Load(dirname, optimize, quantification), batch_size, optimize);
  } else {
    LOG(kLOG_INFO) << "executor inited";
  }
@@ -46,7 +46,7 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
 template <typename Dtype, Precision P>
 bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
                                  const std::string &para_path, bool optimize,
-                                  int batch_size) {
+                                  bool quantification, int batch_size) {
  if (loader_.get() == nullptr) {
    loader_ = std::make_shared<Loader<Dtype, P>>();
  } else {
@@ -55,7 +55,8 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
  if (executor_.get() == nullptr) {
    executor_ = std::make_shared<Executor<Dtype, P>>(
-        loader_->Load(model_path, para_path, optimize), batch_size, optimize);
+        loader_->Load(model_path, para_path, optimize, quantification),
+        batch_size, optimize);
  } else {
    LOG(kLOG_INFO) << "executor inited";
  }

--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -39,14 +39,18 @@ class PaddleMobile {
   * @b 加载分开形式的 fluid 模型
   * */
  bool Load(const std::string &dirname, bool optimize = false,
-            int batch_size = 1);
+            bool quantification = false, int batch_size = 1);
  /*
   * @b load combine format fluid mode
   * @b 加载结合在一起格式的模型
   * */
  bool Load(const std::string &model_path, const std::string &para_path,
-            bool optimize = false, int batch_size = 1);
+            bool optimize = false, bool quantification = false,
+            int batch_size = 1);
+  /*
+   * @b 设置线程数, 当 cmake 中开启 openmp 时生效
+   * */
  void SetThreadNum(int num);
  /*

--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -16,10 +16,32 @@ limitations under the License. */
 #include <cstdlib>
 #include <cstring>
+#ifdef PADDLE_MOBILE_FPGA
+#include "fpga/api/fpga_api.h"
+#endif
 namespace paddle_mobile {
 namespace memory {
 const int MALLOC_ALIGN = 64;
+#ifdef PADDLE_MOBILE_FPGA
+namespace api = paddle::mobile::fpga::api;
+void Copy(void *dst, const void *src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+void *Alloc(size_t size) { return api::malloc(size); }
+void Free(void *ptr) {
+  if (ptr) {
+    api::fpga_free(ptr);
+  }
+}
+#else
 void Copy(void *dst, const void *src, size_t num) {
  std::memcpy(dst, src, num);
 }
@@ -42,5 +64,7 @@ void Free(void *ptr) {
  }
 }
+#endif
 }  // namespace memory
 }  // namespace paddle_mobile
--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -26,7 +26,7 @@ void BatchNormOp<Dtype, T>::InferShape() const {
  auto x_dims = this->param_.InputX()->dims();
  this->param_.OutputY()->Resize(x_dims);
 }
-template class BatchNormOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/box_coder_op.cpp
+++ b/src/operators/box_coder_op.cpp
@@ -47,7 +47,7 @@ void BoxCoderOp<Dtype, T>::InferShape() const {
  this->param_.OutputBox()->Resize(framework::make_ddim(
      {input_targetbox_dims[0], input_priorbox_dims[0], 4}));
 }
-template class BoxCoderOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -56,7 +56,6 @@ void ConcatOp<Dtype, T>::InferShape() const {
  this->param_.Out()->Resize(out_dims);
 }
-template class ConcatOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -48,8 +48,6 @@ void ConvOp<Dtype, T>::InferShape() const {
  this->param_.Output()->Resize(ddim);
 }
-template class ConvOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -49,8 +49,6 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
  this->param_.Output()->Resize(ddim);
 }
-template class DepthwiseConvOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/dropout_op.cpp
+++ b/src/operators/dropout_op.cpp
@@ -22,7 +22,7 @@ void DropoutOp<Dtype, T>::InferShape() const {
  auto input_dims = this->param_.InputX()->dims();
  this->param_.Out()->Resize(input_dims);
 }
-template class DropoutOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -24,7 +24,7 @@ void ElementwiseAddOp<Dtype, T>::InferShape() const {
  auto x_dim = this->param_.InputX()->dims();
  this->param_.Out()->Resize(x_dim);
 }
-template class ElementwiseAddOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -14,10 +14,7 @@ limitations under the License. */
 #include "feed_op.h"
 namespace paddle_mobile {
-namespace operators {
+namespace operators {}
-template class FeedOp<CPU, float>;
-}
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;

--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
@@ -14,10 +14,7 @@ limitations under the License. */
 #include "fetch_op.h"
 namespace paddle_mobile {
-namespace operators {
+namespace operators {}
-template class FetchOp<CPU, float>;
-}
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;

--- a/src/operators/fusion_conv_add.cpp
+++ b/src/operators/fusion_conv_add.cpp
@@ -45,7 +45,6 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
  this->param_.Output()->Resize(ddim);
 }
-template class FusionConvAddOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/fusion_conv_add.h
+++ b/src/operators/fusion_conv_add.h
@@ -36,8 +36,6 @@ class FusionConvAddMatcher : public framework::FusionOpMatcher {
  void FolderNodes(
      framework::Node *node,
      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node->OpDescs(node_.Depth());
    node->Folder(node_.Depth(), Type(),
                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
  }

--- a/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
@@ -44,7 +44,7 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
  framework::DDim ddim = framework::make_ddim(output_shape);
  this->param_.Output()->Resize(ddim);
 }
-template class FusionConvAddBNReluOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/fusion_conv_add_bn_relu_op.h
+++ b/src/operators/fusion_conv_add_bn_relu_op.h
@@ -39,8 +39,6 @@ class FusionConvAddBNReluMatcher : public framework::FusionOpMatcher {
  void FolderNodes(
      framework::Node *node,
      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node->OpDescs(node_.Depth());
    node->Folder(node_.Depth(), Type(),
                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
                  {G_OP_TYPE_BATCHNORM,

--- a/src/operators/fusion_conv_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_relu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVBNRELU_OP
+#include "operators/fusion_conv_bn_relu_op.h"
+#include "operators/math/conv_func.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename Dtype, typename T>
+void FusionConvBNReluOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/fusion_conv_bn_relu_op.h
+++ b/src/operators/fusion_conv_bn_relu_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVBNRELU_OP
+#pragma once
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/conv_bn_relu_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionConvBNReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionConvBNReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"}}}},
+                 removed_nodes);
+  }
+  std::string Type() { return G_OP_TYPE_FUSION_CONV_BN_RELU; }
+};
+template <typename DeviceType, typename T>
+class FusionConvBNReluOp : public framework::OperatorWithKernel<
+                               DeviceType, FusionConvBNReluParam,
+                               operators::ConvBNReluKernel<DeviceType, T>> {
+ public:
+  FusionConvBNReluOp(const string &type, const VariableNameMap &inputs,
+                     const VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs,
+                     std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionConvBNReluParam,
+            operators::ConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                        attrs, scope) {}
+  using framework::OperatorWithKernel<
+      DeviceType, FusionConvBNReluParam,
+      operators::ConvBNReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+ protected:
+};
+#ifdef PADDLE_MOBILE_CPU
+#ifndef FUSION_CONV_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_conv_bn_relu_registrar(
+    new FusionConvBNReluMatcher());
+#define FUSION_CONV_BN_RELU_REGISTER
+#endif
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+}  // namespace operators
+}  // namespace paddle_mobile
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_conv_bn_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/fusion_dwconv_bn_relu_op.cpp
+++ b/src/operators/fusion_dwconv_bn_relu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DWCONVBNRELU_OP
+#include "operators/fusion_dwconv_bn_relu_op.h"
+#include "operators/math/conv_func.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename Dtype, typename T>
+void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/fusion_dwconv_bn_relu_op.h
+++ b/src/operators/fusion_dwconv_bn_relu_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DWCONVBNRELU_OP
+#pragma once
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "op_param.h"
+#include "operators/kernel/dwconv_bn_relu_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+using std::vector;
+class FusionDWConvBNReluMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDWConvBNReluMatcher() {
+    node_ = framework::Node(G_OP_TYPE_DEPTHWISE_CONV);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "Scale"},
+                    {"Mean", "Mean"},
+                    {"Bias", "Bias"},
+                    {"Variance", "Variance"}}}},
+                 removed_nodes);
+  }
+  std::string Type() { return G_OP_TYPE_FUSION_DWCONV_BN_RELU; }
+};
+template <typename DeviceType, typename T>
+class FusionDWConvBNReluOp : public framework::OperatorWithKernel<
+                                 DeviceType, FusionDWConvBNReluParam,
+                                 operators::DWConvBNReluKernel<DeviceType, T>> {
+ public:
+  FusionDWConvBNReluOp(const string &type, const VariableNameMap &inputs,
+                       const VariableNameMap &outputs,
+                       const framework::AttributeMap &attrs,
+                       std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDWConvBNReluParam,
+            operators::DWConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
+                                                          attrs, scope) {}
+  using framework::OperatorWithKernel<
+      DeviceType, FusionDWConvBNReluParam,
+      operators::DWConvBNReluKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+ protected:
+};
+#ifdef PADDLE_MOBILE_CPU
+#ifndef FUSION_DWCONV_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_dwconv_bn_relu_registrar(
+    new FusionDWConvBNReluMatcher());
+#define FUSION_DWCONV_BN_RELU_REGISTER
+#endif
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#ifndef FUSION_DWCONV_BN_RELU_REGISTER
+static framework::FusionOpRegistrar fusion_dwconv_bn_relu_registrar(
+    new FusionDWConvBNReluMatcher());
+#define FUSION_DWCONV_BN_RELU_REGISTER
+#endif
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+}  // namespace operators
+}  // namespace paddle_mobile
+#ifdef PADDLE_MOBILE_CPU
+USE_OP_CPU(fusion_dwconv_bn_relu);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
+#endif
--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -50,7 +50,6 @@ void FusionFcOp<Dtype, T>::InferShape() const {
  this->param_.Out()->Resize(ddim);
 }
-template class FusionFcOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/im2sequence_op.cpp
+++ b/src/operators/im2sequence_op.cpp
@@ -47,8 +47,6 @@ void Im2SequenceOp<Dtype, T>::InferShape() const {
  this->param_.Output()->Resize(ddim);
 }
-template class Im2SequenceOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBNRELU_OP
 #include "operators/kernel/conv_add_bn_relu_kernel.h"
-#include "operators/kernel/central-arm-func/conv_add_bn_relu_func.h"
+#include "operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h"
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVBNRELU_OP
+#include "operators/kernel/conv_bn_relu_kernel.h"
+#include "operators/kernel/central-arm-func/conv_bn_relu_arm_func.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ConvBNReluKernel<CPU, float>::Init(FusionConvBNReluParam *param) {
+  const Tensor *mean = param->InputMean();
+  const Tensor *variance = param->InputVariance();
+  const Tensor *scale = param->InputScale();
+  const Tensor *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+  //   DLOG << "variance: " << *variance;
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+  const int C = mean->numel();
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  Tensor *new_scale = new Tensor();
+  Tensor *new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({C});
+  auto new_bias_ptr = new_bias->mutable_data<float>({C});
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  return true;
+}
+template <>
+void ConvBNReluKernel<CPU, float>::Compute(
+    const FusionConvBNReluParam &param) const {
+  ConvBNReluCompute<float>(param);
+}
+template class ConvBNReluKernel<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DWCONVBNRELU_OP
+#include "operators/kernel/dwconv_bn_relu_kernel.h"
+#include "operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool DWConvBNReluKernel<CPU, float>::Init(FusionDWConvBNReluParam *param) {
+  const Tensor *mean = param->InputMean();
+  const Tensor *variance = param->InputVariance();
+  const Tensor *scale = param->InputScale();
+  const Tensor *bias = param->InputBias();
+  const float epsilon = param->Epsilon();
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+  const int C = mean->numel();
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  Tensor *new_scale = new Tensor();
+  Tensor *new_bias = new Tensor();
+  auto new_scale_ptr = new_scale->mutable_data<float>({C});
+  auto new_bias_ptr = new_bias->mutable_data<float>({C});
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+  return true;
+}
+template <>
+void DWConvBNReluKernel<CPU, float>::Compute(
+    const FusionDWConvBNReluParam &param) const {
+  DWConvBNReluCompute<float>(param);
+}
+template class DWConvBNReluKernel<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
+++ b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
@@ -54,7 +54,40 @@ void BatchnormCompute(const BatchNormParam &param) {
  int HXW = H * W;
-#ifdef ARMV7
+#if __ARM_NEON
+#if __aarch64__
+  float *inv_std_ptr = new float[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  Tensor new_scale;
+  auto new_scale_ptr = new_scale.mutable_data<float>(framework::make_ddim({C}));
+  Tensor new_bias;
+  auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
+  /// ((x - est_mean) * (inv_var) * scale + bias equal to
+  /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+    {
+      for (int n = 0; n < N; n++) {
+        for (int h = 0; h < H; h++) {
+          int tmp_index = n * stride0 + i * stride1 + h * stride2;
+          for (int w = 0; w < W; w++) {
+            int index = tmp_index + w;
+            out_ptr[index] =
+                input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
+          }
+        }
+      }
+    }
+  }
+  delete[] inv_std_ptr;
+#else
  if (HXW > 32) {
    int NXC = N * C;
    float *inv_std_ptr = new float[NXC * 4];
@@ -229,6 +262,7 @@ void BatchnormCompute(const BatchNormParam &param) {
    delete[] inv_std_ptr;
  }
+#endif
 #else
  float *inv_std_ptr = new float[C];
  for (int i = 0; i < C; i++) {

--- a/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_bn_relu_func.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBNRELU_OP
 #pragma once
+#include <vector>
 #include "operators/math/depthwise_conv_3x3.h"
 #include "operators/op_param.h"
@@ -23,14 +25,9 @@ namespace operators {
 void ConvAddBNReluBasic(const FusionConvAddBNReluParam &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
  Tensor new_bias = *param.NewBias();
  Tensor new_scale = *param.NewScale();
-  int axis = param.Axis();
  Tensor *output = param.Output();
-  math::expand_bias(bias, axis, output->dims());
-  output->ShareDataWith(bias);
  int groups = param.Groups();
  std::vector<int> strides = param.Strides();
  std::vector<int> paddings = param.Paddings();
@@ -107,7 +104,7 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam &param) {
      math::matmulWithBn<float>(
          filter_slice, false, col_matrix, false, static_cast<float>(1),
-          &out_slice, static_cast<float>(0), true, &new_scale, &new_bias);
+          &out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
    }
  }
 }
@@ -121,7 +118,7 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam &param) {
      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
                                        param.Output(), param.NewScale(),
-                                        param.NewBias(), 1);
+                                        param.NewBias(), true);
  } else if (param.Groups() == param.Input()->dims()[1] &&
             param.Input()->dims()[1] == param.Output()->dims()[1] &&
             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&

--- a/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVBNRELU_OP
+#pragma once
+#include <vector>
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+void ConvBNReluBasic(const FusionConvBNReluParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor new_bias = *param.NewBias();
+  Tensor new_scale = *param.NewScale();
+  Tensor *output = param.Output();
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmulWithBn<float>(
+          filter_slice, false, col_matrix, false, static_cast<float>(1),
+          &out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
+    }
+  }
+}
+template <typename P>
+void ConvBNReluCompute(const FusionConvBNReluParam &param) {
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
+                                        param.Output(), param.NewScale(),
+                                        param.NewBias(), true);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
+    //                                        param.Output(), param.NewScale(),
+    //                                        param.NewBias(), 1);
+    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
+                                          param.Output(), param.NewScale(),
+                                          param.NewBias(), true);
+  } else {
+    ConvBNReluBasic(param);
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_DWCONVBNRELU_OP
+#pragma once
+#include <vector>
+#include "operators/math/depthwise_conv_3x3.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+void DWConvBNReluBasic(const FusionDWConvBNReluParam &param) {
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor new_bias = *param.NewBias();
+  Tensor new_scale = *param.NewScale();
+  Tensor *output = param.Output();
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+  bool is_expand =
+      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmulWithBn<float>(
+          filter_slice, false, col_matrix, false, static_cast<float>(1),
+          &out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
+    }
+  }
+}
+template <typename P>
+void DWConvBNReluCompute(const FusionDWConvBNReluParam &param) {
+  if (param.Groups() == param.Input()->dims()[1] &&
+      param.Input()->dims()[1] == param.Output()->dims()[1] &&
+      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
+                                        param.Output(), param.NewScale(),
+                                        param.NewBias(), true);
+  } else if (param.Groups() == param.Input()->dims()[1] &&
+             param.Input()->dims()[1] == param.Output()->dims()[1] &&
+             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
+    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
+    //                                        param.Output(), param.NewScale(),
+    //                                        param.NewBias(), 1);
+    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
+                                          param.Output(), param.NewScale(),
+                                          param.NewBias(), true);
+  } else {
+    DWConvBNReluBasic(param);
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/pool_arm_func.h
+++ b/src/operators/kernel/central-arm-func/pool_arm_func.h
@@ -76,15 +76,20 @@ void PoolCompute(const PoolParam &param) {
    }
  } else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
-#ifndef IOS
+#if __ARM_NEON
+#if __aarch64__
+    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+#else
    if (pooling_type == "max") {
      math::Pool2x2Max(strides, paddings, in_x, out);
    } else if (pooling_type == "avg") {
      math::Pool2x2Avg(strides, paddings, in_x, out);
    }
+#endif
 #else
    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
-#endif
+#endif  // __ARM_NEON
  } else {
    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
  }

--- a/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
+++ b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
@@ -68,6 +68,7 @@ void sigmoid(const Tensor *X, Tensor *Y) {
      input_outer_ptr++;
    }
  }
+#else
 #endif
 }

--- a/src/operators/kernel/conv_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_bn_relu_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef FUSION_CONVBNRELU_OP
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using framework::DDim;
+using framework::OpKernelBase;
+template <typename DeviceType, typename T>
+class ConvBNReluKernel
+    : public OpKernelBase<DeviceType, FusionConvBNReluParam> {
+ public:
+  void Compute(const FusionConvBNReluParam &param) const;
+  bool Init(FusionConvBNReluParam *param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/dwconv_bn_relu_kernel.h
+++ b/src/operators/kernel/dwconv_bn_relu_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef FUSION_DWCONVBNRELU_OP
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/conv_func.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using framework::DDim;
+using framework::OpKernelBase;
+template <typename DeviceType, typename T>
+class DWConvBNReluKernel
+    : public OpKernelBase<DeviceType, FusionDWConvBNReluParam> {
+ public:
+  void Compute(const FusionDWConvBNReluParam &param) const;
+  bool Init(FusionDWConvBNReluParam *param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -24,7 +24,7 @@ void LrnOp<Dtype, T>::InferShape() const {
  auto x_dims = this->param_.InputX()->dims();
  this->param_.Out()->Resize(x_dims);
 }
-template class LrnOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "operators/math/depthwise_conv_3x3.h"
-#ifdef __ARM_NEON
+#if __ARM_NEON
 #include <arm_neon.h>
 #endif
 #include <vector>
@@ -23,7 +23,6 @@ namespace math {
 void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
                      vector<int> paddings, const Tensor *filter, Tensor *bias,
                      Tensor *output, bool if_bias) {
-#ifdef __ARM_NEON
  const int batch_size = input->dims()[0];
  const int input_height = input->dims()[2];
@@ -181,7 +180,27 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
            }
          } else {
-#if defined(ARMV17)
+#if __ARM_NEON
+#if __aarch64__
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos2);
+            const float32x4_t data3 = vld1q_f32(pos3);
+            const float32x4_t v_filter1 = vld1q_f32(filter1);
+            const float32x4_t v_filter2 = vld1q_f32(filter2);
+            const float32x4_t v_filter3 = vld1q_f32(filter3);
+            float32x4_t mula = vmulq_f32(data1, v_filter1);
+            mula = vmlaq_f32(mula, data2, v_filter2);
+            mula = vmlaq_f32(mula, data3, v_filter3);
+            float32x2_t res = vpadd_f32(
+                vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
+            res = vpadd_f32(res, res);
+            if (if_bias) {
+              output_data[ph * output_width + pw] += vget_lane_f32(res, 0);
+            } else {
+              output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
+            }
+#else
            asm volatile(
                "vld1.32  {q1}, [%[pos1]]        \n\t"
@@ -209,26 +228,10 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
                  [filter2] "r"(filter2), [filter3] "r"(filter3),
                  [output_ptr] "r"(output_ptr), [zero] "r"(zero)
                : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
+#endif  // __aarch64__
 #else
-            const float32x4_t data1 = vld1q_f32(pos1);
-            const float32x4_t data2 = vld1q_f32(pos2);
-            const float32x4_t data3 = vld1q_f32(pos3);
-            const float32x4_t v_filter1 = vld1q_f32(filter1);
+#endif  // __ARM_NEON
-            const float32x4_t v_filter2 = vld1q_f32(filter2);
-            const float32x4_t v_filter3 = vld1q_f32(filter3);
-            float32x4_t mula = vmulq_f32(data1, v_filter1);
-            mula = vmlaq_f32(mula, data2, v_filter2);
-            mula = vmlaq_f32(mula, data3, v_filter3);
-            float32x2_t res = vpadd_f32(
-                vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
-            res = vpadd_f32(res, res);
-            if (if_bias) {
-              output_data[ph * output_width + pw] += vget_lane_f32(res, 0);
-            } else {
-              output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
-            }
-#endif
          }
        }
      }
@@ -239,12 +242,11 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
    input_data += input_batch_stride;
    output_data += output_batch_stride;
  }
-#endif
 }
 void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
                          Tensor *output, Tensor *bias, bool if_bias) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
  const float *input_data = input->data<float>();
  const float *filter_data = filter->data<float>();
  float *output_data = output->data<float>();
@@ -520,7 +522,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
 void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
                                   Tensor *output, const Tensor *new_scale,
                                   const Tensor *new_bias, bool if_relu) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
  const float *input_data = input->data<float>();
  const float *filter_data = filter->data<float>();
  float *output_data = output->data<float>();
@@ -824,7 +826,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
 void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
                                   Tensor *output, const Tensor *new_scale,
                                   const Tensor *new_bias, bool if_relu) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
  const int batch_size = input->dims()[0];
@@ -1022,7 +1024,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
 void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
                            Tensor *output, Tensor bias, bool if_bias) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
  const float *input_data = input->data<float>();
  const float *filter_data = filter->data<float>();
  float *output_data = output->data<float>();
@@ -1225,7 +1227,7 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
 void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
                                     Tensor *output, const Tensor *new_scale,
                                     const Tensor *new_bias, bool if_relu) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
  const float *input_data = input->data<float>();
  const float *filter_data = filter->data<float>();
  float *output_data = output->data<float>();

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -28,6 +28,7 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {
+/*
 // 将 A 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
                 float *buffer);
@@ -35,6 +36,7 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
 // 将 B 矩阵分块复制到连续内存(ColMajor)
 void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                 float *buffer);
+*/
 // 将 A 矩阵分块复制到连续内存(RowMajor)
 void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
@@ -51,7 +53,7 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
 void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
                       const float *b, float beta, float *c, float *C, int ldc,
                       bool relu, float *new_scale, float *new_bias);
+/*
 // 向量矩阵乘法 (M = 1)
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
                  const float *B, int ldb, float beta, float *C, int ldc,
@@ -60,6 +62,7 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
 void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
                        int lda, const float *B, int ldb, float beta, float *C,
                        int ldc, bool relu, float *new_scale, float *new_bias);
+*/
 // 计算一个更小的 C 矩阵分块
 void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
@@ -81,6 +84,7 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
 void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
                     float *new_scale, float *new_bias);
+/*
 // 向量矩阵乘法结果回写
 // C = A * B
 void VecWriteBasic(int n, float *c, float *C, int ldc);
@@ -96,6 +100,7 @@ void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
 // C = A * B, batchnorm(C), relu(C)
 void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
                        float *new_bias);
+*/
 // 32位 float 矩阵乘法
 void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,

--- a/src/operators/math/im2col.cpp
+++ b/src/operators/math/im2col.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "operators/math/im2col.h"
 #include <vector>
 #ifdef __ARM_NEON
-#include "arm_neon.h"
+#include <arm_neon.h>
 #endif
 #include "common/types.h"
 namespace paddle_mobile {
@@ -69,7 +69,7 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
    int channels_col = im_channels * filter_height * filter_width;
    const T *im_data = im.data<T>();
    T *col_data = col->data<T>();
-#ifdef __ARM_NEON
+#if __ARM_NEON
    const int osize = col_height;
    const int isize = im_height;
    bool pad1 = padding[0] > 0;

--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -50,7 +50,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
                         const framework::Tensor &matrix_b, bool trans_b,
                         float alpha, framework::Tensor *matrix_out, float beta,
                         bool relu, framework::Tensor *new_scale,
-                         framework::Tensor *new_bias) {
+                         framework::Tensor *new_bias, int group) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -71,7 +71,8 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
  SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
              N, beta, matrix_out->data<float>(), N, relu,
-              new_scale->data<float>(), new_bias->data<float>());
+              new_scale->data<float>() + group,
+              new_bias->data<float>() + group);
 }
 }  // namespace math

--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -31,7 +31,8 @@ template <typename T>
 void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
                  const framework::Tensor &matrix_b, bool trans_b, T alpha,
                  framework::Tensor *matrix_out, T beta, bool relu,
-                  framework::Tensor *new_scale, framework::Tensor *new_bias);
+                  framework::Tensor *new_scale, framework::Tensor *new_bias,
+                  int group);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/pool_2x2.cpp
+++ b/src/operators/math/pool_2x2.cpp
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef POOL_OP
-#include "pool_2x2.h"
+#include "operators/math/pool_2x2.h"
+#include <algorithm>
+#include <vector>
 namespace paddle_mobile {
 namespace operators {
@@ -21,10 +23,10 @@ namespace math {
 void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                Tensor *output) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
-#ifdef ARMV7
+#if __aarch64__
+#else
  const int batch_size = input->dims()[0];
  const int input_height = input->dims()[2];
@@ -93,15 +95,16 @@ void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
    output_data += output_batch_stride;
  }
 #endif
+#else
 #endif
 }
 void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                Tensor *output) {
-#ifdef __ARM_NEON
+#if __ARM_NEON
-#ifdef ARMV7
+#if __aarch64__
+#else
  const int batch_size = input->dims()[0];
  const int input_height = input->dims()[2];
@@ -171,12 +174,9 @@ void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
    input_data += input_batch_stride;
    output_data += output_batch_stride;
  }
-#else
-// TODO(): to imp other asm
 #endif
+#else
 #endif
 }

--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <omp.h>
 #endif
 #include "framework/tensor.h"
-#include "pool_3x3.h"
+#include "operators/math/pool_3x3.h"
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif  // __ARM_NEON
@@ -518,6 +518,8 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
    input_data += input_batch_stride;
    out_data += output_batch_stride;
  }
+#else
 #endif
 }
@@ -582,7 +584,18 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
            }
            output_seg[ph * output_width + pw] = max_value;
          } else {
-#if defined(ARMV7)
+#if __aarch64__
+            const float32x4_t data1 = vld1q_f32(pos1);
+            const float32x4_t data2 = vld1q_f32(pos1 + input_width);
+            const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width);
+            const float32x4_t max_data =
+                vmaxq_f32(vmaxq_f32(data1, data2), data3);
+            float32x2_t res =
+                vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
+                          vget_low_f32(max_data));
+            res = vpmax_f32(res, res);
+            output_seg[ph * output_width + pw] = vget_lane_f32(res, 0);
+#else
            asm volatile(
                "vld1.32  {q1}, [%[pos1]]        \n\t"
                "vld1.32  {q2}, [%[pos2]]        \n\t"
@@ -598,17 +611,6 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
                  [pos2] "r"(pos2), [pos3] "r"(pos3),
                  [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
                : "memory", "q1", "q2", "q3", "q4");
-#else
-            const float32x4_t data1 = vld1q_f32(pos1);
-            const float32x4_t data2 = vld1q_f32(pos1 + input_width);
-            const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width);
-            const float32x4_t max_data =
-                vmaxq_f32(vmaxq_f32(data1, data2), data3);
-            float32x2_t res =
-                vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
-                          vget_low_f32(max_data));
-            res = vpmax_f32(res, res);
-            output_seg[ph * output_width + pw] = vget_lane_f32(res, 0);
 #endif
          }
        }
@@ -676,8 +678,8 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
            }
            output_seg[ph * output_width + pw] = sum / 9.0;
          } else {
-#if defined(ARMV7)
+#if __aarch64__
+#else
            asm volatile(
                "vld1.32  {q1}, [%[pos1]]        \n\t"
                "vld1.32  {q2}, [%[pos2]]        \n\t"
@@ -696,7 +698,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                  [output_ptr] "r"(output_ptr), [zero] "r"(zero),
                  [nine_ptr] "r"(nine_ptr)
                : "memory", "r6", "q1", "q2", "q3", "q4");
-#else
+#endif
            const float32x4_t data1 = vld1q_f32(pos1);
            const float32x4_t data2 = vld1q_f32(pos2);
            const float32x4_t data3 = vld1q_f32(pos3);
@@ -707,7 +709,6 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
                          vget_low_f32(sum_data));
            res = vpadd_f32(res, res);
            output_seg[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0;
-#endif
          }
        }
      }
@@ -715,6 +716,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
    input_data += input_batch_stride;
    output_data += output_batch_stride;
  }
+#else
 #endif
 }
 }  // namespace math

--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
@@ -135,6 +135,7 @@ class SoftmaxFuntor<CPU, T> {
      }
    }
  }
+#else
 #endif  // ARM_NEON
 public:

--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -50,7 +50,7 @@ void MulOp<Dtype, T>::InferShape() const {
  framework::DDim ddim = framework::make_ddim(output_dims);
  this->param_.Out()->Resize(ddim);
 }
-template class MulOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/multiclass_nms_op.cpp
+++ b/src/operators/multiclass_nms_op.cpp
@@ -34,7 +34,7 @@ void MultiClassNMSOp<Dtype, T>::InferShape() const {
  // pre size, will change in Compute.
  this->param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6}));
 }
-template class MultiClassNMSOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -371,7 +371,7 @@ class BatchNormParam : OpParam {
    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
    epsilon_ = GetAttr<float>("epsilon", attrs);
    momentum_ = GetAttr<float>("momentum", attrs);
-    is_test_ = GetAttr<bool>("is_test", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
  }
  const Tensor *InputX() const { return input_x_; }
@@ -1059,6 +1059,165 @@ class FusionConvAddBNReluParam : public OpParam {
 Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
 #endif
+#ifdef FUSION_DWCONVBNRELU_OP
+class FusionDWConvBNReluParam : public OpParam {
+ public:
+  FusionDWConvBNReluParam(const VariableNameMap &inputs,
+                          const VariableNameMap &outputs,
+                          const AttributeMap &attrs, const Scope &scope) {
+    filter_ = FilterFrom<LoDTensor>(inputs, scope);
+    input_ = InputFrom<LoDTensor>(inputs, scope);
+    output_ = OutFrom<LoDTensor>(outputs, scope);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
+    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
+    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
+    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
+    epsilon_ = GetAttr<float>("epsilon", attrs);
+    momentum_ = GetAttr<float>("momentum", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
+  }
+  const Tensor *Input() const { return input_; }
+  const Tensor *Filter() const { return filter_; }
+  Tensor *Output() const { return output_; }
+  const vector<int> &Strides() const { return strides_; }
+  const vector<int> &Paddings() const { return paddings_; }
+  const vector<int> &Dilations() const { return dilations_; }
+  const int &Groups() const { return groups; }
+  const Tensor *InputBias() const { return input_bias_; }
+  const Tensor *InputMean() const { return input_mean_; }
+  const Tensor *InputScale() const { return input_scale_; }
+  const Tensor *InputVariance() const { return input_variance_; }
+  const float &Epsilon() const { return epsilon_; }
+  const float &Momentum() const { return momentum_; }
+  const bool &IsTest() const { return is_test_; }
+  void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; }
+  void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; }
+  const Tensor *NewScale() const { return new_scale_; }
+  const Tensor *NewBias() const { return new_bias_; }
+ protected:
+  Tensor *input_;
+  Tensor *output_;
+  Tensor *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+  Tensor *input_bias_;
+  Tensor *input_mean_;
+  Tensor *input_scale_;
+  Tensor *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  Tensor *new_bias_;
+  Tensor *new_scale_;
+};
+Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
+#endif
+#ifdef FUSION_CONVBNRELU_OP
+class FusionConvBNReluParam : public OpParam {
+ public:
+  FusionConvBNReluParam(const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const AttributeMap &attrs, const Scope &scope) {
+    filter_ = FilterFrom<LoDTensor>(inputs, scope);
+    input_ = InputFrom<LoDTensor>(inputs, scope);
+    output_ = OutFrom<LoDTensor>(outputs, scope);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+    input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
+    input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
+    input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
+    input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
+    epsilon_ = GetAttr<float>("epsilon", attrs);
+    momentum_ = GetAttr<float>("momentum", attrs);
+    //    is_test_ = GetAttr<bool>("is_test", attrs);
+  }
+  const Tensor *Input() const { return input_; }
+  const Tensor *Filter() const { return filter_; }
+  Tensor *Output() const { return output_; }
+  const vector<int> &Strides() const { return strides_; }
+  const vector<int> &Paddings() const { return paddings_; }
+  const vector<int> &Dilations() const { return dilations_; }
+  const int &Groups() const { return groups; }
+  const Tensor *InputBias() const { return input_bias_; }
+  const Tensor *InputMean() const { return input_mean_; }
+  const Tensor *InputScale() const { return input_scale_; }
+  const Tensor *InputVariance() const { return input_variance_; }
+  const float &Epsilon() const { return epsilon_; }
+  const float &Momentum() const { return momentum_; }
+  const bool &IsTest() const { return is_test_; }
+  void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; }
+  void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; }
+  const Tensor *NewScale() const { return new_scale_; }
+  const Tensor *NewBias() const { return new_bias_; }
+ protected:
+  Tensor *input_;
+  Tensor *output_;
+  Tensor *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+  Tensor *input_bias_;
+  Tensor *input_mean_;
+  Tensor *input_scale_;
+  Tensor *input_variance_;
+  float epsilon_;
+  float momentum_;
+  bool is_test_;
+  Tensor *new_bias_;
+  Tensor *new_scale_;
+};
+#endif
 #ifdef IM2SEQUENCE_OP
 class Im2SequenceParam : public OpParam {
 public:

--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
@@ -54,7 +54,7 @@ void PoolOp<DeviceType, T>::InferShape() const {
  }
  this->param_.Output()->Resize(framework::make_ddim(output_shape));
 }
-template class PoolOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/prelu_op.cpp
+++ b/src/operators/prelu_op.cpp
@@ -23,7 +23,7 @@ void PReluOp<Dtype, T>::InferShape() const {
  auto input_dims = this->param_.InputX()->dims();
  this->param_.Out()->Resize(input_dims);
 }
-template class PReluOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/prior_box_op.cpp
+++ b/src/operators/prior_box_op.cpp
@@ -44,7 +44,7 @@ void PriorBoxOp<Dtype, T>::InferShape() const {
  this->param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec));
  this->param_.OutputVariances()->Resize(framework::make_ddim(dim_vec));
 }
-template class PriorBoxOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -23,7 +23,7 @@ void ReluOp<Dtype, T>::InferShape() const {
  auto input_dims = this->param_.InputX()->dims();
  this->param_.Out()->Resize(input_dims);
 }
-template class ReluOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -27,7 +27,7 @@ void ReshapeOp<Dtype, T>::InferShape() const {
  auto out_dims = ValidateShape(shape, input_x_dims);
  this->param_.Out()->Resize(out_dims);
 }
-template class ReshapeOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/resize_op.cpp
+++ b/src/operators/resize_op.cpp
@@ -24,7 +24,7 @@ void ResizeOp<Dtype, T>::InferShape() const {
  auto out_dims = CalOutputShape(this->param_);
  this->param_.Out()->Resize(out_dims);
 }
-template class ResizeOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/scale_op.cpp
+++ b/src/operators/scale_op.cpp
@@ -24,7 +24,7 @@ void ScaleOp<Dtype, T>::InferShape() const {
  auto input_dims = this->param_.InputX()->dims();
  this->param_.Out()->Resize(input_dims);
 }
-template class ScaleOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/sigmoid_op.cpp
+++ b/src/operators/sigmoid_op.cpp
@@ -22,7 +22,7 @@ template <typename DeviceType, typename T>
 void SigmoidOp<DeviceType, T>::InferShape() const {
  this->param_.Out()->Resize(this->param_.InputX()->dims());
 }
-template class SigmoidOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/slice_op.cpp
+++ b/src/operators/slice_op.cpp
@@ -23,7 +23,7 @@ template <typename Dtype, typename T>
 void SliceOp<Dtype, T>::InferShape() const {
  /// todo: add InputShape() detection.
 }
-template class SliceOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -22,7 +22,7 @@ template <typename DeviceType, typename T>
 void SoftmaxOp<DeviceType, T>::InferShape() const {
  this->param_.Out()->Resize(this->param_.InputX()->dims());
 }
-template class SoftmaxOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/transpose_op.cpp
+++ b/src/operators/transpose_op.cpp
@@ -47,7 +47,7 @@ void TransposeOp<Dtype, T>::InferShape() const {
  }
  this->param_.Out()->Resize(out_dims);
 }
-template class TransposeOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -19,7 +19,9 @@ int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
-  auto program = loader.Load(g_googlenet, true);
+  //  auto program = loader.Load(g_googlenet, true);
+  auto program = loader.Load(g_mobilenet_ssd, true);
  //  auto program = loader.Load(g_googlenet_combine + "/model",
  //  g_googlenet_combine +
  //    "/params", true);

--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -23,7 +23,7 @@ int main() {
  auto time1 = time();
  if (paddle_mobile.Load(g_googlenet, optimize)) {
    auto time2 = time();
-    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    DLOG << "load cost: " << time_diff(time1, time1) << "ms";
    std::vector<float> input;
    std::vector<int64_t> dims{1, 3, 224, 224};
    GetInput<float>(g_test_image_1x3x224x224, &input, dims);

--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
@@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
  auto time1 = time();
-  if (paddle_mobile.Load(g_mobilenet_ssd, true)) {
+  auto isok = paddle_mobile.Load(g_mobilenet_ssd_gesture + "/model",
+                                 g_mobilenet_ssd_gesture + "/params", true);
+  //  auto isok = paddle_mobile.Load(g_mobilenet_ssd, false);
+  if (isok) {
    auto time2 = time();
-    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
    std::vector<int64_t> dims{1, 3, 300, 300};
    Tensor input_tensor;
@@ -33,7 +37,8 @@ int main() {
    auto time3 = time();
    paddle_mobile.Predict(input, dims);
    auto time4 = time();
-    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+              << std::endl;
  }
  return 0;
 }
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
@@ -22,7 +22,7 @@ int main() {
  auto time1 = time();
  if (paddle_mobile.Load(g_mobilenet, true)) {
    auto time2 = time();
-    DLOG << "load cost :" << time_diff(time1, time1) << "ms";
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
    std::vector<int64_t> dims{1, 3, 224, 224};
    Tensor input_tensor;
@@ -35,7 +35,8 @@ int main() {
    auto vec_result = paddle_mobile.Predict(input, dims);
    auto time4 = time();
-    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+              << std::endl;
  }
  return 0;

--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -16,6 +16,8 @@ limitations under the License. */
 #include <fstream>
 #include <random>
+#include <string>
+#include <vector>
 #include "common/common.h"
 #include "common/log.h"
@@ -23,6 +25,8 @@ limitations under the License. */
 #include "framework/tensor.h"
 static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd";
+static const std::string g_mobilenet_ssd_gesture =
+    "../models/mobilenet+ssd_gesture";
 static const std::string g_squeezenet = "../models/squeezenet";
 static const std::string g_googlenet = "../models/googlenet";
 static const std::string g_mobilenet = "../models/mobilenet";
@@ -62,9 +66,9 @@ void GetInput(const std::string &input_name, std::vector<T> *input,
    size *= dim;
  }
-  T *input_ptr = (T *)malloc(sizeof(T) * size);
+  T *input_ptr = reinterpret_cast<T *>(malloc(sizeof(T) * size));
  std::ifstream in(input_name, std::ios::in | std::ios::binary);
-  in.read((char *)(input_ptr), size * sizeof(T));
+  in.read(reinterpret_cast<char *>(input_ptr), size * sizeof(T));
  in.close();
  for (int i = 0; i < size; ++i) {
    input->push_back(input_ptr[i]);
@@ -79,6 +83,6 @@ void GetInput(const std::string &input_name,
  T *input_ptr = input->mutable_data<T>(dims);
  std::ifstream in(input_name, std::ios::in | std::ios::binary);
-  in.read((char *)(input_ptr), input->numel() * sizeof(T));
+  in.read(reinterpret_cast<char *>(input_ptr), input->numel() * sizeof(T));
  in.close();
 }
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -38,7 +38,8 @@ build_for_android() {
    fi
    if [ -z "$PLATFORM" ]; then
-        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" or other platforms from the command line.
+        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" platform.
+#        PLATFORM="arm-v8a"
    fi
    if [ "${PLATFORM}" = "arm-v7a" ]; then
@@ -92,23 +93,28 @@ build_for_ios() {
 #    rm -rf "../build"
    PLATFORM="ios"
    MODE="Release"
-    BUILD_DIR=../build/release/"${PLATFORM}"
+#    IOS_ARCH="armv7"
+#    IOS_ARCH="armv7s"
+    IOS_ARCH="arm64"    # Users could choose "armv7" or "armv7s" platforms.
+    BUILD_DIR=../build/release/"${PLATFORM}"/"${IOS_ARCH}"
    TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
    mkdir -p "${BUILD_DIR}"
    if [ $# -eq 1 ]; then
        cmake .. \
            -B"${BUILD_DIR}" \
            -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
            -DIOS_PLATFORM=OS \
+            -DIOS_ARCH="${IOS_ARCH}" \
+            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
            -DNET=$1 \
            -DIS_IOS="true"
    else
        cmake .. \
            -B"${BUILD_DIR}" \
            -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
            -DIOS_PLATFORM=OS \
+            -DIOS_ARCH="${IOS_ARCH}" \
+            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
            -DIS_IOS="true"
    fi
    cd "${BUILD_DIR}"

--- a/tools/ios-cmake/ios.toolchain.cmake
+++ b/tools/ios-cmake/ios.toolchain.cmake
@@ -159,7 +159,6 @@ set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS su
 # set the architecture for iOS
 if (${IOS_PLATFORM} STREQUAL "OS")
-  set (IOS_ARCH armv7 armv7s arm64)
 elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
  set (IOS_ARCH i386)
 elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")

--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -42,6 +42,16 @@ elseif (NET STREQUAL "resnet")
  set(MUL_OP ON)
  set(POOL_OP ON)
  set(RELU_OP ON)
+elseif (NET STREQUAL "FPGAnets")
+  set(FUSION_CONVRELU_OP ON)
+  set(FUSION_CONVBNSCALE_OP ON)
+  set(FUSION_CONVBNSCALERELU_OP ON)  
+  set(FUSION_POOLBN_OP ON)
+  set(FUSION_ELEMENTWISEADDRELU_OP ON)
+  set(REGION_OP ON)
+  set(POOL_OP ON)
+  set(CONCAT_OP ON)
+  set(SOFTMAX_OP ON)      
 else ()
  set(BATCHNORM_OP ON)
  set(BOXCODER_OP ON)
@@ -64,6 +74,8 @@ else ()
  set(TRANSPOSE_OP ON)
  set(FUSION_CONVADD_RELU_OP ON)
  set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_DWCONVBNRELU_OP ON)
+  set(FUSION_CONVBNRELU_OP ON)
  set(PRELU_OP ON)
  set(RESIZE_OP ON)
  set(SCALE_OP ON)
@@ -155,6 +167,14 @@ endif()
 if (FUSION_CONVADDBNRELU_OP)
  add_definitions(-DFUSION_CONVADDBNRELU_OP)
 endif()
+if (FUSION_DWCONVBNRELU_OP)
+  add_definitions(-DFUSION_DWCONVBNRELU_OP)
+endif()
+if (FUSION_CONVBNRELU_OP)
+  add_definitions(-DFUSION_CONVBNRELU_OP)
+endif()
 if (PRELU_OP)
  add_definitions(-DPRELU_OP)
 endif()
@@ -173,3 +193,23 @@ endif()
 if (IM2SEQUENCE_OP)
  add_definitions(-DIM2SEQUENCE_OP)
 endif()
+if (FUSION_CONVRELU_OP)
+  add_definitions(-DFUSION_CONVRELU_OP)
+endif()
+if (FUSION_CONVBNSCALE_OP)
+  add_definitions(-DFUSION_CONVBNSCALE_OP)
+endif()
+if (FUSION_CONVBNSCALERELU_OP)
+  add_definitions(-DFUSION_CONVBNSCALERELU_OP)
+endif()
+if (FUSION_POOLBN_OP)
+  add_definitions(-DFUSION_POOLBN_OP)
+endif()
+if (FUSION_ELEMENTWISEADDRELU_OP)
+  add_definitions(-DFUSION_ELEMENTWISEADDRELU_OP)
+endif()
+if (REGION_OP)
+  add_definitions(-DREGION_OP)
+endif()
--- a/tools/quantification/CMakeLists.txt
+++ b/tools/quantification/CMakeLists.txt
+set(dir ${CMAKE_CURRENT_SOURCE_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build")
+ADD_EXECUTABLE(convert convert.cpp)
+target_link_libraries(convert paddle-mobile)
\ No newline at end of file
--- a/tools/quantification/convert.cpp
+++ b/tools/quantification/convert.cpp
+#include "io/paddle_mobile.h"
+#include <cstdlib>
+using std::string;
+static const std::string g_googlenet_combine = "../models/googlenet_combine";
+static const std::string g_googlenet = "../models/googlenet";
+using paddle_mobile::Executor;
+using paddle_mobile::framework::Program;
+    char *Get_binary_data(std::string filename) {
+        FILE *file = fopen(filename.c_str(), "rb");
+        PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
+                              filename.c_str());
+        fseek(file, 0, SEEK_END);
+        int64_t size = ftell(file);
+        PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
+        rewind(file);
+        char *data = new char[size];
+        size_t bytes_read = fread(data, 1, size, file);
+        PADDLE_MOBILE_ENFORCE(bytes_read == size,
+                              "read binary file bytes do not match with fseek");
+        DLOG << "Get_binary_data end";
+        fclose(file);
+        return data;
+    }
+    void LoadWithDump(const paddle_mobile::framework::VarDesc var_desc,
+                    paddle_mobile::framework::LoDTensor *tensor, char **data, FILE *out_file) {
+        // 1. version
+        uint32_t version = *reinterpret_cast<uint32_t *>(*data);
+        // write version
+        fwrite(&version, sizeof(uint32_t), 1, out_file );
+        (*data) += sizeof(uint32_t);
+        // 2 Lod information
+        uint64_t *lod_level_ptr = new uint64_t();
+        memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
+        uint64_t lod_level = 0;
+        // write lod Information
+        fwrite(&lod_level, sizeof(uint64_t), 1, out_file);
+        delete lod_level_ptr;
+        (*data) += sizeof(uint64_t);
+        auto &lod = *tensor->mutable_lod();
+        lod.resize(lod_level);
+        for (uint64_t i = 0; i < lod_level; ++i) {
+            uint64_t size = *reinterpret_cast<uint64_t *>(*data);
+            // write lod size
+            fwrite(&size, sizeof(uint64_t), 1, out_file);
+            (*data) += sizeof(uint64_t);
+            std::vector<size_t> tmp(size / sizeof(size_t));
+            for (int k = 0; k < tmp.size(); ++k) {
+                tmp[k] = *reinterpret_cast<size_t *>(*data);
+                (*data) += sizeof(size_t);
+            }
+            // write lod size vector
+            fwrite(&tmp, sizeof(size_t), tmp.size(), out_file );
+            lod[i] = tmp;
+        }
+        // 3. tensor version
+        uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
+        // write tensor version
+        fwrite(&tensor_version, sizeof(uint32_t), 1, out_file);
+        (*data) += sizeof(uint32_t);
+        // 4. tensor desc
+        int32_t size = *reinterpret_cast<int32_t *>(*data);
+        // write tensor desc
+        fwrite(&size, sizeof(int32_t), 1, out_file);
+        (*data) += sizeof(int32_t);
+        std::unique_ptr<char[]> buf(new char[size]);
+        for (int m = 0; m < size; ++m) {
+            buf.get()[m] = (*data)[m];
+        }
+        fwrite(buf.get(), sizeof(char), size, out_file);
+        (*data) += (sizeof(char) * size);
+        const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
+        int memory_size = 1;
+        for (auto l : desc.Dims()) {
+            memory_size *= l;
+        }
+        tensor->Resize(paddle_mobile::framework::make_ddim(desc.Dims()));
+        void *memory = tensor;
+        int type_size = 0;
+        switch (desc.DataType()) {
+            case paddle_mobile::framework::VARTYPE_TYPE_FP16:
+                type_size = 2;
+                break;
+            case paddle_mobile::framework::VARTYPE_TYPE_FP32:
+                type_size = 4;
+                memory = tensor->mutable_data<float>();
+                break;
+            case paddle_mobile::framework::VARTYPE_TYPE_FP64:
+                type_size = 8;
+                break;
+            case paddle_mobile::framework::VARTYPE_TYPE_INT32:
+                type_size = 4;
+                break;
+            case paddle_mobile::framework::VARTYPE_TYPE_INT64:
+                type_size = 8;
+                break;
+            case paddle_mobile::framework::VARTYPE_TYPE_BOOL:
+                type_size = 1;
+                break;
+            default:
+                break;
+        }
+        for (int n = 0; n < memory_size * type_size; ++n) {
+            static_cast<char *>(memory)[n] = (*data)[n];
+        }
+        (*data) += (sizeof(char) * memory_size * type_size);
+        // for float 32
+        float min_value = std::numeric_limits<float>::max();
+        float max_value = std::numeric_limits<float>::min();
+        for (int k = 0; k < memory_size; ++k) {
+            min_value = std::min(min_value, static_cast<float *> (memory)[k]);
+            max_value = std::max(max_value, static_cast<float *> (memory)[k]);
+        }
+        fwrite(&min_value, sizeof(float), 1, out_file);
+        fwrite(&max_value, sizeof(float), 1, out_file);
+        for (int g = 0; g < memory_size; ++g) {
+            float value = static_cast<float *> (memory)[g];
+            uint8_t factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
+            fwrite(&factor, sizeof(uint8_t), 1, out_file);
+        }
+    }
+    void quantificate_combined(std::string model_path, std::string param_path, std::string param_min_path){
+        paddle_mobile::Loader<paddle_mobile::CPU,paddle_mobile::Precision::FP32 > loader;
+        bool optimize = true;
+        auto program = loader.Load(model_path, param_path, optimize);
+        char *origin_data = Get_binary_data(program.para_path);
+        char *data = origin_data;
+        FILE *out_file = fopen(param_min_path.c_str(), "wb");
+        for (const auto &block : program.originProgram->Blocks()) {
+            for (const auto &var_desc : block->Vars()) {
+                auto var = program.scope->Var(var_desc->Name());
+                if(var_desc ->Persistable()) {
+                    auto tensor = var->template GetMutable<paddle_mobile::framework::LoDTensor>();
+                    if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+                        continue;
+                    }
+                    LoadWithDump(*var_desc, tensor, &data,out_file);
+                }
+            }
+        }
+        fclose(out_file);
+        delete origin_data;
+    }
+    void quantificate_seperated(std::string model_dir, std::string param_min_path) {
+        paddle_mobile::Loader<paddle_mobile::CPU,paddle_mobile::Precision::FP32 > loader;
+        bool optimize = true;
+        auto program = loader.Load(model_dir, optimize);
+        std::string shell_command = "mkdir "+param_min_path;
+        system(shell_command.c_str());
+        for (const auto &block : program.originProgram->Blocks()) {
+            for (const auto &var_desc : block->Vars()) {
+                auto var = program.scope->Var(var_desc->Name());
+                if(var_desc ->Persistable()) {
+                    auto tensor = var->template GetMutable<paddle_mobile::framework::LoDTensor>();
+                    if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+                        continue;
+                    }
+                    std::string file_name = param_min_path +"/"+ var_desc->Name();
+                    FILE *out_file = fopen(file_name.c_str(), "wb");
+                    char *origin_data =
+                            Get_binary_data(program.model_path + "/" + var_desc->Name());
+                    char *data = origin_data;
+                    LoadWithDump(*var_desc, tensor, &data,out_file);
+                    delete origin_data;
+                    fclose(out_file);
+                }
+            }
+        }
+    }
+    int main() {
+        std::string filename = "params_min";
+        std::string model_path = g_googlenet_combine + "/model";
+        std::string param_path = g_googlenet_combine + "/params";
+        std::string dirname = "param_min_dir";
+        std::string model_dir = g_googlenet;
+//        quantificate_combined(model_path, param_path,filename);
+        quantificate_seperated(model_dir, dirname);
+        return 0;
+    }