conflict

abc1eaf3 · eclipsess · e3f28c9e · d013e92f · abc1eaf3 · abc1eaf3
83 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,7 +29,10 @@ if(DEBUGING)
    message(STATUS "debugging mode")
    add_definitions(-DPADDLE_MOBILE_DEBUG)
 else()
+    if(FPGA)
+    else()
        add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
+    endif()
 endif()
 if(USE_EXCEPTION)
@@ -93,8 +96,7 @@ else()
 endif()
 if(FPGA)
-    set(DEBUGING ON)
+    message("FPGA mode enabled")
-    add_definitions(-DPADDLE_MOBILE_DEBUG)
    add_definitions(-DPADDLE_MOBILE_FPGA)
 else()
    file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
@@ -177,6 +179,10 @@ if(DEBUGING)
    else()
        add_subdirectory(test)
    endif()
+elseif(FPGA)
+    add_subdirectory(test)
 endif()
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 - **FPGA**
-    FPGA实现正在进行中，是基于Xilinx的ZU5目标开发板。
+    目前已经支持 ZCU102 开发板。
 - **灵活性**
@@ -112,6 +112,7 @@ Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 开发文档主要是关于编译、运行等问题。做为开发者，它可以和贡献文档共同结合使用。
 * [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
 * [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
+* [FPGA](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_fpga.md)
 ### 贡献文档
 - [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)

--- a/doc/development_fpga.md
+++ b/doc/development_fpga.md
+# FPGA开发文档
+FPGA平台的代码在Xilinx ZCU102 revision 1.0开发板测试Resnet50成功，预测结果正确。
+## 准备硬件
+___
+1. 购买Xilinx ZCU102 revision1.0 开发板
+2. 另外下载Xilinx ZCU102 Ubuntu[镜像文件](https://www.xilinx.com/member/forms/download/xef.html?filename=Ubuntu_Desktop_Release_2018_1.zip)，并烧录进SD卡。
+ * Windowns系统可使用Win32DiskImager
+ * Linux系统使用dd命令：dd if=name.img of=/dev/sdb
+2. 将SD卡插入电脑，替换分区1中已有的BOOT.BIN、image.ub为[BOOT.BIN、image.ub](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)
+3. 将SD卡插入ZCU102开发板，设置板拨码开关为SD卡启动，上电启动Linux系统.
+3. 装载驱动：sudo insmod [fpgadrv.ko](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)
+## 编译工程
+___
+1. 将最新的paddle mobile 代码复制到ZCU102开发板中。
+2. 进入paddle-mobile根目录， CMakeLists.txt 设置平台为 option(FPGA "fpga support" ON)。CPU和MALI\_GPU选项设置为OFF。
+2. 执行以下命令，可在./test/build下生成test-resnet50可执行程序。
+    * mkdir build
+    * cd build
+    * cmake ..
+    * make
+## 准备模型和数据
+___
+1. 模型文件放在./test/models/resnet50中。将[\_\_model\_\_](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)文件复制到此文件夹下。
+2. 另外下载模型[权重文件](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar),解压后也放在./test/models/resnet50 中。
+3. 将数据文件[image_src_float](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)复制到/test/images下。此数据文件对应着标准数据集中的ILSVRC2012_val_00000885.JPEG，分类标签为80， 对应着"black grouse".
+## 运行程序
+___
+1. 进入./test/build目录。
+2. sudo ./test-resnet50
+3. 如果于DEBUG选项是否打开，屏幕会输出很多中间打印信息。最终打印出预测分类结果为80。
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -22,6 +22,7 @@ const char *G_OP_TYPE_BATCHNORM = "batch_norm";
 const char *G_OP_TYPE_BOX_CODER = "box_coder";
 const char *G_OP_TYPE_CONCAT = "concat";
 const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
+const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant";
 const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
 const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
 const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu";
@@ -34,6 +35,7 @@ const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
 const char *G_OP_TYPE_LRN = "lrn";
 const char *G_OP_TYPE_MUL = "mul";
 const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
+const char *G_OP_TYPE_POLYGON_BOX_TRANSFORM = "polygon_box_transform";
 const char *G_OP_TYPE_POOL2D = "pool2d";
 const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
 const char *G_OP_TYPE_RELU = "relu";
@@ -94,9 +96,11 @@ std::unordered_map<
        {G_OP_TYPE_FUSION_CONV_BN_ADD_RELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
        {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
+        {G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}},
        {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},

--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <cstdlib>
+#pragma once
+#include <cstdlib>
+#include <cstring>
+#include <string>
 #include "common/enforce.h"
 #include "common/log.h"
-#pragma once
 namespace paddle_mobile {
 template <int ID, typename Type>
 struct IDToType {
  typedef Type type_t;

--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "fpga/filter.h"
 #include "fpga/image.h"
 #define FPGA_TEST_MODE
-//#define PADDLE_MOBILE_OS_LINUX
+#define PADDLE_MOBILE_OS_LINUX
 namespace paddle_mobile {
 namespace fpga {
@@ -149,7 +149,7 @@ int ComputeBasicConv(const struct ConvArgs &args) {
  return do_ioctl(IOCTL_CONFIG_CONV, &args);
 }
-int ComputeFpgaConv(const struct WrapperConvArgs &args) {
+int ComputeFpgaConv(const struct SplitConvArgs &args) {
 #ifdef FPGA_TEST_MODE
  DLOG << "=============ComputeFPGAConv===========";
  DLOG << "   filter_num:" << args.filter_num
@@ -194,8 +194,8 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
 #ifdef FPGA_TEST_MODE
  DLOG << "=============ComputeFpgaEWAdd===========";
  DLOG << "   relu_enabled:" << args.relu_enabled
-       << "   const0:" << fp16_2_fp32(short(args.const0))
+       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
-       << "   const1:" << fp16_2_fp32(short(args.const1));
+       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
  DLOG << "   image0_address:" << args.image0.address
       << "   image0_scale_address:" << args.image0.scale_address
       << "   image0_channels:" << args.image0.channels
@@ -383,10 +383,10 @@ void format_concat_output(framework::Tensor *out, int height, int width,
  out->reset_data_ptr(data_ptr);
 }
-void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
+void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
                    framework::Tensor *out, framework::Tensor *filter,
-                   bool relu_enabled, int group_num, int stride_h, int stride_w,
+                    bool relu_enabled, int group_num, int stride_h,
-                   int padding_h, int padding_w, float *bs_ptr) {
+                    int stride_w, int padding_h, int padding_w, float *bs_ptr) {
  auto input_ptr = input->data<float>();
  auto filter_ptr = filter->data<float>();
  auto out_ptr = out->data<float>();

--- a/src/fpga/api.h
+++ b/src/fpga/api.h
@@ -89,7 +89,7 @@ struct ConcatArgs {
  uint32_t width;
 };
-struct WrapperConvArgs {
+struct SplitConvArgs {
  uint32_t split_num;
  uint32_t group_num;
  uint32_t filter_num;
@@ -98,6 +98,14 @@ struct WrapperConvArgs {
  struct ConcatArgs concat_arg;
 };
+struct GroupConvArgs {
+  uint32_t group_num;
+  uint32_t filter_num;
+  struct ImageOutputArgs output;
+  struct SplitConvArgs* conv_args;
+  struct ConcatArgs concat_arg;
+};
 struct PoolingArgs {
  int16_t mode;  // mode: 0:max, 1:avg
  half kernel_reciprocal;
@@ -159,30 +167,6 @@ struct MemoryCacheArgs {
 #define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
 #define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
-enum FPGA_ERR_TYPE {
-  ERR_IOCTL_CMD = -1,
-  ERR_TIMEOUT = -2,
-  ERR_COMPLETION_TIMEOUT = -3,
-  ERR_INVALID_FPGA_ADDR = -4,
-  ERR_NOMEM = -5,
-  ERR_NO_RESERVE_MEM = -6,
-  ERR_COPY_FROM_USER = -7,
-  ERR_COPY_TO_USER = -8,
-  ERR_DEL_TIMER = -9,
-  ERR_ENABLE_MSI = -10,
-  ERR_REGISTER_IRQ = -11,
-  ERR_PCIE_REGISTER = -12,
-  ERR_PCIE_PROBE = -13,
-  ERR_REGISTER_BLOCK = -14,
-  ERR_ALLOC_GENDISK = -15,
-  ERR_INIT_QUEUE = -16,
-  ERR_WAIT = -17,
-  ERR_ECC_ERROR = -31,
-  ERR_FPGA_FAIL_STOP = -64,
-  ERR_FPGA_DEBUG_STOP = -113,
-  DEV_TMP_UNAVAILABLE = -128
-};
 //============================== API =============================
 int open_device();
@@ -195,7 +179,7 @@ int fpga_flush(void* address, size_t size);
 int fpga_invalidate(void* address, size_t size);
 int PerformBypass(const struct BypassArgs& args);
-int ComputeFpgaConv(const struct WrapperConvArgs& args);
+int ComputeFpgaConv(const struct SplitConvArgs& args);
 int ComputeFpgaPool(const struct PoolingArgs& args);
 int ComputeFpgaEWAdd(const struct EWAddArgs& args);
 int ComputeFPGAConcat(const struct ConcatArgs& args);
@@ -220,10 +204,10 @@ void format_bias_scale_array(float** bias_scale_array,
 void format_concat_output(framework::Tensor* out, int height, int width,
                          int image_num, uint32_t* channel_num);
-void fill_conv_arg(struct WrapperConvArgs* arg, framework::Tensor* input,
+void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
                    framework::Tensor* out, framework::Tensor* filter,
-                   bool relu_enabled, int group_num, int stride_h, int stride_w,
+                    bool relu_enabled, int group_num, int stride_h,
-                   int padding_h, int padding_w, float* bs_ptr);
+                    int stride_w, int padding_h, int padding_w, float* bs_ptr);
 half fp32_2_fp16(float fp32_num);
 float fp16_2_fp32(half fp16_num);

--- a/src/fpga/bias_scale.cpp
+++ b/src/fpga/bias_scale.cpp
@@ -27,9 +27,6 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
  int num_per_div_after_alignment =
      align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
-  if (num_per_div_before_alignment == num_per_div_after_alignment) {
-    return;
-  }
  int num_element =
      2 * div_num * num_per_div_after_alignment;  // including bias & scale
  float *ptr_aligned =

--- a/src/fpga/filter.cpp
+++ b/src/fpga/filter.cpp
@@ -21,7 +21,10 @@ namespace paddle_mobile {
 namespace fpga {
 namespace filter {
-int calc_division_capacity(int chw) { return 2048 / ((chw + 15) / 16) * 32; }
+int calc_division_capacity(int chw) {
+  int n = 2048 / ((chw + 15) / 16) * 32;
+  return n < 2048 ? n : 2048;
+}
 int calc_split_num(int num, int division_capacity) {
  return (num + division_capacity - 1) / division_capacity;

--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -156,7 +156,7 @@ class AttrReader {
  template <typename T>
  inline T Get(const string &name) const {
    PADDLE_MOBILE_ENFORCE(attrs_.count(name) != 0,
-                          "%s should  be in AttributeMap", name);
+                          "%s should  be in AttributeMap", name.c_str());
    return ((Attribute)attrs_.at(name)).Get<T>();
  }

--- a/src/framework/data_type.cpp
+++ b/src/framework/data_type.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "framework/data_type.h"
+#include <stdint.h>
+#include <string>
+#include <unordered_map>
+namespace paddle_mobile {
+namespace framework {
+struct DataTypeMap {
+  std::unordered_map<std::type_index,
+                     _PaddleMobile__Framework__Proto__VarType__Type>
+      cpp_to_proto_;
+  std::unordered_map<int, std::type_index> proto_to_cpp_;
+  std::unordered_map<int, std::string> proto_to_str_;
+  std::unordered_map<std::type_index, size_t> cpp_to_size_;
+};
+static DataTypeMap* InitDataTypeMap();
+// C++11 removes the need for manual locking. Concurrent execution shall wait if
+// a static local variable is already being initialized.
+// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
+static DataTypeMap& gDataTypeMap() {
+  static DataTypeMap* g_data_type_map_ = InitDataTypeMap();
+  return *g_data_type_map_;
+}
+template <typename T>
+static inline void RegisterType(
+    DataTypeMap* map, _PaddleMobile__Framework__Proto__VarType__Type proto_type,
+    const std::string& name) {
+  map->proto_to_cpp_.emplace(static_cast<int>(proto_type), typeid(T));
+  map->cpp_to_proto_.emplace(typeid(T), proto_type);
+  map->proto_to_str_.emplace(static_cast<int>(proto_type), name);
+  map->cpp_to_size_.emplace(typeid(T), sizeof(T));
+}
+static DataTypeMap* InitDataTypeMap() {
+  auto retv = new DataTypeMap();
+#define RegType(cc_type, proto_type) \
+  RegisterType<cc_type>(retv, proto_type, #cc_type)
+  // NOTE: Add your customize type here.
+  // RegType(float16, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16);
+  RegType(float, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32);
+  RegType(double, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64);
+  RegType(int, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32);
+  RegType(int64_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64);
+  RegType(bool, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL);
+  RegType(size_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T);
+  RegType(int16_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16);
+  RegType(uint8_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8);
+  RegType(int8_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8);
+#undef RegType
+  return retv;
+}
+_PaddleMobile__Framework__Proto__VarType__Type ToDataType(
+    std::type_index type) {
+  auto it = gDataTypeMap().cpp_to_proto_.find(type);
+  if (it != gDataTypeMap().cpp_to_proto_.end()) {
+    return it->second;
+  }
+  PADDLE_MOBILE_THROW_EXCEPTION("Not support %s as tensor type", type.name());
+}
+std::type_index ToTypeIndex(
+    _PaddleMobile__Framework__Proto__VarType__Type type) {
+  auto it = gDataTypeMap().proto_to_cpp_.find(static_cast<int>(type));
+  if (it != gDataTypeMap().proto_to_cpp_.end()) {
+    return it->second;
+  }
+  PADDLE_MOBILE_THROW_EXCEPTION(
+      "Not support _PaddleMobile__Framework__Proto__VarType__Type(%d) as "
+      "tensor type",
+      static_cast<int>(type));
+}
+std::string DataTypeToString(
+    const _PaddleMobile__Framework__Proto__VarType__Type type) {
+  auto it = gDataTypeMap().proto_to_str_.find(static_cast<int>(type));
+  if (it != gDataTypeMap().proto_to_str_.end()) {
+    return it->second;
+  }
+  PADDLE_MOBILE_THROW_EXCEPTION(
+      "Not support _PaddleMobile__Framework__Proto__VarType__Type(%d) as "
+      "tensor type",
+      static_cast<int>(type));
+}
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/data_type.h
+++ b/src/framework/data_type.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include <typeindex>
+#include "common/enforce.h"
+#include "framework/framework.pb-c.h"
+namespace paddle_mobile {
+namespace framework {
+extern _PaddleMobile__Framework__Proto__VarType__Type ToDataType(
+    std::type_index type);
+extern std::type_index ToTypeIndex(
+    _PaddleMobile__Framework__Proto__VarType__Type type);
+template <typename Visitor>
+inline void VisitDataType(_PaddleMobile__Framework__Proto__VarType__Type type,
+                          Visitor visitor) {
+  switch (type) {
+    // case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16:
+    //   visitor.template apply<float16>();
+    //   break;
+    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32:
+      visitor.template apply<float>();
+      break;
+    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64:
+      visitor.template apply<double>();
+      break;
+    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32:
+      visitor.template apply<int>();
+      break;
+    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64:
+      visitor.template apply<int64_t>();
+      break;
+    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL:
+      visitor.template apply<bool>();
+      break;
+    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8:
+      visitor.template apply<uint8_t>();
+      break;
+    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16:
+      visitor.template apply<int16_t>();
+      break;
+    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8:
+      visitor.template apply<int8_t>();
+      break;
+    default:
+      PADDLE_MOBILE_THROW_EXCEPTION("Not supported %d", type);
+  }
+}
+extern std::string DataTypeToString(
+    const _PaddleMobile__Framework__Proto__VarType__Type type);
+inline std::ostream& operator<<(
+    std::ostream& out,
+    const _PaddleMobile__Framework__Proto__VarType__Type& type) {
+  out << DataTypeToString(type);
+  return out;
+}
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -64,6 +64,9 @@ limitations under the License. */
 // load requared ops
 LOAD_OP(feed)
 LOAD_OP(fetch)
+#ifdef FILL_CONSTANT_OP
+LOAD_OP(fill_constant)
+#endif
 #ifdef BATCHNORM_OP
 LOAD_OP2(batch_norm, CPU, MALI_GPU);
 #endif
@@ -199,6 +202,9 @@ LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA);
 #ifdef MULTICLASSNMS_OP
 LOAD_OP1(multiclass_nms, CPU);
 #endif
+#ifdef POLYGONBOXTRANSFORM_OP
+LOAD_OP1(polygon_box_transform, CPU);
+#endif
 #ifdef SUM_OP
 LOAD_OP1(sum, CPU);
 #endif

--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -32,7 +32,7 @@ template <typename Dtype>
 vector<string> OperatorBase<Dtype>::GetInputKeys() const {
  auto it = op_input_output_key.find(type_);
  if (it == op_input_output_key.end()) {
-    DLOG << type_ << " has no outputs";
+    DLOG << type_ << " has no inputs";
    return {};
  }
  return it->second.first;

--- a/src/framework/selected_rows.h
+++ b/src/framework/selected_rows.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <vector>
 #include "framework/lod_tensor.h"
+#include "framework/mixed_vector.h"
 #include "framework/tensor.h"
 #include "memory/t_malloc.h"
-#include "mixed_vector.h"
 namespace paddle_mobile {
 namespace framework {

--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -338,10 +338,14 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
  for (int i = 0; i < tensor.numel(); i += stride) {
    if (tensor.type() == typeid(float)) {
      printer << tensor.data<float>()[i] << " ";
+    } else if (tensor.type() == typeid(int32_t)) {
+      printer << tensor.data<int32_t>()[i] << " ";
    } else if (tensor.type() == typeid(int64_t)) {
      printer << tensor.data<int64_t>()[i] << " ";
    } else if (tensor.type() == typeid(int8_t)) {
-      printer << tensor.data<int8_t>()[i] << " ";
+      printer << static_cast<int>(tensor.data<int8_t>()[i]) << " ";
+    } else if (tensor.type() == typeid(int32_t)) {
+      printer << tensor.data<int32_t>()[i] << " ";
    }
  }
 #endif

--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -29,7 +29,14 @@ PaddleMobilePredictor<Dtype, P>::PaddleMobilePredictor(
 template <typename Dtype, Precision P>
 bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
  paddle_mobile_.reset(new PaddleMobile<Dtype, P>());
-  if (!config.model_dir.empty()) {
+  if (config.memory_pack.from_memory) {
+    DLOG << "load from memory!";
+    paddle_mobile_->LoadCombinedMemory(config.memory_pack.model_size,
+                                       config.memory_pack.model_buf,
+                                       config.memory_pack.combined_params_size,
+                                       config.memory_pack.combined_params_buf);
+  } else if (!config.model_dir.empty()) {
    paddle_mobile_->Load(config.model_dir, config.optimize,
                         config.quantification, config.batch_size);
  } else if (!config.prog_file.empty() && !config.param_file.empty()) {

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -80,12 +80,13 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
 }
 template <typename Dtype>
-void LoadMemInternal(void **data, framework::LoDTensor *tensor) {
+static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
+                            bool quant_uint8 = false) {
  char **data_buf = reinterpret_cast<char **>(data);
  int64_t size = tensor->numel();
  Dtype *tensor_data = tensor->mutable_data<Dtype>();
-  if (0) {
+  if (quant_uint8) {
-    // TODO(hjchen2) should be moved into operator init function
+    // should be moved into operator init function
    float min_value;
    float max_value;
    memcpy(&min_value, data_buf, sizeof(float));
@@ -141,7 +142,8 @@ void Executor<Dtype, P>::LoadMemory(
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
    case framework::VARTYPE_TYPE_FP32:
-      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor);
+      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
+                             program_.quantification);
      break;
    case framework::VARTYPE_TYPE_INT8:
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);

--- a/src/io/paddle_inference_api.h
+++ b/src/io/paddle_inference_api.h
@@ -111,6 +111,14 @@ class PaddlePredictor {
  PaddlePredictor() = default;
 };
+struct PaddleModelMemoryPack {
+  bool from_memory = false;
+  size_t model_size = 0;
+  uint8_t* model_buf = nullptr;
+  size_t combined_params_size = 0;
+  uint8_t* combined_params_buf = nullptr;
+};
 struct PaddleMobileConfig : public PaddlePredictor::Config {
  enum Precision { FP32 = 0 };
  enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
@@ -124,6 +132,7 @@ struct PaddleMobileConfig : public PaddlePredictor::Config {
  int thread_num = 1;
  std::string prog_file;
  std::string param_file;
+  struct PaddleModelMemoryPack memory_pack;
 };
 // A factory to help create different predictors.

--- a/src/operators/dequantize_op.cpp
+++ b/src/operators/dequantize_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef DEQUANT_OP
 #include "operators/dequantize_op.h"
 namespace paddle_mobile {
@@ -30,3 +32,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(dequantize, ops::DequantizeOp);
 #endif
+#endif
--- a/src/operators/dequantize_op.h
+++ b/src/operators/dequantize_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef DEQUANT_OP
 #pragma once
 #include <string>
@@ -41,3 +43,5 @@ class DequantizeOp
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/elementwise_mul_op.cpp
+++ b/src/operators/elementwise_mul_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 #ifdef ELEMENTWISEMUL_OP
-#include "elementwise_mul_op.h"
+#include "operators/elementwise_mul_op.h"
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/fill_constant_op.cpp
+++ b/src/operators/fill_constant_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FILL_CONSTANT_OP
+#include "operators/fill_constant_op.h"
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fill_constant, ops::FillConstantOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(fill_constant, ops::FillConstantOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fill_constant, ops::FillConstantOp);
+#endif
+#endif
--- a/src/operators/fill_constant_op.h
+++ b/src/operators/fill_constant_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FILL_CONSTANT_OP
+#pragma once
+#include <string>
+#include "framework/data_type.h"
+#include "framework/operator.h"
+#include "framework/selected_rows.h"
+#include "operators/math/math_function.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+template <typename DeviceType, typename T>
+class FillConstantOp : public framework::OperatorBase<DeviceType> {
+ public:
+  FillConstantOp(const string &type, const VariableNameMap &inputs,
+                 const VariableNameMap &outputs,
+                 const framework::AttributeMap attrs,
+                 std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
+                                            scope),
+        param_(inputs, outputs, attrs, *scope) {}
+  void RunImpl() const {
+    auto data_type =
+        static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(
+            param_.DataDtype());
+    framework::Tensor *tensor = nullptr;
+    auto value = param_.Value();
+    auto *outvar = param_.OutVar();
+    if (outvar->template IsType<framework::LoDTensor>()) {
+      tensor = outvar->template GetMutable<framework::LoDTensor>();
+    } else if (outvar->template IsType<framework::SelectedRows>()) {
+      tensor = outvar->template GetMutable<framework::SelectedRows>()
+                   ->mutable_value();
+    } else {
+      PADDLE_MOBILE_THROW_EXCEPTION(
+          "fill constant op's output only"
+          "supports SelectedRows and LoDTensor");
+    }
+    tensor->Resize(framework::make_ddim(param_.Shape()));
+    tensor->mutable_data(framework::ToTypeIndex(data_type));
+    math::set_constant(tensor, value);
+  }
+  void Init() {}
+  void InferShape() const {
+    PADDLE_MOBILE_ENFORCE(
+        param_.Out() != nullptr,
+        "Output (Out) of fill_constant op should not be null.");
+    framework::DDim ddim = framework::make_ddim(param_.Shape());
+    param_.Out()->Resize(ddim);
+  }
+ protected:
+  FillConstantParam<DeviceType> param_;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/dequantize_kernel.cpp
+++ b/src/operators/kernel/arm/dequantize_kernel.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef PADDLE_MOBILE_CPU
+#ifdef DEQUANT_OP
 #include "operators/kernel/dequantize_kernel.h"
@@ -38,7 +38,8 @@ void DequantizeKernel<CPU, float>::Compute(
  const int32_t *x = input->data<const int32_t>();
  float *y = output->mutable_data<float>();
  size_t size = output->numel();
-  float scale = 1.f / (activation_scale * weight_scale);
+  // float scale = 1.f / (activation_scale * weight_scale);
+  float scale = activation_scale / weight_scale;
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  size_t loop = size >> 4;
  size_t remain = size & 0xF;

--- a/src/operators/kernel/arm/mul_kernel.cpp
+++ b/src/operators/kernel/arm/mul_kernel.cpp
@@ -31,6 +31,8 @@ void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) const {
  param.Out()->set_lod(param.InputX()->lod());
 }
+template class MulKernel<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/fpga/mul_kernel.cpp
+++ b/src/operators/kernel/fpga/mul_kernel.cpp
@@ -12,57 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef MUL_OP
+#ifdef POLYGONBOXTRANSFORM_OP
-#include "operators/kernel/mul_kernel.h"
+#include "operators/kernel/polygon_box_transform_kernel.h"
+#include "operators/kernel/central-arm-func/polygon_box_transform_arm_func.h"
 namespace paddle_mobile {
 namespace operators {
 template <>
-bool MulKernel<FPGA, float>::Init(MulParam<FPGA> *param) {
+bool PolygonBoxTransformKernel<CPU, float>::Init(
-  bool relu_enabled = false;
+    PolygonBoxTransformParam<CPU> *param) {
-  auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<LoDTensor *>(param->InputY());
-  auto out = param->Out();
-  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
-                        "Image channel should be equal to weight number");
-  int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = 0;
-  }
-  int num = (uint32_t)filter->dims()[1];
-  int chw = (uint32_t)filter->dims()[0];
-  PADDLE_MOBILE_ENFORCE(
-      chw == input_x->numel(),
-      "Filter element num should be equal to IFM element num");
-  int height = (uint32_t)input_x->dims()[2];
-  int width = (uint32_t)input_x->dims()[3];
-  int filter_channel = chw / height / width;
-  out->Resize(framework::make_ddim({1, channel, 1, 1}));
-  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_fc_filter(filter, max_value);
-  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_fp16_ofm(out);
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
-                      0, bs_ptr);
-  param->SetFpgaArgs(conv_arg);
  return true;
 }
 template <>
-void MulKernel<FPGA, float>::Compute(const MulParam<FPGA> &param) const {
+void PolygonBoxTransformKernel<CPU, float>::Compute(
-  fpga::ComputeFpgaConv(param.FpgaArgs());
+    const PolygonBoxTransformParam<CPU> &param) const {
+  PolygonBoxTransformCompute<float>(param);
 }
 }  // namespace operators

--- a/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef PADDLE_MOBILE_CPU
+#ifdef QUANT_OP
 #include "operators/kernel/quantize_kernel.h"
 #include <cmath>
@@ -225,7 +225,7 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale,
  const float *x = input->data<const float>();
  int8_t *y = output->mutable_data<int8_t>();
  size_t size = input->numel();
-#ifdef defined(__ARM_NEON__) || defined(__ARM_NEON)
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
  size_t loop = size >> 4;
  size_t remain = size & 0xF;
  for (size_t i = 0; i < loop; ++i) {
@@ -280,17 +280,18 @@ void QuantizeKernel<CPU, float>::Compute(
  }
  max_abs = std::max(max_abs, 1e-6f);
  // only support int8 currently
-  float online_scale = 127 / max_abs;
+  float scale = 127 / max_abs;
-  param.online_scale_->mutable_data<float>()[0] = online_scale;
+  param.online_scale_->mutable_data<float>()[0] = max_abs;
  switch (param.round_type_) {
    case ROUND_NEAREST_TO_EVEN:
-      quantize_round_to_even(input, online_scale, output);
+      quantize_round_to_even(input, scale, output);
      break;
    case ROUND_NEAREST_TOWARDS_ZERO:
-      quantize_round_to_zero(input, online_scale, output);
+      quantize_round_to_zero(input, scale, output);
      break;
    case ROUND_NEAREST_AWAY_ZERO:
-      quantize_round_to_nearest(input, online_scale, output);
+      quantize_round_to_nearest(input, scale, output);
+      break;
    default:
      LOG(kLOG_ERROR) << "round type is not supported.";
      break;

--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -16,24 +16,27 @@ limitations under the License. */
 #pragma once
 #include <vector>
+#include "operators/math/conv_arm_int8.h"
 #include "operators/math/conv_func.h"
 #include "operators/math/depthwise_conv_3x3.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
+#include "operators/math/pad.h"
 #include "operators/math/vol2col.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
+template <typename Dtype>
 inline void ConvBasic(const ConvParam<CPU> &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
  Tensor *output = param.Output();
-  output->mutable_data<float>();
  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
+  const std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
+  const std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
+  const std::vector<int> dilations = param.Dilations();
  const int batch_size = static_cast<int>(input->dims()[0]);
@@ -57,7 +60,7 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
  Tensor col;
  Tensor col_matrix;
  if (is_expand) {
-    col.mutable_data<float>(col_shape);
+    col.mutable_data<Dtype>(col_shape);
    col_matrix.ShareDataWith(col);
    col_matrix.Resize(col_matrix_shape);
  }
@@ -76,8 +79,8 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
  int in_step = static_cast<int>(input->dims()[1]) / groups;
  int out_step = static_cast<int>(output->dims()[1]) / groups;
-  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Vol2ColFunctor<CPU, Dtype> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Dtype> im2col;
  for (int i = 0; i < batch_size; i++) {
    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
@@ -96,6 +99,7 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
               std::vector<int>{paddings[0], paddings[1], paddings[0],
                                paddings[1]},
               &col);
      } else if (data_dim == 3U) {
        // vol2col
        vol2col(in_slice, dilations, strides, paddings, &col);
@@ -104,15 +108,70 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
+      math::matmul<Dtype>(filter_slice, false, col_matrix, false,
                          static_cast<float>(1), &out_slice,
                          static_cast<float>(0));
    }
  }
 }
+inline void ConvCompute_int8(const ConvParam<CPU> &param) {
+  typedef void (*ConvFunc)(const Tensor &input, const Tensor &kernel,
+                           Tensor *output);
+  static ConvFunc conv_funcs_table[7][5] = {
+      {0, 0, 0, 0, 0},                                // k = 1
+      {0, 0, 0, 0, 0}, {conv3x3s1_int8, 0, 0, 0, 0},  // k = 3
+      {0, 0, 0, 0, 0}, {conv5x5s1_int8, 0, 0, 0, 0},  // k = 5
+      {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0},               // k = 7
+  };
+  const Tensor *input = param.Input();
+  Tensor *filter = param.Filter();
+  Tensor *output = param.Output();
+  int groups = param.Groups();
+  const std::vector<int> &strides = param.Strides();
+  const std::vector<int> &paddings = param.Paddings();
+  const std::vector<int> &dilations = param.Dilations();
+  int kernel_h = filter->dims()[2];
+  int kernel_w = filter->dims()[3];
+  output->mutable_data<int32_t>();
+  ConvFunc conv_func = 0;
+  if (strides[1] == strides[0] && strides[1] < 6 && kernel_h == kernel_w &&
+      kernel_h < 8 && groups == 1 && dilations[0] == dilations[1] &&
+      dilations[1] == 1) {
+    conv_func = conv_funcs_table[kernel_h - 1][strides[0] - 1];
+  }
+  if (conv_func) {
+    int batch_size = input->dims()[0];
+    math::PadFunctor<CPU, int8_t> pad;
+    Tensor input_pad;
+    for (int i = 0; i < batch_size; ++i) {
+      Tensor in_batch = input->Slice(i, i + 1);
+      Tensor out_batch = output->Slice(i, i + 1);
+      if (paddings[0] == 0 && paddings[1] == 0) {
+        input_pad = in_batch;
+      } else {
+        framework::DDim pad_shape = in_batch.dims();
+        pad_shape[2] += 2 * paddings[0];
+        pad_shape[3] += 2 * paddings[1];
+        input_pad.mutable_data<int8_t>(pad_shape);
+        pad(in_batch, paddings[0], paddings[1], &input_pad);
+      }
+      conv_func(input_pad, *filter, &out_batch);
+    }
+  } else {
+    ConvBasic<int8_t>(param);
+  }
+}
 template <typename P>
 void ConvCompute(const ConvParam<CPU> &param) {
+  if (param.Input()->type() == typeid(int8_t)) {
+    ConvCompute_int8(param);
+  } else {
+    param.Output()->mutable_data<float>();
    if (param.Groups() == param.Input()->dims()[1] &&
        param.Input()->dims()[1] == param.Output()->dims()[1] &&
        param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
@@ -126,7 +185,8 @@ void ConvCompute(const ConvParam<CPU> &param) {
      math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
                             param.Filter(), nullptr, param.Output(), false);
    } else {
-    ConvBasic(param);
+      ConvBasic<float>(param);
+    }
  }
 }

--- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
@@ -44,7 +44,7 @@ void DepthwiseConvCompute(const ConvParam<CPU> &param) {
                                 Bias, false);
  } else {
-    ConvBasic(param);
+    ConvBasic<float>(param);
  }
 }

--- a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
@@ -15,8 +15,12 @@ limitations under the License. */
 #ifdef ELEMENTWISEADD_OP
 #pragma once
 #include "operators/math/elementwise_op_function.h"
 #include "operators/op_param.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
 namespace paddle_mobile {
 namespace operators {
@@ -33,8 +37,61 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
  Tensor *Out = param.Out();
  Out->mutable_data<float>();
  int axis = param.Axis();
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  const auto &x_dims = input_x->dims();
+  const auto &y_dims = input_y->dims();
+  /// axis = -1 represent the last dimensions.
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  size_t batch = 1;
+  size_t channels = 1;
+  size_t elementwise_num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    elementwise_num *= x_dims[i];
+  }
+  const float *bias_data = input_y->data<float>();
+  const float *input_data = input_x->data<float>();
+  float *output_data = Out->mutable_data<float>();
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      size_t offset = (i * channels + j) * elementwise_num;
+      const float *input = input_data + offset;
+      const float *bias = bias_data + j;
+      float *output = output_data + offset;
+      int loop = elementwise_num >> 0x4;
+      int remain = elementwise_num & 0xF;
+      for (int k = 0; k < loop; ++k) {
+        float32x4_t rb = vdupq_n_f32(*bias);
+        float32x4_t r0 = vld1q_f32(input);
+        float32x4_t r1 = vld1q_f32(input + 4);
+        float32x4_t r2 = vld1q_f32(input + 8);
+        float32x4_t r3 = vld1q_f32(input + 12);
+        r0 = vaddq_f32(r0, rb);
+        r1 = vaddq_f32(r1, rb);
+        r2 = vaddq_f32(r2, rb);
+        r3 = vaddq_f32(r3, rb);
+        vst1q_f32(output, r0);
+        vst1q_f32(output + 4, r1);
+        vst1q_f32(output + 8, r2);
+        vst1q_f32(output + 12, r3);
+        input += 16;
+        output += 16;
+      }
+      for (int k = 0; k < remain; ++k) {
+        output[k] = input[k] + *bias;
+      }
+    }
+  }
+#else
  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
                                                 AddFunctor<float>(), Out);
+#endif
 }
 template class ElementwiseAddKernel<CPU, float>;

--- a/src/operators/kernel/central-arm-func/mul_arm_func.h
+++ b/src/operators/kernel/central-arm-func/mul_arm_func.h
@@ -58,7 +58,7 @@ void MulCompute(const MulParam<CPU> &param) {
  const Tensor *input_x = param.InputX();
  const Tensor *input_y = param.InputY();
  Tensor *out = param.Out();
-  out->mutable_data<float>();
  const Tensor x_matrix =
      input_x->dims().size() > 2
          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
@@ -71,15 +71,21 @@ void MulCompute(const MulParam<CPU> &param) {
  if (out_dim.size() != 2) {
    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
  }
+  if (param.InputX()->type() == typeid(int8_t)) {
+    out->mutable_data<int32_t>();
+    math::matmul<int8_t>(x_matrix, false, y_matrix, false,
+                         static_cast<int8_t>(1), out, static_cast<int8_t>(0));
+  } else {
+    out->mutable_data<float>();
    math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
                        out, static_cast<float>(0));
+  }
  if (out_dim.size() != 2) {
    out->Resize(out_dim);
  }
 }
-template class MulKernel<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
+++ b/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POLYGONBOXTRANSFORM_OP
+#pragma once
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename P>
+void PolygonBoxTransformCompute(const PolygonBoxTransformParam<CPU>& param) {
+  const auto* input = param.Input();
+  const auto& input_dims = input->dims();
+  const auto* input_data = input->data<float>();
+  auto* output = param.Output();
+  auto* output_data = output->mutable_data<float>();
+  int64_t batch_size = input_dims[0];
+  int64_t geo_channel = input_dims[1];
+  int64_t height = input_dims[2];
+  int64_t width = input_dims[3];
+  int64_t id = 0;
+  for (int64_t id_n = 0; id_n < batch_size * geo_channel; ++id_n) {
+    for (int64_t id_h = 0; id_h < height; ++id_h) {
+      for (int64_t id_w = 0; id_w < width; ++id_w) {
+        id = id_n * height * width + width * id_h + id_w;
+        if (id_n % 2 == 0) {
+          output_data[id] = id_w * 4 - input_data[id];
+        } else {
+          output_data[id] = id_h * 4 - input_data[id];
+        }
+      }
+    }
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/relu_arm_func.h
@@ -17,6 +17,9 @@ limitations under the License. */
 #include <operators/math/transform.h>
 #include "operators/op_param.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
 namespace paddle_mobile {
 namespace operators {
@@ -37,71 +40,100 @@ void ReluCompute(const ReluParam<CPU> &param) {
  auto *out_ptr = out->mutable_data<float>();
  int numel = input_x->numel();
-  //  if (numel > 64) {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  //    asm volatile(
+#if __aarch64__
-  //        "pld        [%[input_x_ptr], #0]        \n\t"
+  if (numel > 0) {
-  //        "vmov.f32   q8,    #0.0                 \n\t"
+    int loop = numel >> 0x4;
-  //        "subs %[num], %[num], #32                \n\t"
+    int remain = numel & 0xF;
-  //        "blt        end_num_%=                  \n\t"
+    float32x4_t zero = vdupq_n_f32(0.f);
-  //        "loop_num_%=:                           \n\t"
+    for (int i = 0; i < loop; ++i) {
-  //        "pld        [%[input_x_ptr], #1024]      \n\t"
+      float32x4_t r0 = vld1q_f32(input_x_ptr);
-  //
+      float32x4_t r1 = vld1q_f32(input_x_ptr + 4);
-  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+      float32x4_t r2 = vld1q_f32(input_x_ptr + 8);
-  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+      float32x4_t r3 = vld1q_f32(input_x_ptr + 12);
-  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+      r0 = vmaxq_f32(r0, zero);
-  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+      r1 = vmaxq_f32(r1, zero);
-  //
+      r2 = vmaxq_f32(r2, zero);
-  //        "vmax.f32 q0, q0, q8                   \n\t"
+      r3 = vmaxq_f32(r3, zero);
-  //        "vmax.f32 q1, q1, q8                    \n\t"
+      vst1q_f32(out_ptr, r0);
-  //        "vmax.f32 q2, q2, q8                   \n\t"
+      vst1q_f32(out_ptr + 4, r1);
-  //        "vmax.f32 q3, q3, q8                   \n\t"
+      vst1q_f32(out_ptr + 8, r2);
-  //        "vmax.f32 q4, q4, q8                   \n\t"
+      vst1q_f32(out_ptr + 12, r3);
-  //        "vmax.f32 q5, q5, q8                   \n\t"
+      input_x_ptr += 16;
-  //        "vmax.f32 q6, q6, q8                   \n\t"
+      out_ptr += 16;
-  //        "vmax.f32 q7, q7, q8                   \n\t"
+    }
-  //
+    for (int i = 0; i < remain; ++i) {
-  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+      out_ptr[i] = (input_x_ptr[i] > 0) * input_x_ptr[i];
-  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+    }
-  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+#else
-  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+  if (numel > 64) {
-  //
+    asm volatile(
-  //        "subs %[num], %[num], #32              \n\t"
+        "pld        [%[input_x_ptr], #0]        \n\t"
-  //        "bge        loop_num_%=                \n\t"
+        "vmov.f32   q8,    #0.0                 \n\t"
-  //        "end_num_%=:                           \n\t"
+        "subs %[num], %[num], #32                \n\t"
-  //        "cmp %[num], #0                         \n\t"
+        "blt        end_num_%=                  \n\t"
-  //        "bge   end_%=                          \n\t"
+        "loop_num_%=:                           \n\t"
-  //        "mov r6, #4                             \n\t"
+        "pld        [%[input_x_ptr], #1024]      \n\t"
-  //        "mul r5, %[num], r6                     \n\t"
-  //        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
+        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-  //        "vmax.f32 q0, q0, q8                   \n\t"
+        "vmax.f32 q0, q0, q8                   \n\t"
-  //        "vmax.f32 q1, q1, q8                    \n\t"
+        "vmax.f32 q1, q1, q8                    \n\t"
-  //        "vmax.f32 q2, q2, q8                   \n\t"
+        "vmax.f32 q2, q2, q8                   \n\t"
-  //        "vmax.f32 q3, q3, q8                   \n\t"
+        "vmax.f32 q3, q3, q8                   \n\t"
-  //        "vmax.f32 q4, q4, q8                   \n\t"
+        "vmax.f32 q4, q4, q8                   \n\t"
-  //        "vmax.f32 q5, q5, q8                   \n\t"
+        "vmax.f32 q5, q5, q8                   \n\t"
-  //        "vmax.f32 q6, q6, q8                   \n\t"
+        "vmax.f32 q6, q6, q8                   \n\t"
-  //        "vmax.f32 q7, q7, q8                   \n\t"
+        "vmax.f32 q7, q7, q8                   \n\t"
-  //        "add %[out_ptr], %[out_ptr], r5       \n\t"
-  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
-  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
-  //        "end_%=:                                \n\t"
-  //        :
+        "subs %[num], %[num], #32              \n\t"
-  //        :
+        "bge        loop_num_%=                \n\t"
-  //        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num]
+        "end_num_%=:                           \n\t"
-  //        "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
+        "cmp %[num], #0                         \n\t"
-  //        "q7", "q8", "r5",
+        "bge   end_%=                          \n\t"
-  //          "r6");
+        "mov r6, #4                             \n\t"
-  //  } else {
+        "mul r5, %[num], r6                     \n\t"
+        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
+        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+        "vmax.f32 q0, q0, q8                   \n\t"
+        "vmax.f32 q1, q1, q8                    \n\t"
+        "vmax.f32 q2, q2, q8                   \n\t"
+        "vmax.f32 q3, q3, q8                   \n\t"
+        "vmax.f32 q4, q4, q8                   \n\t"
+        "vmax.f32 q5, q5, q8                   \n\t"
+        "vmax.f32 q6, q6, q8                   \n\t"
+        "vmax.f32 q7, q7, q8                   \n\t"
+        "add %[out_ptr], %[out_ptr], r5       \n\t"
+        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+        "end_%=:                                \n\t"
+        :
+        :
+        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5",
+          "r6");
+#endif
+  } else {
+#endif
    ReluFunctor<float> func_;
    math::Transform trans;
    trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
-  //  }
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  }
+#endif
 }
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/central-arm-func/sum_arm_func.h
+++ b/src/operators/kernel/central-arm-func/sum_arm_func.h
@@ -15,11 +15,14 @@ limitations under the License. */
 #ifdef SUM_OP
 #pragma once
+#include <vector>
 #include "operators/math/selected_rows_functor.h"
 namespace paddle_mobile {
 namespace operators {
 using LoDTensorArray = std::vector<LoDTensor>;
 template <typename P>
 void SumCompute(const SumParam<CPU> &param) {
  auto inputsvars = param.InputsVars();
@@ -63,31 +66,21 @@ void SumCompute(const SumParam<CPU> &param) {
    std::unique_ptr<framework::SelectedRows> in0;
    if (in_place) {
      // If is in_place, we store the input[0] to in0
-      auto *in_sel0 = inputsvars[0]->Get<SelectedRows>();
+      auto *in_sel0 = inputsvars[0]->Get<framework::SelectedRows>();
      auto &rows = in_sel0->rows();
-      //#ifdef PADDLE_WITH_CUDA
-      //                    std::vector<int64_t> rows_in_cpu;
-      //        rows_in_cpu.reserve(rows.size());
-      //        for (auto item : rows) {
-      //          rows_in_cpu.push_back(item);
-      //        }
-      //        in0.reset(new framework::SelectedRows(rows_in_cpu,
-      //        in_sel0.height()));
-      //#else
      in0.reset(new framework::SelectedRows(rows, in_sel0->height()));
-      //#endif
      in0->mutable_value()->ShareDataWith(in_sel0->value());
    }
-    auto get_selected_row = [&](size_t i) -> const SelectedRows & {
+    auto get_selected_row = [&](size_t i) -> const framework::SelectedRows & {
      if (i == 0 && in0) {
        return *in0.get();
      } else {
-        return *(inputsvars[i]->Get<SelectedRows>());
+        return *(inputsvars[i]->Get<framework::SelectedRows>());
      }
    };
-    auto *out = outvar->GetMutable<SelectedRows>();
+    auto *out = outvar->GetMutable<framework::SelectedRows>();
    out->mutable_rows()->clear();
    auto *out_value = out->mutable_value();
@@ -150,8 +143,6 @@ void SumCompute(const SumParam<CPU> &param) {
      }
    }
  } else {
-    if (outvar->IsType<framework::Tensor>()) {
-    }
    PADDLE_MOBILE_THROW_EXCEPTION(
        "Unexpected branch, output variable type is %s", outvar->Type().name());
  }

--- a/src/operators/kernel/dequantize_kernel.h
+++ b/src/operators/kernel/dequantize_kernel.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef DEQUANT_OP
 #pragma once
 #include "framework/operator.h"
@@ -30,3 +32,5 @@ class DequantizeKernel
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/elementwise_mul_kernel.h
+++ b/src/operators/kernel/elementwise_mul_kernel.h
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
-using namespace framework;
 template <typename DeviceType, typename T>
 class ElementwiseMulKernel
    : public framework::OpKernelBase<DeviceType,

--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
@@ -66,10 +66,11 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
  fpga::format_fp16_ofm(out);
-  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
+                       param->Groups(), param->Strides()[0],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;

--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
@@ -65,10 +65,11 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
  fpga::format_fp16_ofm(out);
-  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
+                       param->Groups(), param->Strides()[0],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
@@ -47,10 +47,11 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
  fpga::format_fp16_ofm(out);
-  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
+                       param->Groups(), param->Strides()[0],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp
@@ -59,10 +59,11 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
  fpga::format_fp16_ofm(out);
-  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
+                       param->Groups(), param->Strides()[0],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
@@ -59,10 +59,11 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
  fpga::format_fp16_ofm(out);
-  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
+                       param->Groups(), param->Strides()[0],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
@@ -53,9 +53,9 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
  fpga::format_fp16_ofm(out);
-  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
-                      0, bs_ptr);
+                       0, 0, bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
@@ -54,9 +54,9 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
  fpga::format_fp16_ofm(out);
-  fpga::WrapperConvArgs conv_arg = {0};
+  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
-                      0, bs_ptr);
+                       0, 0, bs_ptr);
  param->SetFpgaArgs(conv_arg);
  return true;
 }

--- a/src/operators/kernel/polygon_box_transform_kernel.h
+++ b/src/operators/kernel/polygon_box_transform_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POLYGONBOXTRANSFORM_OP
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class PolygonBoxTransformKernel
+    : public framework::OpKernelBase<DeviceType,
+                                     PolygonBoxTransformParam<DeviceType>> {
+ public:
+  void Compute(const PolygonBoxTransformParam<DeviceType>& param) const;
+  bool Init(PolygonBoxTransformParam<DeviceType>* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/quantize_kernel.h
+++ b/src/operators/kernel/quantize_kernel.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef QUANT_OP
 #pragma once
 #include "framework/operator.h"
@@ -30,3 +32,5 @@ class QuantizeKernel
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/sum_kernel.h
+++ b/src/operators/kernel/sum_kernel.h
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
-using namespace framework;
 template <typename DeviceType, typename T>
 class SumKernel
    : public framework::OpKernelBase<DeviceType, SumParam<DeviceType>> {

--- a/src/operators/math/conv3x3_arm_int8.cpp
+++ b/src/operators/math/conv3x3_arm_int8.cpp
--- a/src/operators/math/conv5x5_arm_int8.cpp
+++ b/src/operators/math/conv5x5_arm_int8.cpp
--- a/src/operators/math/conv_arm_int8.h
+++ b/src/operators/math/conv_arm_int8.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef CONV_OP
+#pragma once
+#include "framework/tensor.h"
+namespace paddle_mobile {
+namespace operators {
+void conv3x3s1_int8(const framework::Tensor& input,
+                    const framework::Tensor& weight, framework::Tensor* output);
+void conv3x3s1_int8_4c(const framework::Tensor& input,
+                       const framework::Tensor& weight,
+                       framework::Tensor* output);
+void conv5x5s1_int8(const framework::Tensor& input,
+                    const framework::Tensor& weight, framework::Tensor* output);
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -3379,7 +3379,7 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
    // 对 B 分块
    NC = L1 / (KC * sizeof(float));
    if (NC == 0) {
-      NC == NR;
+      NC = NR;
    } else {
      int nblock_num = (n + NC - 1) / NC;
      NC = (n + nblock_num - 1) / nblock_num;
@@ -3662,7 +3662,7 @@ void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
  b_ptr = b;
  int kc1 = k / 8;
  int kc2 = k % 8;
-  int step = 4 * ldc;
+  int step = sizeof(float) * ldc;
  asm volatile(
      "pld        [%[a_ptr]]            \n\t"
      "pld        [%[a_ptr],  #64]      \n\t"
@@ -3866,11 +3866,10 @@ void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
      :
      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
        [kc2] "r"(kc2), [step] "r"(step)
-      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+      : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-        "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+        "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
 #endif  // __aarch64__
-#else
 #endif  // __ARM_NEON
 }

--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -22,9 +22,11 @@ limitations under the License. */
 #define C(i, j) C[(i)*ldc + (j)]
 #if __aarch64__
+#define MR_INT8 4
 #define MR 6
 #define NR 16
 #else
+#define MR_INT8 4
 #define MR 6
 #define NR 8
 #endif
@@ -96,6 +98,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
  void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
                            float *c, float *C, int ldc, float *p,
                            std::string mode, float *bias, float *bias1);
  /*
  // 向量矩阵乘法 (M = 1)
  void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
@@ -139,6 +142,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                       float *new_scale, float *new_bias);
  void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
                          float *new_scale, float *new_bias, float *bias1);
  /*
  // 向量矩阵乘法结果回写
  // C = A * B
@@ -185,15 +189,67 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                          const float *B, int ldb, float *C, int ldc, float *p,
                          std::string mode, float *bias, float *bias1);
+  // 8 bits function cluster begins
+  // 8 bits int small block inner product
+  void AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
+                 int32_t ldc);
+  void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
+                 int32_t ldc);
+  // 8 bits int inner product
+  void InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
+                           const int8_t *a, const int8_t *b, int8_t beta,
+                           int32_t *c, int32_t *C, int32_t ldc, bool relu,
+                           int8_t *bias);
+  // 8 bits int pack function
+  void PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
+                      int32_t lda, int8_t *buffer);
+  void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
+                      int32_t lda, int8_t *buffer);
+  void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
+                      int32_t ldb, int8_t *buffer);
+  // 8 bits int matrix product
+  void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
+             int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C,
+             int32_t ldc, bool relu, int8_t *bias);
+  // 8 bits int write back
+  // C = alpha * A * B + beta * C
+  void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                          int32_t ldc);
+  // C = A * B
+  void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc);
+  // C = A * B + C
+  void WriteWithAdd(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                    int32_t ldc);
+  // C = A * B + bias
+  void WriteWithAddV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                      int32_t ldc, int8_t *bias);
+  // C = A * B + C, relu(C)
+  void WriteWithAddRelu(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                        int32_t ldc);
+  // C = A * B + bias, relu(C)
+  void WriteWithAddReluV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                          int32_t ldc, int8_t *bias);
 private:
  int MC = 0;
  int KC = 0;
  int NC = 0;
+  // 32位 float
  float *packedA;
  float *packedB;
  float *packedC;
  float *zero;
+  // 8 bits int
+  int8_t *packedA_int8;
+  int8_t *packedB_int8;
+  int32_t *packedC_int8;
+  int8_t *zero_int8;
 };
 }  // namespace math

--- a/src/operators/math/gemm_int8.cpp
+++ b/src/operators/math/gemm_int8.cpp
--- a/src/operators/math/im2col.cpp
+++ b/src/operators/math/im2col.cpp
@@ -28,15 +28,11 @@ namespace math {
 *   [input_channels, filter_height, filter_width, output_height,
 * output_width]
 */
-template <class T>
+template <>
-class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
+void Im2ColFunctor<ColFormat::kCFO, CPU, float>::operator()(
- public:
+    const framework::Tensor &im, const std::vector<int> &dilation,
-  void operator()(const framework::Tensor &im, const std::vector<int> &dilation,
+    const std::vector<int> &stride, const std::vector<int> &padding,
-                  const std::vector<int> &stride,
+    framework::Tensor *col) {
-                  const std::vector<int> &padding, framework::Tensor *col) {
-    //    PADDLE_ENFORCE(im.dims().size() == 3);
-    //    PADDLE_ENFORCE(col->dims().size() == 5);
  int im_channels = im.dims()[0];
  int im_height = im.dims()[1];
  int im_width = im.dims()[2];
@@ -45,30 +41,9 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
  int col_height = col->dims()[3];
  int col_width = col->dims()[4];
-    //    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2]
-    //    -
-    //                       ((dilation[0] * (filter_height - 1)
-    //                       + 1))) /
-    //                              stride[0] +
-    //                          1,
-    //                      col_height,
-    //                      "Output_height and
-    //                      padding(padding_up, padding_down)
-    //                      are " "inconsistent.");
-    //    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3]
-    //    -
-    //                       ((dilation[1] * (filter_width - 1)
-    //                       + 1))) /
-    //                              stride[1] +
-    //                          1,
-    //                      col_width,
-    //                      "Output_height and
-    //                      padding(padding_up, padding_down)
-    //                      are " "inconsistent.");
  int channels_col = im_channels * filter_height * filter_width;
-    const T *im_data = im.data<T>();
+  const float *im_data = im.data<float>();
-    T *col_data = col->data<T>();
+  float *col_data = col->data<float>();
 #if __ARM_NEON
  const int osize = col_height;
  const int isize = im_height;
@@ -249,8 +224,8 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
      col_data += 9 * oosize;
      im_data += isize * isize;
    }
-    } else if (stride[0] == 2 && filter_height == 3 && pad1 &&
+  } else if (stride[0] == 2 && filter_height == 3 && pad1 && dilation[0] == 1 &&
-               dilation[0] == 1 && im_height > 2 && im_height == im_width) {
+             im_height > 2 && im_height == im_width) {
    for (int c = 0; c < im_channels; ++c) {
      int oosize = osize * osize;
      int nk4 = osize / 4;
@@ -396,15 +371,13 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
      for (int h = 0; h < col_height; ++h) {
        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
        for (int w = 0; w < col_width; ++w) {
-            int im_col_idx =
+          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
-                w * stride[1] - padding[1] + w_offset * dilation[1];
          int col_idx = (c * col_height + h) * col_width + w;
-            int im_idx =
+          int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
-                (im_row_idx + c_im * im_height) * im_width + im_col_idx;
          col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
                               im_col_idx < 0 || im_col_idx >= im_width)
-                                    ? static_cast<T>(0)
+                                  ? static_cast<float>(0)
                                  : im_data[im_idx];
        }
      }
@@ -424,14 +397,138 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
        col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
                             im_col_idx < 0 || im_col_idx >= im_width)
-                                  ? static_cast<T>(0)
+                                ? static_cast<float>(0)
                                : im_data[im_idx];
      }
    }
  }
 #endif
+}
+void ExtractToImg(const int8_t *im_data, int8_t *col_data, const int im_height,
+                  const int im_width, const int col_height, const int col_width,
+                  const int padding_h, const int padding_w, const int stride_h,
+                  const int stride_w, const int kh, const int kw) {
+  int h = padding_h - kh;
+  int w = padding_w - kw;
+  int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0;
+  int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0;
+  int start_height = kh + col_start_height * stride_h - padding_h;
+  int start_width = kw + col_start_width * stride_w - padding_w;
+  int end_height = (col_height - col_start_height) * stride_h + start_height;
+  end_height = end_height > im_height ? im_height : end_height;
+  int end_width = (col_width - col_start_width) * stride_w + start_width;
+  end_width = end_width > im_width ? im_width : end_width;
+  int extract = (end_width - start_width + stride_w - 1) / stride_w;
+  im_data += start_height * im_width + start_width;
+  col_data += col_start_height * col_width + col_start_width;
+  for (int i = start_height; i < end_height; i += stride_h) {
+    if (stride_w == 1) {
+      memcpy(col_data, im_data, extract * sizeof(int8_t));
+    } else if (stride_w == 2) {
+      int s = 0;
+#if __ARM_NEON
+      for (; s < extract - 15; s += 16) {
+        int8x16x2_t img = vld2q_s8(im_data + s * 2);
+        vst1q_s8(col_data + s, img.val[0]);
      }
-};
+#endif
+      for (; s < extract; ++s) {
+        col_data[s] = im_data[s * 2];
+      }
+    } else if (stride_w == 3) {
+      int s = 0;
+#if __ARM_NEON
+      for (; s < extract - 15; s += 16) {
+        int8x16x3_t img = vld3q_s8(im_data + s * 3);
+        vst1q_s8(col_data + s, img.val[0]);
+      }
+#endif
+      for (; s < extract; ++s) {
+        col_data[s] = im_data[s * 3];
+      }
+    } else if (stride_w == 4) {
+      int s = 0;
+#if __ARM_NEON
+      for (; s < extract - 15; s += 16) {
+        int8x16x4_t img = vld4q_s8(im_data + s * 4);
+        vst1q_s8(col_data + s, img.val[0]);
+      }
+#endif
+      for (; s < extract; ++s) {
+        col_data[s] = im_data[s * 4];
+      }
+    } else {
+      PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1, 2, 3 and 4.");
+    }
+    im_data += im_width * stride_h;
+    col_data += col_width;
+  }
+}
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height,
+ * output_width]
+ */
+template <>
+void Im2ColFunctor<ColFormat::kCFO, CPU, int8_t>::operator()(
+    const framework::Tensor &im, const std::vector<int> &dilation,
+    const std::vector<int> &stride, const std::vector<int> &padding,
+    framework::Tensor *col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int col_height = col->dims()[3];
+  int col_width = col->dims()[4];
+  int channels_col = im_channels * filter_height * filter_width;
+  const int8_t *im_data = im.data<int8_t>();
+  int8_t *col_data = col->data<int8_t>();
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  if (stride[0] <= 4 && dilation[0] == 1 && dilation[0] == dilation[1]) {
+    // pad 0
+    memset(col_data, 0, col->numel() * sizeof(int8_t));
+    for (int ic = 0; ic < im_channels; ++ic) {
+      for (int kh = 0; kh < filter_height; ++kh) {
+        for (int kw = 0; kw < filter_width; ++kw) {
+          ExtractToImg(im_data, col_data, im_height, im_width, col_height,
+                       col_width, padding[0], padding[1], stride[0], stride[1],
+                       kh, kw);
+          col_data += col_height * col_width;
+        }
+      }
+      im_data += im_height * im_width;
+    }
+  } else {
+#endif
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int c_im = c / (filter_width * filter_height);
+      for (int h = 0; h < col_height; ++h) {
+        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+        for (int w = 0; w < col_width; ++w) {
+          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+          int col_idx = (c * col_height + h) * col_width + w;
+          int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
+          col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
+                               im_col_idx < 0 || im_col_idx >= im_width)
+                                  ? static_cast<int8_t>(0)
+                                  : im_data[im_idx];
+        }
+      }
+    }
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  }
+#endif
+}
 /*
 * im = [input_channels, input_height, input_width]
@@ -456,27 +553,6 @@ class Col2ImFunctor<ColFormat::kCFO, CPU, T> {
    int col_height = col.dims()[3];
    int col_width = col.dims()[4];
-    //    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2]
-    //    -
-    //                       ((dilation[0] * (filter_height - 1)
-    //                       + 1))) /
-    //                              stride[0] +
-    //                          1,
-    //                      col_height,
-    //                      "Output_height and
-    //                      padding(padding_up, padding_down)
-    //                      are " "inconsistent.");
-    //    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3]
-    //    -
-    //                       ((dilation[1] * (filter_width - 1)
-    //                       + 1))) /
-    //                              stride[1] +
-    //                          1,
-    //                      col_width,
-    //                      "Output_height and
-    //                      padding(padding_up, padding_down)
-    //                      are " "inconsistent.");
    int channels_col = im_channels * filter_height * filter_width;
    T *im_data = im->data<T>();
@@ -503,9 +579,9 @@ class Col2ImFunctor<ColFormat::kCFO, CPU, T> {
 };
 template class Im2ColFunctor<ColFormat::kCFO, CPU, float>;
-// template class Im2ColFunctor<ColFormat::kCFO, CPU, double>;
+template class Im2ColFunctor<ColFormat::kCFO, CPU, int8_t>;
 template class Col2ImFunctor<ColFormat::kCFO, CPU, float>;
-template class Col2ImFunctor<ColFormat::kCFO, CPU, double>;
+template class Col2ImFunctor<ColFormat::kCFO, CPU, int8_t>;
 /*
 * im = [input_channels, input_height, input_width]
@@ -519,8 +595,6 @@ class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
  void operator()(const framework::Tensor &im, const std::vector<int> &dilation,
                  const std::vector<int> &stride,
                  const std::vector<int> &padding, framework::Tensor *col) {
-    //    PADDLE_ENFORCE(im.dims().size() == 3);
-    //    PADDLE_ENFORCE(col->dims().size() == 5);
    int im_channels = im.dims()[0];
    int im_height = im.dims()[1];
    int im_width = im.dims()[2];
@@ -528,18 +602,6 @@ class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
    int filter_width = col->dims()[4];
    int col_height = col->dims()[0];
    int col_width = col->dims()[1];
-    //    PADDLE_ENFORCE_EQ(
-    //        (im_height + padding[0] + padding[2] -
-    //        filter_height) / stride[0]
-    //        + 1, col_height, "Output_height and
-    //        padding(padding_up,
-    //        padding_down) are " "inconsistent.");
-    //    PADDLE_ENFORCE_EQ(
-    //        (im_width + padding[1] + padding[3] -
-    //        filter_width) / stride[1] +
-    //        1, col_width, "col_width and padding(padding_left,
-    //        padding_right)
-    //        are " "inconsistent.");
    const T *im_data = im.data<T>();
    T *col_data = col->data<T>();
@@ -589,8 +651,6 @@ class Col2ImFunctor<ColFormat::kOCF, CPU, T> {
                  const std::vector<int> &dilation,
                  const std::vector<int> &stride,
                  const std::vector<int> &padding, framework::Tensor *im) {
-    //    PADDLE_ENFORCE(im->dims().size() == 3);
-    //    PADDLE_ENFORCE(col.dims().size() == 5);
    int im_channels = im->dims()[0];
    int im_height = im->dims()[1];
    int im_width = im->dims()[2];
@@ -599,19 +659,6 @@ class Col2ImFunctor<ColFormat::kOCF, CPU, T> {
    int col_height = col.dims()[0];
    int col_width = col.dims()[1];
-    //    PADDLE_ENFORCE_EQ(
-    //        (im_height + padding[0] + padding[2] -
-    //        filter_height) / stride[0]
-    //        + 1, col_height, "Output_height and
-    //        padding(padding_up,
-    //        padding_down) are " "inconsistent.");
-    //    PADDLE_ENFORCE_EQ(
-    //        (im_width + padding[1] + padding[3] -
-    //        filter_width) / stride[1] +
-    //        1, col_width, "col_width and padding(padding_left,
-    //        padding_right)
-    //        are " "inconsistent.");
    T *im_data = im->data<T>();
    const T *col_data = col.data<T>();
@@ -651,9 +698,7 @@ class Col2ImFunctor<ColFormat::kOCF, CPU, T> {
 };
 template class Im2ColFunctor<ColFormat::kOCF, CPU, float>;
-template class Im2ColFunctor<ColFormat::kOCF, CPU, double>;
 template class Col2ImFunctor<ColFormat::kOCF, CPU, float>;
-template class Col2ImFunctor<ColFormat::kOCF, CPU, double>;
 }  // namespace math
 }  // namespace operators

--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -15,12 +15,31 @@ limitations under the License. */
 #include "operators/math/math_function.h"
 #include <cstring>
 #include <string>
+#include "framework/data_type.h"
+#include "framework/tensor.h"
 #include "operators/math/gemm.h"
 namespace paddle_mobile {
 namespace operators {
 namespace math {
+struct TensorSetConstant {
+  TensorSetConstant(framework::Tensor *tensor, float value)
+      : tensor_(tensor), value_(value) {}
+  template <typename T>
+  void apply() const {
+    auto *begin = tensor_->mutable_data<T>();
+    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
+  }
+  framework::Tensor *tensor_;
+  float value_;
+};
+void set_constant(framework::Tensor *tensor, float value) {
+  framework::VisitDataType(framework::ToDataType(tensor->type()),
+                           TensorSetConstant(tensor, value));
+}
 template <>
 void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
                   const framework::Tensor &matrix_b, bool trans_b, float alpha,
@@ -135,7 +154,7 @@ template <typename T>
 struct ClearTensor<CPU, T> {
  void operator()(framework::Tensor *tensor) {
    auto size = tensor->numel();
-    auto *tensor_data = tensor->data<float>();
+    auto *tensor_data = tensor->data<T>();
    memset((void *)tensor_data, 0, sizeof(T) * size);  // NOLINT
  }
 };
@@ -151,9 +170,9 @@ struct RowwiseAdd<CPU, T> {
    PADDLE_MOBILE_ENFORCE((output->dims() == in_dims),
                          "output->dims() must be equal to in_dims.");
-    auto *input_data = input.data<float>();
+    auto *input_data = input.data<T>();
-    auto *out_data = output->data<float>();
+    auto *out_data = output->data<T>();
-    auto *vec_data = vector.data<float>();
+    auto *vec_data = vector.data<T>();
    for (int64_t i = 0; i < in_dims[0]; ++i) {
      for (int64_t j = 0; j < size; ++j) {
        out_data[i * size + j] = input_data[i * size + j] + vec_data[j];

--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -15,17 +15,20 @@ limitations under the License. */
 #pragma once
 #include <cmath>
+#include <string>
 #include "framework/tensor.h"
 namespace paddle_mobile {
 namespace operators {
 namespace math {
+void set_constant(framework::Tensor *tensor, float value);
 template <typename T>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
            const framework::Tensor &matrix_b, bool trans_b, T alpha,
            framework::Tensor *matrix_out, T beta, bool relu = false,
-            float *bias = nullptr);
+            T *bias = nullptr);
 template <typename T>
 void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,

--- a/src/operators/math/math_function_int8.cpp
+++ b/src/operators/math/math_function_int8.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <cstring>
+#include <string>
+#include "operators/math/gemm.h"
+#include "operators/math/math_function.h"
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+template <>
+void matmul<int8_t>(const framework::Tensor &matrix_a, bool trans_a,
+                    const framework::Tensor &matrix_b, bool trans_b,
+                    int8_t alpha, framework::Tensor *matrix_out, int8_t beta,
+                    bool relu, int8_t *bias) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  PADDLE_MOBILE_ENFORCE(
+      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+      "The input and output of matmul be matrix");
+  int32_t M = dim_out[0];
+  int32_t N = dim_out[1];
+  int32_t K = (!trans_a) ? dim_a[1] : dim_a[0];
+  Gemm gemm;
+  if (trans_a) {
+    int32_t numel = matrix_a.numel();
+    int32_t m = matrix_a.dims()[0];
+    int32_t n = matrix_a.dims()[1];
+    int8_t *tmp = (int8_t *)(matrix_a.data<int8_t>());  // NOLINT
+    int8_t *a = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * numel));
+    int32_t index = 0;
+    for (int32_t j = 0; j < n; j++) {
+      for (int32_t i = 0; i < m; i++) {
+        a[index++] = tmp[i * n + j];
+      }
+    }
+    gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
+               matrix_out->data<int32_t>(), N, relu, bias);
+  } else {
+    gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
+               matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(), N,
+               relu, bias);
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/math/pad.cpp
+++ b/src/operators/math/pad.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/math/pad.h"
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+template <typename T>
+class PadFunctor<CPU, T> {
+ public:
+  void operator()(const framework::Tensor &input, const int pad_h,
+                  const int pad_w, framework::Tensor *output) {
+    const T *in_data = input.data<T>();
+    T *out_data = output->mutable_data<T>();
+    const framework::DDim &input_shape = input.dims();
+    const framework::DDim &output_shape = output->dims();
+    // fill output with 0
+    memset(out_data, 0, sizeof(T) * output->numel());
+    // should make sure the shape of output is match with input
+    for (int i = 0; i < input_shape[0]; ++i) {
+      for (int c = 0; c < input_shape[1]; ++c) {
+        out_data += pad_h * output_shape[3];
+        for (int h = 0; h < input_shape[2]; ++h) {
+          memcpy(out_data + pad_w, in_data, sizeof(T) * input_shape[3]);
+          out_data += output_shape[3];
+          in_data += input_shape[3];
+        }
+        out_data += pad_h * output_shape[3];
+      }
+    }
+  }
+};
+template class PadFunctor<CPU, float>;
+template class PadFunctor<CPU, int8_t>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/math/pad.h
+++ b/src/operators/math/pad.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "framework/tensor.h"
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+template <typename DeviceType, typename T>
+class PadFunctor {
+ public:
+  void operator()(const framework::Tensor &input, const int pad_h,
+                  const int pad_w, framework::Tensor *output);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/math/vol2col.cpp
+++ b/src/operators/math/vol2col.cpp
@@ -32,9 +32,6 @@ class Vol2ColFunctor<CPU, T> {
  void operator()(const Tensor &vol, const std::vector<int> &dilations,
                  const std::vector<int> &strides,
                  const std::vector<int> &paddings, Tensor *col) const {
-    //    PADDLE_ENFORCE(vol.dims().size() == 4);
-    //    PADDLE_ENFORCE(col->dims().size() == 7);
    int input_channels = vol.dims()[0];
    int input_depth = vol.dims()[1];
    int input_height = vol.dims()[2];
@@ -48,32 +45,6 @@ class Vol2ColFunctor<CPU, T> {
    int channels_col =
        input_channels * filter_depth * filter_height * filter_width;
-    //    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-    //                       ((dilations[0] * (filter_depth - 1)
-    //                       + 1))) /
-    //                              strides[0] +
-    //                          1,
-    //                      output_depth,
-    //                      "input_depth and output_depth are "
-    //                      "mismatching.");
-    //    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-    //                       ((dilations[1] * (filter_height -
-    //                       1) + 1))) /
-    //                              strides[1] +
-    //                          1,
-    //                      output_height,
-    //                      "input_height and output_height are
-    //                      "
-    //                      "mismatching.");
-    //    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-    //                       ((dilations[2] * (filter_width - 1)
-    //                       + 1))) /
-    //                              strides[2] +
-    //                          1,
-    //                      output_width,
-    //                      "input_width and output_width are "
-    //                      "mismatching.");
    const T *vol_data = vol.data<T>();
    T *col_data = col->data<T>();
@@ -119,9 +90,6 @@ class Col2VolFunctor<CPU, T> {
  void operator()(const Tensor &col, const std::vector<int> &dilations,
                  const std::vector<int> &strides,
                  const std::vector<int> &paddings, Tensor *vol) const {
-    //    PADDLE_ENFORCE(vol->dims().size() == 4);
-    //    PADDLE_ENFORCE(col.dims().size() == 7);
    int input_channels = vol->dims()[0];
    int input_depth = vol->dims()[1];
    int input_height = vol->dims()[2];
@@ -135,31 +103,6 @@ class Col2VolFunctor<CPU, T> {
    int channels_col =
        input_channels * filter_depth * filter_height * filter_width;
-    //    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-    //                       ((dilations[0] * (filter_depth - 1)
-    //                       + 1))) /
-    //                              strides[0] +
-    //                          1,
-    //                      output_depth,
-    //                      "input_depth and output_depth are "
-    //                      "mismatching.");
-    //    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-    //                       ((dilations[1] * (filter_height -
-    //                       1) + 1))) /
-    //                              strides[1] +
-    //                          1,
-    //                      output_height,
-    //                      "input_height and output_height are
-    //                      "
-    //                      "mismatching.");
-    //    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-    //                       ((dilations[2] * (filter_width - 1)
-    //                       + 1))) /
-    //                              strides[2] +
-    //                          1,
-    //                      output_width,
-    //                      "input_width and output_width are "
-    //                      "mismatching.");
    T *vol_data = vol->data<T>();
    const T *col_data = col.data<T>();
@@ -195,9 +138,9 @@ class Col2VolFunctor<CPU, T> {
 };
 template class Vol2ColFunctor<CPU, float>;
-template class Vol2ColFunctor<CPU, double>;
+template class Vol2ColFunctor<CPU, int8_t>;
 template class Col2VolFunctor<CPU, float>;
-template class Col2VolFunctor<CPU, double>;
+template class Col2VolFunctor<CPU, int8_t>;
 }  // namespace math
 }  // namespace operators

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -546,11 +546,11 @@ class MulParam : OpParam {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -999,6 +999,28 @@ class MultiClassNMSParam : public OpParam {
 };
 #endif
+#ifdef POLYGONBOXTRANSFORM_OP
+template <typename Dtype>
+class PolygonBoxTransformParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+ public:
+  PolygonBoxTransformParam(const VariableNameMap &inputs,
+                           const VariableNameMap &outputs,
+                           const AttributeMap &attrs, const Scope &scope) {
+    input_ = InputFrom<GType>(inputs, scope);
+    output_ = OutputFrom<GType>(outputs, scope);
+  }
+  const RType *Input() const { return input_; }
+  RType *Output() const { return output_; }
+ private:
+  RType *input_;
+  RType *output_;
+};
+#endif
 template <typename Dtype>
 class FeedParam : public OpParam {
  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
@@ -1041,6 +1063,42 @@ class FetchParam : public OpParam {
  RType *out_;
 };
+#ifdef FILL_CONSTANT_OP
+template <typename Dtype>
+class FillConstantParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+ public:
+  FillConstantParam(const VariableNameMap &inputs,
+                    const VariableNameMap &outputs, const AttributeMap &attrs,
+                    const Scope &scope) {
+    out_var_ = OutVarFrom(outputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    dtype_ = GetAttr<int>("dtype", attrs);
+    shape_ = GetAttr<vector<int>>("shape", attrs);
+    value_ = GetAttr<float>("value", attrs);
+  }
+  Variable *OutVar() const { return out_var_; }
+  RType *Out() const { return out_; }
+  const int &DataDtype() const { return dtype_; }
+  const vector<int> &Shape() const { return shape_; }
+  const float &Value() const { return value_; }
+ private:
+  Variable *out_var_;
+  RType *out_;
+  int dtype_;
+  vector<int> shape_;
+  float value_;
+};
+#endif
 #ifdef TRANSPOSE_OP
 template <typename Dtype>
 class TransposeParam : public OpParam {
@@ -1401,11 +1459,11 @@ class FusionFcParam : public OpParam {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
@@ -1441,11 +1499,11 @@ class FusionConvAddParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
@@ -1496,11 +1554,11 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1554,11 +1612,11 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1629,11 +1687,11 @@ class FusionConvAddBNReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1715,11 +1773,11 @@ class FusionConvBNAddReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1782,11 +1840,11 @@ class FusionConvBNParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1857,11 +1915,11 @@ class FusionConvAddBNParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1983,11 +2041,11 @@ class FusionConvBNReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -2272,6 +2330,7 @@ class ShapeParam : public OpParam {
 };
 #endif
+#ifdef QUANT_OP
 template <typename Dtype>
 class QuantizeParam : public OpParam {
  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
@@ -2282,14 +2341,12 @@ class QuantizeParam : public OpParam {
                const AttributeMap &attrs, const Scope &scope) {
    input_ = InputXFrom<GType>(inputs, scope);
    out_ = OutFrom<GType>(outputs, scope);
-    if (HasAttr("is_static", attrs)) {
-      is_static_ = GetAttr<bool>("is_static", attrs);
-    }
    // online
    // scale = max(abs(x))
    online_scale_ = GetVarValue<GType>("OutScale", outputs, scope);
    // offline
    if (HasAttr("static_scale", attrs)) {
+      is_static_ = true;
      static_scale_ = GetAttr<float>("static_scale", attrs);
    }
    // x = round(scale * x)
@@ -2311,9 +2368,11 @@ class QuantizeParam : public OpParam {
  float static_scale_ = 1.0f;
  // round method type
  // nearest_zero and nearest_even is valid currently
-  RoundType round_type_ = ROUND_NEAREST_TO_EVEN;
+  RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
 };
+#endif
+#ifdef DEQUANT_OP
 template <typename Dtype>
 class DequantizeParam : public OpParam {
  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
@@ -2341,6 +2400,7 @@ class DequantizeParam : public OpParam {
  RType *activation_scale_;
  float weight_scale_;
 };
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/polygon_box_transform_op.cpp
+++ b/src/operators/polygon_box_transform_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POLYGONBOXTRANSFORM_OP
+#include "operators/polygon_box_transform_op.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename Dtype, typename T>
+void PolygonBoxTransformOp<Dtype, T>::InferShape() const {
+  PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr,
+                        "Input (Input) of get_shape op should not be null.");
+  PADDLE_MOBILE_ENFORCE(this->param_.Output() != nullptr,
+                        "Output (Output) of get_shape op should not be null.");
+  auto input_dims = this->param_.Input()->dims();
+  PADDLE_MOBILE_ENFORCE(input_dims.size() == 4, "input's rank must be 4.");
+  PADDLE_MOBILE_ENFORCE(input_dims[1] % 2 == 0,
+                        "input's second dimension must be even.");
+  this->param_.Output()->Resize(input_dims);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(polygon_box_transform, ops::PolygonBoxTransformOp);
+#endif
+#endif
--- a/src/operators/polygon_box_transform_op.h
+++ b/src/operators/polygon_box_transform_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POLYGONBOXTRANSFORM_OP
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/polygon_box_transform_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using paddle_mobile::framework::Tensor;
+template <typename DeviceType, typename T>
+class PolygonBoxTransformOp
+    : public framework::OperatorWithKernel<
+          DeviceType, PolygonBoxTransformParam<DeviceType>,
+          operators::PolygonBoxTransformKernel<DeviceType, T>> {
+ public:
+  PolygonBoxTransformOp(const std::string &type, const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs,
+                        std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, PolygonBoxTransformParam<DeviceType>,
+            operators::PolygonBoxTransformKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  using framework::OperatorWithKernel<
+      DeviceType, PolygonBoxTransformParam<DeviceType>,
+      operators::PolygonBoxTransformKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+ protected:
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/quantize_op.cpp
+++ b/src/operators/quantize_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef QUANT_OP
 #include "operators/quantize_op.h"
 #include <vector>
@@ -33,3 +35,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(quantize, ops::QuantizeOp);
 #endif
+#endif
--- a/src/operators/quantize_op.h
+++ b/src/operators/quantize_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef QUANT_OP
 #pragma once
 #include <string>
@@ -40,3 +42,5 @@ class QuantizeOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/src/operators/sum_op.cpp
+++ b/src/operators/sum_op.cpp
@@ -26,7 +26,7 @@ void SumOp<Dtype, T>::InferShape() const {
  auto inputs = this->param_.Inputs();
  const size_t n = inputs.size();
-  std::vector<DDim> inputs_dims;
+  std::vector<framework::DDim> inputs_dims;
  inputs_dims.reserve(n);
  for (int i = 0; i < n; i++) {
    inputs_dims.push_back(inputs[i]->dims());
@@ -65,7 +65,6 @@ REGISTER_OPERATOR_CPU(sum, ops::SumOp);
 REGISTER_OPERATOR_MALI_GPU(sum, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(sum, ops::ConcatOp);
 #endif
 #endif
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
--- a/test/common/test_gemm_accuracy.cpp
+++ b/test/common/test_gemm_accuracy.cpp
@@ -84,7 +84,7 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
  }
  paddle_mobile::operators::math::Gemm gemm;
-  gemm.SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias,
+  gemm.SgemmWithBn(m, n, k, 1, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias,
                   nullptr);
  int eq = 0;
  int neq = 0;

--- a/test/common/test_gemm_int8_accuracy.cpp
+++ b/test/common/test_gemm_int8_accuracy.cpp
--- a/test/common/test_gemm_perf.cpp
+++ b/test/common/test_gemm_perf.cpp
@@ -28,13 +28,11 @@ limitations under the License. */
 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
+  paddle_mobile.SetThreadNum(1);
-  Tensor aa, bb, cc, scale, bias;
+  Tensor aa, bb, cc;
  auto aaptr = aa.mutable_data<float>({m, k});
  auto bbptr = bb.mutable_data<float>({k, n});
  auto ccptr = cc.mutable_data<float>({m, n});
-  auto scaleptr = scale.mutable_data<float>({m});
-  auto biasptr = bias.mutable_data<float>({m});
  for (int i = 0; i < m * k; ++i) {
    aaptr[i] = 2;
@@ -45,23 +43,55 @@ int main() {
  for (int i = 0; i < m * n; ++i) {
    ccptr[i] = 2;
  }
-  for (int i = 0; i < m; ++i) {
-    scaleptr[i] = 1;
+  Tensor aa_int8, bb_int8, cc_int8;
-    biasptr[i] = 0;
+  auto aaptr_int8 = aa_int8.mutable_data<int8_t>({m, k});
+  auto bbptr_int8 = bb_int8.mutable_data<int8_t>({k, n});
+  auto ccptr_int8 = cc_int8.mutable_data<int32_t>({m, n});
+  for (int i = 0; i < m * k; ++i) {
+    aaptr_int8[i] = static_cast<int8_t>(2);
+  }
+  for (int i = 0; i < k * n; ++i) {
+    bbptr_int8[i] = static_cast<int8_t>(2);
+  }
+  for (int i = 0; i < m * n; ++i) {
+    ccptr_int8[i] = static_cast<int32_t>(2);
  }
-  auto time1 = time();
+  // float
+  // warm-up 10 times
  for (int j = 0; j < 10; ++j) {
    paddle_mobile::operators::math::matmul<float>(
        aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
-        false, biasptr);
+        false, nullptr);
+  }
-    //    paddle_mobile::operators::math::matmulWithBn<float>(
+  auto time1 = time();
-    //        aa, false, bb, false, static_cast<float>(1), &cc,
+  for (int j = 0; j < 10; ++j) {
-    //        static_cast<float>(0), true, &scale, &bias, 0);
+    paddle_mobile::operators::math::matmul<float>(
+        aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
+        false, nullptr);
  }
  auto time2 = time();
-  std::cout << "gemm  cost :" << time_diff(time1, time2) / 10 << "ms\n";
+  std::cout << "float gemm  cost :" << time_diff(time1, time2) / 10 << "ms\n";
+  // int8_t
+  // warm-up 10 times
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul<int8_t>(
+        aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
+        static_cast<int8_t>(0), false, nullptr);
+  }
+  auto time3 = time();
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul<int8_t>(
+        aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
+        static_cast<int8_t>(0), false, nullptr);
+  }
+  auto time4 = time();
+  std::cout << "int8_t gemm  cost :" << time_diff(time3, time4) / 10 << "ms\n";
  return 0;
 }
--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
--- a/test/framework/test_load_memory.cpp
+++ b/test/framework/test_load_memory.cpp
--- a/test/framework/test_load_memory_inference_api.cpp
+++ b/test/framework/test_load_memory_inference_api.cpp
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
--- a/test/net/test_googlenet_quali.cpp
+++ b/test/net/test_googlenet_quali.cpp
--- a/test/operators/test_dequantize_op.cpp
+++ b/test/operators/test_dequantize_op.cpp
@@ -59,7 +59,7 @@ int TestDequqntizeOp() {
  framework::Tensor output_cmp;
  output_cmp.Resize(dim);
-  float dequant_scale = 1.f / (1.27 * 1.74);
+  float dequant_scale = 1.27 / 1.74;
  dequantize(input, dequant_scale, &output_cmp);
  const float* output_cmp_data = output_cmp.data<float>();
  for (int i = 0; i < output->numel(); ++i) {

--- a/test/operators/test_fill_constant_op.cpp
+++ b/test/operators/test_fill_constant_op.cpp
--- a/test/operators/test_int8_conv_op.cpp
+++ b/test/operators/test_int8_conv_op.cpp
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
--- a/test/operators/test_polygon_box_transform_op.cpp
+++ b/test/operators/test_polygon_box_transform_op.cpp
--- a/test/operators/test_quantize_op.cpp
+++ b/test/operators/test_quantize_op.cpp
--- a/test/test_helper.h
+++ b/test/test_helper.h
--- a/tools/op.cmake
+++ b/tools/op.cmake