diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9268c9a2d1ab3791805c539eb408560bc3aaff26..bdbf5a6ea604400fb5087976df0e1e9c279fd78d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,7 +29,10 @@ if(DEBUGING)
     message(STATUS "debugging mode")
     add_definitions(-DPADDLE_MOBILE_DEBUG)
 else()
-    add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
+    if(FPGA)
+    else()
+        add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
+    endif()
 endif()
 
 if(USE_EXCEPTION)
@@ -93,8 +96,7 @@ else()
 endif()
 
 if(FPGA)
-    set(DEBUGING ON)
-    add_definitions(-DPADDLE_MOBILE_DEBUG)
+    message("FPGA mode enabled")
     add_definitions(-DPADDLE_MOBILE_FPGA)
 else()
     file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
@@ -177,6 +179,10 @@ if(DEBUGING)
     else()
         add_subdirectory(test)
     endif()
+elseif(FPGA)
+    add_subdirectory(test)
 endif()
 
 
+
+
diff --git a/README.md b/README.md
index b86860830066cf1b622ff3b449803b0446794b74..1a478db3770e1f5e518594fd2fefabb686cf3c38 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 
 - **FPGA**
 
-    FPGA实现正在进行中，是基于Xilinx的ZU5目标开发板。
+    目前已经支持 ZCU102 开发板。
 
 - **灵活性**
 
@@ -112,6 +112,7 @@ Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 开发文档主要是关于编译、运行等问题。做为开发者，它可以和贡献文档共同结合使用。
 * [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
 * [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
+* [FPGA](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_fpga.md)
 
 ### 贡献文档
 - [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
diff --git a/doc/development_fpga.md b/doc/development_fpga.md
new file mode 100644
index 0000000000000000000000000000000000000000..14cc57c6b4055e8c4e45d8b673eb1e3be22ae256
--- /dev/null
+++ b/doc/development_fpga.md
@@ -0,0 +1,37 @@
+# FPGA开发文档
+
+FPGA平台的代码在Xilinx ZCU102 revision 1.0开发板测试Resnet50成功，预测结果正确。
+
+## 准备硬件
+___
+
+1. 购买Xilinx ZCU102 revision1.0 开发板
+2. 另外下载Xilinx ZCU102 Ubuntu[镜像文件](https://www.xilinx.com/member/forms/download/xef.html?filename=Ubuntu_Desktop_Release_2018_1.zip)，并烧录进SD卡。
+ * Windowns系统可使用Win32DiskImager
+ * Linux系统使用dd命令：dd if=name.img of=/dev/sdb
+2. 将SD卡插入电脑，替换分区1中已有的BOOT.BIN、image.ub为[BOOT.BIN、image.ub](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)
+3. 将SD卡插入ZCU102开发板，设置板拨码开关为SD卡启动，上电启动Linux系统.
+3. 装载驱动：sudo insmod [fpgadrv.ko](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)
+
+
+## 编译工程
+___
+1. 将最新的paddle mobile 代码复制到ZCU102开发板中。
+2. 进入paddle-mobile根目录， CMakeLists.txt 设置平台为 option(FPGA "fpga support" ON)。CPU和MALI\_GPU选项设置为OFF。
+2. 执行以下命令，可在./test/build下生成test-resnet50可执行程序。
+    * mkdir build
+    * cd build
+    * cmake ..
+    * make
+
+## 准备模型和数据
+___
+1. 模型文件放在./test/models/resnet50中。将[\_\_model\_\_](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)文件复制到此文件夹下。
+2. 另外下载模型[权重文件](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar),解压后也放在./test/models/resnet50 中。
+3. 将数据文件[image_src_float](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)复制到/test/images下。此数据文件对应着标准数据集中的ILSVRC2012_val_00000885.JPEG，分类标签为80， 对应着"black grouse".
+
+## 运行程序
+___
+1. 进入./test/build目录。
+2. sudo ./test-resnet50
+3. 如果于DEBUG选项是否打开，屏幕会输出很多中间打印信息。最终打印出预测分类结果为80。
diff --git a/src/common/types.cpp b/src/common/types.cpp
index 46e5bfab3711ac81f5438cb21105843f52183e15..8c8de7765161e61dc75036a87a34fc6abd2df43e 100644
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -22,6 +22,7 @@ const char *G_OP_TYPE_BATCHNORM = "batch_norm";
 const char *G_OP_TYPE_BOX_CODER = "box_coder";
 const char *G_OP_TYPE_CONCAT = "concat";
 const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
+const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant";
 const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
 const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
 const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu";
@@ -34,6 +35,7 @@ const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
 const char *G_OP_TYPE_LRN = "lrn";
 const char *G_OP_TYPE_MUL = "mul";
 const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
+const char *G_OP_TYPE_POLYGON_BOX_TRANSFORM = "polygon_box_transform";
 const char *G_OP_TYPE_POOL2D = "pool2d";
 const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
 const char *G_OP_TYPE_RELU = "relu";
@@ -94,9 +96,11 @@ std::unordered_map<
         {G_OP_TYPE_FUSION_CONV_BN_ADD_RELU, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
         {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
+        {G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}},
         {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
         {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
         {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
         {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
diff --git a/src/common/variant.h b/src/common/variant.h
index ca2fcc090769bc49603176dc361d5f8c8e22890c..4aa4f47c628caec438ecd00522d90ebf299da6a0 100644
--- a/src/common/variant.h
+++ b/src/common/variant.h
@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cstdlib>
+#pragma once
 
+#include <cstdlib>
+#include <cstring>
+#include <string>
 #include "common/enforce.h"
 #include "common/log.h"
 
-#pragma once
-
 namespace paddle_mobile {
+
 template <int ID, typename Type>
 struct IDToType {
   typedef Type type_t;
diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp
index 725895ae6a3da161af545646c2a74bda16be532f..d3f473a7f43714592779de941ed1a6ea53baea83 100644
--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "fpga/filter.h"
 #include "fpga/image.h"
 #define FPGA_TEST_MODE
-//#define PADDLE_MOBILE_OS_LINUX
+#define PADDLE_MOBILE_OS_LINUX
 
 namespace paddle_mobile {
 namespace fpga {
@@ -149,7 +149,7 @@ int ComputeBasicConv(const struct ConvArgs &args) {
   return do_ioctl(IOCTL_CONFIG_CONV, &args);
 }
 
-int ComputeFpgaConv(const struct WrapperConvArgs &args) {
+int ComputeFpgaConv(const struct SplitConvArgs &args) {
 #ifdef FPGA_TEST_MODE
   DLOG << "=============ComputeFPGAConv===========";
   DLOG << "   filter_num:" << args.filter_num
@@ -194,8 +194,8 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
 #ifdef FPGA_TEST_MODE
   DLOG << "=============ComputeFpgaEWAdd===========";
   DLOG << "   relu_enabled:" << args.relu_enabled
-       << "   const0:" << fp16_2_fp32(short(args.const0))
-       << "   const1:" << fp16_2_fp32(short(args.const1));
+       << "   const0:" << fp16_2_fp32(int16_t(args.const0))
+       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
   DLOG << "   image0_address:" << args.image0.address
        << "   image0_scale_address:" << args.image0.scale_address
        << "   image0_channels:" << args.image0.channels
@@ -383,10 +383,10 @@ void format_concat_output(framework::Tensor *out, int height, int width,
   out->reset_data_ptr(data_ptr);
 }
 
-void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
-                   framework::Tensor *out, framework::Tensor *filter,
-                   bool relu_enabled, int group_num, int stride_h, int stride_w,
-                   int padding_h, int padding_w, float *bs_ptr) {
+void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
+                    framework::Tensor *out, framework::Tensor *filter,
+                    bool relu_enabled, int group_num, int stride_h,
+                    int stride_w, int padding_h, int padding_w, float *bs_ptr) {
   auto input_ptr = input->data<float>();
   auto filter_ptr = filter->data<float>();
   auto out_ptr = out->data<float>();
diff --git a/src/fpga/api.h b/src/fpga/api.h
index a4f71e119c83de40771f321abfc8bb2821e4523a..f535975a35ecc3c454bbac597b31d8c3670cbf91 100644
--- a/src/fpga/api.h
+++ b/src/fpga/api.h
@@ -89,7 +89,7 @@ struct ConcatArgs {
   uint32_t width;
 };
 
-struct WrapperConvArgs {
+struct SplitConvArgs {
   uint32_t split_num;
   uint32_t group_num;
   uint32_t filter_num;
@@ -98,6 +98,14 @@ struct WrapperConvArgs {
   struct ConcatArgs concat_arg;
 };
 
+struct GroupConvArgs {
+  uint32_t group_num;
+  uint32_t filter_num;
+  struct ImageOutputArgs output;
+  struct SplitConvArgs* conv_args;
+  struct ConcatArgs concat_arg;
+};
+
 struct PoolingArgs {
   int16_t mode;  // mode: 0:max, 1:avg
   half kernel_reciprocal;
@@ -159,30 +167,6 @@ struct MemoryCacheArgs {
 #define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
 #define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
 
-enum FPGA_ERR_TYPE {
-  ERR_IOCTL_CMD = -1,
-  ERR_TIMEOUT = -2,
-  ERR_COMPLETION_TIMEOUT = -3,
-  ERR_INVALID_FPGA_ADDR = -4,
-  ERR_NOMEM = -5,
-  ERR_NO_RESERVE_MEM = -6,
-  ERR_COPY_FROM_USER = -7,
-  ERR_COPY_TO_USER = -8,
-  ERR_DEL_TIMER = -9,
-  ERR_ENABLE_MSI = -10,
-  ERR_REGISTER_IRQ = -11,
-  ERR_PCIE_REGISTER = -12,
-  ERR_PCIE_PROBE = -13,
-  ERR_REGISTER_BLOCK = -14,
-  ERR_ALLOC_GENDISK = -15,
-  ERR_INIT_QUEUE = -16,
-  ERR_WAIT = -17,
-  ERR_ECC_ERROR = -31,
-  ERR_FPGA_FAIL_STOP = -64,
-  ERR_FPGA_DEBUG_STOP = -113,
-  DEV_TMP_UNAVAILABLE = -128
-};
-
 //============================== API =============================
 
 int open_device();
@@ -195,7 +179,7 @@ int fpga_flush(void* address, size_t size);
 int fpga_invalidate(void* address, size_t size);
 
 int PerformBypass(const struct BypassArgs& args);
-int ComputeFpgaConv(const struct WrapperConvArgs& args);
+int ComputeFpgaConv(const struct SplitConvArgs& args);
 int ComputeFpgaPool(const struct PoolingArgs& args);
 int ComputeFpgaEWAdd(const struct EWAddArgs& args);
 int ComputeFPGAConcat(const struct ConcatArgs& args);
@@ -220,10 +204,10 @@ void format_bias_scale_array(float** bias_scale_array,
 void format_concat_output(framework::Tensor* out, int height, int width,
                           int image_num, uint32_t* channel_num);
 
-void fill_conv_arg(struct WrapperConvArgs* arg, framework::Tensor* input,
-                   framework::Tensor* out, framework::Tensor* filter,
-                   bool relu_enabled, int group_num, int stride_h, int stride_w,
-                   int padding_h, int padding_w, float* bs_ptr);
+void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
+                    framework::Tensor* out, framework::Tensor* filter,
+                    bool relu_enabled, int group_num, int stride_h,
+                    int stride_w, int padding_h, int padding_w, float* bs_ptr);
 
 half fp32_2_fp16(float fp32_num);
 float fp16_2_fp32(half fp16_num);
diff --git a/src/fpga/bias_scale.cpp b/src/fpga/bias_scale.cpp
index 23889d5b1fee3d8cb9e4673f42b18574366411eb..50f1ed03f0121b5afdc41d427e5b52675994bd1e 100644
--- a/src/fpga/bias_scale.cpp
+++ b/src/fpga/bias_scale.cpp
@@ -27,9 +27,6 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
       (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
   int num_per_div_after_alignment =
       align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
-  if (num_per_div_before_alignment == num_per_div_after_alignment) {
-    return;
-  }
   int num_element =
       2 * div_num * num_per_div_after_alignment;  // including bias & scale
   float *ptr_aligned =
diff --git a/src/fpga/filter.cpp b/src/fpga/filter.cpp
index c824b446ce3a4c3f13ad788780997a3920a1484c..db851b926bbbd549205ee5d75bc46a6c04888098 100644
--- a/src/fpga/filter.cpp
+++ b/src/fpga/filter.cpp
@@ -21,7 +21,10 @@ namespace paddle_mobile {
 namespace fpga {
 namespace filter {
 
-int calc_division_capacity(int chw) { return 2048 / ((chw + 15) / 16) * 32; }
+int calc_division_capacity(int chw) {
+  int n = 2048 / ((chw + 15) / 16) * 32;
+  return n < 2048 ? n : 2048;
+}
 
 int calc_split_num(int num, int division_capacity) {
   return (num + division_capacity - 1) / division_capacity;
diff --git a/src/framework/attribute.h b/src/framework/attribute.h
index ff9e1204a1e32f3ffe6271d4d2d76b8e3cf24d63..a94346bc7ab321b0f5710a98fb3cc60198f148b0 100644
--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -156,7 +156,7 @@ class AttrReader {
   template <typename T>
   inline T Get(const string &name) const {
     PADDLE_MOBILE_ENFORCE(attrs_.count(name) != 0,
-                          "%s should  be in AttributeMap", name);
+                          "%s should  be in AttributeMap", name.c_str());
     return ((Attribute)attrs_.at(name)).Get<T>();
   }
 
diff --git a/src/framework/data_type.cpp b/src/framework/data_type.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0bcf7d9f67dae28db5a316476778b4132b39b274
--- /dev/null
+++ b/src/framework/data_type.cpp
@@ -0,0 +1,107 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "framework/data_type.h"
+#include <stdint.h>
+#include <string>
+#include <unordered_map>
+
+namespace paddle_mobile {
+namespace framework {
+
+struct DataTypeMap {
+  std::unordered_map<std::type_index,
+                     _PaddleMobile__Framework__Proto__VarType__Type>
+      cpp_to_proto_;
+  std::unordered_map<int, std::type_index> proto_to_cpp_;
+  std::unordered_map<int, std::string> proto_to_str_;
+  std::unordered_map<std::type_index, size_t> cpp_to_size_;
+};
+
+static DataTypeMap* InitDataTypeMap();
+// C++11 removes the need for manual locking. Concurrent execution shall wait if
+// a static local variable is already being initialized.
+// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
+static DataTypeMap& gDataTypeMap() {
+  static DataTypeMap* g_data_type_map_ = InitDataTypeMap();
+  return *g_data_type_map_;
+}
+
+template <typename T>
+static inline void RegisterType(
+    DataTypeMap* map, _PaddleMobile__Framework__Proto__VarType__Type proto_type,
+    const std::string& name) {
+  map->proto_to_cpp_.emplace(static_cast<int>(proto_type), typeid(T));
+  map->cpp_to_proto_.emplace(typeid(T), proto_type);
+  map->proto_to_str_.emplace(static_cast<int>(proto_type), name);
+  map->cpp_to_size_.emplace(typeid(T), sizeof(T));
+}
+
+static DataTypeMap* InitDataTypeMap() {
+  auto retv = new DataTypeMap();
+
+#define RegType(cc_type, proto_type) \
+  RegisterType<cc_type>(retv, proto_type, #cc_type)
+
+  // NOTE: Add your customize type here.
+  // RegType(float16, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16);
+  RegType(float, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32);
+  RegType(double, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64);
+  RegType(int, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32);
+  RegType(int64_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64);
+  RegType(bool, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL);
+  RegType(size_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T);
+  RegType(int16_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16);
+  RegType(uint8_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8);
+  RegType(int8_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8);
+
+#undef RegType
+  return retv;
+}
+
+_PaddleMobile__Framework__Proto__VarType__Type ToDataType(
+    std::type_index type) {
+  auto it = gDataTypeMap().cpp_to_proto_.find(type);
+  if (it != gDataTypeMap().cpp_to_proto_.end()) {
+    return it->second;
+  }
+  PADDLE_MOBILE_THROW_EXCEPTION("Not support %s as tensor type", type.name());
+}
+
+std::type_index ToTypeIndex(
+    _PaddleMobile__Framework__Proto__VarType__Type type) {
+  auto it = gDataTypeMap().proto_to_cpp_.find(static_cast<int>(type));
+  if (it != gDataTypeMap().proto_to_cpp_.end()) {
+    return it->second;
+  }
+  PADDLE_MOBILE_THROW_EXCEPTION(
+      "Not support _PaddleMobile__Framework__Proto__VarType__Type(%d) as "
+      "tensor type",
+      static_cast<int>(type));
+}
+
+std::string DataTypeToString(
+    const _PaddleMobile__Framework__Proto__VarType__Type type) {
+  auto it = gDataTypeMap().proto_to_str_.find(static_cast<int>(type));
+  if (it != gDataTypeMap().proto_to_str_.end()) {
+    return it->second;
+  }
+  PADDLE_MOBILE_THROW_EXCEPTION(
+      "Not support _PaddleMobile__Framework__Proto__VarType__Type(%d) as "
+      "tensor type",
+      static_cast<int>(type));
+}
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/data_type.h b/src/framework/data_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e3623fdedcb527cb0c85bbb7a2eaf04d91a2193
--- /dev/null
+++ b/src/framework/data_type.h
@@ -0,0 +1,77 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <typeindex>
+#include "common/enforce.h"
+#include "framework/framework.pb-c.h"
+
+namespace paddle_mobile {
+
+namespace framework {
+
+extern _PaddleMobile__Framework__Proto__VarType__Type ToDataType(
+    std::type_index type);
+extern std::type_index ToTypeIndex(
+    _PaddleMobile__Framework__Proto__VarType__Type type);
+
+template <typename Visitor>
+inline void VisitDataType(_PaddleMobile__Framework__Proto__VarType__Type type,
+                          Visitor visitor) {
+  switch (type) {
+    // case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16:
+    //   visitor.template apply<float16>();
+    //   break;
+    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32:
+      visitor.template apply<float>();
+      break;
+    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64:
+      visitor.template apply<double>();
+      break;
+    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32:
+      visitor.template apply<int>();
+      break;
+    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64:
+      visitor.template apply<int64_t>();
+      break;
+    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL:
+      visitor.template apply<bool>();
+      break;
+    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8:
+      visitor.template apply<uint8_t>();
+      break;
+    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16:
+      visitor.template apply<int16_t>();
+      break;
+    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8:
+      visitor.template apply<int8_t>();
+      break;
+    default:
+      PADDLE_MOBILE_THROW_EXCEPTION("Not supported %d", type);
+  }
+}
+
+extern std::string DataTypeToString(
+    const _PaddleMobile__Framework__Proto__VarType__Type type);
+inline std::ostream& operator<<(
+    std::ostream& out,
+    const _PaddleMobile__Framework__Proto__VarType__Type& type) {
+  out << DataTypeToString(type);
+  return out;
+}
+
+}  // namespace framework
+}  // namespace paddle_mobile
diff --git a/src/framework/load_ops.h b/src/framework/load_ops.h
index 03fdd8d433cd40aa7ba4786f02221bd24bd3a050..2b76b0158fe06e8678208f6f98fcdb71f8d91e51 100644
--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -64,6 +64,9 @@ limitations under the License. */
 // load requared ops
 LOAD_OP(feed)
 LOAD_OP(fetch)
+#ifdef FILL_CONSTANT_OP
+LOAD_OP(fill_constant)
+#endif
 #ifdef BATCHNORM_OP
 LOAD_OP2(batch_norm, CPU, MALI_GPU);
 #endif
@@ -199,6 +202,9 @@ LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA);
 #ifdef MULTICLASSNMS_OP
 LOAD_OP1(multiclass_nms, CPU);
 #endif
+#ifdef POLYGONBOXTRANSFORM_OP
+LOAD_OP1(polygon_box_transform, CPU);
+#endif
 #ifdef SUM_OP
 LOAD_OP1(sum, CPU);
 #endif
diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp
index dd865fb27d4345f16ddca8005463986787d681be..21b14dfcac682e7d310dcf4e8c47afaa0fb68fb3 100644
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -32,7 +32,7 @@ template <typename Dtype>
 vector<string> OperatorBase<Dtype>::GetInputKeys() const {
   auto it = op_input_output_key.find(type_);
   if (it == op_input_output_key.end()) {
-    DLOG << type_ << " has no outputs";
+    DLOG << type_ << " has no inputs";
     return {};
   }
   return it->second.first;
diff --git a/src/framework/selected_rows.h b/src/framework/selected_rows.h
index 9c8176285278afa69679ac3471f7a4adb0aeea3f..db49bd91159116883e5fcb148ef3ed012ec42e71 100644
--- a/src/framework/selected_rows.h
+++ b/src/framework/selected_rows.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <vector>
 
 #include "framework/lod_tensor.h"
+#include "framework/mixed_vector.h"
 #include "framework/tensor.h"
 #include "memory/t_malloc.h"
-#include "mixed_vector.h"
 
 namespace paddle_mobile {
 namespace framework {
diff --git a/src/framework/tensor.h b/src/framework/tensor.h
index 909819c145e2a5388ec42d2609f82929ed337d7d..496cde98e57561ca048f356fa397f5447b9050f5 100644
--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -338,10 +338,14 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
   for (int i = 0; i < tensor.numel(); i += stride) {
     if (tensor.type() == typeid(float)) {
       printer << tensor.data<float>()[i] << " ";
+    } else if (tensor.type() == typeid(int32_t)) {
+      printer << tensor.data<int32_t>()[i] << " ";
     } else if (tensor.type() == typeid(int64_t)) {
       printer << tensor.data<int64_t>()[i] << " ";
     } else if (tensor.type() == typeid(int8_t)) {
-      printer << tensor.data<int8_t>()[i] << " ";
+      printer << static_cast<int>(tensor.data<int8_t>()[i]) << " ";
+    } else if (tensor.type() == typeid(int32_t)) {
+      printer << tensor.data<int32_t>()[i] << " ";
     }
   }
 #endif
diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc
index b07232867c0c66a9d064469f279dffe55b4b75bb..6a7dff597af7fa5de06c90304136e81390fe06af 100644
--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -29,7 +29,14 @@ PaddleMobilePredictor<Dtype, P>::PaddleMobilePredictor(
 template <typename Dtype, Precision P>
 bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
   paddle_mobile_.reset(new PaddleMobile<Dtype, P>());
-  if (!config.model_dir.empty()) {
+
+  if (config.memory_pack.from_memory) {
+    DLOG << "load from memory!";
+    paddle_mobile_->LoadCombinedMemory(config.memory_pack.model_size,
+                                       config.memory_pack.model_buf,
+                                       config.memory_pack.combined_params_size,
+                                       config.memory_pack.combined_params_buf);
+  } else if (!config.model_dir.empty()) {
     paddle_mobile_->Load(config.model_dir, config.optimize,
                          config.quantification, config.batch_size);
   } else if (!config.prog_file.empty() && !config.param_file.empty()) {
diff --git a/src/io/executor.cpp b/src/io/executor.cpp
index 100a774054035285d0e8b14ca195ad9c627a7ff7..9efec27c9df3d51a3411db87faee924b374d2ac7 100644
--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -80,12 +80,13 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
 }
 
 template <typename Dtype>
-void LoadMemInternal(void **data, framework::LoDTensor *tensor) {
+static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
+                            bool quant_uint8 = false) {
   char **data_buf = reinterpret_cast<char **>(data);
   int64_t size = tensor->numel();
   Dtype *tensor_data = tensor->mutable_data<Dtype>();
-  if (0) {
-    // TODO(hjchen2) should be moved into operator init function
+  if (quant_uint8) {
+    // should be moved into operator init function
     float min_value;
     float max_value;
     memcpy(&min_value, data_buf, sizeof(float));
@@ -141,7 +142,8 @@ void Executor<Dtype, P>::LoadMemory(
   // parse tensor from stream
   switch (tensor_desc.DataType()) {
     case framework::VARTYPE_TYPE_FP32:
-      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor);
+      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
+                             program_.quantification);
       break;
     case framework::VARTYPE_TYPE_INT8:
       LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
diff --git a/src/io/paddle_inference_api.h b/src/io/paddle_inference_api.h
index 104ba11153cdb9b3bb5e249a771a2cd27ad7dbac..16756a61bf3265a0b6d7c2ec731d2c3d17bf9c3c 100644
--- a/src/io/paddle_inference_api.h
+++ b/src/io/paddle_inference_api.h
@@ -111,6 +111,14 @@ class PaddlePredictor {
   PaddlePredictor() = default;
 };
 
+struct PaddleModelMemoryPack {
+  bool from_memory = false;
+  size_t model_size = 0;
+  uint8_t* model_buf = nullptr;
+  size_t combined_params_size = 0;
+  uint8_t* combined_params_buf = nullptr;
+};
+
 struct PaddleMobileConfig : public PaddlePredictor::Config {
   enum Precision { FP32 = 0 };
   enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
@@ -124,6 +132,7 @@ struct PaddleMobileConfig : public PaddlePredictor::Config {
   int thread_num = 1;
   std::string prog_file;
   std::string param_file;
+  struct PaddleModelMemoryPack memory_pack;
 };
 
 // A factory to help create different predictors.
diff --git a/src/operators/dequantize_op.cpp b/src/operators/dequantize_op.cpp
index df835e3007fe90a5540d420077099a60023c913a..21cd96368c4938d309f08d036b172607a5afee8c 100644
--- a/src/operators/dequantize_op.cpp
+++ b/src/operators/dequantize_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef DEQUANT_OP
+
 #include "operators/dequantize_op.h"
 
 namespace paddle_mobile {
@@ -30,3 +32,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(dequantize, ops::DequantizeOp);
 #endif
+
+#endif
diff --git a/src/operators/dequantize_op.h b/src/operators/dequantize_op.h
index 4855f27fc84cc4ef5acd7a4f9cbe7ad8a70b9c75..906167a9a2f3d0e4dfa4ccf02c0d819108cd3493 100644
--- a/src/operators/dequantize_op.h
+++ b/src/operators/dequantize_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef DEQUANT_OP
+
 #pragma once
 
 #include <string>
@@ -41,3 +43,5 @@ class DequantizeOp
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/elementwise_mul_op.cpp b/src/operators/elementwise_mul_op.cpp
index 920a9a546f5ea6d5ef4f41de361ba43cb9c1a7b1..335a908ace54664f0bcbca37bdcde30047edee5d 100644
--- a/src/operators/elementwise_mul_op.cpp
+++ b/src/operators/elementwise_mul_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifdef ELEMENTWISEMUL_OP
 
-#include "elementwise_mul_op.h"
+#include "operators/elementwise_mul_op.h"
 
 namespace paddle_mobile {
 namespace operators {
diff --git a/src/operators/fill_constant_op.cpp b/src/operators/fill_constant_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6d7c4f44f1b769c47d6f741d139118158292a40f
--- /dev/null
+++ b/src/operators/fill_constant_op.cpp
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FILL_CONSTANT_OP
+
+#include "operators/fill_constant_op.h"
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fill_constant, ops::FillConstantOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(fill_constant, ops::FillConstantOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(fill_constant, ops::FillConstantOp);
+#endif
+
+#endif
diff --git a/src/operators/fill_constant_op.h b/src/operators/fill_constant_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..78eb162efc8ccd42b9fba363d49d1dbc4052f6b2
--- /dev/null
+++ b/src/operators/fill_constant_op.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FILL_CONSTANT_OP
+
+#pragma once
+
+#include <string>
+#include "framework/data_type.h"
+#include "framework/operator.h"
+#include "framework/selected_rows.h"
+#include "operators/math/math_function.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+using std::string;
+
+template <typename DeviceType, typename T>
+class FillConstantOp : public framework::OperatorBase<DeviceType> {
+ public:
+  FillConstantOp(const string &type, const VariableNameMap &inputs,
+                 const VariableNameMap &outputs,
+                 const framework::AttributeMap attrs,
+                 std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
+                                            scope),
+        param_(inputs, outputs, attrs, *scope) {}
+  void RunImpl() const {
+    auto data_type =
+        static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(
+            param_.DataDtype());
+    framework::Tensor *tensor = nullptr;
+    auto value = param_.Value();
+    auto *outvar = param_.OutVar();
+
+    if (outvar->template IsType<framework::LoDTensor>()) {
+      tensor = outvar->template GetMutable<framework::LoDTensor>();
+    } else if (outvar->template IsType<framework::SelectedRows>()) {
+      tensor = outvar->template GetMutable<framework::SelectedRows>()
+                   ->mutable_value();
+    } else {
+      PADDLE_MOBILE_THROW_EXCEPTION(
+          "fill constant op's output only"
+          "supports SelectedRows and LoDTensor");
+    }
+    tensor->Resize(framework::make_ddim(param_.Shape()));
+    tensor->mutable_data(framework::ToTypeIndex(data_type));
+
+    math::set_constant(tensor, value);
+  }
+
+  void Init() {}
+
+  void InferShape() const {
+    PADDLE_MOBILE_ENFORCE(
+        param_.Out() != nullptr,
+        "Output (Out) of fill_constant op should not be null.");
+    framework::DDim ddim = framework::make_ddim(param_.Shape());
+    param_.Out()->Resize(ddim);
+  }
+
+ protected:
+  FillConstantParam<DeviceType> param_;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/dequantize_kernel.cpp b/src/operators/kernel/arm/dequantize_kernel.cpp
index 3033c16c747855455e43454b204fef8e4a345818..cd6c8d17f1ea05e3df6f8f364c2d3d5c9976e46b 100644
--- a/src/operators/kernel/arm/dequantize_kernel.cpp
+++ b/src/operators/kernel/arm/dequantize_kernel.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_MOBILE_CPU
+#ifdef DEQUANT_OP
 
 #include "operators/kernel/dequantize_kernel.h"
 
@@ -38,7 +38,8 @@ void DequantizeKernel<CPU, float>::Compute(
   const int32_t *x = input->data<const int32_t>();
   float *y = output->mutable_data<float>();
   size_t size = output->numel();
-  float scale = 1.f / (activation_scale * weight_scale);
+  // float scale = 1.f / (activation_scale * weight_scale);
+  float scale = activation_scale / weight_scale;
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
   size_t loop = size >> 4;
   size_t remain = size & 0xF;
diff --git a/src/operators/kernel/arm/mul_kernel.cpp b/src/operators/kernel/arm/mul_kernel.cpp
index aa3ee7077eb7db440c8493eae5b95f03a42196a4..276281f963e449af9d55f7c5ca58ef5da17e6f93 100644
--- a/src/operators/kernel/arm/mul_kernel.cpp
+++ b/src/operators/kernel/arm/mul_kernel.cpp
@@ -31,6 +31,8 @@ void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) const {
   param.Out()->set_lod(param.InputX()->lod());
 }
 
+template class MulKernel<CPU, float>;
+
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/arm/polygon_box_transform_kernel.cpp b/src/operators/kernel/arm/polygon_box_transform_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e72c29135e9898d3b5342d1c4b4f0176f105a62a
--- /dev/null
+++ b/src/operators/kernel/arm/polygon_box_transform_kernel.cpp
@@ -0,0 +1,38 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef POLYGONBOXTRANSFORM_OP
+
+#include "operators/kernel/polygon_box_transform_kernel.h"
+#include "operators/kernel/central-arm-func/polygon_box_transform_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool PolygonBoxTransformKernel<CPU, float>::Init(
+    PolygonBoxTransformParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void PolygonBoxTransformKernel<CPU, float>::Compute(
+    const PolygonBoxTransformParam<CPU> &param) const {
+  PolygonBoxTransformCompute<float>(param);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/quantize_kernel.cpp b/src/operators/kernel/arm/quantize_kernel.cpp
index e2c8efc299c858a3cbb907ce0e98b1c2f96d2bc1..e7552d2602b31f9a5c10e3d81122babae8fcf1a8 100644
--- a/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_MOBILE_CPU
+#ifdef QUANT_OP
 
 #include "operators/kernel/quantize_kernel.h"
 #include <cmath>
@@ -225,7 +225,7 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale,
   const float *x = input->data<const float>();
   int8_t *y = output->mutable_data<int8_t>();
   size_t size = input->numel();
-#ifdef defined(__ARM_NEON__) || defined(__ARM_NEON)
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
   size_t loop = size >> 4;
   size_t remain = size & 0xF;
   for (size_t i = 0; i < loop; ++i) {
@@ -280,17 +280,18 @@ void QuantizeKernel<CPU, float>::Compute(
   }
   max_abs = std::max(max_abs, 1e-6f);
   // only support int8 currently
-  float online_scale = 127 / max_abs;
-  param.online_scale_->mutable_data<float>()[0] = online_scale;
+  float scale = 127 / max_abs;
+  param.online_scale_->mutable_data<float>()[0] = max_abs;
   switch (param.round_type_) {
     case ROUND_NEAREST_TO_EVEN:
-      quantize_round_to_even(input, online_scale, output);
+      quantize_round_to_even(input, scale, output);
       break;
     case ROUND_NEAREST_TOWARDS_ZERO:
-      quantize_round_to_zero(input, online_scale, output);
+      quantize_round_to_zero(input, scale, output);
       break;
     case ROUND_NEAREST_AWAY_ZERO:
-      quantize_round_to_nearest(input, online_scale, output);
+      quantize_round_to_nearest(input, scale, output);
+      break;
     default:
       LOG(kLOG_ERROR) << "round type is not supported.";
       break;
diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h
index a3e21e4b4b702630f7942f2a5171a3401f29a431..f80a8f944139566483c47daf10f9decac49650dc 100644
--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -16,24 +16,27 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+#include "operators/math/conv_arm_int8.h"
 #include "operators/math/conv_func.h"
 #include "operators/math/depthwise_conv_3x3.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
+#include "operators/math/pad.h"
 #include "operators/math/vol2col.h"
 #include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
+
+template <typename Dtype>
 inline void ConvBasic(const ConvParam<CPU> &param) {
   const Tensor *input = param.Input();
   Tensor filter = *param.Filter();
   Tensor *output = param.Output();
-  output->mutable_data<float>();
   int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
+  const std::vector<int> strides = param.Strides();
+  const std::vector<int> paddings = param.Paddings();
+  const std::vector<int> dilations = param.Dilations();
 
   const int batch_size = static_cast<int>(input->dims()[0]);
 
@@ -57,7 +60,7 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
   Tensor col;
   Tensor col_matrix;
   if (is_expand) {
-    col.mutable_data<float>(col_shape);
+    col.mutable_data<Dtype>(col_shape);
     col_matrix.ShareDataWith(col);
     col_matrix.Resize(col_matrix_shape);
   }
@@ -76,8 +79,8 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
   int in_step = static_cast<int>(input->dims()[1]) / groups;
   int out_step = static_cast<int>(output->dims()[1]) / groups;
 
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+  math::Vol2ColFunctor<CPU, Dtype> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Dtype> im2col;
 
   for (int i = 0; i < batch_size; i++) {
     Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
@@ -96,6 +99,7 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
                std::vector<int>{paddings[0], paddings[1], paddings[0],
                                 paddings[1]},
                &col);
+
       } else if (data_dim == 3U) {
         // vol2col
         vol2col(in_slice, dilations, strides, paddings, &col);
@@ -104,29 +108,85 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
       // gemm
       Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
       Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
+
+      math::matmul<Dtype>(filter_slice, false, col_matrix, false,
                           static_cast<float>(1), &out_slice,
                           static_cast<float>(0));
     }
   }
 }
 
+inline void ConvCompute_int8(const ConvParam<CPU> &param) {
+  typedef void (*ConvFunc)(const Tensor &input, const Tensor &kernel,
+                           Tensor *output);
+  static ConvFunc conv_funcs_table[7][5] = {
+      {0, 0, 0, 0, 0},                                // k = 1
+      {0, 0, 0, 0, 0}, {conv3x3s1_int8, 0, 0, 0, 0},  // k = 3
+      {0, 0, 0, 0, 0}, {conv5x5s1_int8, 0, 0, 0, 0},  // k = 5
+      {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0},               // k = 7
+  };
+  const Tensor *input = param.Input();
+  Tensor *filter = param.Filter();
+  Tensor *output = param.Output();
+  int groups = param.Groups();
+  const std::vector<int> &strides = param.Strides();
+  const std::vector<int> &paddings = param.Paddings();
+  const std::vector<int> &dilations = param.Dilations();
+  int kernel_h = filter->dims()[2];
+  int kernel_w = filter->dims()[3];
+  output->mutable_data<int32_t>();
+
+  ConvFunc conv_func = 0;
+  if (strides[1] == strides[0] && strides[1] < 6 && kernel_h == kernel_w &&
+      kernel_h < 8 && groups == 1 && dilations[0] == dilations[1] &&
+      dilations[1] == 1) {
+    conv_func = conv_funcs_table[kernel_h - 1][strides[0] - 1];
+  }
+  if (conv_func) {
+    int batch_size = input->dims()[0];
+    math::PadFunctor<CPU, int8_t> pad;
+
+    Tensor input_pad;
+    for (int i = 0; i < batch_size; ++i) {
+      Tensor in_batch = input->Slice(i, i + 1);
+      Tensor out_batch = output->Slice(i, i + 1);
+      if (paddings[0] == 0 && paddings[1] == 0) {
+        input_pad = in_batch;
+      } else {
+        framework::DDim pad_shape = in_batch.dims();
+        pad_shape[2] += 2 * paddings[0];
+        pad_shape[3] += 2 * paddings[1];
+        input_pad.mutable_data<int8_t>(pad_shape);
+        pad(in_batch, paddings[0], paddings[1], &input_pad);
+      }
+      conv_func(input_pad, *filter, &out_batch);
+    }
+  } else {
+    ConvBasic<int8_t>(param);
+  }
+}
+
 template <typename P>
 void ConvCompute(const ConvParam<CPU> &param) {
-  if (param.Groups() == param.Input()->dims()[1] &&
-      param.Input()->dims()[1] == param.Output()->dims()[1] &&
-      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
-    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
-                               nullptr, false);
-  } else if (param.Groups() == param.Input()->dims()[1] &&
-             param.Input()->dims()[1] == param.Output()->dims()[1] &&
-             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3) {
-    math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
-                           param.Filter(), nullptr, param.Output(), false);
+  if (param.Input()->type() == typeid(int8_t)) {
+    ConvCompute_int8(param);
   } else {
-    ConvBasic(param);
+    param.Output()->mutable_data<float>();
+    if (param.Groups() == param.Input()->dims()[1] &&
+        param.Input()->dims()[1] == param.Output()->dims()[1] &&
+        param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+        param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
+      math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
+                                 nullptr, false);
+    } else if (param.Groups() == param.Input()->dims()[1] &&
+               param.Input()->dims()[1] == param.Output()->dims()[1] &&
+               param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
+               param.Filter()->dims()[2] == 3) {
+      math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
+                             param.Filter(), nullptr, param.Output(), false);
+    } else {
+      ConvBasic<float>(param);
+    }
   }
 }
 
diff --git a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
index 2a1afb3cf6fdbdc0a80cec5558c2b42fec6699f3..ff5d5d4b2a351d075fcecce209063aa66e026754 100644
--- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
@@ -44,7 +44,7 @@ void DepthwiseConvCompute(const ConvParam<CPU> &param) {
                                  Bias, false);
 
   } else {
-    ConvBasic(param);
+    ConvBasic<float>(param);
   }
 }
 
diff --git a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
index ace72b6faddb04ee3547f1b2bc01461d8c9f2e98..0c01ef0072444479d2d2e2f7676b842d89e432ec 100644
--- a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
@@ -15,8 +15,12 @@ limitations under the License. */
 #ifdef ELEMENTWISEADD_OP
 
 #pragma once
+
 #include "operators/math/elementwise_op_function.h"
 #include "operators/op_param.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
 
 namespace paddle_mobile {
 namespace operators {
@@ -33,8 +37,61 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
   Tensor *Out = param.Out();
   Out->mutable_data<float>();
   int axis = param.Axis();
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  const auto &x_dims = input_x->dims();
+  const auto &y_dims = input_y->dims();
+  /// axis = -1 represent the last dimensions.
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  size_t batch = 1;
+  size_t channels = 1;
+  size_t elementwise_num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    elementwise_num *= x_dims[i];
+  }
+  const float *bias_data = input_y->data<float>();
+  const float *input_data = input_x->data<float>();
+  float *output_data = Out->mutable_data<float>();
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      size_t offset = (i * channels + j) * elementwise_num;
+      const float *input = input_data + offset;
+      const float *bias = bias_data + j;
+      float *output = output_data + offset;
+
+      int loop = elementwise_num >> 0x4;
+      int remain = elementwise_num & 0xF;
+      for (int k = 0; k < loop; ++k) {
+        float32x4_t rb = vdupq_n_f32(*bias);
+        float32x4_t r0 = vld1q_f32(input);
+        float32x4_t r1 = vld1q_f32(input + 4);
+        float32x4_t r2 = vld1q_f32(input + 8);
+        float32x4_t r3 = vld1q_f32(input + 12);
+        r0 = vaddq_f32(r0, rb);
+        r1 = vaddq_f32(r1, rb);
+        r2 = vaddq_f32(r2, rb);
+        r3 = vaddq_f32(r3, rb);
+        vst1q_f32(output, r0);
+        vst1q_f32(output + 4, r1);
+        vst1q_f32(output + 8, r2);
+        vst1q_f32(output + 12, r3);
+        input += 16;
+        output += 16;
+      }
+      for (int k = 0; k < remain; ++k) {
+        output[k] = input[k] + *bias;
+      }
+    }
+  }
+#else
   ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
                                                  AddFunctor<float>(), Out);
+#endif
 }
 
 template class ElementwiseAddKernel<CPU, float>;
diff --git a/src/operators/kernel/central-arm-func/mul_arm_func.h b/src/operators/kernel/central-arm-func/mul_arm_func.h
index dd6df54da5a81c2c4d1030103b6bb9811a54246a..07e634e3be9648520357871d91d6677aec6b5c0e 100644
--- a/src/operators/kernel/central-arm-func/mul_arm_func.h
+++ b/src/operators/kernel/central-arm-func/mul_arm_func.h
@@ -58,7 +58,7 @@ void MulCompute(const MulParam<CPU> &param) {
   const Tensor *input_x = param.InputX();
   const Tensor *input_y = param.InputY();
   Tensor *out = param.Out();
-  out->mutable_data<float>();
+
   const Tensor x_matrix =
       input_x->dims().size() > 2
           ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
@@ -71,15 +71,21 @@ void MulCompute(const MulParam<CPU> &param) {
   if (out_dim.size() != 2) {
     out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
   }
-  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
-                      out, static_cast<float>(0));
+  if (param.InputX()->type() == typeid(int8_t)) {
+    out->mutable_data<int32_t>();
+    math::matmul<int8_t>(x_matrix, false, y_matrix, false,
+                         static_cast<int8_t>(1), out, static_cast<int8_t>(0));
+
+  } else {
+    out->mutable_data<float>();
+    math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                        out, static_cast<float>(0));
+  }
   if (out_dim.size() != 2) {
     out->Resize(out_dim);
   }
 }
 
-template class MulKernel<CPU, float>;
-
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h b/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..6db4297046fba8cbb8028f1c70d8214b703158b6
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef POLYGONBOXTRANSFORM_OP
+#pragma once
+
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void PolygonBoxTransformCompute(const PolygonBoxTransformParam<CPU>& param) {
+  const auto* input = param.Input();
+  const auto& input_dims = input->dims();
+  const auto* input_data = input->data<float>();
+  auto* output = param.Output();
+  auto* output_data = output->mutable_data<float>();
+
+  int64_t batch_size = input_dims[0];
+  int64_t geo_channel = input_dims[1];
+  int64_t height = input_dims[2];
+  int64_t width = input_dims[3];
+  int64_t id = 0;
+  for (int64_t id_n = 0; id_n < batch_size * geo_channel; ++id_n) {
+    for (int64_t id_h = 0; id_h < height; ++id_h) {
+      for (int64_t id_w = 0; id_w < width; ++id_w) {
+        id = id_n * height * width + width * id_h + id_w;
+        if (id_n % 2 == 0) {
+          output_data[id] = id_w * 4 - input_data[id];
+        } else {
+          output_data[id] = id_h * 4 - input_data[id];
+        }
+      }
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/relu_arm_func.h b/src/operators/kernel/central-arm-func/relu_arm_func.h
index d68569c0a5c0730d96a89cd534b2a89c0d3a9bff..38b2e6f334b4b24460f72450b01e4bdc2a6ff616 100644
--- a/src/operators/kernel/central-arm-func/relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/relu_arm_func.h
@@ -17,6 +17,9 @@ limitations under the License. */
 
 #include <operators/math/transform.h>
 #include "operators/op_param.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
 
 namespace paddle_mobile {
 namespace operators {
@@ -37,71 +40,100 @@ void ReluCompute(const ReluParam<CPU> &param) {
   auto *out_ptr = out->mutable_data<float>();
 
   int numel = input_x->numel();
-  //  if (numel > 64) {
-  //    asm volatile(
-  //        "pld        [%[input_x_ptr], #0]        \n\t"
-  //        "vmov.f32   q8,    #0.0                 \n\t"
-  //        "subs %[num], %[num], #32                \n\t"
-  //        "blt        end_num_%=                  \n\t"
-  //        "loop_num_%=:                           \n\t"
-  //        "pld        [%[input_x_ptr], #1024]      \n\t"
-  //
-  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-  //
-  //        "vmax.f32 q0, q0, q8                   \n\t"
-  //        "vmax.f32 q1, q1, q8                    \n\t"
-  //        "vmax.f32 q2, q2, q8                   \n\t"
-  //        "vmax.f32 q3, q3, q8                   \n\t"
-  //        "vmax.f32 q4, q4, q8                   \n\t"
-  //        "vmax.f32 q5, q5, q8                   \n\t"
-  //        "vmax.f32 q6, q6, q8                   \n\t"
-  //        "vmax.f32 q7, q7, q8                   \n\t"
-  //
-  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
-  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
-  //
-  //        "subs %[num], %[num], #32              \n\t"
-  //        "bge        loop_num_%=                \n\t"
-  //        "end_num_%=:                           \n\t"
-  //        "cmp %[num], #0                         \n\t"
-  //        "bge   end_%=                          \n\t"
-  //        "mov r6, #4                             \n\t"
-  //        "mul r5, %[num], r6                     \n\t"
-  //        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
-  //        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
-  //        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-  //        "vmax.f32 q0, q0, q8                   \n\t"
-  //        "vmax.f32 q1, q1, q8                    \n\t"
-  //        "vmax.f32 q2, q2, q8                   \n\t"
-  //        "vmax.f32 q3, q3, q8                   \n\t"
-  //        "vmax.f32 q4, q4, q8                   \n\t"
-  //        "vmax.f32 q5, q5, q8                   \n\t"
-  //        "vmax.f32 q6, q6, q8                   \n\t"
-  //        "vmax.f32 q7, q7, q8                   \n\t"
-  //        "add %[out_ptr], %[out_ptr], r5       \n\t"
-  //        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
-  //        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
-  //        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
-  //        "end_%=:                                \n\t"
-  //        :
-  //        :
-  //        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num]
-  //        "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-  //        "q7", "q8", "r5",
-  //          "r6");
-  //  } else {
-  ReluFunctor<float> func_;
-  math::Transform trans;
-  trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
-  //  }
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#if __aarch64__
+  if (numel > 0) {
+    int loop = numel >> 0x4;
+    int remain = numel & 0xF;
+    float32x4_t zero = vdupq_n_f32(0.f);
+    for (int i = 0; i < loop; ++i) {
+      float32x4_t r0 = vld1q_f32(input_x_ptr);
+      float32x4_t r1 = vld1q_f32(input_x_ptr + 4);
+      float32x4_t r2 = vld1q_f32(input_x_ptr + 8);
+      float32x4_t r3 = vld1q_f32(input_x_ptr + 12);
+      r0 = vmaxq_f32(r0, zero);
+      r1 = vmaxq_f32(r1, zero);
+      r2 = vmaxq_f32(r2, zero);
+      r3 = vmaxq_f32(r3, zero);
+      vst1q_f32(out_ptr, r0);
+      vst1q_f32(out_ptr + 4, r1);
+      vst1q_f32(out_ptr + 8, r2);
+      vst1q_f32(out_ptr + 12, r3);
+      input_x_ptr += 16;
+      out_ptr += 16;
+    }
+    for (int i = 0; i < remain; ++i) {
+      out_ptr[i] = (input_x_ptr[i] > 0) * input_x_ptr[i];
+    }
+#else
+  if (numel > 64) {
+    asm volatile(
+        "pld        [%[input_x_ptr], #0]        \n\t"
+        "vmov.f32   q8,    #0.0                 \n\t"
+        "subs %[num], %[num], #32                \n\t"
+        "blt        end_num_%=                  \n\t"
+        "loop_num_%=:                           \n\t"
+        "pld        [%[input_x_ptr], #1024]      \n\t"
+
+        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+
+        "vmax.f32 q0, q0, q8                   \n\t"
+        "vmax.f32 q1, q1, q8                    \n\t"
+        "vmax.f32 q2, q2, q8                   \n\t"
+        "vmax.f32 q3, q3, q8                   \n\t"
+        "vmax.f32 q4, q4, q8                   \n\t"
+        "vmax.f32 q5, q5, q8                   \n\t"
+        "vmax.f32 q6, q6, q8                   \n\t"
+        "vmax.f32 q7, q7, q8                   \n\t"
+
+        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+
+        "subs %[num], %[num], #32              \n\t"
+        "bge        loop_num_%=                \n\t"
+        "end_num_%=:                           \n\t"
+        "cmp %[num], #0                         \n\t"
+        "bge   end_%=                          \n\t"
+        "mov r6, #4                             \n\t"
+        "mul r5, %[num], r6                     \n\t"
+        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
+        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+        "vmax.f32 q0, q0, q8                   \n\t"
+        "vmax.f32 q1, q1, q8                    \n\t"
+        "vmax.f32 q2, q2, q8                   \n\t"
+        "vmax.f32 q3, q3, q8                   \n\t"
+        "vmax.f32 q4, q4, q8                   \n\t"
+        "vmax.f32 q5, q5, q8                   \n\t"
+        "vmax.f32 q6, q6, q8                   \n\t"
+        "vmax.f32 q7, q7, q8                   \n\t"
+        "add %[out_ptr], %[out_ptr], r5       \n\t"
+        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+        "end_%=:                                \n\t"
+        :
+        :
+        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5",
+          "r6");
+#endif
+  } else {
+#endif
+    ReluFunctor<float> func_;
+    math::Transform trans;
+    trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  }
+#endif
 }
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/kernel/central-arm-func/sum_arm_func.h b/src/operators/kernel/central-arm-func/sum_arm_func.h
index 25c1c51c7abd62a900665197ab4e221b76a3fa04..36c7ac9694bde85fbf702ad8adf5ffda8744da1d 100644
--- a/src/operators/kernel/central-arm-func/sum_arm_func.h
+++ b/src/operators/kernel/central-arm-func/sum_arm_func.h
@@ -15,11 +15,14 @@ limitations under the License. */
 #ifdef SUM_OP
 #pragma once
 
+#include <vector>
 #include "operators/math/selected_rows_functor.h"
 
 namespace paddle_mobile {
 namespace operators {
+
 using LoDTensorArray = std::vector<LoDTensor>;
+
 template <typename P>
 void SumCompute(const SumParam<CPU> &param) {
   auto inputsvars = param.InputsVars();
@@ -63,31 +66,21 @@ void SumCompute(const SumParam<CPU> &param) {
     std::unique_ptr<framework::SelectedRows> in0;
     if (in_place) {
       // If is in_place, we store the input[0] to in0
-      auto *in_sel0 = inputsvars[0]->Get<SelectedRows>();
+      auto *in_sel0 = inputsvars[0]->Get<framework::SelectedRows>();
       auto &rows = in_sel0->rows();
-      //#ifdef PADDLE_WITH_CUDA
-      //                    std::vector<int64_t> rows_in_cpu;
-      //        rows_in_cpu.reserve(rows.size());
-      //        for (auto item : rows) {
-      //          rows_in_cpu.push_back(item);
-      //        }
-      //        in0.reset(new framework::SelectedRows(rows_in_cpu,
-      //        in_sel0.height()));
-      //#else
       in0.reset(new framework::SelectedRows(rows, in_sel0->height()));
-      //#endif
       in0->mutable_value()->ShareDataWith(in_sel0->value());
     }
 
-    auto get_selected_row = [&](size_t i) -> const SelectedRows & {
+    auto get_selected_row = [&](size_t i) -> const framework::SelectedRows & {
       if (i == 0 && in0) {
         return *in0.get();
       } else {
-        return *(inputsvars[i]->Get<SelectedRows>());
+        return *(inputsvars[i]->Get<framework::SelectedRows>());
       }
     };
 
-    auto *out = outvar->GetMutable<SelectedRows>();
+    auto *out = outvar->GetMutable<framework::SelectedRows>();
     out->mutable_rows()->clear();
     auto *out_value = out->mutable_value();
 
@@ -150,8 +143,6 @@ void SumCompute(const SumParam<CPU> &param) {
       }
     }
   } else {
-    if (outvar->IsType<framework::Tensor>()) {
-    }
     PADDLE_MOBILE_THROW_EXCEPTION(
         "Unexpected branch, output variable type is %s", outvar->Type().name());
   }
diff --git a/src/operators/kernel/dequantize_kernel.h b/src/operators/kernel/dequantize_kernel.h
index 3d0437875bb64a0d32948a05725214d666ebfa01..d147e3f94ab87165cceac886289e74747906e047 100644
--- a/src/operators/kernel/dequantize_kernel.h
+++ b/src/operators/kernel/dequantize_kernel.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef DEQUANT_OP
+
 #pragma once
 
 #include "framework/operator.h"
@@ -30,3 +32,5 @@ class DequantizeKernel
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/elementwise_mul_kernel.h b/src/operators/kernel/elementwise_mul_kernel.h
index d1e326c6c4e7830c11c387dca03da9858c9a37dd..63f0df4815dc143e482140a855eb254bd016d50c 100644
--- a/src/operators/kernel/elementwise_mul_kernel.h
+++ b/src/operators/kernel/elementwise_mul_kernel.h
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 
-using namespace framework;
-
 template <typename DeviceType, typename T>
 class ElementwiseMulKernel
     : public framework::OpKernelBase<DeviceType,
diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
index 671df76967b4537d111695cdbe091b9c7de2c5a2..9b3944fc9a9ab308d9fe8b791a34e09651b87e6e 100644
--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
@@ -66,10 +66,11 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
   fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
   fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
   param->SetFpgaArgs(conv_arg);
 
   return true;
diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
index ea01245f1207739d4234ea3509451a2de1d321f4..83f74e97d04eda29f3aaa6a0cc16ed7d194321d8 100644
--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
@@ -65,10 +65,11 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
 
   fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
   param->SetFpgaArgs(conv_arg);
   return true;
 }
diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
index 928b73e4d30144cdf1128a018628b6208fcfd5f0..4975f2a905dcd76c5b7f013eafaa376dd2bb1646 100644
--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
@@ -47,10 +47,11 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
 
   fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
   param->SetFpgaArgs(conv_arg);
   return true;
 }
diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/conv_bn_kernel.cpp
index fea211af74b634fc0dd8dcee1db7c2c004145561..276e71b6a44e9a7beba0d5db2f51472a9927d8da 100644
--- a/src/operators/kernel/fpga/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp
@@ -59,10 +59,11 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
 
   fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
   param->SetFpgaArgs(conv_arg);
   return true;
 }
diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
index 87fe12664e75717c78d79ec50821a9bb6201c5a0..f519a37cb57378a603969adae255f88ae8a5df2a 100644
--- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
@@ -59,10 +59,11 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
 
   fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
-                      param->Groups(), param->Strides()[0], param->Strides()[1],
-                      param->Paddings()[0], param->Paddings()[1], bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
+                       param->Groups(), param->Strides()[0],
+                       param->Strides()[1], param->Paddings()[0],
+                       param->Paddings()[1], bs_ptr);
   param->SetFpgaArgs(conv_arg);
   return true;
 }
diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp
index 7c7bceaaee82617122da9c0fd2a5fa6b688f1153..52d7c0a4e69080e11f86d1507829e7e779a69228 100644
--- a/src/operators/kernel/fpga/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
@@ -53,9 +53,9 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
   fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
   fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
-                      0, bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
+                       0, 0, bs_ptr);
   param->SetFpgaArgs(conv_arg);
   return true;
 }
diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
index d543e1ea46bea09ee7331d03760633ee240454d5..407e14238d542604e876ced624d5a0db698a6101 100644
--- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
@@ -54,9 +54,9 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
   fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
   fpga::format_fp16_ofm(out);
 
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
-                      0, bs_ptr);
+  fpga::SplitConvArgs conv_arg = {0};
+  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
+                       0, 0, bs_ptr);
   param->SetFpgaArgs(conv_arg);
   return true;
 }
diff --git a/src/operators/kernel/fpga/mul_kernel.cpp b/src/operators/kernel/fpga/mul_kernel.cpp
deleted file mode 100644
index 9e282bd27b744cb48fccdc8e4602ae2fc9a1ad79..0000000000000000000000000000000000000000
--- a/src/operators/kernel/fpga/mul_kernel.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MUL_OP
-
-#include "operators/kernel/mul_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool MulKernel<FPGA, float>::Init(MulParam<FPGA> *param) {
-  bool relu_enabled = false;
-  auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<LoDTensor *>(param->InputY());
-  auto out = param->Out();
-
-  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
-                        "Image channel should be equal to weight number");
-  int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = 0;
-  }
-  int num = (uint32_t)filter->dims()[1];
-  int chw = (uint32_t)filter->dims()[0];
-  PADDLE_MOBILE_ENFORCE(
-      chw == input_x->numel(),
-      "Filter element num should be equal to IFM element num");
-  int height = (uint32_t)input_x->dims()[2];
-  int width = (uint32_t)input_x->dims()[3];
-  int filter_channel = chw / height / width;
-
-  out->Resize(framework::make_ddim({1, channel, 1, 1}));
-  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_fc_filter(filter, max_value);
-
-  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_fp16_ofm(out);
-
-  fpga::WrapperConvArgs conv_arg = {0};
-  fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
-                      0, bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void MulKernel<FPGA, float>::Compute(const MulParam<FPGA> &param) const {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/src/operators/kernel/polygon_box_transform_kernel.h b/src/operators/kernel/polygon_box_transform_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5baf32cc7dca0aee1eb0b7c13895e806f70320a
--- /dev/null
+++ b/src/operators/kernel/polygon_box_transform_kernel.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef POLYGONBOXTRANSFORM_OP
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class PolygonBoxTransformKernel
+    : public framework::OpKernelBase<DeviceType,
+                                     PolygonBoxTransformParam<DeviceType>> {
+ public:
+  void Compute(const PolygonBoxTransformParam<DeviceType>& param) const;
+  bool Init(PolygonBoxTransformParam<DeviceType>* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/quantize_kernel.h b/src/operators/kernel/quantize_kernel.h
index 7a35d03ba76651df935fd9c32b13377767f3c439..c55ca2182acd0f459c785f29d359ea9039a7350a 100644
--- a/src/operators/kernel/quantize_kernel.h
+++ b/src/operators/kernel/quantize_kernel.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef QUANT_OP
+
 #pragma once
 
 #include "framework/operator.h"
@@ -30,3 +32,5 @@ class QuantizeKernel
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/sum_kernel.h b/src/operators/kernel/sum_kernel.h
index 669db899b542a5231d685e098cf907e0b1b650ff..ed337432e0fd4bf4035b67d4099379ce29918547 100644
--- a/src/operators/kernel/sum_kernel.h
+++ b/src/operators/kernel/sum_kernel.h
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 
-using namespace framework;
-
 template <typename DeviceType, typename T>
 class SumKernel
     : public framework::OpKernelBase<DeviceType, SumParam<DeviceType>> {
diff --git a/src/operators/math/conv3x3_arm_int8.cpp b/src/operators/math/conv3x3_arm_int8.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..283dcb2255b43052dcaf2d622ad629e923810a82
--- /dev/null
+++ b/src/operators/math/conv3x3_arm_int8.cpp
@@ -0,0 +1,761 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_OP
+
+#include "operators/math/conv_arm_int8.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+void conv3x3s1_int8(const framework::Tensor& input,
+                    const framework::Tensor& weight,
+                    framework::Tensor* output) {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  const int8_t* in_data = input.data<int8_t>();
+  const int8_t* w_data = weight.data<int8_t>();
+  int32_t* out_data = output->mutable_data<int32_t>();
+  // make sure that batch size is 1
+  int input_c = input.dims()[1];
+  int input_h = input.dims()[2];
+  int input_w = input.dims()[3];
+  int output_c = output->dims()[1];
+  int output_h = output->dims()[2];
+  int output_w = output->dims()[3];
+  int image_size = input_h * input_w;
+  int out_image_size = output_h * output_w;
+  memset(out_data, 0, output_c * out_image_size * sizeof(int32_t));
+#if __aarch64__
+  // TODO(hjchen2)
+#else
+  int oc = 0;
+  #pragma omp parallel for
+  for (; oc < output_c - 1; oc += 2) {
+    for (int ic = 0; ic < input_c; ++ic) {
+      const int8_t* kernel0 = w_data + (oc * input_c + ic) * 9;
+      const int8_t* kernel1 = w_data + ((oc + 1) * input_c + ic) * 9;
+      int32_t* output0 = out_data + oc * out_image_size;
+      int32_t* output0n = output0 + output_w;
+      int32_t* output1 = out_data + (oc + 1) * out_image_size;
+      int32_t* output1n = output1 + output_w;
+
+      int oh = 0;
+      for (; oh < output_h - 1; oh += 2) {
+        const int8_t* r0 = in_data + ic * image_size + oh * input_w;
+        const int8_t* r1 = r0 + input_w;
+        const int8_t* r2 = r1 + input_w;
+        const int8_t* r3 = r2 + input_w;
+
+        int ow = output_w >> 3;
+        int remain = output_w & 0x7;
+        if (ow > 0) {
+          asm volatile(
+              "vld1.8     {d0}, [%[kernel0]]        \n"
+              "ldr        r5,   [%[kernel0], #8]    \n"
+              "vld1.8     {d1}, [%[kernel1]]        \n"
+              "ldr        r6,   [%[kernel1], #8]    \n"
+
+              "0:                                   \n"
+              "vld1.8     {d2-d3}, [%[r0]]          \n"  // r0
+              "add        %[r0], #8                 \n"
+              "vext.8     d4, d2, d3, #1            \n"
+              "vext.8     d5, d2, d3, #2            \n"
+              "vdup.s8    d6, d0[0]                 \n"
+              "vdup.s8    d7, d0[1]                 \n"
+              "vdup.s8    d8, d0[2]                 \n"
+              "vdup.s8    d9, d1[0]                 \n"
+              "vdup.s8    d10, d1[1]                \n"
+              "vdup.s8    d11, d1[2]                \n"
+              "vmull.s8   q6, d2, d6                \n"
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddl.s16  q12, d12, d14             \n"
+              "vaddl.s16  q13, d13, d15             \n"
+              "vmull.s8   q6, d2, d9                \n"
+              "vmull.s8   q7, d4, d10               \n"
+              "vmlal.s8   q6, d5, d11               \n"
+              "vaddl.s16  q14, d12, d14             \n"
+              "vaddl.s16  q15, d13, d15             \n"
+
+              "vld1.8     {d2-d3}, [%[r1]]          \n"  // r1
+              "add        %[r1], #8                 \n"
+              "vext.8     d4, d2, d3, #1            \n"
+              "vext.8     d5, d2, d3, #2            \n"
+
+              "vmull.s8   q6, d2, d6                \n"  // next row
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddl.s16  q8, d12, d14              \n"
+              "vaddl.s16  q9, d13, d15              \n"
+              "vmull.s8   q6, d2, d9                \n"
+              "vmull.s8   q7, d4, d10               \n"
+              "vmlal.s8   q6, d5, d11               \n"
+              "vaddl.s16  q10, d12, d14             \n"
+              "vaddl.s16  q11, d13, d15             \n"
+
+              "vdup.s8    d6, d0[3]                 \n"
+              "vdup.s8    d7, d0[4]                 \n"
+              "vdup.s8    d8, d0[5]                 \n"
+              "vdup.s8    d9, d1[3]                 \n"
+              "vdup.s8    d10, d1[4]                \n"
+              "vdup.s8    d11, d1[5]                \n"
+              "vmull.s8   q6, d2, d6                \n"
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddw.s16  q12, q12, d12             \n"
+              "vaddw.s16  q13, q13, d13             \n"
+              "vaddw.s16  q12, q12, d14             \n"
+              "vaddw.s16  q13, q13, d15             \n"
+              "vmull.s8   q6, d2, d9                \n"
+              "vmull.s8   q7, d4, d10               \n"
+              "vmlal.s8   q6, d5, d11               \n"
+              "vaddw.s16  q14, q14, d12             \n"
+              "vaddw.s16  q15, q15, d13             \n"
+              "vaddw.s16  q14, q14, d14             \n"
+              "vaddw.s16  q15, q15, d15             \n"
+
+              "vld1.8     {d2-d3}, [%[r2]]          \n"  // r2
+              "add        %[r2], #8                 \n"
+              "vext.8     d4, d2, d3, #1            \n"
+              "vext.8     d5, d2, d3, #2            \n"
+
+              "vmull.s8   q6, d2, d6                \n"  // next row
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddw.s16  q8, q8, d12               \n"
+              "vaddw.s16  q8, q8, d14               \n"
+              "vaddw.s16  q9, q9, d13               \n"
+              "vaddw.s16  q9, q9, d15               \n"
+              "vmull.s8   q6, d2, d9                \n"
+              "vmull.s8   q7, d4, d10               \n"
+              "vmlal.s8   q6, d5, d11               \n"
+              "vaddw.s16  q10, q10, d12             \n"
+              "vaddw.s16  q11, q11, d13             \n"
+              "vaddw.s16  q10, q10, d14             \n"
+              "vaddw.s16  q11, q11, d15             \n"
+
+              "vdup.s8    d6, d0[6]                 \n"
+              "vdup.s8    d7, d0[7]                 \n"
+              "vdup.s8    d8, r5                    \n"
+              "vdup.s8    d9, d1[6]                 \n"
+              "vdup.s8    d10, d1[7]                \n"
+              "vdup.s8    d11, r6                   \n"
+              "vmull.s8   q6, d2, d6                \n"
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddw.s16  q12, q12, d12             \n"
+              "vaddw.s16  q13, q13, d13             \n"
+              "vaddw.s16  q12, q12, d14             \n"
+              "vaddw.s16  q13, q13, d15             \n"
+
+              "vld1.32    {d12-d15}, [%[output0]]   \n"
+              "vadd.s32   q6, q6, q12               \n"
+              "vadd.s32   q7, q7, q13               \n"
+              "vst1.32    {d12-d15}, [%[output0]]!  \n"
+
+              "vmull.s8   q6, d2, d9                \n"
+              "vmull.s8   q7, d4, d10               \n"
+              "vmlal.s8   q6, d5, d11               \n"
+              "vaddw.s16  q14, q14, d12             \n"
+              "vaddw.s16  q15, q15, d13             \n"
+              "vaddw.s16  q14, q14, d14             \n"
+              "vaddw.s16  q15, q15, d15             \n"
+
+              "vld1.32    {d12-d15}, [%[output1]]   \n"
+              "vadd.s32   q6, q6, q14               \n"
+              "vadd.s32   q7, q7, q15               \n"
+              "vst1.32    {d12-d15}, [%[output1]]!  \n"
+
+              "vld1.8     {d2-d3}, [%[r3]]          \n"  // r3
+              "add        %[r3], #8                 \n"
+              "vext.8     d4, d2, d3, #1            \n"
+              "vext.8     d5, d2, d3, #2            \n"
+
+              "vmull.s8   q6, d2, d6                \n"  // next row
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddw.s16  q8, q8, d12               \n"
+              "vaddw.s16  q9, q9, d15               \n"
+              "vaddw.s16  q8, q8, d14               \n"
+              "vaddw.s16  q9, q9, d13               \n"
+
+              "vld1.32    {d12-d15}, [%[output0n]]  \n"
+              "vadd.s32   q6, q6, q8                \n"
+              "vadd.s32   q7, q7, q9                \n"
+              "vst1.32    {d12-d15}, [%[output0n]]! \n"
+
+              "vmull.s8   q6, d2, d9                \n"
+              "vmull.s8   q7, d4, d10               \n"
+              "vmlal.s8   q6, d5, d11               \n"
+              "vaddw.s16  q10, q10, d12             \n"
+              "vaddw.s16  q11, q11, d15             \n"
+              "vaddw.s16  q10, q10, d14             \n"
+              "vaddw.s16  q11, q11, d13             \n"
+
+              "vld1.32    {d12-d15}, [%[output1n]]  \n"
+              "vadd.s32   q6, q6, q10               \n"
+              "vadd.s32   q7, q7, q11               \n"
+              "vst1.32    {d12-d15}, [%[output1n]]! \n"
+
+              "subs       %[ow], #1                 \n"
+              "bne        0b                        \n"
+              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
+                [ow] "+r"(ow), [output0] "+r"(output0), [output1] "+r"(output1),
+                [output0n] "+r"(output0n), [output1n] "+r"(output1n)
+              : [kernel0] "r"(kernel0), [kernel1] "r"(kernel1)
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r5",
+                "r6");
+        }
+        if (remain > 0) {
+          asm volatile(
+              "vld1.8     {d0}, [%[kernel0]]       \n"
+              "ldr        r5,   [%[kernel0], #8]   \n"
+              "vld1.8     {d1}, [%[kernel1]]       \n"
+              "ldr        r6,   [%[kernel1], #8]   \n"
+
+              "0:                                  \n"
+              "vld1.8     d4, [%[r0]]              \n"
+              "vld1.8     d5, [%[r1]]              \n"
+              "vld1.8     d6, [%[r2]]              \n"
+              "vld1.8     d7, [%[r3]]              \n"
+              "add        %[r0], #1                \n"
+              "add        %[r1], #1                \n"
+              "add        %[r2], #1                \n"
+              "add        %[r3], #1                \n"
+              "vdup.s8     d2, r5                  \n"
+              "vdup.s8     d3, r6                  \n"
+              "vext.8     d8, d0, d2, #3           \n"
+              "vext.8     d9, d0, d2, #6           \n"
+              "vext.8     d10, d1, d3, #3          \n"
+              "vext.8     d11, d1, d3, #6          \n"
+
+              "vmull.s8   q6, d4, d0               \n"
+              "vmull.s8   q7, d5, d8               \n"
+              "vmlal.s8   q6, d6, d9               \n"
+              "vaddl.s16  q12, d12, d14            \n"
+              "vdup.s32   d2, d24[1]               \n"
+              "vadd.s32   d24, d24, d2             \n"
+              "vadd.s32   d24, d24, d25            \n"
+              "vmull.s8   q6, d4, d1               \n"
+              "vmull.s8   q7, d5, d10              \n"
+              "vmlal.s8   q6, d6, d11              \n"
+              "vaddl.s16  q13, d12, d14            \n"
+              "vdup.s32   d2, d26[1]               \n"
+              "vadd.s32   d26, d26, d2             \n"
+              "vadd.s32   d26, d26, d27            \n"
+
+              "ldr        r7, [%[output0]]         \n"
+              "vdup.s32   d14, r7                  \n"
+              "vadd.s32   d14, d14, d24            \n"
+              "vst1.32    d14[0], [%[output0]]!    \n"
+              "ldr        r7, [%[output1]]         \n"
+              "vdup.s32   d14, r7                  \n"
+              "vadd.s32   d14, d14, d26            \n"
+              "vst1.32    d14[0], [%[output1]]!    \n"
+
+              "vmull.s8   q6, d5, d0               \n"
+              "vmull.s8   q7, d6, d8               \n"
+              "vmlal.s8   q6, d7, d9               \n"
+              "vaddl.s16  q12, d12, d14            \n"
+              "vdup.s32   d2, d24[1]               \n"
+              "vadd.s32   d24, d24, d2             \n"
+              "vadd.s32   d24, d24, d25            \n"
+              "vmull.s8   q6, d5, d1               \n"
+              "vmull.s8   q7, d6, d10              \n"
+              "vmlal.s8   q6, d7, d11              \n"
+              "vaddl.s16  q13, d12, d14            \n"
+              "vdup.s32   d2, d26[1]               \n"
+              "vadd.s32   d26, d26, d2             \n"
+              "vadd.s32   d26, d26, d27            \n"
+
+              "ldr        r7, [%[output0n]]        \n"
+              "vdup.s32   d14, r7                  \n"
+              "vadd.s32   d14, d14, d24            \n"
+              "vst1.32    d14[0], [%[output0n]]!   \n"
+              "ldr        r7, [%[output1n]]        \n"
+              "vdup.s32   d14, r7                  \n"
+              "vadd.s32   d14, d14, d26            \n"
+              "vst1.32    d14[0], [%[output1n]]!   \n"
+
+              "subs       %[remain], #1            \n"
+              "bne        0b                       \n"
+              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
+                [remain] "+r"(remain), [output0] "+r"(output0),
+                [output1] "+r"(output1), [output0n] "+r"(output0n),
+                [output1n] "+r"(output1n)
+              : [kernel0] "r"(kernel0), [kernel1] "r"(kernel1)
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "r5", "r6", "r7");
+        }
+        output0 += output_w;
+        output1 += output_w;
+        output0n += output_w;
+        output1n += output_w;
+      }
+      // remain output height
+      for (; oh < output_h; ++oh) {
+        const int8_t* r0 = in_data + ic * image_size + oh * input_w;
+        const int8_t* r1 = r0 + input_w;
+        const int8_t* r2 = r1 + input_w;
+        const int8_t* r3 = r2 + input_w;
+        const int8_t* r4 = r3 + input_w;
+
+        int ow = output_w >> 3;
+        int remain = output_w & 0x7;
+        if (ow > 0) {
+          asm volatile(
+              "vld1.8     {d0}, [%[kernel0]]        \n"
+              "ldr        r5,   [%[kernel0], #8]    \n"
+              "vld1.8     {d1}, [%[kernel1]]        \n"
+              "ldr        r6,   [%[kernel1], #8]    \n"
+
+              "0:                                   \n"
+              "vld1.8     {d2-d3}, [%[r0]]          \n"  // r0
+              "add        %[r0], #8                 \n"
+              "vext.8     d4, d2, d3, #1            \n"
+              "vext.8     d5, d2, d3, #2            \n"
+              "vdup.s8    d6, d0[0]                 \n"
+              "vdup.s8    d7, d0[1]                 \n"
+              "vdup.s8    d8, d0[2]                 \n"
+              "vdup.s8    d9, d1[0]                 \n"
+              "vdup.s8    d10, d1[1]                \n"
+              "vdup.s8    d11, d1[2]                \n"
+              "vmull.s8   q6, d2, d6                \n"
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddl.s16  q12, d12, d14             \n"
+              "vaddl.s16  q13, d13, d15             \n"
+              "vmull.s8   q6, d2, d9                \n"
+              "vmull.s8   q7, d4, d10               \n"
+              "vmlal.s8   q6, d5, d11               \n"
+              "vaddl.s16  q14, d12, d14             \n"
+              "vaddl.s16  q15, d13, d15             \n"
+
+              "vld1.8     {d2-d3}, [%[r1]]          \n"  // r1
+              "add        %[r1], #8                 \n"
+              "vext.8     d4, d2, d3, #1            \n"
+              "vext.8     d5, d2, d3, #2            \n"
+              "vdup.s8    d6, d0[3]                 \n"
+              "vdup.s8    d7, d0[4]                 \n"
+              "vdup.s8    d8, d0[5]                 \n"
+              "vdup.s8    d9, d1[3]                 \n"
+              "vdup.s8    d10, d1[4]                \n"
+              "vdup.s8    d11, d1[5]                \n"
+              "vmull.s8   q6, d2, d6                \n"
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddw.s16  q12, q12, d12             \n"
+              "vaddw.s16  q12, q12, d14             \n"
+              "vaddw.s16  q13, q13, d13             \n"
+              "vaddw.s16  q13, q13, d15             \n"
+              "vmull.s8   q6, d2, d9                \n"
+              "vmull.s8   q7, d4, d10               \n"
+              "vmlal.s8   q6, d5, d11               \n"
+              "vaddw.s16  q14, q14, d12             \n"
+              "vaddw.s16  q14, q14, d14             \n"
+              "vaddw.s16  q15, q15, d13             \n"
+              "vaddw.s16  q15, q15, d15             \n"
+
+              "vld1.8     {d2-d3}, [%[r2]]          \n"  // r2
+              "add        %[r2], #8                 \n"
+              "vext.8     d4, d2, d3, #1            \n"
+              "vext.8     d5, d2, d3, #2            \n"
+              "vdup.s8    d6, d0[6]                 \n"
+              "vdup.s8    d7, d0[7]                 \n"
+              "vdup.s8    d8, r5                    \n"
+              "vdup.s8    d9, d1[6]                 \n"
+              "vdup.s8    d10, d1[7]                \n"
+              "vdup.s8    d11, r6                   \n"
+              "vmull.s8   q6, d2, d6                \n"
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddw.s16  q12, q12, d12             \n"
+              "vaddw.s16  q12, q12, d14             \n"
+              "vaddw.s16  q13, q13, d13             \n"
+              "vaddw.s16  q13, q13, d15             \n"
+              "vmull.s8   q6, d2, d9                \n"
+              "vmull.s8   q7, d4, d10               \n"
+              "vmlal.s8   q6, d5, d11               \n"
+              "vaddw.s16  q14, q14, d12             \n"
+              "vaddw.s16  q14, q14, d14             \n"
+              "vaddw.s16  q15, q15, d13             \n"
+              "vaddw.s16  q15, q15, d15             \n"
+
+              "vld1.32    {d12-d15}, [%[output0]]   \n"
+              "vadd.s32   q6, q6, q12               \n"
+              "vadd.s32   q7, q7, q13               \n"
+              "vst1.32    {d12-d15}, [%[output0]]!  \n"
+              "vld1.32    {d12-d15}, [%[output1]]  \n"
+              "vadd.s32   q6, q6, q14               \n"
+              "vadd.s32   q7, q7, q15               \n"
+              "vst1.32    {d12-d15}, [%[output1]]! \n"
+
+              "subs       %[ow], #1                 \n"
+              "bne        0b                        \n"
+              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [ow] "+r"(ow),
+                [output0] "+r"(output0), [output1] "+r"(output1)
+              : [kernel0] "r"(kernel0), [kernel1] "r"(kernel1)
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r5",
+                "r6");
+        }
+
+        if (remain > 0) {
+          asm volatile(
+              "vld1.8     {d0}, [%[kernel0]]       \n"
+              "ldr        r5,   [%[kernel0], #8]   \n"
+              "vld1.8     {d1}, [%[kernel1]]       \n"
+              "ldr        r6,   [%[kernel1], #8]   \n"
+
+              "0:                                  \n"
+              "vld1.8     d4, [%[r0]]              \n"
+              "vld1.8     d5, [%[r1]]              \n"
+              "vld1.8     d6, [%[r2]]              \n"
+              "add        %[r0], #1                \n"
+              "add        %[r1], #1                \n"
+              "add        %[r2], #1                \n"
+              "vdup.s8     d2, r5                  \n"
+              "vdup.s8     d3, r6                  \n"
+              "vext.8     d8, d0, d2, #3           \n"
+              "vext.8     d9, d0, d2, #6           \n"
+              "vext.8     d10, d1, d3, #3          \n"
+              "vext.8     d11, d1, d3, #6          \n"
+
+              "vmull.s8   q6, d4, d0               \n"
+              "vmull.s8   q7, d5, d8               \n"
+              "vmlal.s8   q6, d6, d9               \n"
+              "vaddl.s16  q12, d12, d14            \n"
+              "vdup.s32   d2, d24[1]               \n"
+              "vadd.s32   d24, d24, d2             \n"
+              "vadd.s32   d24, d24, d25            \n"
+              "vmull.s8   q6, d4, d1               \n"
+              "vmull.s8   q7, d5, d10              \n"
+              "vmlal.s8   q6, d6, d11              \n"
+              "vaddl.s16  q13, d12, d14            \n"
+              "vdup.s32   d2, d26[1]               \n"
+              "vadd.s32   d26, d26, d2             \n"
+              "vadd.s32   d26, d26, d27            \n"
+
+              "ldr        r7, [%[output0]]         \n"
+              "vdup.s32   d14, r7                  \n"
+              "vadd.s32   d14, d14, d24            \n"
+              "vst1.32    d14[0], [%[output0]]!    \n"
+              "ldr        r7, [%[output1]]         \n"
+              "vdup.s32   d14, r7                  \n"
+              "vadd.s32   d14, d14, d26            \n"
+              "vst1.32    d14[0], [%[output1]]!    \n"
+
+              "subs       %[remain], #1            \n"
+              "bne        0b                       \n"
+              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2),
+                [remain] "+r"(remain), [output0] "+r"(output0),
+                [output1] "+r"(output1)
+              : [kernel0] "r"(kernel0), [kernel1] "r"(kernel1)
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "r5", "r6", "r7");
+        }
+      }
+    }
+  }
+
+  for (; oc < output_c; ++oc) {
+    for (int ic = 0; ic < input_c; ++ic) {
+      const int8_t* kernel0 = w_data + (oc * input_c + ic) * 9;
+      int32_t* output0 = out_data + oc * out_image_size;
+      int32_t* output0n = output0 + output_w;
+
+      int oh = 0;
+      for (; oh < output_h - 1; oh += 2) {
+        const int8_t* r0 = in_data + ic * image_size + oh * input_w;
+        const int8_t* r1 = r0 + input_w;
+        const int8_t* r2 = r1 + input_w;
+        const int8_t* r3 = r2 + input_w;
+
+        int ow = output_w >> 3;
+        int remain = output_w & 0x7;
+        if (ow > 0) {
+          asm volatile(
+              "vld1.8     {d0}, [%[kernel0]]        \n"
+              "ldr        r5,   [%[kernel0], #8]    \n"
+
+              "0:                                   \n"
+              "vld1.8     {d2-d3}, [%[r0]]          \n"  // r0
+              "add        %[r0], #8                 \n"
+              "vext.8     d4, d2, d3, #1            \n"
+              "vext.8     d5, d2, d3, #2            \n"
+              "vdup.s8    d6, d0[0]                 \n"
+              "vdup.s8    d7, d0[1]                 \n"
+              "vdup.s8    d8, d0[2]                 \n"
+              "vmull.s8   q6, d2, d6                \n"
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddl.s16  q12, d12, d14             \n"
+              "vaddl.s16  q13, d13, d15             \n"
+
+              "vld1.8     {d2-d3}, [%[r1]]          \n"  // r1
+              "add        %[r1], #8                 \n"
+              "vext.8     d4, d2, d3, #1            \n"
+              "vext.8     d5, d2, d3, #2            \n"
+
+              "vmull.s8   q6, d2, d6                \n"  // next row
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddl.s16  q8, d12, d14              \n"
+              "vaddl.s16  q9, d13, d15              \n"
+
+              "vdup.s8    d6, d0[3]                 \n"
+              "vdup.s8    d7, d0[4]                 \n"
+              "vdup.s8    d8, d0[5]                 \n"
+              "vmull.s8   q6, d2, d6                \n"
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddw.s16  q12, q12, d12             \n"
+              "vaddw.s16  q12, q12, d14             \n"
+              "vaddw.s16  q13, q13, d13             \n"
+              "vaddw.s16  q13, q13, d15             \n"
+
+              "vld1.8     {d2-d3}, [%[r2]]          \n"  // r2
+              "add        %[r2], #8                 \n"
+              "vext.8     d4, d2, d3, #1            \n"
+              "vext.8     d5, d2, d3, #2            \n"
+
+              "vmull.s8   q6, d2, d6                \n"  // next row
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddw.s16  q8, q8, d12               \n"
+              "vaddw.s16  q8, q8, d14               \n"
+              "vaddw.s16  q9, q9, d13               \n"
+              "vaddw.s16  q9, q9, d15               \n"
+
+              "vdup.s8    d6, d0[6]                 \n"
+              "vdup.s8    d7, d0[7]                 \n"
+              "vdup.s8    d8, r5                    \n"
+              "vmull.s8   q6, d2, d6                \n"
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddw.s16  q12, q12, d12             \n"
+              "vaddw.s16  q12, q12, d14             \n"
+              "vaddw.s16  q13, q13, d13             \n"
+              "vaddw.s16  q13, q13, d15             \n"
+
+              "vld1.32    {d12-d15}, [%[output0]]   \n"
+              "vadd.s32   q6, q6, q12               \n"
+              "vadd.s32   q7, q7, q13               \n"
+              "vst1.32    {d12-d15}, [%[output0]]!  \n"
+
+              "vld1.8     {d2-d3}, [%[r3]]          \n"  // r3
+              "add        %[r3], #8                 \n"
+              "vext.8     d4, d2, d3, #1            \n"
+              "vext.8     d5, d2, d3, #2            \n"
+
+              "vmull.s8   q6, d2, d6                \n"  // next row
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddw.s16  q8, q8, d12               \n"
+              "vaddw.s16  q8, q8, d14               \n"
+              "vaddw.s16  q9, q9, d13               \n"
+              "vaddw.s16  q9, q9, d15               \n"
+
+              "vld1.32    {d12-d15}, [%[output0n]]  \n"
+              "vadd.s32   q6, q6, q8                \n"
+              "vadd.s32   q7, q7, q9                \n"
+              "vst1.32    {d12-d15}, [%[output0n]]! \n"
+
+              "subs       %[ow], #1                 \n"
+              "bne        0b                        \n"
+              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
+                [ow] "+r"(ow), [output0] "+r"(output0),
+                [output0n] "+r"(output0n)
+              : [kernel0] "r"(kernel0)
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r5");
+        }
+        if (remain > 0) {
+          asm volatile(
+              "vld1.8     {d0}, [%[kernel0]]       \n"
+              "ldr        r5,   [%[kernel0], #8]   \n"
+
+              "0:                                  \n"
+              "vld1.8     d4, [%[r0]]              \n"
+              "vld1.8     d5, [%[r1]]              \n"
+              "vld1.8     d6, [%[r2]]              \n"
+              "vld1.8     d7, [%[r3]]              \n"
+              "add        %[r0], #1                \n"
+              "add        %[r1], #1                \n"
+              "add        %[r2], #1                \n"
+              "add        %[r3], #1                \n"
+              "vdup.s8    d2, r5                   \n"
+              "vext.8     d8, d0, d2, #3           \n"
+              "vext.8     d9, d0, d2, #6           \n"
+
+              "vmull.s8   q6, d4, d0               \n"
+              "vmull.s8   q7, d5, d8               \n"
+              "vmlal.s8   q6, d6, d9               \n"
+              "vaddl.s16  q12, d12, d14            \n"
+              "vdup.s32   d2, d24[1]               \n"
+              "vadd.s32   d24, d24, d2             \n"
+              "vadd.s32   d24, d24, d25            \n"
+
+              "ldr        r7, [%[output0]]         \n"
+              "vdup.s32   d14, r7                  \n"
+              "vadd.s32   d14, d14, d24            \n"
+              "vst1.32    d14[0], [%[output0]]!    \n"
+
+              "vmull.s8   q6, d5, d0               \n"
+              "vmull.s8   q7, d6, d8               \n"
+              "vmlal.s8   q6, d7, d9               \n"
+              "vaddl.s16  q12, d12, d14            \n"
+              "vdup.s32   d2, d24[1]               \n"
+              "vadd.s32   d24, d24, d2             \n"
+              "vadd.s32   d24, d24, d25            \n"
+
+              "ldr        r7, [%[output0n]]        \n"
+              "vdup.s32   d14, r7                  \n"
+              "vadd.s32   d14, d14, d24            \n"
+              "vst1.32    d14[0], [%[output0n]]!   \n"
+
+              "subs       %[remain], #1            \n"
+              "bne        0b                       \n"
+              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
+                [remain] "+r"(remain), [output0] "+r"(output0),
+                [output0n] "+r"(output0n)
+              : [kernel0] "r"(kernel0)
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "r5", "r7");
+        }
+        output0 += output_w;
+        output0n += output_w;
+      }
+      // remain output height
+      for (; oh < output_h; ++oh) {
+        const int8_t* r0 = in_data + ic * image_size + oh * input_w;
+        const int8_t* r1 = r0 + input_w;
+        const int8_t* r2 = r1 + input_w;
+
+        int ow = output_w >> 3;
+        int remain = output_w & 0x7;
+        if (ow > 0) {
+          asm volatile(
+              "vld1.8     {d0}, [%[kernel0]]        \n"
+              "ldr        r5,   [%[kernel0], #8]    \n"
+
+              "0:                                   \n"
+              "vld1.8     {d2-d3}, [%[r0]]          \n"  // r0
+              "add        %[r0], #8                 \n"
+              "vext.8     d4, d2, d3, #1            \n"
+              "vext.8     d5, d2, d3, #2            \n"
+              "vdup.s8    d6, d0[0]                 \n"
+              "vdup.s8    d7, d0[1]                 \n"
+              "vdup.s8    d8, d0[2]                 \n"
+              "vmull.s8   q6, d2, d6                \n"
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddl.s16  q12, d12, d14             \n"
+              "vaddl.s16  q13, d13, d15             \n"
+
+              "vld1.8     {d2-d3}, [%[r1]]          \n"  // r1
+              "add        %[r1], #8                 \n"
+              "vext.8     d4, d2, d3, #1            \n"
+              "vext.8     d5, d2, d3, #2            \n"
+              "vdup.s8    d6, d0[3]                 \n"
+              "vdup.s8    d7, d0[4]                 \n"
+              "vdup.s8    d8, d0[5]                 \n"
+              "vmull.s8   q6, d2, d6                \n"
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddw.s16  q12, q12, d12             \n"
+              "vaddw.s16  q12, q12, d14             \n"
+              "vaddw.s16  q13, q13, d13             \n"
+              "vaddw.s16  q13, q13, d15             \n"
+
+              "vld1.8     {d2-d3}, [%[r2]]          \n"  // r2
+              "add        %[r2], #8                 \n"
+              "vext.8     d4, d2, d3, #1            \n"
+              "vext.8     d5, d2, d3, #2            \n"
+              "vdup.s8    d6, d0[6]                 \n"
+              "vdup.s8    d7, d0[7]                 \n"
+              "vdup.s8    d8, r5                    \n"
+              "vmull.s8   q6, d2, d6                \n"
+              "vmull.s8   q7, d4, d7                \n"
+              "vmlal.s8   q6, d5, d8                \n"
+              "vaddw.s16  q12, q12, d12             \n"
+              "vaddw.s16  q12, q12, d14             \n"
+              "vaddw.s16  q13, q13, d13             \n"
+              "vaddw.s16  q13, q13, d15             \n"
+
+              "vld1.32    {d12-d15}, [%[output0]]   \n"
+              "vadd.s32   q6, q6, q12               \n"
+              "vadd.s32   q7, q7, q13               \n"
+              "vst1.32    {d12-d15}, [%[output0]]!  \n"
+
+              "subs       %[ow], #1                 \n"
+              "bne        0b                        \n"
+              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [ow] "+r"(ow),
+                [output0] "+r"(output0)
+              : [kernel0] "r"(kernel0)
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r5");
+        }
+
+        if (remain > 0) {
+          asm volatile(
+              "vld1.8     {d0}, [%[kernel0]]       \n"
+              "ldr        r5,   [%[kernel0], #8]   \n"
+
+              "0:                                  \n"
+              "vld1.8     d4, [%[r0]]              \n"
+              "vld1.8     d5, [%[r1]]              \n"
+              "vld1.8     d6, [%[r2]]              \n"
+              "add        %[r0], #1                \n"
+              "add        %[r1], #1                \n"
+              "add        %[r2], #1                \n"
+              "vdup.s8    d2, r5                   \n"
+              "vext.8     d8, d0, d2, #3           \n"
+              "vext.8     d9, d0, d2, #6           \n"
+
+              "vmull.s8   q6, d4, d0               \n"
+              "vmull.s8   q7, d5, d8               \n"
+              "vmlal.s8   q6, d6, d9               \n"
+              "vaddl.s16  q12, d12, d14            \n"
+              "vdup.s32   d2, d24[1]               \n"
+              "vadd.s32   d24, d24, d2             \n"
+              "vadd.s32   d24, d24, d25            \n"
+
+              "ldr        r7, [%[output0]]         \n"
+              "vdup.s32   d14, r7                  \n"
+              "vadd.s32   d14, d14, d24            \n"
+              "vst1.32    d14[0], [%[output0]]!    \n"
+
+              "subs       %[remain], #1            \n"
+              "bne        0b                       \n"
+              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2),
+                [remain] "+r"(remain), [output0] "+r"(output0)
+              : [kernel0] "r"(kernel0)
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "r5", "r7");
+        }
+      }
+    }
+  }
+#endif
+#else
+// TODO(hjchen2)
+#endif
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/math/conv5x5_arm_int8.cpp b/src/operators/math/conv5x5_arm_int8.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c861c22d184d5428f3ab9c8f3a69b9aca5b697bd
--- /dev/null
+++ b/src/operators/math/conv5x5_arm_int8.cpp
@@ -0,0 +1,551 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_OP
+
+#include "operators/math/conv_arm_int8.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+void conv5x5s1_int8(const framework::Tensor& input,
+                    const framework::Tensor& weight,
+                    framework::Tensor* output) {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  const int8_t* in_data = input.data<int8_t>();
+  const int8_t* w_data = weight.data<int8_t>();
+  int32_t* out_data = output->mutable_data<int32_t>();
+  // make sure that batch size is 1
+  int input_c = input.dims()[1];
+  int input_h = input.dims()[2];
+  int input_w = input.dims()[3];
+  int output_c = output->dims()[1];
+  int output_h = output->dims()[2];
+  int output_w = output->dims()[3];
+  int image_size = input_h * input_w;
+  int out_image_size = output_h * output_w;
+  memset(out_data, 0, output_c * out_image_size * sizeof(int32_t));
+#if __aarch64__
+  // TODO(hjchen2)
+#else
+  #pragma omp parallel for
+  for (int oc = 0; oc < output_c; ++oc) {
+    for (int ic = 0; ic < input_c; ++ic) {
+      const int8_t* kernel = w_data + (oc * input_c + ic) * 25;
+      int32_t* output0 = out_data + oc * out_image_size;
+      int32_t* output1 = output0 + output_w;
+      int oh = 0;
+      for (; oh < output_h - 1; oh += 2) {
+        const int8_t* r0 = in_data + ic * image_size + oh * input_w;
+        const int8_t* r1 = r0 + input_w;
+        const int8_t* r2 = r1 + input_w;
+        const int8_t* r3 = r2 + input_w;
+        const int8_t* r4 = r3 + input_w;
+        const int8_t* r5 = r4 + input_w;
+
+        int ow = output_w >> 3;
+        int remain = output_w & 0x7;
+        if (ow > 0) {
+          asm volatile("vld1.8  {d0-d3}, [%[kernel]]  \n"
+                       : [kernel] "+r"(kernel)
+                       :
+                       : "cc", "memory", "q0", "q1");
+          asm volatile(
+              "0:                                  \n"
+              "vld1.8     {d4-d5}, [%[r0]]         \n"  // r0
+              "add        %[r0], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+              "vdup.s8    d10, d0[0]               \n"
+              "vdup.s8    d11, d0[1]               \n"
+              "vdup.s8    d12, d0[2]               \n"
+              "vdup.s8    d13, d0[3]               \n"
+              "vdup.s8    d14, d0[4]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q14, d16, d18            \n"
+              "vaddl.s16  q15, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q14, q14, d16            \n"
+              "vaddw.s16  q15, q15, d17            \n"
+
+              "vld1.8     {d4-d5}, [%[r1]]         \n"  // r1
+              "add        %[r1], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+
+              "vmull.s8   q8, d4, d10              \n"  // next row
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q10, d16, d18            \n"
+              "vaddl.s16  q11, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q10, q10, d16            \n"
+              "vaddw.s16  q11, q11, d17            \n"
+
+              "vdup.s8    d10, d0[5]               \n"
+              "vdup.s8    d11, d0[6]               \n"
+              "vdup.s8    d12, d0[7]               \n"
+              "vdup.s8    d13, d1[0]               \n"
+              "vdup.s8    d14, d1[1]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q14, q14, q12            \n"
+              "vadd.s32   q15, q15, q13            \n"
+
+              "vld1.8     {d4-d5}, [%[r2]]         \n"  // r2
+              "add        %[r2], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+
+              "vmull.s8   q8, d4, d10              \n"  // next row
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q10, q10, q12            \n"
+              "vadd.s32   q11, q11, q13            \n"
+
+              "vdup.s8    d10, d1[2]               \n"
+              "vdup.s8    d11, d1[3]               \n"
+              "vdup.s8    d12, d1[4]               \n"
+              "vdup.s8    d13, d1[5]               \n"
+              "vdup.s8    d14, d1[6]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q14, q14, q12            \n"
+              "vadd.s32   q15, q15, q13            \n"
+
+              "vld1.8     {d4-d5}, [%[r3]]         \n"  // r3
+              "add        %[r3], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+
+              "vmull.s8   q8, d4, d10              \n"  // next row
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q10, q10, q12            \n"
+              "vadd.s32   q11, q11, q13            \n"
+
+              "vdup.s8    d10, d1[7]               \n"
+              "vdup.s8    d11, d2[0]               \n"
+              "vdup.s8    d12, d2[1]               \n"
+              "vdup.s8    d13, d2[2]               \n"
+              "vdup.s8    d14, d2[3]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q14, q14, q12            \n"
+              "vadd.s32   q15, q15, q13            \n"
+
+              "vld1.8     {d4-d5}, [%[r4]]         \n"  // r4
+              "add        %[r4], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+
+              "vmull.s8   q8, d4, d10              \n"  // next row
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q10, q10, q12            \n"
+              "vadd.s32   q11, q11, q13            \n"
+
+              "vdup.s8    d10, d2[4]               \n"
+              "vdup.s8    d11, d2[5]               \n"
+              "vdup.s8    d12, d2[6]               \n"
+              "vdup.s8    d13, d2[7]               \n"
+              "vdup.s8    d14, d3[0]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q14, q14, q12            \n"
+              "vadd.s32   q15, q15, q13            \n"
+
+              "vld1.32    {d24-d27}, [%[output0]]  \n"
+              "vadd.s32   q12, q12, q14            \n"
+              "vadd.s32   q13, q13, q15            \n"
+              "vst1.32    {d24-d27}, [%[output0]]! \n"
+
+              "vld1.8     {d4-d5}, [%[r5]]         \n"  // row 5
+              "add        %[r5], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q10, q10, q12            \n"
+              "vadd.s32   q11, q11, q13            \n"
+
+              "vld1.32    {d24-d27}, [%[output1]]  \n"
+              "vadd.s32   q12, q12, q10            \n"
+              "vadd.s32   q13, q13, q11            \n"
+              "vst1.32    {d24-d27}, [%[output1]]! \n"
+
+              "subs       %[ow], #1                \n"
+              "bne        0b                       \n"
+              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
+                [r4] "+r"(r4), [r5] "+r"(r5), [ow] "+r"(ow),
+                [output0] "+r"(output0), [output1] "+r"(output1)
+              :
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+        }
+        if (remain > 0) {
+          asm volatile("vld1.8  {d0-d3}, [%[kernel]]  \n"
+                       : [kernel] "+r"(kernel)
+                       :
+                       : "cc", "memory", "q0", "q1");
+          asm volatile(
+              "0:                                  \n"
+              "vld1.8     d4, [%[r0]]              \n"
+              "vld1.8     d5, [%[r1]]              \n"
+              "vld1.8     d6, [%[r2]]              \n"
+              "vld1.8     d7, [%[r3]]              \n"
+              "vld1.8     d8, [%[r4]]              \n"
+              "vld1.8     d9, [%[r5]]              \n"
+              "add        %[r0], #1                \n"
+              "add        %[r1], #1                \n"
+              "add        %[r2], #1                \n"
+              "add        %[r3], #1                \n"
+              "add        %[r4], #1                \n"
+              "add        %[r5], #1                \n"
+              "vext.8     d10, d0, d1, #5          \n"
+              "vext.8     d11, d1, d2, #2          \n"
+              "vext.8     d12, d1, d2, #7          \n"
+              "vext.8     d13, d2, d3, #4          \n"
+
+              "vmull.s8   q7, d4, d0               \n"
+              "vmull.s8   q8, d5, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q10, d14, d16            \n"
+              "vaddw.s16  q10, q10, d18            \n"
+              "vadd.s32   d4, d20, d21             \n"
+              "vaddl.s16  q10, d15, d17            \n"
+              "vaddw.s16  q10, q10, d19            \n"
+              "vdup.s32   d14, d4[0]               \n"
+              "vdup.s32   d15, d4[1]               \n"
+              "vadd.s32   d15, d15, d14            \n"
+              "vdup.s32   d14, d20[0]              \n"
+              "vadd.s32   d15, d15, d14            \n"
+
+              "ldr        r6, [%[output0]]         \n"
+              "vdup.s32   d14, r6                  \n"
+              "vadd.s32   d15, d15, d14            \n"
+              "vst1.32    d15[0], [%[output0]]!    \n"
+
+              "vmull.s8   q7, d5, d0               \n"
+              "vmull.s8   q8, d6, d10              \n"
+              "vmull.s8   q9, d7, d11              \n"
+              "vmlal.s8   q8, d8, d12              \n"
+              "vmlal.s8   q9, d9, d13              \n"
+              "vaddl.s16  q10, d14, d16            \n"
+              "vaddw.s16  q10, q10, d18            \n"
+              "vadd.s32   d4, d20, d21             \n"
+              "vaddl.s16  q10, d15, d17            \n"
+              "vaddw.s16  q10, q10, d19            \n"
+              "vdup.s32   d14, d4[0]               \n"
+              "vdup.s32   d15, d4[1]               \n"
+              "vadd.s32   d15, d15, d14            \n"
+              "vdup.s32   d14, d20[0]              \n"
+              "vadd.s32   d15, d15, d14            \n"
+
+              "ldr        r6, [%[output1]]         \n"
+              "vdup.s32   d14, r6                  \n"
+              "vadd.s32   d15, d15, d14            \n"
+              "vst1.32    d15[0], [%[output1]]!    \n"
+
+              "subs       %[remain], #1            \n"
+              "bne        0b                       \n"
+              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
+                [r4] "+r"(r4), [r5] "+r"(r5), [remain] "+r"(remain),
+                [output0] "+r"(output0), [output1] "+r"(output1)
+              :
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "r6");
+        }
+        output0 += output_w;
+        output1 += output_w;
+      }
+      // remain output height
+      for (; oh < output_h; ++oh) {
+        const int8_t* r0 = in_data + ic * image_size + oh * input_w;
+        const int8_t* r1 = r0 + input_w;
+        const int8_t* r2 = r1 + input_w;
+        const int8_t* r3 = r2 + input_w;
+        const int8_t* r4 = r3 + input_w;
+
+        int ow = output_w >> 3;
+        int remain = output_w & 0x7;
+        if (ow > 0) {
+          asm volatile("vld1.8  {d0-d3}, [%[kernel]]  \n"
+                       : [kernel] "+r"(kernel)
+                       :
+                       : "cc", "memory", "q0", "q1");
+          asm volatile(
+              "0:                                  \n"
+              "vld1.8     {d4-d5}, [%[r0]]         \n"  // r0
+              "add        %[r0], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+              "vdup.s8    d10, d0[0]               \n"
+              "vdup.s8    d11, d0[1]               \n"
+              "vdup.s8    d12, d0[2]               \n"
+              "vdup.s8    d13, d0[3]               \n"
+              "vdup.s8    d14, d0[4]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q14, d16, d18            \n"
+              "vaddl.s16  q15, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q14, q14, d16            \n"
+              "vaddw.s16  q15, q15, d17            \n"
+
+              "vld1.8     {d4-d5}, [%[r1]]         \n"  // r1
+              "add        %[r1], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+              "vdup.s8    d10, d0[5]               \n"
+              "vdup.s8    d11, d0[6]               \n"
+              "vdup.s8    d12, d0[7]               \n"
+              "vdup.s8    d13, d1[0]               \n"
+              "vdup.s8    d14, d1[1]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q14, q14, q12            \n"
+              "vadd.s32   q15, q15, q13            \n"
+
+              "vld1.8     {d4-d5}, [%[r2]]         \n"  // r2
+              "add        %[r2], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+              "vdup.s8    d10, d1[2]               \n"
+              "vdup.s8    d11, d1[3]               \n"
+              "vdup.s8    d12, d1[4]               \n"
+              "vdup.s8    d13, d1[5]               \n"
+              "vdup.s8    d14, d1[6]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q14, q14, q12            \n"
+              "vadd.s32   q15, q15, q13            \n"
+
+              "vld1.8     {d4-d5}, [%[r3]]         \n"  // r3
+              "add        %[r3], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+              "vdup.s8    d10, d1[7]               \n"
+              "vdup.s8    d11, d2[0]               \n"
+              "vdup.s8    d12, d2[1]               \n"
+              "vdup.s8    d13, d2[2]               \n"
+              "vdup.s8    d14, d2[3]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q14, q14, q12            \n"
+              "vadd.s32   q15, q15, q13            \n"
+
+              "vld1.8     {d4-d5}, [%[r4]]         \n"  // r4
+              "add        %[r4], #8                \n"
+              "vext.8     d6, d4, d5, #1           \n"
+              "vext.8     d7, d4, d5, #2           \n"
+              "vext.8     d8, d4, d5, #3           \n"
+              "vext.8     d9, d4, d5, #4           \n"
+              "vdup.s8    d10, d2[4]               \n"
+              "vdup.s8    d11, d2[5]               \n"
+              "vdup.s8    d12, d2[6]               \n"
+              "vdup.s8    d13, d2[7]               \n"
+              "vdup.s8    d14, d3[0]               \n"
+              "vmull.s8   q8, d4, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q12, d16, d18            \n"
+              "vaddl.s16  q13, d17, d19            \n"
+              "vmull.s8   q8, d9, d14              \n"
+              "vaddw.s16  q12, q12, d16            \n"
+              "vaddw.s16  q13, q13, d17            \n"
+              "vadd.s32   q14, q14, q12            \n"
+              "vadd.s32   q15, q15, q13            \n"
+
+              "vld1.32    {d24-d27}, [%[output0]]  \n"
+              "vadd.s32   q12, q12, q14            \n"
+              "vadd.s32   q13, q13, q15            \n"
+              "vst1.32    {d24-d27}, [%[output0]]! \n"
+
+              "subs       %[ow], #1                \n"
+              "bne        0b                       \n"
+              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
+                [r4] "+r"(r4), [ow] "+r"(ow), [output0] "+r"(output0)
+              :
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+        }
+
+        if (remain > 0) {
+          asm volatile("vld1.8  {d0-d3}, [%[kernel]]  \n"
+                       : [kernel] "+r"(kernel)
+                       :
+                       : "cc", "memory", "q0", "q1");
+          asm volatile(
+              "0:                                  \n"
+              "vld1.8     d4, [%[r0]]              \n"
+              "vld1.8     d5, [%[r1]]              \n"
+              "vld1.8     d6, [%[r2]]              \n"
+              "vld1.8     d7, [%[r3]]              \n"
+              "vld1.8     d8, [%[r4]]              \n"
+              "add        %[r0], #1                \n"
+              "add        %[r1], #1                \n"
+              "add        %[r2], #1                \n"
+              "add        %[r3], #1                \n"
+              "add        %[r4], #1                \n"
+              "vext.8     d10, d0, d1, #5          \n"
+              "vext.8     d11, d1, d2, #2          \n"
+              "vext.8     d12, d1, d2, #7          \n"
+              "vext.8     d13, d2, d3, #4          \n"
+
+              "vmull.s8   q7, d4, d0               \n"
+              "vmull.s8   q8, d5, d10              \n"
+              "vmull.s8   q9, d6, d11              \n"
+              "vmlal.s8   q8, d7, d12              \n"
+              "vmlal.s8   q9, d8, d13              \n"
+              "vaddl.s16  q10, d14, d16            \n"
+              "vaddw.s16  q10, q10, d18            \n"
+              "vadd.s32   d4, d20, d21             \n"
+              "vaddl.s16  q10, d15, d17            \n"
+              "vaddw.s16  q10, q10, d19            \n"
+              "vdup.s32   d14, d4[0]               \n"
+              "vdup.s32   d15, d4[1]               \n"
+              "vadd.s32   d15, d15, d14            \n"
+              "vdup.s32   d14, d20[0]              \n"
+              "vadd.s32   d15, d15, d14            \n"
+
+              "ldr        r6, [%[output0]]         \n"
+              "vdup.s32   d14, r6                  \n"
+              "vadd.s32   d15, d15, d14            \n"
+              "vst1.32    d15[0], [%[output0]]!    \n"
+
+              "subs       %[remain], #1            \n"
+              "bne        0b                       \n"
+              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
+                [r4] "+r"(r4), [remain] "+r"(remain), [output0] "+r"(output0)
+              :
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "r6");
+        }
+      }
+    }
+  }
+#endif
+#else
+// TODO(hjchen2)
+#endif
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/math/conv_arm_int8.h b/src/operators/math/conv_arm_int8.h
new file mode 100644
index 0000000000000000000000000000000000000000..98843e6158bb0f9816bf49a1cbced5a2ea731446
--- /dev/null
+++ b/src/operators/math/conv_arm_int8.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef CONV_OP
+
+#pragma once
+
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+void conv3x3s1_int8(const framework::Tensor& input,
+                    const framework::Tensor& weight, framework::Tensor* output);
+
+void conv3x3s1_int8_4c(const framework::Tensor& input,
+                       const framework::Tensor& weight,
+                       framework::Tensor* output);
+
+void conv5x5s1_int8(const framework::Tensor& input,
+                    const framework::Tensor& weight, framework::Tensor* output);
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index 1fcfc5f98a5279cc4a93da596edbd63c693bd488..44621ba99a92a3ed456b8d7d0959e3580662d910 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -3379,7 +3379,7 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
     // 对 B 分块
     NC = L1 / (KC * sizeof(float));
     if (NC == 0) {
-      NC == NR;
+      NC = NR;
     } else {
       int nblock_num = (n + NC - 1) / NC;
       NC = (n + nblock_num - 1) / nblock_num;
@@ -3662,7 +3662,7 @@ void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
   b_ptr = b;
   int kc1 = k / 8;
   int kc2 = k % 8;
-  int step = 4 * ldc;
+  int step = sizeof(float) * ldc;
   asm volatile(
       "pld        [%[a_ptr]]            \n\t"
       "pld        [%[a_ptr],  #64]      \n\t"
@@ -3866,11 +3866,10 @@ void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
       :
       : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
         [kc2] "r"(kc2), [step] "r"(step)
-      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-        "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+      : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
+        "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
 
 #endif  // __aarch64__
-#else
 
 #endif  // __ARM_NEON
 }
diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h
index d7f5b2249ad20f4e2d242ce68b6069ae71a23e28..ea023bc134033aee6577ebf06c95f2a762d08bca 100644
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -22,9 +22,11 @@ limitations under the License. */
 #define C(i, j) C[(i)*ldc + (j)]
 
 #if __aarch64__
+#define MR_INT8 4
 #define MR 6
 #define NR 16
 #else
+#define MR_INT8 4
 #define MR 6
 #define NR 8
 #endif
@@ -96,6 +98,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
   void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
                             float *c, float *C, int ldc, float *p,
                             std::string mode, float *bias, float *bias1);
+
   /*
   // 向量矩阵乘法 (M = 1)
   void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
@@ -139,6 +142,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                        float *new_scale, float *new_bias);
   void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
                           float *new_scale, float *new_bias, float *bias1);
+
   /*
   // 向量矩阵乘法结果回写
   // C = A * B
@@ -185,15 +189,67 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                           const float *B, int ldb, float *C, int ldc, float *p,
                           std::string mode, float *bias, float *bias1);
 
+  // 8 bits function cluster begins
+  // 8 bits int small block inner product
+  void AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
+                 int32_t ldc);
+  void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
+                 int32_t ldc);
+
+  // 8 bits int inner product
+  void InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
+                           const int8_t *a, const int8_t *b, int8_t beta,
+                           int32_t *c, int32_t *C, int32_t ldc, bool relu,
+                           int8_t *bias);
+
+  // 8 bits int pack function
+  void PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
+                      int32_t lda, int8_t *buffer);
+  void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
+                      int32_t lda, int8_t *buffer);
+  void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
+                      int32_t ldb, int8_t *buffer);
+
+  // 8 bits int matrix product
+  void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
+             int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C,
+             int32_t ldc, bool relu, int8_t *bias);
+
+  // 8 bits int write back
+  // C = alpha * A * B + beta * C
+  void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                          int32_t ldc);
+  // C = A * B
+  void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc);
+  // C = A * B + C
+  void WriteWithAdd(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                    int32_t ldc);
+  // C = A * B + bias
+  void WriteWithAddV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                      int32_t ldc, int8_t *bias);
+  // C = A * B + C, relu(C)
+  void WriteWithAddRelu(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                        int32_t ldc);
+  // C = A * B + bias, relu(C)
+  void WriteWithAddReluV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                          int32_t ldc, int8_t *bias);
+
  private:
   int MC = 0;
   int KC = 0;
   int NC = 0;
 
+  // 32位 float
   float *packedA;
   float *packedB;
   float *packedC;
   float *zero;
+
+  // 8 bits int
+  int8_t *packedA_int8;
+  int8_t *packedB_int8;
+  int32_t *packedC_int8;
+  int8_t *zero_int8;
 };
 
 }  // namespace math
diff --git a/src/operators/math/gemm_int8.cpp b/src/operators/math/gemm_int8.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5dd8a7c3131543f426f32e258efb3181be9b2f61
--- /dev/null
+++ b/src/operators/math/gemm_int8.cpp
@@ -0,0 +1,870 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string.h>
+#include "common/log.h"
+#include "memory/t_malloc.h"
+#include "operators/math/gemm.h"
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
+                     int32_t ldc) {
+#if __ARM_NEON
+#if __aarch64__
+// TODO
+#else
+  const int8_t *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int32_t kc1 = k >> 3;
+  int32_t kc2 = k & 7;
+  int32_t kc3 = kc2 >> 2;
+  int32_t kc4 = kc2 & 3;
+  int32_t kc5 = kc4 >> 1;
+  int32_t kc6 = kc4 & 1;
+  int32_t step = sizeof(int32_t) * ldc;
+  asm volatile(
+      // q8-q15: save 32 results
+      "pld          [%[a_ptr]]                     \n\t"
+      "pld          [%[b_ptr]]                     \n\t"
+      "pld          [%[b_ptr], #64]                \n\t"
+      "vmov.s32     q8,         #0                 \n\t"
+      "vmov.s32     q9,         q8                 \n\t"
+      "vmov.s32     q10,        q8                 \n\t"
+      "vmov.s32     q11,        q8                 \n\t"
+      "vmov.s32     q12,        q8                 \n\t"
+      "vmov.s32     q13,        q8                 \n\t"
+      "vmov.s32     q14,        q8                 \n\t"
+      "vmov.s32     q15,        q8                 \n\t"
+      "subs         %[kc1],     %[kc1],       #1   \n\t"
+      "blt          1f                             \n\t"
+      "0:                                          \n\t"
+      "pld          [%[a_ptr], #64]                \n\t"
+      "pld          [%[b_ptr], #128]               \n\t"
+      "vld1.s8      {d0-d3},    [%[a_ptr]]!        \n\t"  // load A 8 cols
+      "vld1.s8      {d8-d11},   [%[b_ptr]]!        \n\t"  // load B first 4 rows
+      "vmovl.s8     q2,         d0                 \n\t"  // process B first 4
+                                                          // rows
+      "vmovl.s8     q3,         d8                 \n\t"
+      "vmlal.s16    q8,         d6,            d4[0]\n\t"
+      "vmlal.s16    q9,         d7,            d4[0]\n\t"
+      "vmlal.s16    q10,        d6,            d4[1]\n\t"
+      "vmlal.s16    q11,        d7,            d4[1]\n\t"
+      "vmlal.s16    q12,        d6,            d4[2]\n\t"
+      "vmlal.s16    q13,        d7,            d4[2]\n\t"
+      "vmlal.s16    q14,        d6,            d4[3]\n\t"
+      "vmlal.s16    q15,        d7,            d4[3]\n\t"
+      "vmovl.s8     q3,         d9                 \n\t"
+      "vmlal.s16    q8,         d6,            d5[0]\n\t"
+      "vmlal.s16    q9,         d7,            d5[0]\n\t"
+      "vmlal.s16    q10,        d6,            d5[1]\n\t"
+      "vmlal.s16    q11,        d7,            d5[1]\n\t"
+      "vmlal.s16    q12,        d6,            d5[2]\n\t"
+      "vmlal.s16    q13,        d7,            d5[2]\n\t"
+      "vmlal.s16    q14,        d6,            d5[3]\n\t"
+      "vmlal.s16    q15,        d7,            d5[3]\n\t"
+      "vld1.s8      {d12-d15},  [%[b_ptr]]!        \n\t"  // load B second 4
+                                                          // rows
+      "vmovl.s8     q2,         d1                 \n\t"
+      "vmovl.s8     q3,         d10                 \n\t"
+      "vmlal.s16    q8,         d6,            d4[0]\n\t"
+      "vmlal.s16    q9,         d7,            d4[0]\n\t"
+      "vmlal.s16    q10,        d6,            d4[1]\n\t"
+      "vmlal.s16    q11,        d7,            d4[1]\n\t"
+      "vmlal.s16    q12,        d6,            d4[2]\n\t"
+      "vmlal.s16    q13,        d7,            d4[2]\n\t"
+      "vmlal.s16    q14,        d6,            d4[3]\n\t"
+      "vmlal.s16    q15,        d7,            d4[3]\n\t"
+      "vmovl.s8     q3,         d11                 \n\t"
+      "vmlal.s16    q8,         d6,            d5[0]\n\t"
+      "vmlal.s16    q9,         d7,            d5[0]\n\t"
+      "vmlal.s16    q10,        d6,            d5[1]\n\t"
+      "vmlal.s16    q11,        d7,            d5[1]\n\t"
+      "vmlal.s16    q12,        d6,            d5[2]\n\t"
+      "vmlal.s16    q13,        d7,            d5[2]\n\t"
+      "vmlal.s16    q14,        d6,            d5[3]\n\t"
+      "vmlal.s16    q15,        d7,            d5[3]\n\t"
+      "vmovl.s8     q2,         d2                 \n\t"  // process B second 4
+                                                          // rows
+      "vmovl.s8     q3,         d12                 \n\t"
+      "vmlal.s16    q8,         d6,            d4[0]\n\t"
+      "vmlal.s16    q9,         d7,            d4[0]\n\t"
+      "vmlal.s16    q10,        d6,            d4[1]\n\t"
+      "vmlal.s16    q11,        d7,            d4[1]\n\t"
+      "vmlal.s16    q12,        d6,            d4[2]\n\t"
+      "vmlal.s16    q13,        d7,            d4[2]\n\t"
+      "vmlal.s16    q14,        d6,            d4[3]\n\t"
+      "vmlal.s16    q15,        d7,            d4[3]\n\t"
+      "vmovl.s8     q3,         d13                 \n\t"
+      "vmlal.s16    q8,         d6,            d5[0]\n\t"
+      "vmlal.s16    q9,         d7,            d5[0]\n\t"
+      "vmlal.s16    q10,        d6,            d5[1]\n\t"
+      "vmlal.s16    q11,        d7,            d5[1]\n\t"
+      "vmlal.s16    q12,        d6,            d5[2]\n\t"
+      "vmlal.s16    q13,        d7,            d5[2]\n\t"
+      "vmlal.s16    q14,        d6,            d5[3]\n\t"
+      "vmlal.s16    q15,        d7,            d5[3]\n\t"
+      "vmovl.s8     q2,         d3                 \n\t"
+      "vmovl.s8     q3,         d14                 \n\t"
+      "vmlal.s16    q8,         d6,            d4[0]\n\t"
+      "vmlal.s16    q9,         d7,            d4[0]\n\t"
+      "vmlal.s16    q10,        d6,            d4[1]\n\t"
+      "vmlal.s16    q11,        d7,            d4[1]\n\t"
+      "vmlal.s16    q12,        d6,            d4[2]\n\t"
+      "vmlal.s16    q13,        d7,            d4[2]\n\t"
+      "vmlal.s16    q14,        d6,            d4[3]\n\t"
+      "vmlal.s16    q15,        d7,            d4[3]\n\t"
+      "vmovl.s8     q3,         d15                 \n\t"
+      "vmlal.s16    q8,         d6,            d5[0]\n\t"
+      "vmlal.s16    q9,         d7,            d5[0]\n\t"
+      "vmlal.s16    q10,        d6,            d5[1]\n\t"
+      "vmlal.s16    q11,        d7,            d5[1]\n\t"
+      "vmlal.s16    q12,        d6,            d5[2]\n\t"
+      "vmlal.s16    q13,        d7,            d5[2]\n\t"
+      "vmlal.s16    q14,        d6,            d5[3]\n\t"
+      "vmlal.s16    q15,        d7,            d5[3]\n\t"
+
+      "subs         %[kc1],     %[kc1],        #1  \n\t"
+      "bge          0b                             \n\t"
+      "1:                                          \n\t"  // last 4 rows
+      "subs         %[kc3],     %[kc3],        #1  \n\t"
+      "blt          2f                             \n\t"
+      "vld1.s8      {d0-d1},    [%[a_ptr]]!        \n\t"  // load A 4 cols
+      "vld1.s8      {d8-d11},   [%[b_ptr]]!        \n\t"  // load B 4 rows
+      "vmovl.s8     q2,         d0                 \n\t"
+      "vmovl.s8     q3,         d8                 \n\t"
+      "vmlal.s16    q8,         d6,            d4[0]\n\t"
+      "vmlal.s16    q9,         d7,            d4[0]\n\t"
+      "vmlal.s16    q10,        d6,            d4[1]\n\t"
+      "vmlal.s16    q11,        d7,            d4[1]\n\t"
+      "vmlal.s16    q12,        d6,            d4[2]\n\t"
+      "vmlal.s16    q13,        d7,            d4[2]\n\t"
+      "vmlal.s16    q14,        d6,            d4[3]\n\t"
+      "vmlal.s16    q15,        d7,            d4[3]\n\t"
+      "vmovl.s8     q3,         d9                 \n\t"
+      "vmlal.s16    q8,         d6,            d5[0]\n\t"
+      "vmlal.s16    q9,         d7,            d5[0]\n\t"
+      "vmlal.s16    q10,        d6,            d5[1]\n\t"
+      "vmlal.s16    q11,        d7,            d5[1]\n\t"
+      "vmlal.s16    q12,        d6,            d5[2]\n\t"
+      "vmlal.s16    q13,        d7,            d5[2]\n\t"
+      "vmlal.s16    q14,        d6,            d5[3]\n\t"
+      "vmlal.s16    q15,        d7,            d5[3]\n\t"
+      "vmovl.s8     q2,         d1                 \n\t"
+      "vmovl.s8     q3,         d10                 \n\t"
+      "vmlal.s16    q8,         d6,            d4[0]\n\t"
+      "vmlal.s16    q9,         d7,            d4[0]\n\t"
+      "vmlal.s16    q10,        d6,            d4[1]\n\t"
+      "vmlal.s16    q11,        d7,            d4[1]\n\t"
+      "vmlal.s16    q12,        d6,            d4[2]\n\t"
+      "vmlal.s16    q13,        d7,            d4[2]\n\t"
+      "vmlal.s16    q14,        d6,            d4[3]\n\t"
+      "vmlal.s16    q15,        d7,            d4[3]\n\t"
+      "vmovl.s8     q3,         d11                 \n\t"
+      "vmlal.s16    q8,         d6,            d5[0]\n\t"
+      "vmlal.s16    q9,         d7,            d5[0]\n\t"
+      "vmlal.s16    q10,        d6,            d5[1]\n\t"
+      "vmlal.s16    q11,        d7,            d5[1]\n\t"
+      "vmlal.s16    q12,        d6,            d5[2]\n\t"
+      "vmlal.s16    q13,        d7,            d5[2]\n\t"
+      "vmlal.s16    q14,        d6,            d5[3]\n\t"
+      "vmlal.s16    q15,        d7,            d5[3]\n\t"
+      "2:                                          \n\t"  // last 2 rows
+      "subs         %[kc5],     %[kc5],        #1  \n\t"
+      "blt          3f                             \n\t"
+      "vld1.s8      {d0},       [%[a_ptr]]!        \n\t"  // load A 2 cols
+      "vld1.s8      {d8-d9},    [%[b_ptr]]!        \n\t"  // load B 2 rows
+      "vmovl.s8     q2,         d0                 \n\t"
+      "vmovl.s8     q3,         d8                 \n\t"
+      "vmlal.s16    q8,         d6,            d4[0]\n\t"
+      "vmlal.s16    q9,         d7,            d4[0]\n\t"
+      "vmlal.s16    q10,        d6,            d4[1]\n\t"
+      "vmlal.s16    q11,        d7,            d4[1]\n\t"
+      "vmlal.s16    q12,        d6,            d4[2]\n\t"
+      "vmlal.s16    q13,        d7,            d4[2]\n\t"
+      "vmlal.s16    q14,        d6,            d4[3]\n\t"
+      "vmlal.s16    q15,        d7,            d4[3]\n\t"
+      "vmovl.s8     q3,         d9                 \n\t"
+      "vmlal.s16    q8,         d6,            d5[0]\n\t"
+      "vmlal.s16    q9,         d7,            d5[0]\n\t"
+      "vmlal.s16    q10,        d6,            d5[1]\n\t"
+      "vmlal.s16    q11,        d7,            d5[1]\n\t"
+      "vmlal.s16    q12,        d6,            d5[2]\n\t"
+      "vmlal.s16    q13,        d7,            d5[2]\n\t"
+      "vmlal.s16    q14,        d6,            d5[3]\n\t"
+      "vmlal.s16    q15,        d7,            d5[3]\n\t"
+      "3:                                          \n\t"  // last 1 row
+      "subs         %[kc6],     %[kc6],        #1  \n\t"
+      "blt          4f                             \n\t"
+      "vld1.s8      {d0},       [%[a_ptr]]         \n\t"  // load A 1 col
+      "vld1.s8      {d8},       [%[b_ptr]]        \n\t"   // load B 1 row
+      "vmovl.s8     q2,         d0                 \n\t"
+      "vmovl.s8     q3,         d8                 \n\t"
+      "vmlal.s16    q8,         d6,            d4[0]\n\t"
+      "vmlal.s16    q9,         d7,            d4[0]\n\t"
+      "vmlal.s16    q10,        d6,            d4[1]\n\t"
+      "vmlal.s16    q11,        d7,            d4[1]\n\t"
+      "vmlal.s16    q12,        d6,            d4[2]\n\t"
+      "vmlal.s16    q13,        d7,            d4[2]\n\t"
+      "vmlal.s16    q14,        d6,            d4[3]\n\t"
+      "vmlal.s16    q15,        d7,            d4[3]\n\t"
+      "4:                                          \n\t"
+      "vst1.32      {q8, q9},   [%[c]],   %[step]  \n\t"
+      "vst1.32      {q10, q11}, [%[c]],   %[step]  \n\t"
+      "vst1.32      {q12, q13}, [%[c]],   %[step]  \n\t"
+      "vst1.32      {q14, q15}, [%[c]]             \n\t"
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [kc3] "r"(kc3), [kc5] "r"(kc5), [kc6] "r"(kc6), [step] "r"(step)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+#endif  // __aarch64__
+#endif  // __ARM_NEON
+}
+
+// 8 bits int small block inner product
+void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
+                     int32_t ldc) {
+#if __ARM_NEON
+#if __aarch64__
+// TODO
+#else
+  const int8_t *a_ptr, *b_ptr;
+  a_ptr = a;
+  b_ptr = b;
+  int32_t kc1 = k >> 3;
+  int32_t kc2 = k & 7;
+  int32_t kc3 = kc2 >> 2;
+  int32_t kc4 = kc2 & 3;
+  int32_t kc5 = kc4 >> 1;
+  int32_t kc6 = kc4 & 1;
+  int32_t step = sizeof(int32_t) * ldc;
+  asm volatile(
+      // q4-q15: save 48 results
+      "pld          [%[a_ptr]]                     \n\t"
+      "pld          [%[b_ptr]]                     \n\t"
+      "pld          [%[b_ptr], #64]                \n\t"
+      "vmov.s32     q4,         #0                 \n\t"
+      "vmov.s32     q5,         q4                 \n\t"
+      "vmov.s32     q6,         q4                 \n\t"
+      "vmov.s32     q7,         q4                 \n\t"
+      "vmov.s32     q8,         q4                 \n\t"
+      "vmov.s32     q9,         q4                 \n\t"
+      "vmov.s32     q10,        q4                 \n\t"
+      "vmov.s32     q11,        q4                 \n\t"
+      "vmov.s32     q12,        q4                 \n\t"
+      "vmov.s32     q13,        q4                 \n\t"
+      "vmov.s32     q14,        q4                 \n\t"
+      "vmov.s32     q15,        q4                 \n\t"
+      "mov r0,      #12                            \n\t"
+      "subs         %[kc1],     %[kc1],       #1   \n\t"
+      "blt          1f                             \n\t"
+      "0:                                          \n\t"
+      "pld          [%[a_ptr], #64]                \n\t"
+      "pld          [%[b_ptr], #128]               \n\t"
+      "vld1.s8      {d0-d2},    [%[a_ptr]]!        \n\t"  // A 4 cols
+      "vld1.s8      {d3},       [%[b_ptr]]!        \n\t"  // B 1st row
+      "vmovl.s8     q2,         d0                 \n\t"
+      "vmovl.s8     q3,         d3                 \n\t"
+      "vmlal.s16    q4,         d6,            d4[0]\n\t"
+      "vmlal.s16    q5,         d7,            d4[0]\n\t"
+      "vmlal.s16    q6,         d6,            d4[1]\n\t"
+      "vmlal.s16    q7,         d7,            d4[1]\n\t"
+      "vmlal.s16    q8,         d6,            d4[2]\n\t"
+      "vmlal.s16    q9,         d7,            d4[2]\n\t"
+      "vmlal.s16    q10,        d6,            d4[3]\n\t"
+      "vmlal.s16    q11,        d7,            d4[3]\n\t"
+      "vmlal.s16    q12,        d6,            d5[0]\n\t"
+      "vmlal.s16    q13,        d7,            d5[0]\n\t"
+      "vmlal.s16    q14,        d6,            d5[1]\n\t"
+      "vmlal.s16    q15,        d7,            d5[1]\n\t"
+      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 2nd row
+      "vmovl.s8     q3,         d3                  \n\t"
+      "vmlal.s16    q4,         d6,            d5[2]\n\t"
+      "vmlal.s16    q5,         d7,            d5[2]\n\t"
+      "vmlal.s16    q6,         d6,            d5[3]\n\t"
+      "vmlal.s16    q7,         d7,            d5[3]\n\t"
+      "vmovl.s8     q2,         d1                  \n\t"
+      "vmlal.s16    q8,         d6,            d4[0]\n\t"
+      "vmlal.s16    q9,         d7,            d4[0]\n\t"
+      "vmlal.s16    q10,        d6,            d4[1]\n\t"
+      "vmlal.s16    q11,        d7,            d4[1]\n\t"
+      "vmlal.s16    q12,        d6,            d4[2]\n\t"
+      "vmlal.s16    q13,        d7,            d4[2]\n\t"
+      "vmlal.s16    q14,        d6,            d4[3]\n\t"
+      "vmlal.s16    q15,        d7,            d4[3]\n\t"
+      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 3th row
+      "vmovl.s8     q3,         d3                  \n\t"
+      "vmlal.s16    q4,         d6,            d5[0]\n\t"
+      "vmlal.s16    q5,         d7,            d5[0]\n\t"
+      "vmlal.s16    q6,         d6,            d5[1]\n\t"
+      "vmlal.s16    q7,         d7,            d5[1]\n\t"
+      "vmlal.s16    q8,         d6,            d5[2]\n\t"
+      "vmlal.s16    q9,         d7,            d5[2]\n\t"
+      "vmlal.s16    q10,        d6,            d5[3]\n\t"
+      "vmlal.s16    q11,        d7,            d5[3]\n\t"
+      "vmovl.s8     q2,         d2                  \n\t"
+      "vmlal.s16    q12,        d6,            d4[0]\n\t"
+      "vmlal.s16    q13,        d7,            d4[0]\n\t"
+      "vmlal.s16    q14,        d6,            d4[1]\n\t"
+      "vmlal.s16    q15,        d7,            d4[1]\n\t"
+      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 4th row
+      "vmovl.s8     q3,         d3                  \n\t"
+      "vmlal.s16    q4,         d6,            d4[2]\n\t"
+      "vmlal.s16    q5,         d7,            d4[2]\n\t"
+      "vmlal.s16    q6,         d6,            d4[3]\n\t"
+      "vmlal.s16    q7,         d7,            d4[3]\n\t"
+      "vmlal.s16    q8,         d6,            d5[0]\n\t"
+      "vmlal.s16    q9,         d7,            d5[0]\n\t"
+      "vmlal.s16    q10,        d6,            d5[1]\n\t"
+      "vmlal.s16    q11,        d7,            d5[1]\n\t"
+      "vmlal.s16    q12,        d6,            d5[2]\n\t"
+      "vmlal.s16    q13,        d7,            d5[2]\n\t"
+      "vmlal.s16    q14,        d6,            d5[3]\n\t"
+      "vmlal.s16    q15,        d7,            d5[3]\n\t"
+
+      "vld1.s8      {d0-d2},    [%[a_ptr]]!        \n\t"  // A 4 cols
+      "vld1.s8      {d3},       [%[b_ptr]]!        \n\t"  // B 1st row
+      "vmovl.s8     q2,         d0                 \n\t"
+      "vmovl.s8     q3,         d3                 \n\t"
+      "vmlal.s16    q4,         d6,            d4[0]\n\t"
+      "vmlal.s16    q5,         d7,            d4[0]\n\t"
+      "vmlal.s16    q6,         d6,            d4[1]\n\t"
+      "vmlal.s16    q7,         d7,            d4[1]\n\t"
+      "vmlal.s16    q8,         d6,            d4[2]\n\t"
+      "vmlal.s16    q9,         d7,            d4[2]\n\t"
+      "vmlal.s16    q10,        d6,            d4[3]\n\t"
+      "vmlal.s16    q11,        d7,            d4[3]\n\t"
+      "vmlal.s16    q12,        d6,            d5[0]\n\t"
+      "vmlal.s16    q13,        d7,            d5[0]\n\t"
+      "vmlal.s16    q14,        d6,            d5[1]\n\t"
+      "vmlal.s16    q15,        d7,            d5[1]\n\t"
+      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 2nd row
+      "vmovl.s8     q3,         d3                  \n\t"
+      "vmlal.s16    q4,         d6,            d5[2]\n\t"
+      "vmlal.s16    q5,         d7,            d5[2]\n\t"
+      "vmlal.s16    q6,         d6,            d5[3]\n\t"
+      "vmlal.s16    q7,         d7,            d5[3]\n\t"
+      "vmovl.s8     q2,         d1                  \n\t"
+      "vmlal.s16    q8,         d6,            d4[0]\n\t"
+      "vmlal.s16    q9,         d7,            d4[0]\n\t"
+      "vmlal.s16    q10,        d6,            d4[1]\n\t"
+      "vmlal.s16    q11,        d7,            d4[1]\n\t"
+      "vmlal.s16    q12,        d6,            d4[2]\n\t"
+      "vmlal.s16    q13,        d7,            d4[2]\n\t"
+      "vmlal.s16    q14,        d6,            d4[3]\n\t"
+      "vmlal.s16    q15,        d7,            d4[3]\n\t"
+      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 3th row
+      "vmovl.s8     q3,         d3                  \n\t"
+      "vmlal.s16    q4,         d6,            d5[0]\n\t"
+      "vmlal.s16    q5,         d7,            d5[0]\n\t"
+      "vmlal.s16    q6,         d6,            d5[1]\n\t"
+      "vmlal.s16    q7,         d7,            d5[1]\n\t"
+      "vmlal.s16    q8,         d6,            d5[2]\n\t"
+      "vmlal.s16    q9,         d7,            d5[2]\n\t"
+      "vmlal.s16    q10,        d6,            d5[3]\n\t"
+      "vmlal.s16    q11,        d7,            d5[3]\n\t"
+      "vmovl.s8     q2,         d2                  \n\t"
+      "vmlal.s16    q12,        d6,            d4[0]\n\t"
+      "vmlal.s16    q13,        d7,            d4[0]\n\t"
+      "vmlal.s16    q14,        d6,            d4[1]\n\t"
+      "vmlal.s16    q15,        d7,            d4[1]\n\t"
+      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 4th row
+      "vmovl.s8     q3,         d3                  \n\t"
+      "vmlal.s16    q4,         d6,            d4[2]\n\t"
+      "vmlal.s16    q5,         d7,            d4[2]\n\t"
+      "vmlal.s16    q6,         d6,            d4[3]\n\t"
+      "vmlal.s16    q7,         d7,            d4[3]\n\t"
+      "vmlal.s16    q8,         d6,            d5[0]\n\t"
+      "vmlal.s16    q9,         d7,            d5[0]\n\t"
+      "vmlal.s16    q10,        d6,            d5[1]\n\t"
+      "vmlal.s16    q11,        d7,            d5[1]\n\t"
+      "vmlal.s16    q12,        d6,            d5[2]\n\t"
+      "vmlal.s16    q13,        d7,            d5[2]\n\t"
+      "vmlal.s16    q14,        d6,            d5[3]\n\t"
+      "vmlal.s16    q15,        d7,            d5[3]\n\t"
+
+      "subs         %[kc1],     %[kc1],        #1  \n\t"
+      "bge          0b                             \n\t"
+      "1:                                          \n\t"  // last <8 rows
+      "subs         %[kc3],     %[kc3],        #1  \n\t"
+      "blt          2f                             \n\t"
+      "vld1.s8      {d0-d2},    [%[a_ptr]]!        \n\t"  // A 4 cols
+      "vld1.s8      {d3},       [%[b_ptr]]!        \n\t"  // B 1st row
+      "vmovl.s8     q2,         d0                 \n\t"
+      "vmovl.s8     q3,         d3                 \n\t"
+      "vmlal.s16    q4,         d6,            d4[0]\n\t"
+      "vmlal.s16    q5,         d7,            d4[0]\n\t"
+      "vmlal.s16    q6,         d6,            d4[1]\n\t"
+      "vmlal.s16    q7,         d7,            d4[1]\n\t"
+      "vmlal.s16    q8,         d6,            d4[2]\n\t"
+      "vmlal.s16    q9,         d7,            d4[2]\n\t"
+      "vmlal.s16    q10,        d6,            d4[3]\n\t"
+      "vmlal.s16    q11,        d7,            d4[3]\n\t"
+      "vmlal.s16    q12,        d6,            d5[0]\n\t"
+      "vmlal.s16    q13,        d7,            d5[0]\n\t"
+      "vmlal.s16    q14,        d6,            d5[1]\n\t"
+      "vmlal.s16    q15,        d7,            d5[1]\n\t"
+      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 2nd row
+      "vmovl.s8     q3,         d3                  \n\t"
+      "vmlal.s16    q4,         d6,            d5[2]\n\t"
+      "vmlal.s16    q5,         d7,            d5[2]\n\t"
+      "vmlal.s16    q6,         d6,            d5[3]\n\t"
+      "vmlal.s16    q7,         d7,            d5[3]\n\t"
+      "vmovl.s8     q2,         d1                  \n\t"
+      "vmlal.s16    q8,         d6,            d4[0]\n\t"
+      "vmlal.s16    q9,         d7,            d4[0]\n\t"
+      "vmlal.s16    q10,        d6,            d4[1]\n\t"
+      "vmlal.s16    q11,        d7,            d4[1]\n\t"
+      "vmlal.s16    q12,        d6,            d4[2]\n\t"
+      "vmlal.s16    q13,        d7,            d4[2]\n\t"
+      "vmlal.s16    q14,        d6,            d4[3]\n\t"
+      "vmlal.s16    q15,        d7,            d4[3]\n\t"
+      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 3th row
+      "vmovl.s8     q3,         d3                  \n\t"
+      "vmlal.s16    q4,         d6,            d5[0]\n\t"
+      "vmlal.s16    q5,         d7,            d5[0]\n\t"
+      "vmlal.s16    q6,         d6,            d5[1]\n\t"
+      "vmlal.s16    q7,         d7,            d5[1]\n\t"
+      "vmlal.s16    q8,         d6,            d5[2]\n\t"
+      "vmlal.s16    q9,         d7,            d5[2]\n\t"
+      "vmlal.s16    q10,        d6,            d5[3]\n\t"
+      "vmlal.s16    q11,        d7,            d5[3]\n\t"
+      "vmovl.s8     q2,         d2                  \n\t"
+      "vmlal.s16    q12,        d6,            d4[0]\n\t"
+      "vmlal.s16    q13,        d7,            d4[0]\n\t"
+      "vmlal.s16    q14,        d6,            d4[1]\n\t"
+      "vmlal.s16    q15,        d7,            d4[1]\n\t"
+      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 4th row
+      "vmovl.s8     q3,         d3                  \n\t"
+      "vmlal.s16    q4,         d6,            d4[2]\n\t"
+      "vmlal.s16    q5,         d7,            d4[2]\n\t"
+      "vmlal.s16    q6,         d6,            d4[3]\n\t"
+      "vmlal.s16    q7,         d7,            d4[3]\n\t"
+      "vmlal.s16    q8,         d6,            d5[0]\n\t"
+      "vmlal.s16    q9,         d7,            d5[0]\n\t"
+      "vmlal.s16    q10,        d6,            d5[1]\n\t"
+      "vmlal.s16    q11,        d7,            d5[1]\n\t"
+      "vmlal.s16    q12,        d6,            d5[2]\n\t"
+      "vmlal.s16    q13,        d7,            d5[2]\n\t"
+      "vmlal.s16    q14,        d6,            d5[3]\n\t"
+      "vmlal.s16    q15,        d7,            d5[3]\n\t"
+
+      "2:                                          \n\t"  // last <4 rows
+      "subs         %[kc5],     %[kc5],        #1  \n\t"
+      "blt          3f                             \n\t"
+      "vld1.s8      {d0, d1},   [%[a_ptr]],    r0  \n\t"
+      "vld1.s8      {d3},       [%[b_ptr]]!        \n\t"  // B 1st row
+      "vmovl.s8     q2,         d0                 \n\t"
+      "vmovl.s8     q3,         d3                 \n\t"
+      "vmlal.s16    q4,         d6,            d4[0]\n\t"
+      "vmlal.s16    q5,         d7,            d4[0]\n\t"
+      "vmlal.s16    q6,         d6,            d4[1]\n\t"
+      "vmlal.s16    q7,         d7,            d4[1]\n\t"
+      "vmlal.s16    q8,         d6,            d4[2]\n\t"
+      "vmlal.s16    q9,         d7,            d4[2]\n\t"
+      "vmlal.s16    q10,        d6,            d4[3]\n\t"
+      "vmlal.s16    q11,        d7,            d4[3]\n\t"
+      "vmlal.s16    q12,        d6,            d5[0]\n\t"
+      "vmlal.s16    q13,        d7,            d5[0]\n\t"
+      "vmlal.s16    q14,        d6,            d5[1]\n\t"
+      "vmlal.s16    q15,        d7,            d5[1]\n\t"
+      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 2nd row
+      "vmovl.s8     q3,         d3                  \n\t"
+      "vmlal.s16    q4,         d6,            d5[2]\n\t"
+      "vmlal.s16    q5,         d7,            d5[2]\n\t"
+      "vmlal.s16    q6,         d6,            d5[3]\n\t"
+      "vmlal.s16    q7,         d7,            d5[3]\n\t"
+      "vmovl.s8     q2,         d1                  \n\t"
+      "vmlal.s16    q8,         d6,            d4[0]\n\t"
+      "vmlal.s16    q9,         d7,            d4[0]\n\t"
+      "vmlal.s16    q10,        d6,            d4[1]\n\t"
+      "vmlal.s16    q11,        d7,            d4[1]\n\t"
+      "vmlal.s16    q12,        d6,            d4[2]\n\t"
+      "vmlal.s16    q13,        d7,            d4[2]\n\t"
+      "vmlal.s16    q14,        d6,            d4[3]\n\t"
+      "vmlal.s16    q15,        d7,            d4[3]\n\t"
+      "3:                                          \n\t"  // last <2 rows
+      "subs         %[kc6],     %[kc6],        #1  \n\t"
+      "blt          4f                             \n\t"
+      "vld1.s8      {d0},       [%[a_ptr]]         \n\t"
+      "vld1.s8      {d3},       [%[b_ptr]]         \n\t"
+      "vmovl.s8     q2,         d0                 \n\t"
+      "vmovl.s8     q3,         d3                 \n\t"
+      "vmlal.s16    q4,         d6,            d4[0]\n\t"
+      "vmlal.s16    q5,         d7,            d4[0]\n\t"
+      "vmlal.s16    q6,         d6,            d4[1]\n\t"
+      "vmlal.s16    q7,         d7,            d4[1]\n\t"
+      "vmlal.s16    q8,         d6,            d4[2]\n\t"
+      "vmlal.s16    q9,         d7,            d4[2]\n\t"
+      "vmlal.s16    q10,        d6,            d4[3]\n\t"
+      "vmlal.s16    q11,        d7,            d4[3]\n\t"
+      "vmlal.s16    q12,        d6,            d5[0]\n\t"
+      "vmlal.s16    q13,        d7,            d5[0]\n\t"
+      "vmlal.s16    q14,        d6,            d5[1]\n\t"
+      "vmlal.s16    q15,        d7,            d5[1]\n\t"
+      "4:                                          \n\t"
+      "vst1.32      {q4, q5},   [%[c]],   %[step]  \n\t"
+      "vst1.32      {q6, q7},   [%[c]],   %[step]  \n\t"
+      "vst1.32      {q8, q9},   [%[c]],   %[step]  \n\t"
+      "vst1.32      {q10, q11}, [%[c]],   %[step]  \n\t"
+      "vst1.32      {q12, q13}, [%[c]],   %[step]  \n\t"
+      "vst1.32      {q14, q15}, [%[c]]             \n\t"
+      :
+      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
+        [kc3] "r"(kc3), [kc5] "r"(kc5), [kc6] "r"(kc6), [step] "r"(step)
+      : "cc", "memory", "r0", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+#endif  // __aarch64__
+#endif  // __ARM_NEON
+}
+
+// 8 bits int inner product
+void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
+                               const int8_t *a, const int8_t *b, int8_t beta,
+                               int32_t *c, int32_t *C, int32_t ldc, bool relu,
+                               int8_t *bias) {
+#pragma omp parallel for
+  for (int32_t j = 0; j < nc; j += NR) {
+    for (int32_t i = 0; i < mc; i += MR_INT8) {
+      //      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+    }
+  }
+  if (alpha != 1) {
+    WriteWithAlphaBeta(mc, nc, c, C, ldc);
+    return;
+  }
+  if (beta == 0) {
+    WriteBasic(mc, nc, c, C, ldc);
+    return;
+  }
+  if (beta == 1 && !relu) {
+    if (bias == nullptr) {
+      WriteWithAdd(mc, nc, c, C, ldc);
+    } else {
+      WriteWithAddV1(mc, nc, c, C, ldc, bias);
+    }
+    return;
+  }
+  if (beta == 1 && relu) {
+    if (bias == nullptr) {
+      WriteWithAddRelu(mc, nc, c, C, ldc);
+    } else {
+      WriteWithAddReluV1(mc, nc, c, C, ldc, bias);
+    }
+    return;
+  }
+}
+// 8 bits int PackMatrixA_4r
+void Gemm::PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
+                          int32_t lda, int8_t *buffer) {
+  const int8_t *a0, *a1, *a2, *a3;
+  for (int32_t i = 0; i < m - m_tail; i += MR_INT8) {
+    a0 = A + i * lda;
+    a1 = A + (i + 1) * lda;
+    a2 = A + (i + 2) * lda;
+    a3 = A + (i + 3) * lda;
+    for (int32_t j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
+    }
+  }
+
+  if (m_tail != 0) {
+    a0 = &A(m - m_tail, 0);
+    a1 = a0 + lda;
+    a2 = a0 + 2 * lda;
+    a3 = a0 + 3 * lda;
+    switch (m_tail) {
+      case 1:
+        a1 = zero_int8;
+      case 2:
+        a2 = zero_int8;
+      case 3:
+        a3 = zero_int8;
+        break;
+      default:
+        break;
+    }
+    for (int j = 0; j < k; ++j) {
+      *buffer++ = *a0++;
+      *buffer++ = *a1++;
+      *buffer++ = *a2++;
+      *buffer++ = *a3++;
+    }
+  }
+}
+
+// 8 bits int PackMatrixA_6r
+void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
+                          int32_t lda, int8_t *buffer) {
+  const int32_t i_length = m - m_tail;
+  for (int32_t i = 0; i < i_length; i += MR_INT8) {
+    const int8_t *a0 = A + i * lda;
+    const int8_t *a1 = A + (i + 1) * lda;
+    const int8_t *a2 = A + (i + 2) * lda;
+    const int8_t *a3 = A + (i + 3) * lda;
+    const int8_t *a4 = A + (i + 4) * lda;
+    const int8_t *a5 = A + (i + 5) * lda;
+    int8_t *local_buffer = buffer + i * k;
+    for (int32_t j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+    }
+  }
+  if (m_tail != 0) {
+    const int8_t *a0 = &A(i_length, 0);
+    const int8_t *a1 = a0 + lda;
+    const int8_t *a2 = a0 + 2 * lda;
+    const int8_t *a3 = a0 + 3 * lda;
+    const int8_t *a4 = a0 + 4 * lda;
+    const int8_t *a5 = a0 + 5 * lda;
+    int8_t *local_buffer = buffer + i_length * k;
+    switch (m_tail) {
+      case 1:
+        a1 = zero_int8;
+      case 2:
+        a2 = zero_int8;
+      case 3:
+        a3 = zero_int8;
+      case 4:
+        a4 = zero_int8;
+      case 5:
+        a5 = zero_int8;
+        break;
+      default:
+        break;
+    }
+    for (int32_t j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+      *local_buffer++ = *a4++;
+      *local_buffer++ = *a5++;
+    }
+  }
+}
+
+// 8 bits int PackMatrixB
+void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
+                          int32_t ldb, int8_t *buffer) {
+  const int32_t j_length = n - n_tail;
+  for (int32_t j = 0; j < j_length; j += NR) {
+    int8_t *local_buffer = buffer + j * k;
+    for (int32_t i = 0; i < k; ++i) {
+      const int8_t *b0 = &B(i, j);
+#if __ARM_NEON
+#if __aarch64__
+      // TODO
+#else
+      asm volatile(
+          //          "pld        [%[b0]]                     \n\t"
+          "vld1.s8    {d0},   [%[b0]]         \n\t"
+          "vst1.s8    {d0},   [%[local_buffer]]!    \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "q0");
+#endif  // __aarch64__
+#else
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+#endif  // __ARM_NEON
+    }
+  }
+  if (n_tail != 0) {
+    int8_t *local_buffer = buffer + j_length * k;
+    for (int32_t i = 0; i < k; ++i) {
+      const int8_t *b0 = &B(i, j_length);
+      for (int32_t j = j_length; j < n; ++j) {
+        *local_buffer++ = *b0++;
+      }
+      for (int32_t j = n; j < j_length + NR; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+}
+
+// 8 bits int matrix product (m*k x k*n)
+void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
+                 int32_t lda, const int8_t *B, int32_t ldb, int8_t beta,
+                 int32_t *C, int32_t ldc, bool relu, int8_t *bias) {
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int32_t L1 = 32 * 1024;
+  int32_t L2 = 512 * 1024;
+
+  KC = k;
+  MC = L1 / (KC * sizeof(int8_t));
+  NC = L2 / (KC * sizeof(int8_t));
+
+  // make sure MC is multiple of MR_INT8, and NC is multiple of NR
+  if (MC == 0) {
+    MC = MR_INT8;
+  } else {
+    int32_t mblock_num = (m + MC - 1) / MC;
+    MC = (m + mblock_num - 1) / mblock_num;
+    MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
+  }
+  // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+  if (NC == 0) {
+    NC = NR;
+  } else {
+    int32_t nblock_num = (n + NC - 1) / NC;
+    NC = (n + nblock_num - 1) / nblock_num;
+    NC = (NC + NR - 1) / NR * NR;
+  }
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+  packedA_int8 = static_cast<int8_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
+  packedB_int8 = static_cast<int8_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
+  packedC_int8 = static_cast<int32_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC));
+  zero_int8 =
+      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * KC));
+
+  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * KC);
+  int32_t mc, nc;
+  for (int32_t j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB_int8);
+    for (int32_t i = 0; i < m; i += MC) {
+      mc = s_min(m - i, MC);
+      //      PackMatrixA_6r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
+      PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
+      if (bias == nullptr) {
+        InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
+                            packedC_int8, &C(i, j), ldc, relu, nullptr);
+      } else {
+        InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
+                            packedC_int8, &C(i, j), ldc, relu, bias + i);
+      }
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA_int8);
+  paddle_mobile::memory::Free(packedB_int8);
+  paddle_mobile::memory::Free(packedC_int8);
+  paddle_mobile::memory::Free(zero_int8);
+}
+
+//  8 bits int write back
+// C = alpha * A * B + beta * C
+void Gemm::WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                              int32_t ldc) {}
+// C = A * B, 8位 int32_t
+void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                      int32_t ldc) {
+#if __ARM_NEON
+#if __aarch64__
+// TODO
+#else
+  int32_t nc1 = nc >> 4;
+  int32_t _nc1 = nc & 15;
+  int32_t step = sizeof(int32_t) * ldc;
+  int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 4));
+  int32_t volatile m = mc;
+
+  int32_t *volatile c_ptr, *volatile C_ptr;
+  int32_t *C0, *c0;
+  c_ptr = c;
+  C_ptr = C;
+  if (nc1 > 0) {
+    asm volatile(
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "loop_mc_%=:                        \n\t"
+
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+
+        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
+        "vst1.32    {q0, q1}, [r6]!         \n\t"
+
+        "vld1.32    {q2, q3}, [%[c_ptr]]!   \n\t"
+        "vst1.32    {q2, q3}, [r6]!         \n\t"
+
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[C_ptr], %[C_ptr], %[step]   \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1]  \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
+          [step] "r"(step), [step1] "r"(step1)
+        : "memory", "r5", "r6", "q0", "q1", "q2", "q3");
+  }
+
+  if (_nc1 != 0) {
+    for (int32_t i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 16 + i * ldc;
+      c0 = c_ptr + nc1 * 16 + i * NC;
+      for (int32_t j = 0; j < _nc1; j++) {
+        *C0++ = *c0++;
+      }
+    }
+  }
+#endif  // __aarch64__
+#endif  // __ARM_NEON
+}
+
+// C = A * B + C
+void Gemm::WriteWithAdd(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                        int32_t ldc) {}
+
+// C = A * B + bias
+void Gemm::WriteWithAddV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                          int32_t ldc, int8_t *bias) {}
+// C = A * B + C, relu(C)
+void Gemm::WriteWithAddRelu(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                            int32_t ldc) {}
+
+// C = A * B + bias, relu(C)
+void Gemm::WriteWithAddReluV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                              int32_t ldc, int8_t *bias) {}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/math/im2col.cpp b/src/operators/math/im2col.cpp
index e23b02486bec81c4685420d451dc093dd420ac97..47055ec4f24e5b5b226c1f084bb2253d2ebb77c7 100644
--- a/src/operators/math/im2col.cpp
+++ b/src/operators/math/im2col.cpp
@@ -28,91 +28,240 @@ namespace math {
  *   [input_channels, filter_height, filter_width, output_height,
  * output_width]
  */
-template <class T>
-class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
- public:
-  void operator()(const framework::Tensor &im, const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *col) {
-    //    PADDLE_ENFORCE(im.dims().size() == 3);
-    //    PADDLE_ENFORCE(col->dims().size() == 5);
+template <>
+void Im2ColFunctor<ColFormat::kCFO, CPU, float>::operator()(
+    const framework::Tensor &im, const std::vector<int> &dilation,
+    const std::vector<int> &stride, const std::vector<int> &padding,
+    framework::Tensor *col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int col_height = col->dims()[3];
+  int col_width = col->dims()[4];
+
+  int channels_col = im_channels * filter_height * filter_width;
+  const float *im_data = im.data<float>();
+  float *col_data = col->data<float>();
+#if __ARM_NEON
+  const int osize = col_height;
+  const int isize = im_height;
+  bool pad1 = padding[0] > 0;
+  bool pad2 =
+      (pad1 && padding[1] &&
+       (((isize - 2 * padding[0] + filter_height) % stride[0] == 0) ? 1 : 0));
+  int fill = isize % 2;
+  if (stride[0] == 1 && filter_height == 3 && pad1 && pad2 &&
+      dilation[0] == 1 && im_height > 2 && im_height == im_width) {
+    for (int c = 0; c < im_channels; ++c) {
+      int oosize = osize * osize;
+      int nk4 = osize / 4;
+      int mk4 = osize % 4;
+
+      float *col0 = col_data + 0 * oosize + 2 * osize + 2;
+      float *col1 = col_data + 1 * oosize + 2 * osize + 1;
+      float *col2 = col_data + 2 * oosize + 2 * osize;
+
+      float *col3 = col_data + 3 * oosize + osize + 2;
+      float *col4 = col_data + 4 * oosize + osize + 1;
+      float *col5 = col_data + 5 * oosize + osize;
+
+      float *col6 = col_data + 6 * oosize + 2;
+      float *col7 = col_data + 7 * oosize + 1;
+      float *col8 = col_data + 8 * oosize;
+
+      float32x4_t im1;
+      const float *im_tmp_data = im_data + osize + 1;
+
+      int rrsize = oosize - osize - 1;
+      int nr4 = rrsize / 4;
+      int mr4 = rrsize % 4;
+      for (int i = 0; i < nr4; ++i) {
+        im1 = vld1q_f32(im_tmp_data);
+        vst1q_f32(col0, im1);
+        vst1q_f32(col1, im1);
+        vst1q_f32(col2, im1);
+        vst1q_f32(col3, im1);
+        vst1q_f32(col4, im1);
+        vst1q_f32(col5, im1);
+        vst1q_f32(col6, im1);
+        vst1q_f32(col7, im1);
+        vst1q_f32(col8, im1);
+
+        col0 += 4;
+        col1 += 4;
+        col2 += 4;
+        col3 += 4;
+        col4 += 4;
+        col5 += 4;
+        col6 += 4;
+        col7 += 4;
+        col8 += 4;
+
+        im_tmp_data += 4;
+      }
+      for (int i = 0; i < mr4; ++i) {
+        *col0 = *im_tmp_data;
+        *col1 = *im_tmp_data;
+        *col2 = *im_tmp_data;
+        *col3 = *im_tmp_data;
+        *col4 = *im_tmp_data;
+        *col5 = *im_tmp_data;
+        *col6 = *im_tmp_data;
+        *col7 = *im_tmp_data;
+        *col8 = *im_tmp_data;
+
+        col0++;
+        col1++;
+        col2++;
+        col3++;
+        col4++;
+        col5++;
+        col6++;
+        col7++;
+        col8++;
+
+        im_tmp_data++;
+      }
 
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[1];
-    int filter_width = col->dims()[2];
-    int col_height = col->dims()[3];
-    int col_width = col->dims()[4];
-
-    //    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2]
-    //    -
-    //                       ((dilation[0] * (filter_height - 1)
-    //                       + 1))) /
-    //                              stride[0] +
-    //                          1,
-    //                      col_height,
-    //                      "Output_height and
-    //                      padding(padding_up, padding_down)
-    //                      are " "inconsistent.");
-    //    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3]
-    //    -
-    //                       ((dilation[1] * (filter_width - 1)
-    //                       + 1))) /
-    //                              stride[1] +
-    //                          1,
-    //                      col_width,
-    //                      "Output_height and
-    //                      padding(padding_up, padding_down)
-    //                      are " "inconsistent.");
+      im_tmp_data = im_data + 1;
+      col0 = col_data + 0 * oosize + osize + 2;
+      col1 = col_data + 1 * oosize + osize + 1;
+      col2 = col_data + 2 * oosize + osize;
+
+      col3 = col_data + 3 * oosize + 2;
+      col4 = col_data + 4 * oosize + 1;
+      col5 = col_data + 5 * oosize;
+
+      for (int i = 0; i < nk4; i++) {
+        im1 = vld1q_f32(im_tmp_data);
+        vst1q_f32(col0, im1);
+        vst1q_f32(col1, im1);
+        vst1q_f32(col2, im1);
+        vst1q_f32(col3, im1);
+        vst1q_f32(col4, im1);
+        vst1q_f32(col5, im1);
+
+        col0 += 4;
+        col1 += 4;
+        col2 += 4;
+        col3 += 4;
+        col4 += 4;
+        col5 += 4;
+        im_tmp_data += 4;
+      }
 
-    int channels_col = im_channels * filter_height * filter_width;
-    const T *im_data = im.data<T>();
-    T *col_data = col->data<T>();
-#if __ARM_NEON
-    const int osize = col_height;
-    const int isize = im_height;
-    bool pad1 = padding[0] > 0;
-    bool pad2 =
-        (pad1 && padding[1] &&
-         (((isize - 2 * padding[0] + filter_height) % stride[0] == 0) ? 1 : 0));
-    int fill = isize % 2;
-    if (stride[0] == 1 && filter_height == 3 && pad1 && pad2 &&
-        dilation[0] == 1 && im_height > 2 && im_height == im_width) {
-      for (int c = 0; c < im_channels; ++c) {
-        int oosize = osize * osize;
-        int nk4 = osize / 4;
-        int mk4 = osize % 4;
-
-        float *col0 = col_data + 0 * oosize + 2 * osize + 2;
-        float *col1 = col_data + 1 * oosize + 2 * osize + 1;
-        float *col2 = col_data + 2 * oosize + 2 * osize;
-
-        float *col3 = col_data + 3 * oosize + osize + 2;
-        float *col4 = col_data + 4 * oosize + osize + 1;
-        float *col5 = col_data + 5 * oosize + osize;
-
-        float *col6 = col_data + 6 * oosize + 2;
-        float *col7 = col_data + 7 * oosize + 1;
-        float *col8 = col_data + 8 * oosize;
-
-        float32x4_t im1;
-        const float *im_tmp_data = im_data + osize + 1;
-
-        int rrsize = oosize - osize - 1;
-        int nr4 = rrsize / 4;
-        int mr4 = rrsize % 4;
-        for (int i = 0; i < nr4; ++i) {
-          im1 = vld1q_f32(im_tmp_data);
-          vst1q_f32(col0, im1);
-          vst1q_f32(col1, im1);
-          vst1q_f32(col2, im1);
-          vst1q_f32(col3, im1);
-          vst1q_f32(col4, im1);
-          vst1q_f32(col5, im1);
-          vst1q_f32(col6, im1);
-          vst1q_f32(col7, im1);
-          vst1q_f32(col8, im1);
+      for (int i = 0; i < mk4; i++) {
+        *col0 = *im_tmp_data;
+        *col1 = *im_tmp_data;
+        *col2 = *im_tmp_data;
+        *col3 = *im_tmp_data;
+        *col4 = *im_tmp_data;
+        *col5 = *im_tmp_data;
+        col0++;
+        col1++;
+        col2++;
+        col3++;
+        col4++;
+        col5++;
+
+        im_tmp_data++;
+      }
+
+      // fill 0 1 11;
+      for (int i = 0; i < osize; ++i) {
+        col_data[0 * oosize + i * osize] = 0.0;
+        col_data[3 * oosize + i * osize] = 0.0;
+        col_data[6 * oosize + i * osize] = 0.0;
+
+        col_data[2 * oosize + osize - 1 + i * osize] = 0.0;
+        col_data[5 * oosize + osize - 1 + i * osize] = 0.0;
+        col_data[8 * oosize + osize - 1 + i * osize] = 0.0;
+      }
+
+      col_data[0 * oosize + osize + 1] = im_data[0];
+      col_data[3 * oosize + 1] = im_data[0];
+      col_data[6 * oosize + 1] = im_data[osize];
+
+      col_data[1 * oosize + osize] = im_data[0];
+      col_data[4 * oosize] = im_data[0];
+      col_data[7 * oosize] = im_data[osize];
+
+      float32x4_t zero4;
+      zero4 = vdupq_n_f32(0.0);
+      auto col_z0 = col_data;
+      auto col_z1 = col_data + oosize;
+      auto col_z2 = col_data + 2 * oosize;
+      auto col_z6 = col_data + 6 * oosize + osize * (osize - 1);
+      auto col_z7 = col_data + 7 * oosize + osize * (osize - 1);
+      auto col_z8 = col_data + 8 * oosize + osize * (osize - 1);
+
+      for (int i = 0; i < nk4; ++i) {
+        vst1q_f32(col_z0, zero4);
+        vst1q_f32(col_z1, zero4);
+        vst1q_f32(col_z2, zero4);
+        vst1q_f32(col_z6, zero4);
+        vst1q_f32(col_z7, zero4);
+        vst1q_f32(col_z8, zero4);
+
+        col_z0 += 4;
+        col_z1 += 4;
+        col_z2 += 4;
+        col_z6 += 4;
+        col_z7 += 4;
+        col_z8 += 4;
+      }
+
+      for (int i = 0; i < mk4; ++i) {
+        col_z0[i] = 0.0;
+        col_z1[i] = 0.0;
+        col_z2[i] = 0.0;
+        col_z6[i] = 0.0;
+        col_z7[i] = 0.0;
+        col_z8[i] = 0.0;
+      }
+      col_data += 9 * oosize;
+      im_data += isize * isize;
+    }
+  } else if (stride[0] == 2 && filter_height == 3 && pad1 && dilation[0] == 1 &&
+             im_height > 2 && im_height == im_width) {
+    for (int c = 0; c < im_channels; ++c) {
+      int oosize = osize * osize;
+      int nk4 = osize / 4;
+      int mk4 = osize % 4;
+
+      // 3 2 3 1 0 1 3 2 3
+      float *col0 = col_data + 0 * oosize + osize + 1;
+      float *col1 = col_data + 1 * oosize + osize;
+      float *col2 = col_data + 2 * oosize + osize;
+
+      float *col3 = col_data + 3 * oosize + 1;
+      float *col4 = col_data + 4 * oosize;
+      float *col5 = col_data + 5 * oosize;
+
+      float *col6 = col_data + 6 * oosize + 1;
+      float *col7 = col_data + 7 * oosize;
+      float *col8 = col_data + 8 * oosize;
+
+      float32x4x2_t im01;
+      float32x4x2_t im23;
+      const float *im_tmp_data0 = im_data;
+      const float *im_tmp_data2 = im_data + isize;
+
+      for (int j = 0; j < osize; ++j) {
+        for (int i = 0; i < nk4; ++i) {
+          im01 = vld2q_f32(im_tmp_data0);
+          im23 = vld2q_f32(im_tmp_data2);
+          vst1q_f32(col0, im23.val[1]);
+          vst1q_f32(col1, im23.val[0]);
+          vst1q_f32(col2, im23.val[1]);
+          vst1q_f32(col3, im01.val[1]);
+          vst1q_f32(col4, im01.val[0]);
+          vst1q_f32(col5, im01.val[1]);
+          vst1q_f32(col6, im23.val[1]);
+          vst1q_f32(col7, im23.val[0]);
+          vst1q_f32(col8, im23.val[1]);
 
           col0 += 4;
           col1 += 4;
@@ -124,18 +273,21 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
           col7 += 4;
           col8 += 4;
 
-          im_tmp_data += 4;
+          im_tmp_data0 += 8;
+          im_tmp_data2 += 8;
         }
-        for (int i = 0; i < mr4; ++i) {
-          *col0 = *im_tmp_data;
-          *col1 = *im_tmp_data;
-          *col2 = *im_tmp_data;
-          *col3 = *im_tmp_data;
-          *col4 = *im_tmp_data;
-          *col5 = *im_tmp_data;
-          *col6 = *im_tmp_data;
-          *col7 = *im_tmp_data;
-          *col8 = *im_tmp_data;
+        const float *im_tmp_data1 = im_tmp_data0 + 1;
+        const float *im_tmp_data3 = im_tmp_data2 + 1;
+        for (int i = 0; i < mk4; ++i) {
+          *col0 = *im_tmp_data3;
+          *col1 = *im_tmp_data2;
+          *col2 = *im_tmp_data3;
+          *col3 = *im_tmp_data1;
+          *col4 = *im_tmp_data0;
+          *col5 = *im_tmp_data1;
+          *col6 = *im_tmp_data3;
+          *col7 = *im_tmp_data2;
+          *col8 = *im_tmp_data3;
 
           col0++;
           col1++;
@@ -146,271 +298,215 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
           col6++;
           col7++;
           col8++;
-
-          im_tmp_data++;
+          im_tmp_data0 += 2;
+          im_tmp_data1 += 2;
+          im_tmp_data2 += 2;
+          im_tmp_data3 += 2;
         }
-
-        im_tmp_data = im_data + 1;
-        col0 = col_data + 0 * oosize + osize + 2;
-        col1 = col_data + 1 * oosize + osize + 1;
-        col2 = col_data + 2 * oosize + osize;
-
-        col3 = col_data + 3 * oosize + 2;
-        col4 = col_data + 4 * oosize + 1;
-        col5 = col_data + 5 * oosize;
-
-        for (int i = 0; i < nk4; i++) {
-          im1 = vld1q_f32(im_tmp_data);
-          vst1q_f32(col0, im1);
-          vst1q_f32(col1, im1);
-          vst1q_f32(col2, im1);
-          vst1q_f32(col3, im1);
-          vst1q_f32(col4, im1);
-          vst1q_f32(col5, im1);
-
-          col0 += 4;
-          col1 += 4;
-          col2 += 4;
-          col3 += 4;
-          col4 += 4;
-          col5 += 4;
-          im_tmp_data += 4;
-        }
-
-        for (int i = 0; i < mk4; i++) {
-          *col0 = *im_tmp_data;
-          *col1 = *im_tmp_data;
-          *col2 = *im_tmp_data;
-          *col3 = *im_tmp_data;
-          *col4 = *im_tmp_data;
-          *col5 = *im_tmp_data;
-          col0++;
-          col1++;
-          col2++;
-          col3++;
-          col4++;
-          col5++;
-
-          im_tmp_data++;
-        }
-
-        // fill 0 1 11;
-        for (int i = 0; i < osize; ++i) {
-          col_data[0 * oosize + i * osize] = 0.0;
-          col_data[3 * oosize + i * osize] = 0.0;
-          col_data[6 * oosize + i * osize] = 0.0;
-
+        im_tmp_data0 += (isize - fill);
+        im_tmp_data2 += (isize - fill);
+      }
+      for (int i = 0; i < osize; ++i) {
+        col_data[0 * oosize + i * osize] = 0.0;
+        col_data[3 * oosize + i * osize] = 0.0;
+        col_data[6 * oosize + i * osize] = 0.0;
+        if (pad2) {
           col_data[2 * oosize + osize - 1 + i * osize] = 0.0;
           col_data[5 * oosize + osize - 1 + i * osize] = 0.0;
           col_data[8 * oosize + osize - 1 + i * osize] = 0.0;
         }
-
-        col_data[0 * oosize + osize + 1] = im_data[0];
-        col_data[3 * oosize + 1] = im_data[0];
-        col_data[6 * oosize + 1] = im_data[osize];
-
-        col_data[1 * oosize + osize] = im_data[0];
-        col_data[4 * oosize] = im_data[0];
-        col_data[7 * oosize] = im_data[osize];
-
-        float32x4_t zero4;
-        zero4 = vdupq_n_f32(0.0);
-        auto col_z0 = col_data;
-        auto col_z1 = col_data + oosize;
-        auto col_z2 = col_data + 2 * oosize;
-        auto col_z6 = col_data + 6 * oosize + osize * (osize - 1);
-        auto col_z7 = col_data + 7 * oosize + osize * (osize - 1);
-        auto col_z8 = col_data + 8 * oosize + osize * (osize - 1);
-
-        for (int i = 0; i < nk4; ++i) {
-          vst1q_f32(col_z0, zero4);
-          vst1q_f32(col_z1, zero4);
-          vst1q_f32(col_z2, zero4);
+      }
+      float32x4_t zero4;
+      zero4 = vdupq_n_f32(0.0);
+      auto col_z0 = col_data;
+      auto col_z1 = col_data + oosize;
+      auto col_z2 = col_data + 2 * oosize;
+      auto col_z6 = col_data + 6 * oosize + osize * (osize - 1);
+      auto col_z7 = col_data + 7 * oosize + osize * (osize - 1);
+      auto col_z8 = col_data + 8 * oosize + osize * (osize - 1);
+
+      for (int i = 0; i < nk4; ++i) {
+        vst1q_f32(col_z0, zero4);
+        vst1q_f32(col_z1, zero4);
+        vst1q_f32(col_z2, zero4);
+        if (pad2) {
           vst1q_f32(col_z6, zero4);
           vst1q_f32(col_z7, zero4);
           vst1q_f32(col_z8, zero4);
-
-          col_z0 += 4;
-          col_z1 += 4;
-          col_z2 += 4;
-          col_z6 += 4;
-          col_z7 += 4;
-          col_z8 += 4;
         }
+        col_z0 += 4;
+        col_z1 += 4;
+        col_z2 += 4;
+        col_z6 += 4;
+        col_z7 += 4;
+        col_z8 += 4;
+      }
 
-        for (int i = 0; i < mk4; ++i) {
-          col_z0[i] = 0.0;
-          col_z1[i] = 0.0;
-          col_z2[i] = 0.0;
+      for (int i = 0; i < mk4; ++i) {
+        col_z0[i] = 0.0;
+        col_z1[i] = 0.0;
+        col_z2[i] = 0.0;
+        if (pad2) {
           col_z6[i] = 0.0;
           col_z7[i] = 0.0;
           col_z8[i] = 0.0;
         }
-        col_data += 9 * oosize;
-        im_data += isize * isize;
       }
-    } else if (stride[0] == 2 && filter_height == 3 && pad1 &&
-               dilation[0] == 1 && im_height > 2 && im_height == im_width) {
-      for (int c = 0; c < im_channels; ++c) {
-        int oosize = osize * osize;
-        int nk4 = osize / 4;
-        int mk4 = osize % 4;
-
-        // 3 2 3 1 0 1 3 2 3
-        float *col0 = col_data + 0 * oosize + osize + 1;
-        float *col1 = col_data + 1 * oosize + osize;
-        float *col2 = col_data + 2 * oosize + osize;
-
-        float *col3 = col_data + 3 * oosize + 1;
-        float *col4 = col_data + 4 * oosize;
-        float *col5 = col_data + 5 * oosize;
-
-        float *col6 = col_data + 6 * oosize + 1;
-        float *col7 = col_data + 7 * oosize;
-        float *col8 = col_data + 8 * oosize;
-
-        float32x4x2_t im01;
-        float32x4x2_t im23;
-        const float *im_tmp_data0 = im_data;
-        const float *im_tmp_data2 = im_data + isize;
-
-        for (int j = 0; j < osize; ++j) {
-          for (int i = 0; i < nk4; ++i) {
-            im01 = vld2q_f32(im_tmp_data0);
-            im23 = vld2q_f32(im_tmp_data2);
-            vst1q_f32(col0, im23.val[1]);
-            vst1q_f32(col1, im23.val[0]);
-            vst1q_f32(col2, im23.val[1]);
-            vst1q_f32(col3, im01.val[1]);
-            vst1q_f32(col4, im01.val[0]);
-            vst1q_f32(col5, im01.val[1]);
-            vst1q_f32(col6, im23.val[1]);
-            vst1q_f32(col7, im23.val[0]);
-            vst1q_f32(col8, im23.val[1]);
-
-            col0 += 4;
-            col1 += 4;
-            col2 += 4;
-            col3 += 4;
-            col4 += 4;
-            col5 += 4;
-            col6 += 4;
-            col7 += 4;
-            col8 += 4;
-
-            im_tmp_data0 += 8;
-            im_tmp_data2 += 8;
-          }
-          const float *im_tmp_data1 = im_tmp_data0 + 1;
-          const float *im_tmp_data3 = im_tmp_data2 + 1;
-          for (int i = 0; i < mk4; ++i) {
-            *col0 = *im_tmp_data3;
-            *col1 = *im_tmp_data2;
-            *col2 = *im_tmp_data3;
-            *col3 = *im_tmp_data1;
-            *col4 = *im_tmp_data0;
-            *col5 = *im_tmp_data1;
-            *col6 = *im_tmp_data3;
-            *col7 = *im_tmp_data2;
-            *col8 = *im_tmp_data3;
-
-            col0++;
-            col1++;
-            col2++;
-            col3++;
-            col4++;
-            col5++;
-            col6++;
-            col7++;
-            col8++;
-            im_tmp_data0 += 2;
-            im_tmp_data1 += 2;
-            im_tmp_data2 += 2;
-            im_tmp_data3 += 2;
-          }
-          im_tmp_data0 += (isize - fill);
-          im_tmp_data2 += (isize - fill);
-        }
-        for (int i = 0; i < osize; ++i) {
-          col_data[0 * oosize + i * osize] = 0.0;
-          col_data[3 * oosize + i * osize] = 0.0;
-          col_data[6 * oosize + i * osize] = 0.0;
-          if (pad2) {
-            col_data[2 * oosize + osize - 1 + i * osize] = 0.0;
-            col_data[5 * oosize + osize - 1 + i * osize] = 0.0;
-            col_data[8 * oosize + osize - 1 + i * osize] = 0.0;
-          }
-        }
-        float32x4_t zero4;
-        zero4 = vdupq_n_f32(0.0);
-        auto col_z0 = col_data;
-        auto col_z1 = col_data + oosize;
-        auto col_z2 = col_data + 2 * oosize;
-        auto col_z6 = col_data + 6 * oosize + osize * (osize - 1);
-        auto col_z7 = col_data + 7 * oosize + osize * (osize - 1);
-        auto col_z8 = col_data + 8 * oosize + osize * (osize - 1);
 
-        for (int i = 0; i < nk4; ++i) {
-          vst1q_f32(col_z0, zero4);
-          vst1q_f32(col_z1, zero4);
-          vst1q_f32(col_z2, zero4);
-          if (pad2) {
-            vst1q_f32(col_z6, zero4);
-            vst1q_f32(col_z7, zero4);
-            vst1q_f32(col_z8, zero4);
-          }
-          col_z0 += 4;
-          col_z1 += 4;
-          col_z2 += 4;
-          col_z6 += 4;
-          col_z7 += 4;
-          col_z8 += 4;
-        }
+      col_data[1 * oosize + osize] = im_data[isize];
+      for (int i = 1; i < osize; ++i) {
+        col_data[3 * oosize + i] = im_data[(i - 1) * stride[0] + 1];
+      }
+      col_data[4 * oosize] = im_data[0];
+      col_data[7 * oosize] = im_data[isize];
 
-        for (int i = 0; i < mk4; ++i) {
-          col_z0[i] = 0.0;
-          col_z1[i] = 0.0;
-          col_z2[i] = 0.0;
-          if (pad2) {
-            col_z6[i] = 0.0;
-            col_z7[i] = 0.0;
-            col_z8[i] = 0.0;
-          }
-        }
+      col_data += 9 * oosize;
+      im_data += isize * isize;
+    }
+  } else {
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int c_im = c / (filter_width * filter_height);
+      for (int h = 0; h < col_height; ++h) {
+        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+        for (int w = 0; w < col_width; ++w) {
+          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+          int col_idx = (c * col_height + h) * col_width + w;
+          int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
 
-        col_data[1 * oosize + osize] = im_data[isize];
-        for (int i = 1; i < osize; ++i) {
-          col_data[3 * oosize + i] = im_data[(i - 1) * stride[0] + 1];
+          col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
+                               im_col_idx < 0 || im_col_idx >= im_width)
+                                  ? static_cast<float>(0)
+                                  : im_data[im_idx];
         }
-        col_data[4 * oosize] = im_data[0];
-        col_data[7 * oosize] = im_data[isize];
-
-        col_data += 9 * oosize;
-        im_data += isize * isize;
+      }
+    }
+  }
+#else
+  for (int c = 0; c < channels_col; ++c) {
+    int w_offset = c % filter_width;
+    int h_offset = (c / filter_width) % filter_height;
+    int c_im = c / (filter_width * filter_height);
+    for (int h = 0; h < col_height; ++h) {
+      int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+      for (int w = 0; w < col_width; ++w) {
+        int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+        int col_idx = (c * col_height + h) * col_width + w;
+        int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
+
+        col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
+                             im_col_idx < 0 || im_col_idx >= im_width)
+                                ? static_cast<float>(0)
+                                : im_data[im_idx];
+      }
+    }
+  }
+#endif
+}
+
+void ExtractToImg(const int8_t *im_data, int8_t *col_data, const int im_height,
+                  const int im_width, const int col_height, const int col_width,
+                  const int padding_h, const int padding_w, const int stride_h,
+                  const int stride_w, const int kh, const int kw) {
+  int h = padding_h - kh;
+  int w = padding_w - kw;
+  int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0;
+  int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0;
+  int start_height = kh + col_start_height * stride_h - padding_h;
+  int start_width = kw + col_start_width * stride_w - padding_w;
+
+  int end_height = (col_height - col_start_height) * stride_h + start_height;
+  end_height = end_height > im_height ? im_height : end_height;
+  int end_width = (col_width - col_start_width) * stride_w + start_width;
+  end_width = end_width > im_width ? im_width : end_width;
+  int extract = (end_width - start_width + stride_w - 1) / stride_w;
+
+  im_data += start_height * im_width + start_width;
+  col_data += col_start_height * col_width + col_start_width;
+  for (int i = start_height; i < end_height; i += stride_h) {
+    if (stride_w == 1) {
+      memcpy(col_data, im_data, extract * sizeof(int8_t));
+    } else if (stride_w == 2) {
+      int s = 0;
+#if __ARM_NEON
+      for (; s < extract - 15; s += 16) {
+        int8x16x2_t img = vld2q_s8(im_data + s * 2);
+        vst1q_s8(col_data + s, img.val[0]);
+      }
+#endif
+      for (; s < extract; ++s) {
+        col_data[s] = im_data[s * 2];
+      }
+    } else if (stride_w == 3) {
+      int s = 0;
+#if __ARM_NEON
+      for (; s < extract - 15; s += 16) {
+        int8x16x3_t img = vld3q_s8(im_data + s * 3);
+        vst1q_s8(col_data + s, img.val[0]);
+      }
+#endif
+      for (; s < extract; ++s) {
+        col_data[s] = im_data[s * 3];
+      }
+    } else if (stride_w == 4) {
+      int s = 0;
+#if __ARM_NEON
+      for (; s < extract - 15; s += 16) {
+        int8x16x4_t img = vld4q_s8(im_data + s * 4);
+        vst1q_s8(col_data + s, img.val[0]);
+      }
+#endif
+      for (; s < extract; ++s) {
+        col_data[s] = im_data[s * 4];
       }
     } else {
-      for (int c = 0; c < channels_col; ++c) {
-        int w_offset = c % filter_width;
-        int h_offset = (c / filter_width) % filter_height;
-        int c_im = c / (filter_width * filter_height);
-        for (int h = 0; h < col_height; ++h) {
-          int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-          for (int w = 0; w < col_width; ++w) {
-            int im_col_idx =
-                w * stride[1] - padding[1] + w_offset * dilation[1];
-            int col_idx = (c * col_height + h) * col_width + w;
-            int im_idx =
-                (im_row_idx + c_im * im_height) * im_width + im_col_idx;
-
-            col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
-                                 im_col_idx < 0 || im_col_idx >= im_width)
-                                    ? static_cast<T>(0)
-                                    : im_data[im_idx];
-          }
+      PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1, 2, 3 and 4.");
+    }
+    im_data += im_width * stride_h;
+    col_data += col_width;
+  }
+}
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height,
+ * output_width]
+ */
+template <>
+void Im2ColFunctor<ColFormat::kCFO, CPU, int8_t>::operator()(
+    const framework::Tensor &im, const std::vector<int> &dilation,
+    const std::vector<int> &stride, const std::vector<int> &padding,
+    framework::Tensor *col) {
+  int im_channels = im.dims()[0];
+  int im_height = im.dims()[1];
+  int im_width = im.dims()[2];
+  int filter_height = col->dims()[1];
+  int filter_width = col->dims()[2];
+  int col_height = col->dims()[3];
+  int col_width = col->dims()[4];
+
+  int channels_col = im_channels * filter_height * filter_width;
+  const int8_t *im_data = im.data<int8_t>();
+  int8_t *col_data = col->data<int8_t>();
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  if (stride[0] <= 4 && dilation[0] == 1 && dilation[0] == dilation[1]) {
+    // pad 0
+    memset(col_data, 0, col->numel() * sizeof(int8_t));
+    for (int ic = 0; ic < im_channels; ++ic) {
+      for (int kh = 0; kh < filter_height; ++kh) {
+        for (int kw = 0; kw < filter_width; ++kw) {
+          ExtractToImg(im_data, col_data, im_height, im_width, col_height,
+                       col_width, padding[0], padding[1], stride[0], stride[1],
+                       kh, kw);
+          col_data += col_height * col_width;
         }
       }
+      im_data += im_height * im_width;
     }
-#else
+  } else {
+#endif
     for (int c = 0; c < channels_col; ++c) {
       int w_offset = c % filter_width;
       int h_offset = (c / filter_width) % filter_height;
@@ -424,14 +520,15 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
 
           col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
                                im_col_idx < 0 || im_col_idx >= im_width)
-                                  ? static_cast<T>(0)
+                                  ? static_cast<int8_t>(0)
                                   : im_data[im_idx];
         }
       }
     }
-#endif
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
   }
-};
+#endif
+}
 
 /*
  * im = [input_channels, input_height, input_width]
@@ -456,27 +553,6 @@ class Col2ImFunctor<ColFormat::kCFO, CPU, T> {
     int col_height = col.dims()[3];
     int col_width = col.dims()[4];
 
-    //    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2]
-    //    -
-    //                       ((dilation[0] * (filter_height - 1)
-    //                       + 1))) /
-    //                              stride[0] +
-    //                          1,
-    //                      col_height,
-    //                      "Output_height and
-    //                      padding(padding_up, padding_down)
-    //                      are " "inconsistent.");
-    //    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3]
-    //    -
-    //                       ((dilation[1] * (filter_width - 1)
-    //                       + 1))) /
-    //                              stride[1] +
-    //                          1,
-    //                      col_width,
-    //                      "Output_height and
-    //                      padding(padding_up, padding_down)
-    //                      are " "inconsistent.");
-
     int channels_col = im_channels * filter_height * filter_width;
 
     T *im_data = im->data<T>();
@@ -503,9 +579,9 @@ class Col2ImFunctor<ColFormat::kCFO, CPU, T> {
 };
 
 template class Im2ColFunctor<ColFormat::kCFO, CPU, float>;
-// template class Im2ColFunctor<ColFormat::kCFO, CPU, double>;
+template class Im2ColFunctor<ColFormat::kCFO, CPU, int8_t>;
 template class Col2ImFunctor<ColFormat::kCFO, CPU, float>;
-template class Col2ImFunctor<ColFormat::kCFO, CPU, double>;
+template class Col2ImFunctor<ColFormat::kCFO, CPU, int8_t>;
 
 /*
  * im = [input_channels, input_height, input_width]
@@ -519,8 +595,6 @@ class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
   void operator()(const framework::Tensor &im, const std::vector<int> &dilation,
                   const std::vector<int> &stride,
                   const std::vector<int> &padding, framework::Tensor *col) {
-    //    PADDLE_ENFORCE(im.dims().size() == 3);
-    //    PADDLE_ENFORCE(col->dims().size() == 5);
     int im_channels = im.dims()[0];
     int im_height = im.dims()[1];
     int im_width = im.dims()[2];
@@ -528,18 +602,6 @@ class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
     int filter_width = col->dims()[4];
     int col_height = col->dims()[0];
     int col_width = col->dims()[1];
-    //    PADDLE_ENFORCE_EQ(
-    //        (im_height + padding[0] + padding[2] -
-    //        filter_height) / stride[0]
-    //        + 1, col_height, "Output_height and
-    //        padding(padding_up,
-    //        padding_down) are " "inconsistent.");
-    //    PADDLE_ENFORCE_EQ(
-    //        (im_width + padding[1] + padding[3] -
-    //        filter_width) / stride[1] +
-    //        1, col_width, "col_width and padding(padding_left,
-    //        padding_right)
-    //        are " "inconsistent.");
 
     const T *im_data = im.data<T>();
     T *col_data = col->data<T>();
@@ -589,8 +651,6 @@ class Col2ImFunctor<ColFormat::kOCF, CPU, T> {
                   const std::vector<int> &dilation,
                   const std::vector<int> &stride,
                   const std::vector<int> &padding, framework::Tensor *im) {
-    //    PADDLE_ENFORCE(im->dims().size() == 3);
-    //    PADDLE_ENFORCE(col.dims().size() == 5);
     int im_channels = im->dims()[0];
     int im_height = im->dims()[1];
     int im_width = im->dims()[2];
@@ -599,19 +659,6 @@ class Col2ImFunctor<ColFormat::kOCF, CPU, T> {
     int col_height = col.dims()[0];
     int col_width = col.dims()[1];
 
-    //    PADDLE_ENFORCE_EQ(
-    //        (im_height + padding[0] + padding[2] -
-    //        filter_height) / stride[0]
-    //        + 1, col_height, "Output_height and
-    //        padding(padding_up,
-    //        padding_down) are " "inconsistent.");
-    //    PADDLE_ENFORCE_EQ(
-    //        (im_width + padding[1] + padding[3] -
-    //        filter_width) / stride[1] +
-    //        1, col_width, "col_width and padding(padding_left,
-    //        padding_right)
-    //        are " "inconsistent.");
-
     T *im_data = im->data<T>();
     const T *col_data = col.data<T>();
 
@@ -651,9 +698,7 @@ class Col2ImFunctor<ColFormat::kOCF, CPU, T> {
 };
 
 template class Im2ColFunctor<ColFormat::kOCF, CPU, float>;
-template class Im2ColFunctor<ColFormat::kOCF, CPU, double>;
 template class Col2ImFunctor<ColFormat::kOCF, CPU, float>;
-template class Col2ImFunctor<ColFormat::kOCF, CPU, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp
index 9d39f89b04ebcef93fa9d122d629bdf6f4586c66..4365bf5716b8b5811f6ac66217b2fe74ae116f52 100644
--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -15,12 +15,31 @@ limitations under the License. */
 #include "operators/math/math_function.h"
 #include <cstring>
 #include <string>
+#include "framework/data_type.h"
+#include "framework/tensor.h"
 #include "operators/math/gemm.h"
 
 namespace paddle_mobile {
 namespace operators {
 namespace math {
 
+struct TensorSetConstant {
+  TensorSetConstant(framework::Tensor *tensor, float value)
+      : tensor_(tensor), value_(value) {}
+  template <typename T>
+  void apply() const {
+    auto *begin = tensor_->mutable_data<T>();
+    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
+  }
+  framework::Tensor *tensor_;
+  float value_;
+};
+
+void set_constant(framework::Tensor *tensor, float value) {
+  framework::VisitDataType(framework::ToDataType(tensor->type()),
+                           TensorSetConstant(tensor, value));
+}
+
 template <>
 void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
                    const framework::Tensor &matrix_b, bool trans_b, float alpha,
@@ -135,7 +154,7 @@ template <typename T>
 struct ClearTensor<CPU, T> {
   void operator()(framework::Tensor *tensor) {
     auto size = tensor->numel();
-    auto *tensor_data = tensor->data<float>();
+    auto *tensor_data = tensor->data<T>();
     memset((void *)tensor_data, 0, sizeof(T) * size);  // NOLINT
   }
 };
@@ -151,9 +170,9 @@ struct RowwiseAdd<CPU, T> {
     PADDLE_MOBILE_ENFORCE((output->dims() == in_dims),
                           "output->dims() must be equal to in_dims.");
 
-    auto *input_data = input.data<float>();
-    auto *out_data = output->data<float>();
-    auto *vec_data = vector.data<float>();
+    auto *input_data = input.data<T>();
+    auto *out_data = output->data<T>();
+    auto *vec_data = vector.data<T>();
     for (int64_t i = 0; i < in_dims[0]; ++i) {
       for (int64_t j = 0; j < size; ++j) {
         out_data[i * size + j] = input_data[i * size + j] + vec_data[j];
diff --git a/src/operators/math/math_function.h b/src/operators/math/math_function.h
index de19e3df2ab69c8ac490b09af2852bf2fa806c64..b91242c1868398e4541c3727567a905e5b0c8714 100644
--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -15,17 +15,20 @@ limitations under the License. */
 #pragma once
 
 #include <cmath>
+#include <string>
 #include "framework/tensor.h"
 
 namespace paddle_mobile {
 namespace operators {
 namespace math {
 
+void set_constant(framework::Tensor *tensor, float value);
+
 template <typename T>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
             const framework::Tensor &matrix_b, bool trans_b, T alpha,
             framework::Tensor *matrix_out, T beta, bool relu = false,
-            float *bias = nullptr);
+            T *bias = nullptr);
 
 template <typename T>
 void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
diff --git a/src/operators/math/math_function_int8.cpp b/src/operators/math/math_function_int8.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..70677223d12ded2da07ab53bc371f1e8da9fe293
--- /dev/null
+++ b/src/operators/math/math_function_int8.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstring>
+#include <string>
+#include "operators/math/gemm.h"
+#include "operators/math/math_function.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+template <>
+void matmul<int8_t>(const framework::Tensor &matrix_a, bool trans_a,
+                    const framework::Tensor &matrix_b, bool trans_b,
+                    int8_t alpha, framework::Tensor *matrix_out, int8_t beta,
+                    bool relu, int8_t *bias) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  PADDLE_MOBILE_ENFORCE(
+      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+      "The input and output of matmul be matrix");
+
+  int32_t M = dim_out[0];
+  int32_t N = dim_out[1];
+  int32_t K = (!trans_a) ? dim_a[1] : dim_a[0];
+  Gemm gemm;
+
+  if (trans_a) {
+    int32_t numel = matrix_a.numel();
+    int32_t m = matrix_a.dims()[0];
+    int32_t n = matrix_a.dims()[1];
+    int8_t *tmp = (int8_t *)(matrix_a.data<int8_t>());  // NOLINT
+    int8_t *a = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * numel));
+    int32_t index = 0;
+    for (int32_t j = 0; j < n; j++) {
+      for (int32_t i = 0; i < m; i++) {
+        a[index++] = tmp[i * n + j];
+      }
+    }
+
+    gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
+               matrix_out->data<int32_t>(), N, relu, bias);
+  } else {
+    gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
+               matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(), N,
+               relu, bias);
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/math/pad.cpp b/src/operators/math/pad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d8153c445b007e8c5a902301e2724f22c8f6add1
--- /dev/null
+++ b/src/operators/math/pad.cpp
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "operators/math/pad.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+template <typename T>
+class PadFunctor<CPU, T> {
+ public:
+  void operator()(const framework::Tensor &input, const int pad_h,
+                  const int pad_w, framework::Tensor *output) {
+    const T *in_data = input.data<T>();
+    T *out_data = output->mutable_data<T>();
+    const framework::DDim &input_shape = input.dims();
+    const framework::DDim &output_shape = output->dims();
+    // fill output with 0
+    memset(out_data, 0, sizeof(T) * output->numel());
+    // should make sure the shape of output is match with input
+    for (int i = 0; i < input_shape[0]; ++i) {
+      for (int c = 0; c < input_shape[1]; ++c) {
+        out_data += pad_h * output_shape[3];
+        for (int h = 0; h < input_shape[2]; ++h) {
+          memcpy(out_data + pad_w, in_data, sizeof(T) * input_shape[3]);
+          out_data += output_shape[3];
+          in_data += input_shape[3];
+        }
+        out_data += pad_h * output_shape[3];
+      }
+    }
+  }
+};
+
+template class PadFunctor<CPU, float>;
+template class PadFunctor<CPU, int8_t>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/math/pad.h b/src/operators/math/pad.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f5a4b89674f92746f75bb1e4f9364d5a16fdba2
--- /dev/null
+++ b/src/operators/math/pad.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "framework/tensor.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+template <typename DeviceType, typename T>
+class PadFunctor {
+ public:
+  void operator()(const framework::Tensor &input, const int pad_h,
+                  const int pad_w, framework::Tensor *output);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/math/vol2col.cpp b/src/operators/math/vol2col.cpp
index afee3f7f85a6b2b3f84e9c3430211c4d97656d1c..9311e9e2291709631bc8ee07d2cc94f9ca99f4c2 100644
--- a/src/operators/math/vol2col.cpp
+++ b/src/operators/math/vol2col.cpp
@@ -32,9 +32,6 @@ class Vol2ColFunctor<CPU, T> {
   void operator()(const Tensor &vol, const std::vector<int> &dilations,
                   const std::vector<int> &strides,
                   const std::vector<int> &paddings, Tensor *col) const {
-    //    PADDLE_ENFORCE(vol.dims().size() == 4);
-    //    PADDLE_ENFORCE(col->dims().size() == 7);
-
     int input_channels = vol.dims()[0];
     int input_depth = vol.dims()[1];
     int input_height = vol.dims()[2];
@@ -48,32 +45,6 @@ class Vol2ColFunctor<CPU, T> {
     int channels_col =
         input_channels * filter_depth * filter_height * filter_width;
 
-    //    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-    //                       ((dilations[0] * (filter_depth - 1)
-    //                       + 1))) /
-    //                              strides[0] +
-    //                          1,
-    //                      output_depth,
-    //                      "input_depth and output_depth are "
-    //                      "mismatching.");
-    //    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-    //                       ((dilations[1] * (filter_height -
-    //                       1) + 1))) /
-    //                              strides[1] +
-    //                          1,
-    //                      output_height,
-    //                      "input_height and output_height are
-    //                      "
-    //                      "mismatching.");
-    //    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-    //                       ((dilations[2] * (filter_width - 1)
-    //                       + 1))) /
-    //                              strides[2] +
-    //                          1,
-    //                      output_width,
-    //                      "input_width and output_width are "
-    //                      "mismatching.");
-
     const T *vol_data = vol.data<T>();
     T *col_data = col->data<T>();
 
@@ -119,9 +90,6 @@ class Col2VolFunctor<CPU, T> {
   void operator()(const Tensor &col, const std::vector<int> &dilations,
                   const std::vector<int> &strides,
                   const std::vector<int> &paddings, Tensor *vol) const {
-    //    PADDLE_ENFORCE(vol->dims().size() == 4);
-    //    PADDLE_ENFORCE(col.dims().size() == 7);
-
     int input_channels = vol->dims()[0];
     int input_depth = vol->dims()[1];
     int input_height = vol->dims()[2];
@@ -135,31 +103,6 @@ class Col2VolFunctor<CPU, T> {
     int channels_col =
         input_channels * filter_depth * filter_height * filter_width;
 
-    //    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-    //                       ((dilations[0] * (filter_depth - 1)
-    //                       + 1))) /
-    //                              strides[0] +
-    //                          1,
-    //                      output_depth,
-    //                      "input_depth and output_depth are "
-    //                      "mismatching.");
-    //    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-    //                       ((dilations[1] * (filter_height -
-    //                       1) + 1))) /
-    //                              strides[1] +
-    //                          1,
-    //                      output_height,
-    //                      "input_height and output_height are
-    //                      "
-    //                      "mismatching.");
-    //    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-    //                       ((dilations[2] * (filter_width - 1)
-    //                       + 1))) /
-    //                              strides[2] +
-    //                          1,
-    //                      output_width,
-    //                      "input_width and output_width are "
-    //                      "mismatching.");
     T *vol_data = vol->data<T>();
     const T *col_data = col.data<T>();
 
@@ -195,9 +138,9 @@ class Col2VolFunctor<CPU, T> {
 };
 
 template class Vol2ColFunctor<CPU, float>;
-template class Vol2ColFunctor<CPU, double>;
+template class Vol2ColFunctor<CPU, int8_t>;
 template class Col2VolFunctor<CPU, float>;
-template class Col2VolFunctor<CPU, double>;
+template class Col2VolFunctor<CPU, int8_t>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index 70562da8f8961daed9c0057f3ebc8e1a1a6e340e..568cf77b8e4e81732cd9a783c1a9ea64d347102b 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -546,11 +546,11 @@ class MulParam : OpParam {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -999,6 +999,28 @@ class MultiClassNMSParam : public OpParam {
 };
 #endif
 
+#ifdef POLYGONBOXTRANSFORM_OP
+template <typename Dtype>
+class PolygonBoxTransformParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  PolygonBoxTransformParam(const VariableNameMap &inputs,
+                           const VariableNameMap &outputs,
+                           const AttributeMap &attrs, const Scope &scope) {
+    input_ = InputFrom<GType>(inputs, scope);
+    output_ = OutputFrom<GType>(outputs, scope);
+  }
+  const RType *Input() const { return input_; }
+  RType *Output() const { return output_; }
+
+ private:
+  RType *input_;
+  RType *output_;
+};
+#endif
+
 template <typename Dtype>
 class FeedParam : public OpParam {
   typedef typename DtypeTensorTrait<Dtype>::gtype GType;
@@ -1041,6 +1063,42 @@ class FetchParam : public OpParam {
   RType *out_;
 };
 
+#ifdef FILL_CONSTANT_OP
+template <typename Dtype>
+class FillConstantParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FillConstantParam(const VariableNameMap &inputs,
+                    const VariableNameMap &outputs, const AttributeMap &attrs,
+                    const Scope &scope) {
+    out_var_ = OutVarFrom(outputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    dtype_ = GetAttr<int>("dtype", attrs);
+    shape_ = GetAttr<vector<int>>("shape", attrs);
+    value_ = GetAttr<float>("value", attrs);
+  }
+
+  Variable *OutVar() const { return out_var_; }
+
+  RType *Out() const { return out_; }
+
+  const int &DataDtype() const { return dtype_; }
+
+  const vector<int> &Shape() const { return shape_; }
+
+  const float &Value() const { return value_; }
+
+ private:
+  Variable *out_var_;
+  RType *out_;
+  int dtype_;
+  vector<int> shape_;
+  float value_;
+};
+#endif
+
 #ifdef TRANSPOSE_OP
 template <typename Dtype>
 class TransposeParam : public OpParam {
@@ -1401,11 +1459,11 @@ class FusionFcParam : public OpParam {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 
@@ -1441,11 +1499,11 @@ class FusionConvAddParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 
@@ -1496,11 +1554,11 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1554,11 +1612,11 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1629,11 +1687,11 @@ class FusionConvAddBNReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1715,11 +1773,11 @@ class FusionConvBNAddReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1782,11 +1840,11 @@ class FusionConvBNParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1857,11 +1915,11 @@ class FusionConvAddBNParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -1983,11 +2041,11 @@ class FusionConvBNReluParam : public ConvParam<Dtype> {
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
-  fpga::WrapperConvArgs fpga_conv_args;
+  fpga::SplitConvArgs fpga_conv_args;
 
  public:
-  const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; }
+  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
+  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
 #endif
 };
 #endif
@@ -2272,6 +2330,7 @@ class ShapeParam : public OpParam {
 };
 #endif
 
+#ifdef QUANT_OP
 template <typename Dtype>
 class QuantizeParam : public OpParam {
   typedef typename DtypeTensorTrait<Dtype>::gtype GType;
@@ -2282,14 +2341,12 @@ class QuantizeParam : public OpParam {
                 const AttributeMap &attrs, const Scope &scope) {
     input_ = InputXFrom<GType>(inputs, scope);
     out_ = OutFrom<GType>(outputs, scope);
-    if (HasAttr("is_static", attrs)) {
-      is_static_ = GetAttr<bool>("is_static", attrs);
-    }
     // online
     // scale = max(abs(x))
     online_scale_ = GetVarValue<GType>("OutScale", outputs, scope);
     // offline
     if (HasAttr("static_scale", attrs)) {
+      is_static_ = true;
       static_scale_ = GetAttr<float>("static_scale", attrs);
     }
     // x = round(scale * x)
@@ -2311,9 +2368,11 @@ class QuantizeParam : public OpParam {
   float static_scale_ = 1.0f;
   // round method type
   // nearest_zero and nearest_even is valid currently
-  RoundType round_type_ = ROUND_NEAREST_TO_EVEN;
+  RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
 };
+#endif
 
+#ifdef DEQUANT_OP
 template <typename Dtype>
 class DequantizeParam : public OpParam {
   typedef typename DtypeTensorTrait<Dtype>::gtype GType;
@@ -2341,6 +2400,7 @@ class DequantizeParam : public OpParam {
   RType *activation_scale_;
   float weight_scale_;
 };
+#endif
 
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/src/operators/polygon_box_transform_op.cpp b/src/operators/polygon_box_transform_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3eed0e2f30651ea2f7c3250187b30126ba4d283
--- /dev/null
+++ b/src/operators/polygon_box_transform_op.cpp
@@ -0,0 +1,45 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef POLYGONBOXTRANSFORM_OP
+
+#include "operators/polygon_box_transform_op.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void PolygonBoxTransformOp<Dtype, T>::InferShape() const {
+  PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr,
+                        "Input (Input) of get_shape op should not be null.");
+  PADDLE_MOBILE_ENFORCE(this->param_.Output() != nullptr,
+                        "Output (Output) of get_shape op should not be null.");
+
+  auto input_dims = this->param_.Input()->dims();
+
+  PADDLE_MOBILE_ENFORCE(input_dims.size() == 4, "input's rank must be 4.");
+  PADDLE_MOBILE_ENFORCE(input_dims[1] % 2 == 0,
+                        "input's second dimension must be even.");
+
+  this->param_.Output()->Resize(input_dims);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(polygon_box_transform, ops::PolygonBoxTransformOp);
+#endif
+
+#endif
diff --git a/src/operators/polygon_box_transform_op.h b/src/operators/polygon_box_transform_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e20765f715106d4b3c8a182d52e3ab135637c9e9
--- /dev/null
+++ b/src/operators/polygon_box_transform_op.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef POLYGONBOXTRANSFORM_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/polygon_box_transform_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class PolygonBoxTransformOp
+    : public framework::OperatorWithKernel<
+          DeviceType, PolygonBoxTransformParam<DeviceType>,
+          operators::PolygonBoxTransformKernel<DeviceType, T>> {
+ public:
+  PolygonBoxTransformOp(const std::string &type, const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs,
+                        std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, PolygonBoxTransformParam<DeviceType>,
+            operators::PolygonBoxTransformKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, PolygonBoxTransformParam<DeviceType>,
+      operators::PolygonBoxTransformKernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/quantize_op.cpp b/src/operators/quantize_op.cpp
index 7958b054de3665132b52582b8bd4126413c0597a..865539d7d26de41b319b4d82ed168b2ec74d722d 100644
--- a/src/operators/quantize_op.cpp
+++ b/src/operators/quantize_op.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef QUANT_OP
+
 #include "operators/quantize_op.h"
 #include <vector>
 
@@ -33,3 +35,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(quantize, ops::QuantizeOp);
 #endif
+
+#endif
diff --git a/src/operators/quantize_op.h b/src/operators/quantize_op.h
index 2b0d2f8e321b9e15324e5aa2b38ba50fb4f7aebf..ca04c1213a5cdcb44082848fb45b1ade3f19086f 100644
--- a/src/operators/quantize_op.h
+++ b/src/operators/quantize_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef QUANT_OP
+
 #pragma once
 
 #include <string>
@@ -40,3 +42,5 @@ class QuantizeOp : public framework::OperatorWithKernel<
 
 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/sum_op.cpp b/src/operators/sum_op.cpp
index 8c0638c63ca7cab01047b757476549cf3832bf8a..2e10363b07498128b5573e27a3d63b59c454d8b6 100644
--- a/src/operators/sum_op.cpp
+++ b/src/operators/sum_op.cpp
@@ -26,7 +26,7 @@ void SumOp<Dtype, T>::InferShape() const {
   auto inputs = this->param_.Inputs();
   const size_t n = inputs.size();
 
-  std::vector<DDim> inputs_dims;
+  std::vector<framework::DDim> inputs_dims;
   inputs_dims.reserve(n);
   for (int i = 0; i < n; i++) {
     inputs_dims.push_back(inputs[i]->dims());
@@ -65,7 +65,6 @@ REGISTER_OPERATOR_CPU(sum, ops::SumOp);
 REGISTER_OPERATOR_MALI_GPU(sum, ops::ConcatOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(sum, ops::ConcatOp);
 #endif
 
 #endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e52db8b8f04bd9e4ae86241542ea2565937c677b..a10088f9b417b628418404b8df3d340b851af383 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -12,6 +12,9 @@ if (CON GREATER -1)
     ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-googlenet paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-googlenet-quali net/test_googlenet_quali.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-googlenet-quali paddle-mobile)
     set(FOUND_MATCH ON)
 
 endif ()
@@ -61,38 +64,11 @@ endif ()
 
 list(FIND NET "FPGAnets" CON)
 if (CON GREATER -1)
-    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-resnet paddle-mobile)
-
     ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-resnet50 paddle-mobile)
 
-    ADD_EXECUTABLE(test-fpga-EW fpga/test_fpga_EW.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-EW paddle-mobile)
-
-    ADD_EXECUTABLE(test-fpga-conv fpga/test_fpga_conv.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-conv paddle-mobile)
-
-    ADD_EXECUTABLE(test-fpga-pooling fpga/test_fpga_pooling.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-pooling paddle-mobile)
-
-    ADD_EXECUTABLE(test-fpga-bypass fpga/test_fpga_bypass.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-bypass paddle-mobile)
-
-    ADD_EXECUTABLE(test-fpga-softmax fpga/test_fpga_softmax.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-softmax paddle-mobile)
-
-    ADD_EXECUTABLE(test-fpga-concat fpga/test_fpga_concat.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-fpga-concat paddle-mobile)
-
-    ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-tensor-quant paddle-mobile)
-
-    ADD_EXECUTABLE(test-fpga-concat-op fpga/test_concat_op.cpp test_helper.h test_include.h)
-    target_link_libraries(test-fpga-concat-op paddle-mobile)
-
-    ADD_EXECUTABLE(test-format-data fpga/test_format_data.cpp test_helper.h test_include.h)
-    target_link_libraries(test-format-data paddle-mobile)
+#    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
+#    target_link_libraries(test-resnet paddle-mobile)
     set(FOUND_MATCH ON)
 
 endif ()
@@ -160,6 +136,10 @@ if (NOT FOUND_MATCH)
     ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-googlenet paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-googlenet-quali net/test_googlenet_quali.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-googlenet-quali paddle-mobile)
+
     # gen test
     ADD_EXECUTABLE(test-conv-op operators/test_cov_op.cpp test_helper.h test_include.h executor_for_test.h)
     target_link_libraries(test-conv-op paddle-mobile)
@@ -208,6 +188,14 @@ if (NOT FOUND_MATCH)
     ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-multiclassnms-op paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-polygon-box-transform-op operators/test_polygon_box_transform_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-polygon-box-transform-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-fill-constant-op operators/test_fill_constant_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-fill-constant-op paddle-mobile)
+
     # gen test
     ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-reshape-op paddle-mobile)
@@ -232,6 +220,10 @@ if (NOT FOUND_MATCH)
     ADD_EXECUTABLE(test-dequantize-op operators/test_dequantize_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-dequantize-op paddle-mobile)
 
+    # test int8 conv op
+    ADD_EXECUTABLE(test-int8-conv-op operators/test_int8_conv_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-int8-conv-op paddle-mobile)
+
     # gen test log
     ADD_EXECUTABLE(test-log common/test_log.cpp)
     target_link_libraries(test-log paddle-mobile)
@@ -244,6 +236,10 @@ if (NOT FOUND_MATCH)
     ADD_EXECUTABLE(test-loadmemory framework/test_load_memory.cpp)
     target_link_libraries(test-loadmemory paddle-mobile)
 
+    # gen test log
+    ADD_EXECUTABLE(test-loadmemory-inference framework/test_load_memory_inference_api.cpp)
+    target_link_libraries(test-loadmemory-inference paddle-mobile)
+
     ADD_EXECUTABLE(test-inference-api framework/test_inference_api.cpp)
     target_link_libraries(test-inference-api paddle-mobile)
 
@@ -266,6 +262,10 @@ if (NOT FOUND_MATCH)
     ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
     target_link_libraries(test-gemm-accuracy paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-gemm-int8-accuracy common/test_gemm_int8_accuracy.cpp)
+    target_link_libraries(test-gemm-int8-accuracy paddle-mobile)
+
     # gen test
     ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp)
     target_link_libraries(test-gemm-perf paddle-mobile)
diff --git a/test/common/test_gemm_accuracy.cpp b/test/common/test_gemm_accuracy.cpp
index 0967094f6895d35784a9c06344e3473e66fcd370..2a2505a86b1abab5fe6fd8e0b9905ce7ae78f292 100644
--- a/test/common/test_gemm_accuracy.cpp
+++ b/test/common/test_gemm_accuracy.cpp
@@ -84,7 +84,7 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
   }
 
   paddle_mobile::operators::math::Gemm gemm;
-  gemm.SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias,
+  gemm.SgemmWithBn(m, n, k, 1, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias,
                    nullptr);
   int eq = 0;
   int neq = 0;
diff --git a/test/common/test_gemm_int8_accuracy.cpp b/test/common/test_gemm_int8_accuracy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..80ddd40e121c81032c903955bd7116cf52695569
--- /dev/null
+++ b/test/common/test_gemm_int8_accuracy.cpp
@@ -0,0 +1,131 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstdlib>
+#include <ctime>
+#include <iostream>
+#include <random>
+#include "../test_helper.h"
+#include "common/log.h"
+#include "memory/t_malloc.h"
+#include "operators/math/gemm.h"
+
+#define a(i, j) a[(i)*lda + (j)]
+#define b(i, j) b[(i)*ldb + (j)]
+#define c(i, j) c[(i)*ldc + (j)]
+#define c1(i, j) c1[(i)*ldc + (j)]
+
+using std::default_random_engine;
+using std::uniform_int_distribution;
+
+void print_matirx(int m, int n, int ldc, int32_t *c) {
+  for (int i = 0; i < m; ++i) {
+    std::cout << c(i, 0);
+    for (int j = 1; j < n; ++j) {
+      std::cout << " | " << c(i, j);
+    }
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+}
+
+void print_matirx(int m, int n, int ldc, int8_t *c) {
+  for (int i = 0; i < m; ++i) {
+    std::cout << static_cast<int32_t>(c(i, 0));
+    for (int j = 1; j < n; ++j) {
+      std::cout << " | " << static_cast<int32_t>(c(i, j));
+    }
+    std::cout << std::endl;
+  }
+  std::cout << std::endl;
+}
+
+int do_sgemm(int m, int n, int k, bool relu, int pr) {
+  int lda = k;
+  int ldb = n;
+  int ldc = n;
+  default_random_engine e;
+  uniform_int_distribution<int8_t> pixel(-127, 127);
+  int8_t *a = static_cast<int8_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int8_t) * m * k));
+  int8_t *b = static_cast<int8_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int8_t) * k * n));
+  int32_t *c = static_cast<int32_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int32_t) * m * n));
+  int32_t *c1 = static_cast<int32_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int32_t) * m * n));
+
+  for (int i = 0; i < m * k; ++i) {
+    a[i] = pixel(e);
+  }
+  for (int i = 0; i < k * n; ++i) {
+    b[i] = pixel(e);
+  }
+
+  for (int i = 0; i < m; ++i) {
+    for (int j = 0; j < n; ++j) {
+      int32_t r = 0;
+      for (int p = 0; p < k; p++) {
+        r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
+      }
+      c1(i, j) = r;
+    }
+  }
+
+  paddle_mobile::operators::math::Gemm gemm;
+  gemm.Sgemm(m, n, k, static_cast<int8_t>(1), a, lda, b, ldb,
+             static_cast<int8_t>(0), c, ldc, relu, nullptr);
+  int eq = 0;
+  int neq = 0;
+  for (int i = 0; i < m * n; ++i) {
+    if (c[i] == c1[i]) {
+      ++eq;
+    } else {
+      ++neq;
+    }
+  }
+
+  if (pr > 0) {
+    std::cout << "A:" << std::endl;
+    print_matirx(m, k, lda, a);
+    std::cout << "B:" << std::endl;
+    print_matirx(k, n, ldb, b);
+    std::cout << "C:" << std::endl;
+    print_matirx(m, n, ldc, c);
+    std::cout << "C1:" << std::endl;
+    print_matirx(m, n, ldc, c1);
+  }
+
+  std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
+            << "   eq=" << eq << " neq=" << neq << std::endl;
+
+  paddle_mobile::memory::Free(a);
+  paddle_mobile::memory::Free(b);
+  paddle_mobile::memory::Free(c);
+  paddle_mobile::memory::Free(c1);
+
+  return 0;
+}
+
+int main() {
+  do_sgemm(9, 9, 9, false, 10);
+  do_sgemm(10, 6, 12, false, 0);
+  do_sgemm(512, 256, 384, false, 0);
+  do_sgemm(1366, 768, 256, false, 0);
+  do_sgemm(1255, 755, 333, false, 0);
+  do_sgemm(555, 777, 999, false, 0);
+  do_sgemm(1024, 1024, 1024, false, 0);
+
+  return 0;
+}
diff --git a/test/common/test_gemm_perf.cpp b/test/common/test_gemm_perf.cpp
index 386c09d71a3d5709842991bffd2e8ea039edc940..89f0012ae8effaab383719c1b85748c24eb2bf73 100644
--- a/test/common/test_gemm_perf.cpp
+++ b/test/common/test_gemm_perf.cpp
@@ -28,13 +28,11 @@ limitations under the License. */
 
 int main() {
   paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  Tensor aa, bb, cc, scale, bias;
+  paddle_mobile.SetThreadNum(1);
+  Tensor aa, bb, cc;
   auto aaptr = aa.mutable_data<float>({m, k});
   auto bbptr = bb.mutable_data<float>({k, n});
   auto ccptr = cc.mutable_data<float>({m, n});
-  auto scaleptr = scale.mutable_data<float>({m});
-  auto biasptr = bias.mutable_data<float>({m});
 
   for (int i = 0; i < m * k; ++i) {
     aaptr[i] = 2;
@@ -45,23 +43,55 @@ int main() {
   for (int i = 0; i < m * n; ++i) {
     ccptr[i] = 2;
   }
-  for (int i = 0; i < m; ++i) {
-    scaleptr[i] = 1;
-    biasptr[i] = 0;
+
+  Tensor aa_int8, bb_int8, cc_int8;
+  auto aaptr_int8 = aa_int8.mutable_data<int8_t>({m, k});
+  auto bbptr_int8 = bb_int8.mutable_data<int8_t>({k, n});
+  auto ccptr_int8 = cc_int8.mutable_data<int32_t>({m, n});
+
+  for (int i = 0; i < m * k; ++i) {
+    aaptr_int8[i] = static_cast<int8_t>(2);
+  }
+  for (int i = 0; i < k * n; ++i) {
+    bbptr_int8[i] = static_cast<int8_t>(2);
+  }
+  for (int i = 0; i < m * n; ++i) {
+    ccptr_int8[i] = static_cast<int32_t>(2);
   }
 
-  auto time1 = time();
+  // float
+  // warm-up 10 times
   for (int j = 0; j < 10; ++j) {
     paddle_mobile::operators::math::matmul<float>(
         aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
-        false, biasptr);
+        false, nullptr);
+  }
 
-    //    paddle_mobile::operators::math::matmulWithBn<float>(
-    //        aa, false, bb, false, static_cast<float>(1), &cc,
-    //        static_cast<float>(0), true, &scale, &bias, 0);
+  auto time1 = time();
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul<float>(
+        aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
+        false, nullptr);
   }
   auto time2 = time();
-  std::cout << "gemm  cost :" << time_diff(time1, time2) / 10 << "ms\n";
+  std::cout << "float gemm  cost :" << time_diff(time1, time2) / 10 << "ms\n";
+
+  // int8_t
+  // warm-up 10 times
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul<int8_t>(
+        aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
+        static_cast<int8_t>(0), false, nullptr);
+  }
+
+  auto time3 = time();
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul<int8_t>(
+        aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
+        static_cast<int8_t>(0), false, nullptr);
+  }
+  auto time4 = time();
+  std::cout << "int8_t gemm  cost :" << time_diff(time3, time4) / 10 << "ms\n";
 
   return 0;
 }
diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp
index f850eb3e5ea3a03fe90d82c1eca2af6c9f8e9106..8a6a9dc8af836010695c6c6dc30e81ba224c7ffd 100644
--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
@@ -11,30 +11,107 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
+#include <fstream>
 #include "../test_include.h"
-static const char *g_resnet_combine = "../models/resnet50";
+#include "fpga/api.h"
+void readStream(std::string filename, float *buf) {
+  std::ifstream in;
+  in.open(filename, std::ios::in);
+  if (!in.is_open()) {
+    std::cout << "open File Failed." << std::endl;
+    return;
+  }
+  string strOne;
+  int i = 0;
+  while (!in.eof()) {
+    in >> buf[i];
+    i++;
+  }
+  in.close();
+}
+
+void convert_to_chw(int16_t **data_in, int channel, int height, int width,
+                    int16_t *data_tmp) {
+  int64_t amount_per_side = width * height;
+  for (int h = 0; h < height; h++) {
+    for (int w = 0; w < width; w++) {
+      for (int c = 0; c < channel; c++) {
+        *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++);
+      }
+    }
+  }
+}
 
+void dump(std::string filename, const Tensor input_tensor) {
+  auto dataptr = input_tensor.data<float>();
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  for (int i = 0; i < input_tensor.numel(); ++i) {
+    result = paddle_mobile::fpga::fp16_2_fp32(dataptr[i]);
+    out << result << std::endl;
+  }
+  out.close();
+}
+void dump_stride(std::string filename, const Tensor input_tensor,
+                 const int dumpnum) {
+  int c = (input_tensor.dims())[1];
+  int h = (input_tensor.dims())[2];
+  int w = (input_tensor.dims())[3];
+  auto data_ptr = input_tensor.data<float>();
+  int16_t *data_tmp = (int16_t *)malloc(c * h * w * sizeof(int16_t));
+  int16_t *data_ptr_16 = (int16_t *)data_ptr;
+  convert_to_chw(&data_ptr_16, c, h, w, data_tmp);
+  // const int16_t *dataptr = input_tensor.data<int16_t>();
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int stride = input_tensor.numel() / dumpnum;
+  stride = stride > 0 ? stride : 1;
+  for (int i = 0; i < input_tensor.numel(); i += stride) {
+    result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
+    out << result << std::endl;
+  }
+  out.close();
+  free(data_tmp);
+}
+static const char *g_resnet50 = "../models/resnet50";
+const std::string g_image_src_float = "../images/image_src_float";
 int main() {
-  DLOG << paddle_mobile::fpga::open_device();
+  paddle_mobile::fpga::open_device();
   paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-  //  if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model",
-  //                         std::string(g_resnet_combine) + "/params", true)) {
-  if (paddle_mobile.Load(std::string(g_resnet_combine), true)) {
-    std::vector<int64_t> dims{1, 3, 224, 224};
+  if (paddle_mobile.Load(std::string(g_resnet50), true)) {
     Tensor input_tensor;
     SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
                        static_cast<float>(1));
-
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
-
+    readStream(g_image_src_float,
+               input_tensor.mutable_data<float>({1, 3, 224, 224}));
     paddle_mobile.FeedData(input_tensor);
     paddle_mobile.Predict_To(-1);
-    //    paddle_mobile.Predict_From(73);
-    //    paddle_mobile.Predict_From_To(72, 73);
+    /*for(int i = 0; i < 73; i++)
+    {
+      auto tensor_ptr = paddle_mobile.FetchResult(i);
+      std::string saveName = "resnet50_result_" + std::to_string (i);
+      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).data<float>(),
+    tensor_ptr->numel()); dump_stride(saveName, (*tensor_ptr), 20);
+      //dump(saveName, (*tensor_ptr));
+    }*/
 
-    DLOG << "Computation done";
+    /*std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(73);
+    (*output_tensor).dump<float>("resnet50_result_73");
+    output_tensor = paddle_mobile.FetchResult(74);
+    (*output_tensor).dump<float>("resnet50_result_74");*/
+    std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(74);
+    float max = 0;
+    auto data_ptr = output_tensor->data<float>();
+    int maximumIdx = 0;
+    for (int i = 0; i < (*output_tensor).numel(); i++) {
+      if (data_ptr[i] > max) {
+        maximumIdx = i;
+        max = data_ptr[i];
+      }
+    }
+    std::cout << "index : " << maximumIdx << ",    value : " << max
+              << std::endl;
+    std::cout << "Computation done" << std::endl;
     return 0;
   }
 }
diff --git a/test/framework/test_load_memory.cpp b/test/framework/test_load_memory.cpp
index 4be7aaa82f53bd8c5ccfb531339827534b2736ab..162dba372774578952e4c306bb20a6a95c655c94 100644
--- a/test/framework/test_load_memory.cpp
+++ b/test/framework/test_load_memory.cpp
@@ -58,9 +58,9 @@ int main() {
   size_t sizeBuf = ReadBuffer(model_path.c_str(), &bufModel);
   uint8_t *bufParams = nullptr;
 
-  DLOG << "sizeBuf: " << sizeBuf;
+  std::cout << "sizeBuf: " << sizeBuf << std::endl;
   size_t sizeParams = ReadBuffer(params_path.c_str(), &bufParams);
-  DLOG << "sizeParams: " << sizeParams;
+  std::cout << "sizeParams: " << sizeParams << std::endl;
 
   paddle_mobile.LoadCombinedMemory(sizeBuf, bufModel, sizeParams, bufParams);
   return 0;
diff --git a/test/framework/test_load_memory_inference_api.cpp b/test/framework/test_load_memory_inference_api.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..05d51910172547c6dab7adc8231663be55c916bf
--- /dev/null
+++ b/test/framework/test_load_memory_inference_api.cpp
@@ -0,0 +1,80 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+
+#include <iostream>
+#include "../test_helper.h"
+#include "io/paddle_inference_api.h"
+
+static size_t ReadBuffer(const char *file_name, uint8_t **out) {
+  FILE *fp;
+  fp = fopen(file_name, "rb");
+  PADDLE_MOBILE_ENFORCE(fp != nullptr, " %s open failed !", file_name);
+  fseek(fp, 0, SEEK_END);
+  auto size = static_cast<size_t>(ftell(fp));
+  rewind(fp);
+  DLOG << "model size: " << size;
+  *out = reinterpret_cast<uint8_t *>(malloc(size));
+  size_t cur_len = 0;
+  size_t nread;
+  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
+    cur_len += nread;
+  }
+  fclose(fp);
+  return cur_len;
+}
+
+static char *Get_binary_data(std::string filename) {
+  FILE *file = fopen(filename.c_str(), "rb");
+  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
+                        filename.c_str());
+  fseek(file, 0, SEEK_END);
+  int64_t size = ftell(file);
+  PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
+  rewind(file);
+  auto *data = new char[size];
+  size_t bytes_read = fread(data, 1, size, file);
+  PADDLE_MOBILE_ENFORCE(bytes_read == size,
+                        "read binary file bytes do not match with fseek");
+  fclose(file);
+  return data;
+}
+
+paddle_mobile::PaddleMobileConfig GetConfig() {
+  paddle_mobile::PaddleMobileConfig config;
+  config.precision = paddle_mobile::PaddleMobileConfig::FP32;
+  config.device = paddle_mobile::PaddleMobileConfig::kCPU;
+  const std::shared_ptr<paddle_mobile::PaddleModelMemoryPack> &memory_pack =
+      std::make_shared<paddle_mobile::PaddleModelMemoryPack>();
+  auto model_path = std::string(g_genet_combine) + "/model";
+  auto params_path = std::string(g_genet_combine) + "/params";
+  memory_pack->model_size =
+      ReadBuffer(model_path.c_str(), &memory_pack->model_buf);
+  std::cout << "sizeBuf: " << memory_pack->model_size << std::endl;
+  memory_pack->combined_params_size =
+      ReadBuffer(params_path.c_str(), &memory_pack->combined_params_buf);
+  std::cout << "sizeParams: " << memory_pack->combined_params_size << std::endl;
+  memory_pack->from_memory = true;
+  config.memory_pack = *memory_pack;
+  config.thread_num = 4;
+  return config;
+}
+int main() {
+  paddle_mobile::PaddleMobileConfig config = GetConfig();
+  auto predictor = paddle_mobile::CreatePaddlePredictor<
+      paddle_mobile::PaddleMobileConfig,
+      paddle_mobile::PaddleEngineKind::kPaddleMobile>(config);
+  return 0;
+}
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index a2f030eeac5c2584b33fad2b082b9d5513707260..c88a78974c330ec270fbcb3f5c28e368ef16440e 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -25,27 +25,31 @@ int main() {
   paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
 #endif
 
-  paddle_mobile.SetThreadNum(4);
-  bool optimize = true;
+  paddle_mobile.SetThreadNum(1);
+  bool optimize = false;
   auto time1 = time();
   if (paddle_mobile.Load(g_googlenet, optimize)) {
     auto time2 = time();
     std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
     std::vector<float> input;
+    std::vector<float> output;
     std::vector<int64_t> dims{1, 3, 224, 224};
     GetInput<float>(g_test_image_1x3x224x224, &input, dims);
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
+    //    // 预热十次
+    //    for (int i = 0; i < 10; ++i) {
+    //      output = paddle_mobile.Predict(input, dims);
+    //    }
     auto time3 = time();
     for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
+      output = paddle_mobile.Predict(input, dims);
     }
     auto time4 = time();
 
     std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
               << std::endl;
+    for (int i = 0; i < output.size(); ++i) {
+      DLOG << "result[" << i << "] = " << output[i];
+    }
   }
   return 0;
 }
diff --git a/test/net/test_googlenet_quali.cpp b/test/net/test_googlenet_quali.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..28cb6207d7087939e6265a3fd636d6c2526cff53
--- /dev/null
+++ b/test/net/test_googlenet_quali.cpp
@@ -0,0 +1,55 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+#ifdef PADDLE_MOBILE_FPGA
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+#endif
+
+#ifdef PADDLE_MOBILE_CPU
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+#endif
+
+  paddle_mobile.SetThreadNum(4);
+  bool optimize = true;
+  bool quli = true;
+  auto time1 = time();
+  auto isok = paddle_mobile.Load(std::string(g_googlenet_quali) + "/model",
+                                 std::string(g_googlenet_quali) + "/params",
+                                 optimize, quli);
+  if (isok) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+  return 0;
+}
diff --git a/test/operators/test_dequantize_op.cpp b/test/operators/test_dequantize_op.cpp
index 8c61ae32d90169c5f8c6fdced94ce70f29d93b96..8e89d8f7af3694bcc4701c268451f28675db7fc9 100644
--- a/test/operators/test_dequantize_op.cpp
+++ b/test/operators/test_dequantize_op.cpp
@@ -59,7 +59,7 @@ int TestDequqntizeOp() {
 
   framework::Tensor output_cmp;
   output_cmp.Resize(dim);
-  float dequant_scale = 1.f / (1.27 * 1.74);
+  float dequant_scale = 1.27 / 1.74;
   dequantize(input, dequant_scale, &output_cmp);
   const float* output_cmp_data = output_cmp.data<float>();
   for (int i = 0; i < output->numel(); ++i) {
diff --git a/test/operators/test_fill_constant_op.cpp b/test/operators/test_fill_constant_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b099217d1641eb221b3d0d86d780fb6ecfa929bd
--- /dev/null
+++ b/test/operators/test_fill_constant_op.cpp
@@ -0,0 +1,113 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "../test_include.h"
+#include "operators/fill_constant_op.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename Dtype>
+class TestFillConstantOp {
+ public:
+  explicit TestFillConstantOp(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    for (auto block_desc : blocks) {
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      for (auto op : ops) {
+        if (op->Type() == "fill_constant") {
+          DLOG << " attr size: " << op->GetAttrMap().size();
+          std::unordered_map<std::string, Attribute> attrs = op->GetAttrMap();
+          for (std::unordered_map<std::string, Attribute>::iterator it =
+                   attrs.begin();
+               it != attrs.end(); ++it) {
+            DLOG << "  " << it->first << " " << it->second;
+          }
+          DLOG << " inputs size: " << op->GetInputs().size();
+          DLOG << " outputs size: " << op->GetOutputs().size();
+          DLOG << " output is : " << op->Output("Out")[0];
+          output_var_name = op->Output("Out")[0];
+          std::shared_ptr<operators::FillConstantOp<Dtype, float>> op_ptr =
+              std::make_shared<operators::FillConstantOp<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(op_ptr);
+        }
+      }
+    }
+  }
+
+  std::shared_ptr<Tensor> predict() {
+    auto scope = program_.scope;
+
+    Variable *output = scope->Var(output_var_name);
+    auto *output_tensor = output->GetMutable<LoDTensor>();
+
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);
+
+    predict(0);
+
+    return out_tensor;
+  }
+
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+  string output_var_name;
+
+  void predict(int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      op->Run();
+    }
+  }
+};
+
+template class TestFillConstantOp<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
+
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run FillConstant Test";
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_ocr) + "/model",
+                             std::string(g_ocr) + "/params");
+
+  paddle_mobile::framework::TestFillConstantOp<paddle_mobile::CPU>
+      testFillConstantOp(program);
+
+  auto output = testFillConstantOp.predict();
+  auto *output_ptr = output->data<float>();
+
+  DLOG << "output : ";
+  for (int i = 0; i < output->numel(); ++i) {
+    DLOG << " index " << i << " : " << output_ptr[i];
+  }
+  return 0;
+}
diff --git a/test/operators/test_int8_conv_op.cpp b/test/operators/test_int8_conv_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2ab40ba5833939e4456bb13bf4d5f9819a332693
--- /dev/null
+++ b/test/operators/test_int8_conv_op.cpp
@@ -0,0 +1,279 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "operators/conv_op.h"
+
+namespace paddle_mobile {
+
+// Reference convolution for checking results:
+// accumulate through explicit loops over input, output, and filters.
+template <typename Itype, typename Otype>
+void conv2d(const framework::Tensor *input, const framework::Tensor *filter,
+            const framework::AttributeMap &attrs, framework::Tensor *output) {
+  framework::AttrReader attr_reader(attrs);
+  std::vector<int> paddings = attr_reader.Get<std::vector<int>>("paddings");
+  std::vector<int> strides = attr_reader.Get<std::vector<int>>("strides");
+  std::vector<int> dilations = attr_reader.Get<std::vector<int>>("dilations");
+  int groups = attr_reader.Get<int>("groups");
+  int kernel_h = filter->dims()[2];
+  int kernel_w = filter->dims()[3];
+  int pad_h = paddings[0];
+  int pad_w = paddings[1];
+  int stride_h = strides[0];
+  int stride_w = strides[1];
+  int dilation_h = dilations[0];
+  int dilation_w = dilations[1];
+  auto in_shape = input->dims();
+  auto out_shape = output->dims();
+
+  const bool has_depth = 0;
+  int kernel_d, pad_d, stride_d, dilation_d;
+  if (has_depth) {
+    kernel_d = kernel_h;
+    stride_d = stride_h;
+    pad_d = pad_h;
+    dilation_d = dilation_h;
+  } else {
+    kernel_d = stride_d = dilation_d = 1;
+    pad_d = 0;
+  }
+  // Groups
+  int o_g = out_shape[1] / groups;
+  int k_g = in_shape[1] / groups;
+  int o_head, k_head;
+  // Convolution
+  vector<int> weight_offset(4 + has_depth);
+  vector<int> in_offset(4 + has_depth);
+  vector<int> out_offset(4 + has_depth);
+  auto offset = [](const framework::Tensor *input, const vector<int> &indics) {
+    framework::DDim shape = input->dims();
+    size_t count = 0;
+    for (int i = 0; i < indics.size(); ++i) {
+      count *= shape[i];
+      count += indics[i];
+    }
+    return count;
+  };
+
+  const Itype *in_data = input->data<Itype>();
+  const Itype *w_data = filter->data<Itype>();
+  Otype *out_data = output->mutable_data<Otype>();
+  memset(out_data, 0, output->numel() * sizeof(Otype));
+  for (int n = 0; n < out_shape[0]; n++) {
+    for (int g = 0; g < groups; g++) {
+      o_head = o_g * g;
+      k_head = k_g * g;
+      for (int o = 0; o < o_g; o++) {
+        for (int k = 0; k < k_g; k++) {
+          for (int z = 0; z < (has_depth ? out_shape[2] : 1); z++) {
+            for (int y = 0; y < out_shape[2 + has_depth]; y++) {
+              for (int x = 0; x < out_shape[3 + has_depth]; x++) {
+                for (int r = 0; r < kernel_d; r++) {
+                  for (int p = 0; p < kernel_h; p++) {
+                    for (int q = 0; q < kernel_w; q++) {
+                      int in_z = z * stride_d - pad_d + r * dilation_d;
+                      int in_y = y * stride_h - pad_h + p * dilation_h;
+                      int in_x = x * stride_w - pad_w + q * dilation_w;
+                      if (in_z >= 0 && in_z < (has_depth ? in_shape[2] : 1) &&
+                          in_y >= 0 && in_y < in_shape[2 + has_depth] &&
+                          in_x >= 0 && in_x < in_shape[3 + has_depth]) {
+                        weight_offset[0] = o + o_head;
+                        weight_offset[1] = k;
+                        if (has_depth) {
+                          weight_offset[2] = r;
+                        }
+                        weight_offset[2 + has_depth] = p;
+                        weight_offset[3 + has_depth] = q;
+                        in_offset[0] = n;
+                        in_offset[1] = k + k_head;
+                        if (has_depth) {
+                          in_offset[2] = in_z;
+                        }
+                        in_offset[2 + has_depth] = in_y;
+                        in_offset[3 + has_depth] = in_x;
+                        out_offset[0] = n;
+                        out_offset[1] = o + o_head;
+                        if (has_depth) {
+                          out_offset[2] = z;
+                        }
+                        out_offset[2 + has_depth] = y;
+                        out_offset[3 + has_depth] = x;
+
+                        out_data[offset(output, out_offset)] +=
+                            in_data[offset(input, in_offset)] *
+                            w_data[offset(filter, weight_offset)];
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
+int TestConvOp() {
+  int kernel_h = Kernel;
+  int kernel_w = Kernel;
+  int pad_h = Pad;
+  int pad_w = Pad;
+  int stride_h = Stride;
+  int stride_w = Stride;
+  int dilation_h = 1;
+  int dilation_w = 1;
+
+  int batch_size = 1;
+  int input_c = 3;
+  int input_h = 100;
+  int input_w = 100;
+  int output_c = 10;
+  framework::DDim input_shape =
+      framework::make_ddim({batch_size, input_c, input_h, input_w});
+  framework::DDim filter_shape =
+      framework::make_ddim({output_c, input_c, kernel_h, kernel_w});
+
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["Input"] = std::vector<std::string>({"input"});
+  inputs["Filter"] = std::vector<std::string>({"filter"});
+  outputs["Output"] = std::vector<std::string>({"output"});
+
+  auto input_var = scope.get()->Var("input");
+  auto input = input_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<Itype>(input, input_shape, -20, 20);
+
+  auto filter_var = scope.get()->Var("filter");
+  auto filter = filter_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<Itype>(filter, filter_shape, -20, 20);
+
+  auto output_var = scope.get()->Var("output");
+  framework::AttributeMap attrs;
+  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
+  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
+  attrs["dilations"].Set<vector<int>>(
+      std::vector<int>({dilation_h, dilation_w}));
+  attrs["groups"].Set<int>(1);
+
+  auto *op = new operators::ConvOp<CPU, float>("conv2d", inputs, outputs, attrs,
+                                               scope);
+  //  struct timespec ts_begin, ts_end;
+  op->InferShape();
+  // warmup
+  //  op->Run();
+  //  clock_gettime(CLOCK_MONOTONIC, &ts_begin);
+  //  for (int i = 0; i < 10; ++i) {
+  op->Run();
+  //  }
+  //  clock_gettime(CLOCK_MONOTONIC, &ts_end);
+  //  uint64_t elapsed = (ts_end.tv_sec - ts_begin.tv_sec) * 1e3 +
+  //                     (ts_end.tv_nsec - ts_begin.tv_nsec) / 1e6;
+  //  LOG(kLOG_INFO) << "elapsed: " << elapsed / 10.0 << " ms";
+
+  int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+  int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+  int output_h = (input_h + 2 * pad_h - kernel_extent_h) / stride_h + 1;
+  int output_w = (input_w + 2 * pad_w - kernel_extent_w) / stride_w + 1;
+  auto output_shape = framework::make_ddim(
+      std::vector<int>({batch_size, output_c, output_h, output_w}));
+  framework::Tensor output_cmp;
+  output_cmp.mutable_data<Otype>(output_shape);
+  conv2d<Itype, Otype>(input, filter, attrs, &output_cmp);
+
+  // compare results
+  auto output = output_var->template Get<framework::LoDTensor>();
+  const Otype *output_data = output->data<Otype>();
+  Otype *output_cmp_data = output_cmp.data<Otype>();
+  for (int i = 0; i < output->numel(); ++i) {
+    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
+                          "output[%d] = %d, output_cmp[%d] = %d", i,
+                          output_data[i], i, output_cmp_data[i]);
+  }
+  delete op;
+  return 0;
+}
+
+}  // namespace paddle_mobile
+
+int main() {
+  // kernel = 7, pad = 0, stride = 2
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 2>();
+
+  // kernel = 7, pad = 1, stride = 2
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 2>();
+
+  // kernel = 7, pad = 3, stride = 2
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 2>();
+
+  // kernel = 7, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 1>();
+
+  // kernel = 7, pad = 1, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 1>();
+
+  // kernel = 7, pad = 3, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 1>();
+
+  // kernel = 7, pad = 5, stride = 3
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 5, 3>();
+
+  // kernel = 7, pad = 3, stride = 4
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 4>();
+  LOG(paddle_mobile::kLOG_INFO) << "\n";
+
+  // kernel = 3, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 1>();
+  // kernel = 3, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=0, stride=1";
+  paddle_mobile::TestConvOp<float, float, 3, 0, 1>();
+  LOG(paddle_mobile::kLOG_INFO) << "\n";
+
+  // kernel = 3, pad = 1, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=1, stride=1";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 1>();
+  // kernel = 3, pad = 1, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1";
+  paddle_mobile::TestConvOp<float, float, 3, 1, 1>();
+  LOG(paddle_mobile::kLOG_INFO) << "\n";
+
+  // kernel = 5, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=0, stride=1";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 0, 1>();
+  // kernel = 5, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=0, stride=1";
+  paddle_mobile::TestConvOp<float, float, 5, 0, 1>();
+  LOG(paddle_mobile::kLOG_INFO) << "\n";
+
+  // kernel = 5, pad = 2, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=2, stride=1";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 2, 1>();
+  // kernel = 5, pad = 2, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=2, stride=1";
+  paddle_mobile::TestConvOp<float, float, 5, 2, 1>();
+}
diff --git a/test/operators/test_mul_op.cpp b/test/operators/test_mul_op.cpp
index 8ebf0926890497c0ed622b69f163a9f6f5c8612b..10dab2cda1b3c692f42cf8760eb2b48ae6451f39 100644
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -12,80 +12,88 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/mul_op.h"
 
-int main() {
-  paddle_mobile::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(g_resnet);
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
-
-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::MulOp<paddle_mobile::CPU, float>>
-      executor(program, "mul");
-
-  // 1. input_tensors;
-  vector<Tensor> input_tensors;
-
-  Tensor input1;
-  auto input1_data = CreateInput<float>(&input1, {3, 2, 1, 1}, 0, 1);
-  input_tensors.push_back(input1);
-  Tensor input2;
-  auto input2_data = CreateInput<float>(&input2, {2, 3}, 0, 1);
-  input_tensors.push_back(input2);
-
-  // 2. input_names
-  vector<string> input_names({
-      "pool2d_0.tmp_0",
-      "fc_0.w_0",
-  });
-
-  // 3. output_names
-  vector<string> output_names({"fc_0.tmp_0"});
-
-  // 4. out_dims;
-  vector<DDim> out_ddims;
-  auto out_ddim = paddle_mobile::framework::make_ddim({3, 3});
-  out_ddims.push_back(out_ddim);
-
-  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
-                                            output_names, out_ddims);
-
-  auto output0_data = output[0]->data<float>();
-
-  auto dim_1 = input1.numel() / input1.dims()[0];
-  DLOG << " input1 : ";
-  for (int i = 0; i < input1.dims()[0]; ++i) {
-    for (int j = 0; j < dim_1; ++j) {
-      DLOGF("%f ", input1_data[i * dim_1 + j]);
-    }
-    DLOGF("\n");
-  }
-
-  auto dim_2 = input2.numel() / input2.dims()[0];
-  DLOG << " input2 : ";
-  for (int i = 0; i < input2.dims()[0]; ++i) {
-    for (int j = 0; j < dim_2; ++j) {
-      DLOGF("%f ", input2_data[i * dim_2 + j]);
+#define a(i, j) a[(i)*lda + (j)]
+#define b(i, j) b[(i)*ldb + (j)]
+#define c(i, j) c[(i)*ldc + (j)]
+
+namespace paddle_mobile {
+using framework::AttributeMap;
+using framework::DDim;
+using framework::Scope;
+using framework::make_ddim;
+template <typename I, typename O>
+int TestMulOP() {
+  int32_t m = 1024;
+  int32_t n = 1024;
+  int32_t k = 1024;
+  int32_t lda = k;
+  int32_t ldb = n;
+  int32_t ldc = n;
+  DDim inputA_shape = make_ddim({m, k});
+  DDim inputB_shape = make_ddim({k, n});
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<Scope>();
+  inputs["X"] = std::vector<std::string>({"inputA"});
+  inputs["Y"] = std::vector<std::string>({"inputB"});
+  outputs["Out"] = std::vector<std::string>({"output"});
+
+  auto inputA_var = scope.get()->Var("inputA");
+  auto inputA = inputA_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<I>(inputA, inputA_shape, -127, 127);
+  auto inputB_var = scope.get()->Var("inputB");
+  auto inputB = inputB_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<I>(inputB, inputB_shape, -127, 127);
+
+  auto output_var = scope.get()->Var("output");
+  AttributeMap attrs;
+  attrs["x_num_col_dims"].Set<int>(1);
+  attrs["y_num_col_dims"].Set<int>(1);
+  auto *op =
+      new operators::MulOp<CPU, float>("mul", inputs, outputs, attrs, scope);
+  op->InferShape();
+  op->Run();
+  auto output = output_var->template Get<framework::LoDTensor>();
+  const O *output_data = output->data<O>();
+  // compare
+  O *c = static_cast<O *>(memory::Alloc(sizeof(O) * m * n));
+  I *a = inputA->data<I>();
+  I *b = inputB->data<I>();
+  for (int32_t i = 0; i < m; ++i) {
+    for (int32_t j = 0; j < n; ++j) {
+      O r = 0;
+      for (int32_t p = 0; p < k; p++) {
+        r += static_cast<O>(a(i, p)) * static_cast<O>(b(p, j));
+      }
+      c(i, j) = r;
     }
-    DLOGF("\n");
   }
 
-  auto dim_output0 = output[0]->numel() / output[0]->dims()[0];
-  DLOG << " output : ";
-  for (int i = 0; i < output[0]->dims()[0]; ++i) {
-    for (int j = 0; j < dim_output0; ++j) {
-      DLOGF("%f ", output0_data[i * dim_2 + j]);
+  int32_t eq = 0;
+  int32_t neq = 0;
+  for (int32_t i = 0; i < m * n; ++i) {
+    PADDLE_MOBILE_ENFORCE(
+        output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i,
+        static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i]));
+    if (static_cast<int>(output_data[i] == c[i])) {
+      ++eq;
+    } else {
+      ++neq;
     }
-    DLOGF("\n");
   }
+  DLOG << "mnk=" << m << " " << n << " " << k << "   eq=" << eq
+       << " neq=" << neq;
+  delete op;
+  return 0;
+}
+}  // namespace paddle_mobile
 
-  /// output (3,3)
-  DLOG << "output memory size : " << output[0]->memory_size();
-  DLOG << "output numel : " << output[0]->numel();
-
-  DLOG << input1_data[0] << " x " << input2_data[0] << " + " << input1_data[1]
-       << " x " << input2_data[0 + 3] << " = " << output0_data[0];
+int main() {
+  paddle_mobile::TestMulOP<int8_t, int32_t>();
+  paddle_mobile::TestMulOP<float, float>();
   return 0;
 }
diff --git a/test/operators/test_polygon_box_transform_op.cpp b/test/operators/test_polygon_box_transform_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a71177ddbd8e4d8b0f204fd6ec9c948882499cbd
--- /dev/null
+++ b/test/operators/test_polygon_box_transform_op.cpp
@@ -0,0 +1,126 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "../test_include.h"
+#include "operators/polygon_box_transform_op.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename Dtype>
+class TestPolygonBoxTransformOp {
+ public:
+  explicit TestPolygonBoxTransformOp(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    for (auto block_desc : blocks) {
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      for (auto op : ops) {
+        if (op->Type() == "polygon_box_transform") {
+          DLOG << " attr size: " << op->GetAttrMap().size();
+          DLOG << " inputs size: " << op->GetInputs().size();
+          DLOG << " input is : " << op->Input("Input")[0];
+          input_var_name = op->Input("Input")[0];
+          DLOG << " outputs size: " << op->GetOutputs().size();
+          DLOG << " output is : " << op->Output("Output")[0];
+          output_var_name = op->Output("Output")[0];
+          std::shared_ptr<operators::PolygonBoxTransformOp<Dtype, float>>
+              op_ptr = std::make_shared<
+                  operators::PolygonBoxTransformOp<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(op_ptr);
+          return;
+        }
+      }
+    }
+  }
+
+  std::shared_ptr<Tensor> predict(const Tensor &t) {
+    auto scope = program_.scope;
+    Variable *input_feed_value = scope->Var(input_var_name);
+    auto tensor_input = input_feed_value->GetMutable<LoDTensor>();
+    tensor_input->ShareDataWith(t);
+
+    Variable *output = scope->Var(output_var_name);
+    auto *output_tensor = output->GetMutable<LoDTensor>();
+
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);
+
+    predict(t, 0);
+
+    return out_tensor;
+  }
+
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+  string input_var_name;
+  string output_var_name;
+
+  void predict(const Tensor &t, int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      op->Run();
+    }
+  }
+};
+
+template class TestPolygonBoxTransformOp<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
+
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run PolygonBoxTransform Test";
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_ocr));
+
+  paddle_mobile::framework::Tensor input;
+  SetupTensor<float>(&input, {1, 8, 1, 2}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *input_ptr = input.data<float>();
+  for (int i = 0; i < 16; ++i) {
+    *(input_ptr + i) = i;
+  }
+  DLOG << "input : ";
+  for (int i = 0; i < input.numel(); ++i) {
+    DLOG << " index " << i << " : " << input_ptr[i];
+  }
+
+  paddle_mobile::framework::TestPolygonBoxTransformOp<paddle_mobile::CPU>
+      testPolygonBoxTransformOp(program);
+
+  auto output = testPolygonBoxTransformOp.predict(input);
+  auto *output_ptr = output->data<float>();
+
+  DLOG << "output : ";
+  for (int i = 0; i < output->numel(); ++i) {
+    DLOG << " index " << i << " : " << output_ptr[i];
+  }
+  return 0;
+}
diff --git a/test/operators/test_quantize_op.cpp b/test/operators/test_quantize_op.cpp
index c988862f6d91c87f47525fa36b7ee61f253682ab..5b1f276bebb0b956a7907a500645612c5aeaf8f9 100644
--- a/test/operators/test_quantize_op.cpp
+++ b/test/operators/test_quantize_op.cpp
@@ -18,14 +18,6 @@ limitations under the License. */
 
 namespace paddle_mobile {
 
-// static float g_test_data[50] = {
-//   -5.55, -5.5, -5.45, -5.0, -4.55, -4.5, -4.45, -4.0, -3.55, -3.5,
-//   -3.45, -3.01, -2.75, -2.5, -2.501, -2.49, -2.01, -1.75, -1.5, -1.25,
-//   -1.0, -0.75, -0.5, -0.25, 0.0, 0.25, 0.5, 0.75, 1.0, 1.25,
-//   1.5, 1.75, 2.01, 2.49, 2.501, 2.5, 2.75, 3.01, 3.45, 3.5,
-//   3.55, 4.0, 4.45, 4.5, 4.55, 5.0, 5.45, 5.5, 5.55, 6.0,
-// };
-
 static float find_abs_max(const Tensor *input) {
   float max_abs = 0.f;
   const float *x = input->data<const float>();
@@ -60,6 +52,16 @@ static void quantize_round_to_even(const Tensor *input, const float scale,
   }
 }
 
+static void quantize_round_to_nearest(const Tensor *input, const float scale,
+                                      Tensor *output) {
+  const float *x = input->data<const float>();
+  int8_t *y = output->mutable_data<int8_t>();
+  size_t size = input->numel();
+  for (size_t i = 0; i < size; ++i) {
+    y[i] = round(x[i] * scale);
+  }
+}
+
 int TestQuqntizeOp() {
   framework::DDim dim = framework::make_ddim({1, 3, 224, 224});
 
@@ -88,15 +90,16 @@ int TestQuqntizeOp() {
   auto output_scale = output_scale_var->template Get<framework::LoDTensor>();
   const float *output_scale_data = output_scale->data<float>();
 
-  float max_abs = find_abs_max(input);
-  float output_scale_cmp = 127 / max_abs;
+  float output_scale_cmp = find_abs_max(input);
   PADDLE_MOBILE_ENFORCE(output_scale_cmp == output_scale_data[0],
                         "output_scale = %.6f, output_scale_cmp = %.6f",
                         output_scale_cmp, output_scale_data[0]);
 
   framework::Tensor output_cmp;
   output_cmp.Resize(dim);
-  quantize_round_to_even(input, output_scale_cmp, &output_cmp);
+  float scale = 127 / output_scale_cmp;
+  // quantize_round_to_even(input, scale, &output_cmp);
+  quantize_round_to_nearest(input, scale, &output_cmp);
   int8_t *output_cmp_data = output_cmp.data<int8_t>();
   for (int i = 0; i < output->numel(); ++i) {
     PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
diff --git a/test/test_helper.h b/test/test_helper.h
index 03ee27d71d58eb5c727172a8112aeedfde244d0f..41d6faed5229be8944178ea62786477ceadd6416 100644
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -34,6 +34,7 @@ static const char *g_googlenetv1_combined = "../models/googlenetv1_combine";
 static const char *g_mobilenet_detect = "../models/mobilenet-detect";
 static const char *g_squeezenet = "../models/squeezenet";
 static const char *g_googlenet = "../models/googlenet";
+static const char *g_googlenet_quali = "../models/googlenet_combine_quali";
 static const char *g_mobilenet = "../models/mobilenet";
 static const char *g_alexnet = "../models/alexnet";
 static const char *g_inceptionv4 = "../models/inceptionv4";
diff --git a/tools/op.cmake b/tools/op.cmake
index 3abe18bb7c74362bda4d564cea61ba31d61404bd..f7a6ed4b134f78ddb23487cd3a861f244e6a86db 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -118,12 +118,9 @@ if (CON GREATER -1)
   set(POOL_OP ON)
   set(CONCAT_OP ON)
   set(SOFTMAX_OP ON)
-  set(DROPOUT_OP ON)
   set(FUSION_CONVBNRELU_OP ON)
   set(FUSION_CONVBN_OP ON)
   set(FUSION_CONVADD_OP ON)
-  set(MUL_OP ON)
-
   set(FOUND_MATCH ON)
 endif()
 
@@ -191,6 +188,7 @@ if(NOT FOUND_MATCH)
   set(ELEMENTWISEADD_OP ON)
   set(ELEMENTWISESUB_OP ON)
   set(IM2SEQUENCE_OP ON)
+  set(FILL_CONSTANT_OP ON)
   set(FUSION_CONVADD_OP ON)
   set(FUSION_CONVADDPRELU_OP ON)
   set(FUSION_CONVADDRELU_OP ON)
@@ -198,6 +196,7 @@ if(NOT FOUND_MATCH)
   set(LRN_OP ON)
   set(MUL_OP ON)
   set(MULTICLASSNMS_OP ON)
+  set(POLYGONBOXTRANSFORM_OP ON)
   set(POOL_OP ON)
   set(PRIORBOX_OP ON)
   set(RELU_OP ON)
@@ -225,6 +224,8 @@ if(NOT FOUND_MATCH)
   set(SHAPE_OP ON)
   set(ELEMENTWISEMUL_OP ON)
   set(SUM_OP ON)
+  set(QUANT_OP ON)
+  set(DEQUANT_OP ON)
 endif()
 
   # option(BATCHNORM_OP "" ON)
@@ -233,12 +234,14 @@ endif()
   # option(CONV_OP "" ON)
   # option(DEPTHWISECONV_OP "" ON)
   # option(ELEMENTWISEADD_OP "" ON)
+  # option(FILL_CONSTANT_OP "" ON)
   # option(FUSION_CONVADD_OP "" ON)
   # option(FUSION_CONVADDRELU_OP "" ON)
   # option(FUSION_FC_OP "" ON)
   # option(LRN_OP "" ON)
   # option(MUL_OP "" ON)
   # option(MULTICLASSNMS_OP "" ON)
+  # option(POLYGONBOXTRANSFORM_OP "" ON)
   # option(POOL_OP "" ON)
   # option(PRIORBOX_OP "" ON)
   # option(RELU_OP "" ON)
@@ -269,6 +272,9 @@ endif()
 if (ELEMENTWISESUB_OP)
   add_definitions(-DELEMENTWISESUB_OP)
 endif()
+if (FILL_CONSTANT_OP)
+  add_definitions(-DFILL_CONSTANT_OP)
+endif()
 if (FUSION_CONVADD_OP)
   add_definitions(-DFUSION_CONVADD_OP)
 endif()
@@ -293,6 +299,9 @@ endif()
 if (MULTICLASSNMS_OP)
   add_definitions(-DMULTICLASSNMS_OP)
 endif()
+if (POLYGONBOXTRANSFORM_OP)
+  add_definitions(-DPOLYGONBOXTRANSFORM_OP)
+endif()
 if (POOL_OP)
   add_definitions(-DPOOL_OP)
 endif()
@@ -404,3 +413,10 @@ if (SUM_OP)
   add_definitions(-DSUM_OP)
 endif()
 
+if (QUANT_OP)
+  add_definitions(-DQUANT_OP)
+endif()
+if (DEQUANT_OP)
+  add_definitions(-DDEQUANT_OP)
+endif()
+