Merge branch 'develop' into develop

ef66a4aa · Ray Liu · GitHub · ff8150b5 · 3e56f49a · ef66a4aa
138 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,6 +10,7 @@ option(LOG_PROFILE "log profile" OFF)
 option(CPU "armv7 with neon" ON)
 option(GPU_MALI "mali gpu" OFF)
 option(GPU_CL "opencl gpu" OFF)
+
 option(FPGA "fpga" OFF)
 if(FPGA)
    option(FPGAV1 "fpga v1" ON)
@@ -144,7 +145,7 @@ if(FPGA)
        endforeach()
        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h)
        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
        endforeach()
    endif()
    if(FPGAV2)
@@ -156,7 +157,7 @@ if(FPGA)
        endforeach()
        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h)
        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
        endforeach()
    endif()


--- a/README.md
+++ b/README.md
@@ -7,11 +7,21 @@
 <!--[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)-->

+Welcome to Paddle-Mobile GitHub project。Paddle-Mobile is a project of PaddlePaddle as well as a deep learning framework for embedded platforms.

 欢迎来到 Paddle-Mobile GitHub 项目。Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平台的深度学习的框架。

 ## Features

+- high performance in support of ARM CPU 
+- support Mali GPU
+- support Andreno GPU
+- support the realization of GPU Metal on Apple devices
+- support implementation on ZU5、ZU9 and other FPGA-based development boards
+- support implementation on Raspberry Pi and other arm-linux development boards
+
+## Features
+
 - 高性能支持ARM CPU 
 - 支持Mali GPU
 - 支持Andreno GPU
@@ -19,6 +29,7 @@
 - 支持ZU5、ZU9等FPGA开发板
 - 支持树莓派等arm-linux开发板

+
 ## Demo
 - [ANDROID](https://github.com/xiebaiyuan/paddle-mobile-demo)

@@ -26,6 +37,27 @@

 [https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo)

+## Documentation
+
+### Documentation of design
+
+If you want to know more details about the documentation of paddle-mobile design, please refer to the link as follows. There are many previous designs and discussion: [issue](https://github.com/PaddlePaddle/paddle-mobile/issues).
+
+[link of documentation of design](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/design_doc.md)
+
+### Documentation of development
+
+Documentation of development is mainly about building, running and other tasks.As a developer,you can use it with the help of contributed documents.
+* [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
+* [Android_CPU](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
+* [Android_GPU](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android_GPU.md)
+* [FPGA](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_fpga.md)
+* [ARM_LINUX](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_arm_linux.md)
+
+### How to contribute your documents
+- [tutorial link to contribute documents](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
+- Main procedure of contributing code is covered in the document above.If you have other problems during the procedure,please send them as [issue](https://github.com/PaddlePaddle/paddle-mobile/issues). We will deal with it as quickly as possible.
+
 ## 文档

 ### 设计文档
@@ -46,6 +78,24 @@
 - [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
 - 上面文档中涵盖了主要的贡献代码流程，如果在实践中您还遇到了其他问题，可以发[issue](https://github.com/PaddlePaddle/paddle-mobile/issues)。我们看到后会尽快处理。

+## Acquision of Models
+At present Paddle-Mobile only supports Paddle fluid training model. Models wiil be operated regularly after transformation if you have various models.
+### 1. Use Paddle Fluid directly to train
+It is the most reliable method to be recommanded
+### 2. Transform Caffe to Paddle Fluid model
+[https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
+### 3. ONNX
+ONNX is expanded as Open Neural Network Exchange. The project is aimed to make a full communication and usage among diffrent nerual network development frameworks.
+
+Except for directly using fluid models trained by PaddlePaddle,you can also get certain Paddle fluid models through onnx transformation.
+
+At present，work in support of onnx is also under operation in Baidu. Related tranformation project can be referred to here：
+[https://github.com/PaddlePaddle/paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)
+
+### 4. Download parts of testing models and testing pictures
+[http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
+
+

 ## 模型获得
 目前Paddle-Mobile仅支持Paddle fluid训练的模型。如果你手中的模型是不同种类的模型，需要进行模型转换才可以运行。
@@ -64,6 +114,22 @@ ONNX全称为“Open Neural Network Exchange”，即“开放的神经网络切
 ### 4. 部分测试模型和测试图片下载
 [http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)

+<!--## Online output of simple search
+
+Gif as following is the application output of online main part detection of simple search app
+![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif)-->
+
+## Ask Question
+
+Welcome to put forward or tackle with our problems,You can post your question in our issue modular on github. [Github Issues](https://github.com/PaddlePaddle/paddle-mobile/issues).
+
+## Copyright and License
+Paddle-Mobile provide relatively unstricted Apache-2.0 Open source agreement [Apache-2.0 license](LICENSE).
+
+
+## Old version Mobile-Deep-Learning
+Original MDL(Mobile-Deep-Learning) project has been transferred to [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) 
+
 <!--## 简单搜索线上效果

 如下gif是简单搜索app的线上主体检测应用效果

--- a/src/common/enforce.h
+++ b/src/common/enforce.h
@@ -16,9 +16,9 @@ limitations under the License. */

 #ifdef ENABLE_EXCEPTION
 #include <stdio.h>
+#include <stdlib.h>
 #include <exception>
 #include <string>
-
 #endif

 namespace paddle_mobile {

--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -22,9 +22,10 @@ const char *G_OP_TYPE_BATCHNORM = "batch_norm";
 const char *G_OP_TYPE_BOX_CODER = "box_coder";
 const char *G_OP_TYPE_CONCAT = "concat";
 const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
+const char *G_OP_TYPE_ELEMENTWISE_SUB = "elementwise_sub";
+const char *G_OP_TYPE_ELEMENTWISE_MUL = "elementwise_mul";
 const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant";
 const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
-const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8 = "fusion_conv_add_relu_int8";
 const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
 const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu";
 const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
@@ -32,7 +33,6 @@ const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu";
 const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
 const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
 const char *G_OP_TYPE_FC = "fusion_fc";
-const char *G_OP_TYPE_FC_INT8 = "fusion_fc_int8";
 const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
 const char *G_OP_TYPE_LRN = "lrn";
 const char *G_OP_TYPE_MUL = "mul";
@@ -41,6 +41,7 @@ const char *G_OP_TYPE_POLYGON_BOX_TRANSFORM = "polygon_box_transform";
 const char *G_OP_TYPE_POOL2D = "pool2d";
 const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
 const char *G_OP_TYPE_RELU = "relu";
+const char *G_OP_TYPE_RELU6 = "relu6";
 const char *G_OP_TYPE_RESHAPE = "reshape";
 const char *G_OP_TYPE_RESHAPE2 = "reshape2";
 const char *G_OP_TYPE_SIGMOID = "sigmoid";
@@ -68,14 +69,20 @@ const char *G_OP_TYPE_CRF = "crf_decoding";
 const char *G_OP_TYPE_BILINEAR_INTERP = "bilinear_interp";
 const char *G_OP_TYPE_FLATTEN = "flatten";
 const char *G_OP_TYPE_SHAPE = "shape";
-const char *G_OP_TYPE_ELEMENTWISE_MUL = "elementwise_mul";
 const char *G_OP_TYPE_SUM = "sum";
+const char *G_OP_TYPE_TOP_K = "top_k";
+const char *G_OP_TYPE_CAST = "cast";

 const char *G_OP_TYPE_QUANTIZE = "quantize";
 const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
+const char *G_OP_TYPE_FUSION_DEQUANT_BN = "fusion_dequant_bn";
 const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN = "fusion_dequant_add_bn";
 const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU = "fusion_dequant_bn_relu";
 const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU = "fusion_dequant_add_bn_relu";
+const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT =
+    "fusion_dequant_add_bn_quant";
+const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT =
+    "fusion_dequant_add_bn_relu_quant";

 const char *G_OP_TYPE_TANH = "tanh";
 const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
@@ -91,10 +98,13 @@ std::unordered_map<
        {G_OP_TYPE_PRELU, {{"X", "Alpha"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_RELU6, {{"X"}, {"Out"}}},
        {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
        {G_OP_TYPE_SIGMOID, {{"X"}, {"Out"}}},
        {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
        {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_ELEMENTWISE_SUB, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
        {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
        {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
        {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
@@ -112,13 +122,11 @@ std::unordered_map<
        {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
        {G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}},
        {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
-        {G_OP_TYPE_FC_INT8, {{"X", "Y", "Z", "Scale"}, {"Out"}}},
        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
        {G_OP_TYPE_RESHAPE2, {{"X"}, {"Out", "XShape"}}},
        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
        {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input", "Scale"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
@@ -139,12 +147,18 @@ std::unordered_map<
        {G_OP_TYPE_SHAPE, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_CONV_TRANSPOSE, {{"Input"}, {"Output"}}},
        {G_OP_TYPE_SUM, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
+        {G_OP_TYPE_TOP_K, {{"X"}, {"Out", "Indices"}}},
+        {G_OP_TYPE_CAST, {{"X"}, {"Out"}}},
        {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN, {{"X", "Scale"}, {"Y"}}},
+        {G_OP_TYPE_FUSION_DEQUANT_BN, {{"X", "Scale"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN, {{"X", "Scale"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DEQUANT_BN_RELU, {{"X", "Scale"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU, {{"X", "Scale"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT,
+         {{"X", "Scale"}, {"Out", "OutScale"}}},
+        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT,
+         {{"X", "Scale"}, {"Out", "OutScale"}}},
        {G_OP_TYPE_TANH, {{"X"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_FUSION_DECONV_ADD, {{"Input"}, {"Out"}}},

--- a/src/common/types.h
+++ b/src/common/types.h
@@ -87,10 +87,24 @@ enum PMStatus {
 };

 enum RoundType {
-  ROUND_UNK = 0,
-  ROUND_NEAREST_AWAY_ZERO = 1,
-  ROUND_NEAREST_TOWARDS_ZERO = 2,
-  ROUND_NEAREST_TO_EVEN = 3
+  ROUND_NEAREST_AWAY_ZERO = 0,
+  ROUND_NEAREST_TOWARDS_ZERO = 1,
+  ROUND_NEAREST_TO_EVEN = 2,
+};
+
+enum ActivationType {
+  IDENTITY = 0,
+  RELU = 1,
+  RELU6 = 2,
+  PRELU = 3,
+  LEAKY_RELU = 4,
+  TANH = 5,
+  SIGMOID = 6,
+};
+
+enum PoolingType {
+  MAX = 0,
+  AVG = 1,
 };

 extern const char *G_OP_TYPE_CONV;
@@ -98,12 +112,12 @@ extern const char *G_OP_TYPE_BATCHNORM;
 extern const char *G_OP_TYPE_BOX_CODER;
 extern const char *G_OP_TYPE_CONCAT;
 extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
+extern const char *G_OP_TYPE_ELEMENTWISE_SUB;
+extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
-extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
 extern const char *G_OP_TYPE_FC;
-extern const char *G_OP_TYPE_FC_INT8;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU;
@@ -116,6 +130,7 @@ extern const char *G_OP_TYPE_MULTICLASS_NMS;
 extern const char *G_OP_TYPE_POOL2D;
 extern const char *G_OP_TYPE_PRIOR_BOX;
 extern const char *G_OP_TYPE_RELU;
+extern const char *G_OP_TYPE_RELU6;
 extern const char *G_OP_TYPE_RESHAPE;
 extern const char *G_OP_TYPE_SIGMOID;
 extern const char *G_OP_TYPE_SOFTMAX;
@@ -136,13 +151,17 @@ extern const char *G_OP_TYPE_FUSION_CONV_BN;
 extern const char *G_OP_TYPE_CONV_TRANSPOSE;
 extern const char *G_OP_TYPE_PRELU;
 extern const char *G_OP_TYPE_SUM;
-extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
+extern const char *G_OP_TYPE_TOP_K;
+extern const char *G_OP_TYPE_CAST;

 extern const char *G_OP_TYPE_QUANTIZE;
 extern const char *G_OP_TYPE_DEQUANTIZE;
+extern const char *G_OP_TYPE_FUSION_DEQUANT_BN;
 extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN;
 extern const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU;
+extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT;
+extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT;

 extern const char *G_OP_TYPE_TANH;
 extern const char *G_OP_TYPE_FUSION_DECONV_RELU;

--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -24,8 +24,6 @@ namespace fpga {
 #define USE_RELU 1
 #define USE_BIAS 2

-int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); }
-
 void format_image(framework::Tensor *image_tensor) {
  auto dims = image_tensor->dims();
  auto channel = dims[1], height = dims[2], width = dims[3];
@@ -83,6 +81,13 @@ int get_plit_num(framework::Tensor *filter_tensor) {
  int div_capacity = filter::calc_division_capacity(chw);
  return filter::calc_split_num(num, div_capacity);
 }
+int get_deconv_plit_num(framework::Tensor *filter_tensor, int stride) {
+  auto dims = filter_tensor->dims();
+  auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
+  auto num = dims[0] * stride;
+  int div_capacity = filter::calc_division_capacity(chw);
+  return filter::calc_split_num(num, div_capacity);
+}

 int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
  auto dims = filter_tensor->dims();
@@ -92,12 +97,17 @@ int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
  return filter::calc_num_per_div(num, group_num, div_capacity);
 }

-int get_aligned_filter_element_num(int chw) {
-  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+int get_deconv_filter_num_per_div(framework::Tensor *filter_tensor,
+                                  int group_num, int stride) {
+  auto dims = filter_tensor->dims();
+  auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
+  auto num = dims[0] * stride;
+  int div_capacity = filter::calc_division_capacity(chw);
+  return filter::calc_num_per_div(num, group_num, div_capacity);
 }

-int get_aligned_filter_num(int num) {
-  return align_to_x(num, FILTER_NUM_ALIGNMENT);
+int get_aligned_filter_element_num(int chw) {
+  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
 }

 void format_filter(framework::Tensor *filter_tensor, float max_value,
@@ -177,46 +187,37 @@ void format_concat_output(framework::Tensor *out, int height, int width,

 void expand_conv_arg(ConvArgs *arg) {
  ConvArgs args = *arg;
-  uint64_t filterlen = (uint64_t)args.kernel.width *
-                       (uint64_t)args.kernel.height *
-                       (uint64_t)args.image.channels;
-  filterlen = align_to_x(filterlen, FILTER_ELEMENT_ALIGNMENT);
-  filterlen *= align_to_x((uint64_t)args.filter_num, FILTER_NUM_ALIGNMENT);
-  uint64_t fpga_bias_scale_len =
+
+  auto fpga_bias_scale_len =
      align_to_x(args.filter_num / args.group_num, 8) * args.group_num;

-  uint64_t output_height =
+  auto output_height =
      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
          args.kernel.stride_h +
      1;
-  uint64_t output_width =
+  auto output_width =
      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
          args.kernel.stride_w +
      1;
-  uint64_t output_size =
-      output_height * output_width * (uint64_t)args.filter_num;
-
-  auto filter_per_group = (uint64_t)(args.filter_num / args.group_num);
-  auto channel_per_group = (uint64_t)(args.image.channels / args.group_num);
-
-  uint64_t image_row_count = ((uint64_t)args.image.width) *
-                             ((uint64_t)args.image.channels);  // without align
-  uint64_t image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT);
-  uint64_t image_one_pad_per_row =
-      align_to_x(image_row_count, IMAGE_ALIGNMENT) +
-      ((uint64_t)args.image.pad_width) * ((uint64_t)args.image.channels);
-  uint64_t filter_amount_all =
-      align_to_x(((uint64_t)args.kernel.height) *
-                     ((uint64_t)args.kernel.width) * channel_per_group,
+
+  auto filter_per_group = args.filter_num / args.group_num;
+  auto channel_per_group = args.image.channels / args.group_num;
+
+  auto image_row_count = args.image.width * args.image.channels;
+  auto image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT);
+  auto image_one_pad_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT) +
+                               args.image.pad_width * args.image.channels;
+  auto filter_amount_all =
+      align_to_x(args.kernel.height * args.kernel.width * channel_per_group,
                 FILTER_ELEMENT_ALIGNMENT);

-  uint64_t output_amount_per_row =
-      align_to_x(output_width * ((uint64_t)args.filter_num), IMAGE_ALIGNMENT);
+  auto output_amount_per_row =
+      align_to_x(output_width * args.filter_num, IMAGE_ALIGNMENT);

  // find the opt partition strategy
  uint64_t res_win;
  uint64_t res_fit = 0;
-  for (res_win = 1; res_win <= output_width; res_win = res_win + 1) {
+  for (res_win = 1; res_win <= output_width; res_win++) {
    if ((align_to_x(
             (args.image.channels *
              (args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
@@ -238,48 +239,48 @@ void expand_conv_arg(ConvArgs *arg) {
  }
  res_fit = res_win;

-  uint64_t block_num = (output_width + res_fit - 1) / res_fit;
-  uint64_t block_len = res_fit;
-  uint64_t block_last = output_width - res_fit * (block_num - 1);
+  auto block_num = (output_width + res_fit - 1) / res_fit;
+  auto block_len = res_fit;
+  auto block_last = output_width - res_fit * (block_num - 1);

-  uint64_t res_amount_per_row = output_width * args.filter_num;
-  uint64_t res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;
+  auto res_amount_per_row = output_width * args.filter_num;
+  auto res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;

-  uint64_t image_block_amount_per_row =
-      args.kernel.stride_w * (res_fit)*args.image.channels;
-  uint64_t filter_pad_width_mul_channel =
+  auto image_block_amount_per_row =
+      args.kernel.stride_w * res_fit * args.image.channels;
+  auto filter_pad_width_mul_channel =
      args.image.pad_width * args.image.channels;
-  uint64_t image_amount_per_row_multi_win_first =
+  auto image_amount_per_row_multi_win_first =
      image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height);
-  uint64_t image_amount_per_row_multi_win =
+  auto image_amount_per_row_multi_win =
      image_amount_per_row * (4 * args.kernel.stride_h);

-  uint64_t image_block_num = block_num;
-  uint64_t image_block_len =
+  auto image_block_num = block_num;
+  auto image_block_len =
      align_to_x((args.image.channels *
                  (args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
                 IMAGE_ALIGNMENT) /
          16 +
      1;
-  uint64_t image_block_len_last =
+  auto image_block_len_last =
      align_to_x(
          (args.image.channels *
           (args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
          IMAGE_ALIGNMENT) /
          16 +
      1;
-  uint64_t image_win_cnt = block_len;
-  uint64_t image_win_cnt_last = block_last;
-  uint64_t res_row_data_align4_pad = res_amount_per_row_pad / 8;
-  uint64_t prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1;
+  auto image_win_cnt = block_len;
+  auto image_win_cnt_last = block_last;
+  auto res_row_data_align4_pad = res_amount_per_row_pad / 8;
+  auto prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1;
  if (prog_full_cnt == 1023) {
    prog_full_cnt--;
  }
-  uint64_t post_prog_full_cnt =
+  auto post_prog_full_cnt =
      (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
          ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
          : 0;
-  uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
+  auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;

  (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
  (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
@@ -449,7 +450,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
  arg->sub_conv_num = (uint32_t)stride_h;
  arg->filter_num = (uint32_t)filter->dims()[0];
  int sub_conv_num = arg->sub_conv_num;
-  int sub_stride = 1;
  int sub_pad = deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],
                                                   padding_w, stride_w);
  int sub_filter_width = deconv_filter::deconv_get_sub_filter_axis(
@@ -464,7 +464,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
  arg->sub_output_height = (uint32_t)sub_output_height;
  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
      stride_w, (int)filter->dims()[3], padding_w);
-  arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs));
+
+  arg->output.address = out_ptr;
+  arg->output.scale_address = out->scale;

  int sub_channels = (int)input->dims()[1];
  int omit_size = arg->omit_size;
@@ -484,50 +486,160 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
  int align_conv_sub_filter_count =
      align_sub_filter_count * align_sub_filter_num;

+  int split_num =
+      group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1;
+
+  arg->split_conv_args =
+      (SplitConvArgs *)fpga_malloc(sub_conv_num * sizeof(SplitConvArgs));
  for (int i = 0; i < sub_conv_num; ++i) {
-    arg->conv_args[i].filter_num = (arg->sub_conv_num) * (arg->filter_num);
-    arg->conv_args[i].group_num = (uint32_t)group_num;
-
-    arg->conv_args[i].filter_scale_address = filter->scale;
-    arg->conv_args[i].relu_enabled = relu_enabled;
-
-    arg->conv_args[i].kernel.width = (uint32_t)sub_filter_width;
-    arg->conv_args[i].kernel.height = (uint32_t)sub_filter_width;
-    arg->conv_args[i].kernel.stride_w = 1;
-    arg->conv_args[i].kernel.stride_h = 1;
-
-    // DeconvParam.conv_args[i].image.address = (void*)ptr_image;
-    arg->conv_args[i].image.scale_address = input->scale;
-    arg->conv_args[i].image.channels = (uint32_t)sub_channels;
-    arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
-    arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
-    arg->conv_args[i].image.pad_width = (uint32_t)sub_pad;
-    arg->conv_args[i].image.pad_height = (uint32_t)sub_pad;
-    arg->conv_args[i].image.address = input_ptr;
-    arg->conv_args[i].sb_address = (void *)bs_ptr;
-
-    auto filter_sub_space =
-        (char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char));
-    fpga_copy(filter_sub_space,
-              (char *)filter_ptr + i * align_conv_sub_filter_count,
-              (size_t)align_conv_sub_filter_count);
-    arg->conv_args[i].filter_address = (void *)(filter_sub_space);
-    fpga_flush(filter_sub_space, (size_t)align_conv_sub_filter_count);
+    arg->split_conv_args[i].filter_num =
+        (arg->sub_conv_num) * (arg->filter_num);
+    arg->split_conv_args[i].group_num = (uint32_t)group_num;
+    arg->split_conv_args[i].split_num = split_num;
+    arg->split_conv_args[i].conv_arg =
+        (ConvArgs *)fpga_malloc(split_num * sizeof(ConvArgs));
+
+    arg->split_conv_args[i].concat_arg.height = sub_output_height;
+    arg->split_conv_args[i].concat_arg.width = sub_output_width;
+    arg->split_conv_args[i].concat_arg.image_num = split_num;
+    arg->split_conv_args[i].concat_arg.images_in =
+        (half **)fpga_malloc(split_num * sizeof(half *));
+    arg->split_conv_args[i].concat_arg.scales_in =
+        (float **)fpga_malloc(split_num * sizeof(float *));
+    arg->split_conv_args[i].concat_arg.channel_num =
+        (uint32_t *)fpga_malloc(split_num * sizeof(uint32_t));
+    // arg->split_conv_args[i].concat_arg.image_out =
+    // fpga_malloc(conv_output_size * sizeof(half));
+    // arg->split_conv_args[i].concat_arg.scale_out = fpga_malloc(2 *
+    // sizeof(float));
+  }

+  int filter_num_per_div =
+      get_deconv_filter_num_per_div(filter, group_num, stride_w);
+  int element_num = get_aligned_filter_element_num(
+      (int)(sub_channels * sub_filter_width * sub_filter_width));
+
+  int chw = sub_channels * sub_filter_width * sub_filter_width;
+  int division_capacity = filter::calc_division_capacity(chw);
+  int num_per_div_before_alignment =
+      filter::calc_num_per_div(sub_filter_num, group_num, division_capacity);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  int div_num = (sub_filter_num + num_per_div_before_alignment - 1) /
+                num_per_div_before_alignment;
+  int residual = sub_filter_num % num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment *
+                                ((residual == 0) ? div_num : (div_num - 1)) +
+                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
+
+  int filter_sub_conv_offset = element_num * num_after_alignment;
+  for (int i = 0; i < sub_conv_num; ++i) {
    if (sub_conv_num == 1) {
-      arg->conv_args[i].output.address = out_ptr;
-      arg->conv_args[i].output.scale_address = out->scale;
+      arg->split_conv_args[i].output.address = arg->output.address;
+      arg->split_conv_args[i].output.scale_address = arg->output.scale_address;
+
    } else {
      auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half));
-      arg->conv_args[i].output.address = (void *)((half *)ptr_output);
+      arg->split_conv_args[i].output.address = (void *)((half *)ptr_output);
      auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
-      arg->conv_args[i].output.scale_address = ptr_output_scale;
+      arg->split_conv_args[i].output.scale_address = ptr_output_scale;
+    }
+
+    for (int j = 0; j < split_num; ++j) {
+      arg->split_conv_args[i].conv_arg[j].relu_enabled = relu_enabled;
+      arg->split_conv_args[i].conv_arg[j].group_num = (uint32_t)group_num;
+
+      arg->split_conv_args[i].conv_arg[j].kernel.width =
+          (uint32_t)sub_filter_width;
+      arg->split_conv_args[i].conv_arg[j].kernel.height =
+          (uint32_t)sub_filter_width;
+      arg->split_conv_args[i].conv_arg[j].kernel.stride_w = 1;
+      arg->split_conv_args[i].conv_arg[j].kernel.stride_h = 1;
+
+      arg->split_conv_args[i].conv_arg[j].image.scale_address = input->scale;
+      arg->split_conv_args[i].conv_arg[j].image.channels =
+          (uint32_t)sub_channels;
+      arg->split_conv_args[i].conv_arg[j].image.width =
+          (uint32_t)input->dims()[3];
+      arg->split_conv_args[i].conv_arg[j].image.height =
+          (uint32_t)input->dims()[2];
+      arg->split_conv_args[i].conv_arg[j].image.pad_width = (uint32_t)sub_pad;
+      arg->split_conv_args[i].conv_arg[j].image.pad_height = (uint32_t)sub_pad;
+      arg->split_conv_args[i].conv_arg[j].image.address = input_ptr;
+
+      arg->split_conv_args[i].conv_arg[j].filter_scale_address = filter->scale;
+      arg->split_conv_args[i].conv_arg[j].filter_num = (uint32_t)(
+          j == split_num - 1
+              ? sub_filter_num - (split_num - 1) * filter_num_per_div  // NOLINT
+              : filter_num_per_div);
+
+      size_t filter_size =
+          element_num *
+          align_to_x(arg->split_conv_args[i].conv_arg[j].filter_num,
+                     FILTER_NUM_ALIGNMENT) *
+          sizeof(int8_t);
+      auto filter_head =
+          &((int8_t *)filter_ptr)[j * element_num * filter_num_per_div +
+                                  i * filter_sub_conv_offset];
+      arg->split_conv_args[i].conv_arg[j].filter_address =
+          fpga_malloc(filter_size);
+      memcpy(arg->split_conv_args[i].conv_arg[j].filter_address, filter_head,
+             filter_size);
+      fpga_flush(arg->split_conv_args[i].conv_arg[j].filter_address,
+                 filter_size);
+
+      {
+        static int test_cnt = 0;
+        signed char result = 0;
+        if (test_cnt <= 1) {
+          std::string filename = "deconv_split_flt" + std::to_string(test_cnt);
+
+          fpga::savefile<signed char>(
+              filename, arg->split_conv_args[i].conv_arg[j].filter_address,
+              filter_size, result);
+          test_cnt++;
        }
      }

-  arg->output.address = out_ptr;
-  arg->output.scale_address = out->scale;
-  // fpga_free(filter_ptr);
+      size_t bs_align_num = align_to_x(
+          arg->split_conv_args[i].conv_arg[j].filter_num, BS_NUM_ALIGNMENT);
+      size_t bs_size = 2 * bs_align_num * sizeof(float);
+      auto bs_head = &bs_ptr[j * filter_num_per_div * 2];
+
+      arg->split_conv_args[i].conv_arg[j].sb_address = fpga_malloc(bs_size);
+      memcpy(arg->split_conv_args[i].conv_arg[j].sb_address, bs_head, bs_size);
+      fpga_flush(arg->split_conv_args[i].conv_arg[j].sb_address, bs_size);
+
+      if (split_num == 1) {
+        arg->split_conv_args[i].conv_arg[j].output.address =
+            arg->split_conv_args[i].output.address;
+        arg->split_conv_args[i].conv_arg[j].output.scale_address =
+            arg->split_conv_args[i].output.scale_address;
+      } else {
+        auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half));
+        arg->split_conv_args[i].conv_arg[j].output.address =
+            (void *)((half *)ptr_output);
+        auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
+        arg->split_conv_args[i].conv_arg[j].output.scale_address =
+            ptr_output_scale;
+      }
+      arg->split_conv_args[i].concat_arg.images_in[j] =
+          (half *)arg->split_conv_args[i].conv_arg[j].output.address;  // NOLINT
+      arg->split_conv_args[i].concat_arg.scales_in[j] =
+          arg->split_conv_args[i].conv_arg[j].output.scale_address;
+      arg->split_conv_args[i].concat_arg.channel_num[j] =
+          arg->split_conv_args[i].conv_arg[j].filter_num;
+
+      expand_conv_arg(&(arg->split_conv_args[i].conv_arg[j]));
+    }
+
+    arg->split_conv_args[i].concat_arg.image_out =
+        arg->split_conv_args[i].output.address;
+    arg->split_conv_args[i].concat_arg.scale_out =
+        arg->split_conv_args[i].output.scale_address;
+  }
+  filter->reset_data_ptr(nullptr);
+  fpga_free(bs_ptr);
 }  // fill_deconv_arg

 }  // namespace fpga

--- a/src/fpga/V1/api.h
+++ b/src/fpga/V1/api.h
@@ -21,16 +21,19 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace fpga {

-int get_align_image_cw(int cw);
 void format_image(framework::Tensor* image_tensor);
 void format_fp16_ofm(framework::Tensor* ofm_tensor);  // only allocate memory
 void format_fp32_ofm(framework::Tensor* ofm_tensor);

 float filter_find_max(framework::Tensor* filter_tensor);
 int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num);
+int get_deconv_filter_num_per_div(framework::Tensor* filter_tensor,
+                                  int group_num, int stride);
+
 int get_plit_num(framework::Tensor* filter_tensor);
+int get_deconv_plit_num(framework::Tensor* filter_tensor, int stride);
+
 int get_aligned_filter_element_num(int chw);
-int get_aligned_filter_num(int num);
 void format_filter(framework::Tensor* filter_tensor, float max_value,
                   int group_num);
 void format_fc_filter(framework::Tensor* filter_tensor, float max_value);

--- a/src/fpga/V1/deconv_filter.cpp
+++ b/src/fpga/V1/deconv_filter.cpp
@@ -40,10 +40,9 @@ inverse kernel weights of each channel for every filter
 void deconv_inverse_filter(float** data_in, int num, int channel, int width,
                           int height) {
  float* tmp = *data_in;
-  // float fix_range = 127;//  float scale = fix_range / max;
  int data_size = num * channel * width * height;
  int hw_len = height * width;
-  float* tmp_data = (float*)fpga_malloc(data_size * sizeof(float));
+  auto tmp_data = (float*)fpga_malloc(data_size * sizeof(float));
  for (int i = 0; i < num; ++i) {
    for (int j = 0; j < channel; ++j) {
      for (int k = 0; k < hw_len; ++k) {
@@ -52,7 +51,7 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width,
      }
    }
  }
-  *data_in = (float*)tmp_data;  //
+  *data_in = tmp_data;
  fpga_free(tmp);
 }

@@ -61,8 +60,7 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width,
 */
 int deconv_calc_sub_pad(int filter_axis, int pad, int stride) {
  if (stride == 0 || ((filter_axis - pad - 1) < 0)) {
-    // error
-    return 0;
+    PADDLE_MOBILE_ENFORCE(false, "Wrong deconv parameters");
  }
  return (filter_axis - pad - 1) / stride;
 }
@@ -79,11 +77,8 @@ int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) {
   position. so the omit rows or columns is (stride - )
 */
 int deconv_get_omit(int stride, int filter_width, int pad) {
-  if (((filter_width - pad) <= 0)) {  // ((filter_width-pad) > stride) ||
-    // error
-    return 0;
-  }
-  int idx = 1;
+  PADDLE_MOBILE_ENFORCE(filter_width > pad, "Wrong deconv parameters");
+  int idx;
  bool flag = false;
  for (idx = 1; idx <= stride; ++idx) {
    int j = idx;
@@ -102,10 +97,6 @@ int deconv_get_omit(int stride, int filter_width, int pad) {
  return (stride - idx);
 }

-int deconv_get_sub_filter_num(int filter_num, int stride) {
-  return filter_num * stride;
-}
-
 void deconv_get_sub_filter(char** data_in, int height, int width,
                           int sub_conv_n, int kernel_num, int channel) {
  char* ptr_tmp = *data_in;
@@ -245,7 +236,6 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
  char* ptr_space = (char*)fpga_malloc(sub_conv_n * align_offset *
                                       sizeof(char));  // continuous space
  for (int i = 0; i < sub_conv_n; ++i) {
-    int offset = i * origin_offset;
    char* ptr_tmp = (ptr_ptr_data)[i];

    filter::align_element(&ptr_tmp, sub_num, sub_chw);

--- a/src/fpga/V1/deconv_filter.h
+++ b/src/fpga/V1/deconv_filter.h
@@ -21,7 +21,6 @@ namespace deconv_filter {
 void deconv_inverse_filter(float** data_in, int num, int channel, int width,
                           int height);
 int deconv_calc_sub_pad(int filter_axis, int pad, int stride);
-int deconv_get_sub_filter_num(int filter_num, int stride);
 int deconv_get_sub_filter_axis(int filter_axis, int stride);
 int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis);
 int deconv_get_omit(int stride, int filter_width, int pad);

--- a/src/fpga/V1/pe.cpp
+++ b/src/fpga/V1/pe.cpp
@@ -13,23 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "fpga/common/pe.h"
-#include <unistd.h>
-#include <iomanip>
-#include <iostream>
 #include "common/types.h"
 #include "fpga/V1/filter.h"
 #include "fpga/V1/image.h"
 #include "fpga/common/config.h"
 #include "fpga/common/driver.h"
-using namespace std;
-using namespace paddle_mobile::fpga::driver;  // NOLINT
+
+#ifdef COST_TIME_PRINT
+#include <sys/time.h>
+#include <time.h>
+#include <iomanip>
+#include <iostream>
+//#include <iostream>
+#endif

 namespace paddle_mobile {
 namespace fpga {

-#define IMAGE_ALIGN 16
-#define FILTER_ALIGN 16
-#define FILTER_NUM_ALIGN 32
+using namespace driver;  // NOLINT
+using namespace std;
 #define USE_RELU 1
 #define USE_BIAS 2

@@ -170,15 +172,17 @@ int ComputeFpgaConv(const struct SplitConvArgs &args) {
       << "   group_num:" << args.group_num
       << "   split_num:" << args.split_num;
 #endif
-
+  int ret = 0;
  int split_num = args.split_num;
  for (int i = 0; i < split_num; i++) {
-    ComputeBasicConv(args.conv_arg[i]);
+    ret |= ComputeBasicConv(args.conv_arg[i]);
  }

  if (split_num > 1) {
    ComputeFPGAConcat(args.concat_arg);
  }
+
+  return ret;
 }

 int ComputeBasicConv(const struct ConvArgs &args) {
@@ -207,129 +211,6 @@ int ComputeBasicConv(const struct ConvArgs &args) {
 #ifdef PADDLE_MOBILE_ZU5
  int ret = 0;
  uint64_t output_scale = 0;
-  /*
-  uint64_t output_scale;
-  uint64_t image_scale;
-  uint64_t filter_scale;
-  uint64_t image_address_phy = 0;
-  uint64_t sb_address_phy = 0;
-  uint64_t filter_address_phy = 0;
-  uint64_t output_address_phy = 0;
-
-
-  fpga_copy(&image_scale, args.image.scale_address, 2 * sizeof(float));
-  fpga_copy(&filter_scale, args.filter_scale_address, 2 * sizeof(float));
-  uint64_t filterlen = (uint64_t)args.kernel.width *
-                       (uint64_t)args.kernel.height *
-                       (uint64_t)args.image.channels;
-  filterlen = align_to_x(filterlen, FILTER_ALIGN);
-  filterlen *= align_to_x((uint64_t)args.filter_num, FILTER_NUM_ALIGN);
-  uint64_t fpga_bias_scale_len =
-      align_to_x(args.filter_num / args.group_num, 8) * args.group_num;
-
-  uint64_t output_height =
-      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h +
-      1;
-  uint64_t output_width =
-      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-          args.kernel.stride_w +
-      1;
-  uint64_t output_size =
-      output_height * output_width * (uint64_t)args.filter_num;
-
-  uint64_t filter_per_group = (uint64_t)(args.filter_num / args.group_num);
-  uint64_t channel_per_group = (uint64_t)(args.image.channels / args.group_num);
-
-  uint64_t image_row_count = ((uint64_t)args.image.width) *
-                             ((uint64_t)args.image.channels);  // without align
-  uint64_t image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGN);
-  uint64_t image_one_pad_per_row =
-      align_to_x(image_row_count, IMAGE_ALIGN) +
-      ((uint64_t)args.image.pad_width) * ((uint64_t)args.image.channels);
-  uint64_t filter_amount_all =
-      align_to_x(((uint64_t)args.kernel.height) *
-                     ((uint64_t)args.kernel.width) * channel_per_group,
-                 FILTER_ALIGN);
-
-  uint64_t output_amount_per_row =
-      align_to_x(output_width * ((uint64_t)args.filter_num), IMAGE_ALIGN);
-
-  // find the opt partition strategy
-  uint64_t res_win;
-  uint64_t res_fit = 0;
-  for (res_win = 1; res_win <= output_width; res_win = res_win + 1) {
-    if ((align_to_x(
-             (args.image.channels *
-              (args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
-             IMAGE_ALIGN) /
-             16 +
-         1) *
-            args.kernel.height >
-        2048) {
-      break;
-    }
-  }
-
-  if (res_win != output_width) {
-    res_win -= 1;
-  }
-
-  if (((res_win % 2) != 0) && (res_win != 1)) {
-    res_win = res_win - 1;
-  }
-  res_fit = res_win;
-
-  uint64_t block_num = (output_width + res_fit - 1) / res_fit;
-  uint64_t block_len = res_fit;
-  uint64_t block_last = output_width - res_fit * (block_num - 1);
-
-  uint64_t res_amount_per_row = output_width * args.filter_num;
-  uint64_t res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;
-
-  uint64_t image_block_amount_per_row =
-      args.kernel.stride_w * (res_fit)*args.image.channels;
-  uint64_t filter_pad_width_mul_channel =
-      args.image.pad_width * args.image.channels;
-  uint64_t image_amount_per_row_multi_win_first =
-      image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height);
-  uint64_t image_amount_per_row_multi_win =
-      image_amount_per_row * (4 * args.kernel.stride_h);
-
-  uint64_t image_block_num = block_num;
-  uint64_t image_block_len =
-      align_to_x((args.image.channels *
-                  (args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
-                 IMAGE_ALIGN) /
-          16 +
-      1;
-  uint64_t image_block_len_last =
-      align_to_x(
-          (args.image.channels *
-           (args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
-          IMAGE_ALIGN) /
-          16 +
-      1;
-  uint64_t image_win_cnt = block_len;
-  uint64_t image_win_cnt_last = block_last;
-  uint64_t res_row_data_align4_pad = res_amount_per_row_pad / 8;
-  uint64_t prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1;
-  if (prog_full_cnt == 1023) {
-    prog_full_cnt--;
-  }
-  uint64_t post_prog_full_cnt =
-      (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
-          ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
-          : 0;
-
-  image_address_phy = vaddr_to_paddr(args.image.address);
-  sb_address_phy = vaddr_to_paddr(args.sb_address);
-  filter_address_phy = vaddr_to_paddr(args.filter_address);
-  output_address_phy = vaddr_to_paddr(args.output.address);
-
-  uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
-*/
-
  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
    ret = -EIO;
@@ -357,7 +238,6 @@ int ComputeBasicConv(const struct ConvArgs &args) {
  reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER);
  reg_writeq(*(uint64_t *)args.image.scale_address, REG_CONV_IMAGE_SCALE);
  reg_writeq(*(uint64_t *)args.filter_scale_address, REG_CONV_FILTER_SCALE);
-
  reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
  reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
  reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR);
@@ -381,14 +261,14 @@ int ComputeBasicConv(const struct ConvArgs &args) {
  reg_writeq(args.driver.prog_full_cnt, 0xd08);
  reg_writeq(args.driver.post_prog_full_cnt, 0xd10);
  reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20);
-
  reg_writeq(args.driver.cmd, REG_CONV_CMD);
-
+  DLOG << "before reg poll";
  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
    g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
    ret = -EIO;
    DLOG << "Conv Wait Irq Timeout!";
  }
+  DLOG << "after reg poll";

  output_scale = reg_readq(REG_SCALE_PARAMETER);
  output_scale = (output_scale << 32) | (output_scale >> 32);
@@ -398,9 +278,9 @@ int ComputeBasicConv(const struct ConvArgs &args) {

  return ret;
 #endif
-
  return 0;
-}
+
+}  // ComputeBasicConv

 int ComputeFpgaPool(const struct PoolingArgs &args) {
 #ifdef FPGA_PRINT_MODE
@@ -428,13 +308,11 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
  uint64_t timer_cnt = 0;
  int ret = 0;
  uint64_t cmd = 0;
-
  uint64_t image_physical_address = 0;
  uint64_t output_physical_address = 0;

-  image_physical_address = vaddr_to_paddr(args.image.address);
-  output_physical_address = vaddr_to_paddr(args.output.address);
-
+  image_physical_address = vaddr_to_paddr_driver(args.image.address);
+  output_physical_address = vaddr_to_paddr_driver(args.output.address);
  uint32_t output_height = (uint32_t)(
      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
          args.kernel.stride_h +
@@ -443,37 +321,35 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
          args.kernel.stride_w +
      1);
-
-  uint64_t image_amount_per_row = align_to_x(
-      (uint64_t)args.image.width * (uint64_t)args.image.channels, IMAGE_ALIGN);
+  uint64_t image_amount_per_row =
+      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
+                 IMAGE_ALIGNMENT);
  uint64_t image_one_pad_per_row =
      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 FILTER_ALIGN) +
+                 FILTER_ELEMENT_ALIGNMENT) +
      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
  uint64_t image_two_pad_per_row = align_to_x(
      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) *
          (uint64_t)args.image.channels,
-      IMAGE_ALIGN);
+      IMAGE_ALIGNMENT);
  uint64_t image_row_mul_pooling_hight =
      image_amount_per_row * (uint64_t)args.kernel.height;
  uint64_t image_row_mul_pad_hight =
      image_amount_per_row * (uint64_t)args.image.pad_height;
  uint64_t image_row_mul_step_hight =
      image_amount_per_row * (uint64_t)args.kernel.stride_h;
-  uint64_t result_amount_align_32 = align_to_x(
-      (uint64_t)output_width * (uint64_t)args.image.channels, FILTER_ALIGN);
+  uint64_t result_amount_align_32 =
+      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels,
+                 FILTER_ELEMENT_ALIGNMENT);
  uint64_t result_amount_align_64 = align_to_x(
-      (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGN);
+      (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
  uint64_t image_calcu_height =
      (uint64_t)args.kernel.height +
      ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h;
-
  uint64_t image_pad_left = args.image.channels * args.image.pad_width;
  uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
-
  uint64_t image_padleft_skipwindow =
      (image_skip_window << 32) | image_pad_left;
-
  uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 |
                             (((uint64_t)args.kernel_reciprocal));

@@ -485,50 +361,36 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
    return ret;
  }

-  /*restart scale*/
  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-
  reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
  reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
-
  reg_writeq(
      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
      REG_POOLING_IMAGE_PIXEL);
  reg_writeq(
      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
      REG_POOLING_WINDOW_SIZE);
-
  reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
             REG_POOLING_RESULT_PIXEL);
-
  reg_writeq(((uint64_t)args.image.pad_height) |
                 (((uint64_t)args.image.pad_width) << 32),
             REG_POOLING_PAD_PIXEL);
  reg_writeq(((uint64_t)args.kernel.stride_h) |
                 (((uint64_t)args.kernel.stride_w) << 32),
             REG_POOLING_STEP_PIXEL);
-
  reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
-
  reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
  reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
  reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
-
  reg_writeq(image_row_mul_pooling_hight,
             REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
  reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
  reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
-
  reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
  reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
-
  reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
-
  reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
  reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL);
-
-  /*SDK刷Cache保证数据一致性*/
-
  reg_writeq(cmd, REG_POOLING_CMD);

  DLOG << "before reg poll";
@@ -549,7 +411,8 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
  return ret;
 #endif
  return 0;
-}
+
+}  // ComputeFpgaPool

 int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
 #ifdef FPGA_PRINT_MODE
@@ -577,27 +440,6 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
 #ifdef PADDLE_MOBILE_ZU5
  int ret = 0;
  uint64_t output_scale = 0;
-  /*uint64_t timer_cnt = 0;
-  uint64_t image0_address_phy = 0;
-  uint64_t image1_address_phy = 0;
-  uint64_t output_address_phy = 0;
-
-  uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
-  uint64_t datalen = (uint64_t)args.image0.width *
-                     (uint64_t)args.image0.height *
-                     (uint64_t)args.image0.channels;
-  uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
-  image0_address_phy = vaddr_to_paddr(args.image0.address);
-  image1_address_phy = vaddr_to_paddr(args.image1.address);
-  output_address_phy = vaddr_to_paddr(args.output.address);
-
-  uint64_t image_amount_per_row =
-  align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
-             IMAGE_ALIGN);
-  uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
-                               ((uint64_t)args.image0.width << 16) |
-                               (uint64_t)args.image0.height;*/
-
  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
    ret = -EIO;
@@ -631,7 +473,8 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
  return ret;
 #endif
  return 0;
-}
+
+}  // ComputeFpgaEWAdd

 int PerformBypass(const struct BypassArgs &args) {
 #ifdef FPGA_PRINT_MODE
@@ -651,9 +494,6 @@ int PerformBypass(const struct BypassArgs &args) {
       << "   out_scale_address:" << args.output.scale_address;
 #endif
 #ifdef PADDLE_MOBILE_ZU5
-  DLOG << "Bypass";
-  // return 0;
-  struct fpga_pe *pe;
  uint64_t output_scale = 0;
  uint64_t timer_cnt = 0;
  uint64_t cmd = 0;
@@ -662,15 +502,12 @@ int PerformBypass(const struct BypassArgs &args) {
  uint64_t output_address_phy = 0;
  uint8_t data_cell_in = 0;
  uint8_t data_cell_out = 0;
-
  int ret = 0;
-
  datalen = (uint64_t)args.image.width * (uint64_t)args.image.height *
            (uint64_t)args.image.channels;
  datalen = align_to_x(datalen, 16);
-
-  input_address_phy = vaddr_to_paddr(args.image.address);
-  output_address_phy = vaddr_to_paddr(args.output.address);
+  input_address_phy = vaddr_to_paddr_driver(args.image.address);
+  output_address_phy = vaddr_to_paddr_driver(args.output.address);
  DLOG << "input_phy:" << input_address_phy;
  DLOG << "output_phy:" << output_address_phy;

@@ -733,14 +570,10 @@ int PerformBypass(const struct BypassArgs &args) {
    return ret;
  }

-  /*restart scale*/
  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-
  reg_writeq(input_address_phy, REG_CONVERT_SRC_ADDR);
  reg_writeq(output_address_phy, REG_CONVERT_DST_ADDR);
  reg_writeq(datalen, REG_CONVERT_LENGTH);
-
-  /*SDK刷Cache保证数据一致性*/
  reg_writeq(cmd, REG_CONVERT_CMD);

  DLOG << "before reg poll";
@@ -754,15 +587,14 @@ int PerformBypass(const struct BypassArgs &args) {
  output_scale = reg_readq(REG_SCALE_PARAMETER);
  output_scale = (output_scale << 32) | (output_scale >> 32);
  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-
  //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
  //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
  return ret;
 #endif
-
  return 0;
-}
+
+}  // PerformBypass

 int ComputeFPGAConcat(const struct ConcatArgs &args) {
 #ifdef FPGA_PRINT_MODE
@@ -776,7 +608,7 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) {
    DLOG << "   " << i << "th:        ";
    DLOG << "   channel_num:"
         << args.channel_num[i]
-         // << "   aligned_channel_num:" << args.aligned_channel_num[i]
+         //<< "   aligned_channel_num:" << args.aligned_channel_num[i]
         << "   image_address:" << args.images_in[i]
         << "   image_scale_address:" << args.scales_in[i];
  }
@@ -786,10 +618,15 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) {
                       args.scale_out, args.image_num, args.channel_num,
                       args.height, args.width);
  return 0;
-}
-
-void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel,
-                         int sub_height, int sub_width, int omit_size) {
+}  // ComputeFPGAConcat
+
+void deconv_post_process(const struct DeconvArgs &args) {
+  int sub_conv_n = args.sub_conv_num;
+  int sub_height = args.sub_output_height;
+  int sub_width = args.sub_output_width;
+  int omit_size = args.omit_size;
+  int channel = args.filter_num;
+  int num = 1;
  int origin_h = sub_height * sub_conv_n;
  int origin_w = sub_width * sub_conv_n;
  int align_origin_w = align_to_x(origin_w * channel, 16);
@@ -797,73 +634,110 @@ void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel,
  int deconv_w = origin_w - 2 * omit_size;
  int deconv_row_len = deconv_w * channel;
  int align_deconv_row_len = align_to_x(deconv_row_len, 16);
-  half *ptr_tmp = *data_in;
-  half *ptr_deconv =
-      (half *)fpga_malloc(num * align_deconv_row_len * deconv_h * sizeof(half));
-  memset(ptr_deconv, 0, num * align_deconv_row_len * deconv_h * sizeof(half));
+
+  for (int idx = 0; idx < sub_conv_n; ++idx) {
+    paddle_mobile::fpga::fpga_invalidate(
+        args.split_conv_args[idx].output.address,
+        align_origin_w * origin_h * sizeof(int16_t));
+  }
+
  int deconv_idx = 0;
  for (int nn = 0; nn < num; ++nn) {
    for (int hh = 0; hh < origin_h; ++hh) {
      int hx = (hh % sub_conv_n);
-      half *sub_t = ptr_tmp + hx * sub_height * align_origin_w;  // sub(hx,:);
-
+      auto sub_t =
+          (int16_t *)(args.split_conv_args[sub_conv_n - hx - 1].output.address);
      int hi = (hh / sub_conv_n);
-
      if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
-
-      // for (int ww = 0; ww < origin_w; ++ww){
-
-      //   if((ww < omit_size)  )// || (ww >= (origin_w-omit_size))
-      //      continue;
-
      int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
                  omit_size * channel);
-
-      fpga_copy(ptr_deconv + deconv_idx, sub_t + sidx,
-                sizeof(half) * deconv_row_len);
+      fpga_copy((int16_t *)(args.output.address) + deconv_idx, sub_t + sidx,
+                sizeof(int16_t) * deconv_row_len);
      deconv_idx += align_deconv_row_len;
-      //}
    }
  }
-
-  *data_in = ptr_deconv;
-  fpga_free(ptr_tmp);
+  fpga_flush(args.output.address,
+             num * align_deconv_row_len * deconv_h * sizeof(int16_t));
 }

 int ComputeFpgaDeconv(const struct DeconvArgs &args) {
 #ifdef FPGA_PRINT_MODE
  DLOG << "=============ComputeFPGADeConv===========";
  DLOG << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num
+       << "   group_num:" << args.group_num << "omit_size:" << args.omit_size
+       << "sub_output_width: " << args.sub_output_width
+       << "sub_output_height: " << args.sub_output_height
       << "   sub_conv_num:" << args.sub_conv_num;
+  DLOG << "args.output.address: " << args.output.address
+       << "args.output.scale_address: " << args.output.scale_address;
+
 #endif

  int sub_conv_num = args.sub_conv_num;

+#ifdef COST_TIME_PRINT
+  timeval start, end;
+  long dif_sec, dif_usec;
+#endif
+
  for (int i = 0; i < sub_conv_num; i++) {
-    //#if CPU_SIMULATE
+#ifdef COST_TIME_PRINT
+    gettimeofday(&start, NULL);
+#endif

-    //#else
-    ComputeBasicConv(args.conv_args[i]);
-    //#endif
+    ComputeFpgaConv(args.split_conv_args[i]);
+#ifdef COST_TIME_PRINT
+    gettimeofday(&end, NULL);
+    dif_sec = end.tv_sec - start.tv_sec;
+    dif_usec = end.tv_usec - start.tv_usec;
+    std::cout << "deconv basic_conv: " << i << " times:  "
+              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
+              << std::endl;
+#endif
  }

  if (sub_conv_num > 1) {
-    float max_scale = -1.0;
+    float max_scale = -1.0f;
+#ifdef COST_TIME_PRINT
+    gettimeofday(&start, NULL);
+#endif
    for (int i = 0; i < sub_conv_num; i++) {
-      float ptr_scale = (args.conv_args[i].output.scale_address)[0];
+      paddle_mobile::fpga::fpga_invalidate(
+          args.split_conv_args[i].output.scale_address, 2 * sizeof(float));
+      float ptr_scale = (args.split_conv_args[i].output.scale_address)[0];
      if (ptr_scale > max_scale) {
        args.output.scale_address[0] = ptr_scale;
        args.output.scale_address[1] =
-            (args.conv_args[i].output.scale_address)[1];
+            (args.split_conv_args[i].output.scale_address)[1];
      }
    }
-    deconv_post_process((half **)(&(args.output.address)), args.sub_conv_num, 1,
-                        args.filter_num, (args.sub_output_height),
-                        (args.sub_output_width), args.omit_size);
+
+#ifdef COST_TIME_PRINT
+    gettimeofday(&end, NULL);
+    dif_sec = end.tv_sec - start.tv_sec;
+    dif_usec = end.tv_usec - start.tv_usec;
+    std::cout << "deconv scale  "
+              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
+              << std::endl;
+#endif
+
+    //    fpga_flush(args.output.scale_address, 2 * sizeof(float));
+#ifdef COST_TIME_PRINT
+    gettimeofday(&start, NULL);
+#endif
+    deconv_post_process(args);
+#ifdef COST_TIME_PRINT
+    gettimeofday(&end, NULL);
+    dif_sec = end.tv_sec - start.tv_sec;
+    dif_usec = end.tv_usec - start.tv_usec;
+    std::cout << "deconv_post_process  "
+              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
+              << std::endl;
+#endif
  }
+
  return 0;
-}
+}  // ComputeFpgaDeconv

 int ComputeFPGASplit(const struct SplitArgs &args) {
 #ifdef FPGA_PRINT_MODE
@@ -883,6 +757,7 @@ int ComputeFPGASplit(const struct SplitArgs &args) {
                     args.scales_out, args.image_num, args.out_channel_nums,
                     args.height, args.width);
  return 0;
-}
+}  // ComputeFPGASplit
+
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/common/driver.cpp
+++ b/src/fpga/common/driver.cpp
@@ -153,10 +153,6 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
  uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE);
  unsigned int nr = (unsigned int)_nr;
  int ret = 0;
-  DLOG << size;
-  DLOG << _nr;
-  DLOG << nr;
-
  uint64_t a_size = FPGA_PAGE_SIZE * nr;
  DLOG << a_size;

@@ -283,7 +279,7 @@ int fpga_memory_add() {
  return 0;
 }

-uint64_t vaddr_to_paddr(void *address) {
+uint64_t vaddr_to_paddr_driver(void *address) {
  uint64_t paddr = 0;
  auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address);
  if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
@@ -315,7 +311,7 @@ void *fpga_reg_free(void *ptr) {
    g_fpgainfo.fpga_addr2size_map.erase(iter);
    munmap(ptr, size);
  } else {
-    DLOG << "Invalid pointer";
+    DLOG << "Invalid pointer" << ptr;
  }
 }

@@ -347,7 +343,7 @@ void fpga_free_driver(void *ptr) {
    g_fpgainfo.fpga_addr2size_map.erase(iter);
    munmap(ptr, size);

-    p_addr = vaddr_to_paddr(ptr);
+    p_addr = vaddr_to_paddr_driver(ptr);
    pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE;

    /*clear bitmap*/
@@ -361,7 +357,7 @@ void fpga_free_driver(void *ptr) {
      g_fpgainfo.fpga_vaddr2paddr_map.erase(iter);
    }
  } else {
-    DLOG << "Invalid pointer";
+    DLOG << "Invalid pointer" << ptr;
  }
 }

@@ -373,7 +369,7 @@ int fpga_flush_driver(void *address, size_t size) {
  struct MemoryCacheArgs args;
  uint64_t p_addr;

-  p_addr = vaddr_to_paddr(address);
+  p_addr = vaddr_to_paddr_driver(address);

  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);  // NOLINT
  args.size = size;
@@ -385,7 +381,7 @@ int fpga_invalidate_driver(void *address, size_t size) {
  struct MemoryCacheArgs args;
  uint64_t p_addr;

-  p_addr = vaddr_to_paddr(address);
+  p_addr = vaddr_to_paddr_driver(address);

  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);  // NOLINT
  args.size = size;

--- a/src/fpga/common/driver.h
+++ b/src/fpga/common/driver.h
@@ -31,8 +31,8 @@ namespace driver {

 #define FPGA_REG_PHY_ADDR 0xa0000000
 #define FPGA_REG_SIZE 0x1000
-#define FPGA_MEM_PHY_ADDR 0x20000000
-#define FPGA_MEM_SIZE 0x20000000
+#define FPGA_MEM_PHY_ADDR 0x40000000
+#define FPGA_MEM_SIZE 0x80000000

 #define FPGA_PAGE_SIZE (16UL * 1024UL)

@@ -122,15 +122,11 @@ void *fpga_malloc_driver(size_t size);

 void fpga_free_driver(void *ptr);

-void fpga_copy_driver(void *dest, const void *src, size_t num);
-
 int fpga_flush_driver(void *address, size_t size);

 int fpga_invalidate_driver(void *address, size_t size);

-/*pe*/
-
-uint64_t vaddr_to_paddr(void *address);
+uint64_t vaddr_to_paddr_driver(void *address);

 int fpga_regpoll(uint64_t reg, uint64_t val, int time);


--- a/src/fpga/common/fpga_common.cpp
+++ b/src/fpga/common/fpga_common.cpp
@@ -59,6 +59,9 @@ int close_device() {

 void *fpga_malloc(size_t size) {
  static uint64_t counter = 0;
+  if (size <= 0) {
+    size = 1;
+  }
 #ifdef PADDLE_MOBILE_ZU5
  auto ptr = driver::fpga_malloc_driver(size);
 #else
@@ -115,7 +118,7 @@ int fpga_invalidate(void *address, size_t size) {
 }
 uint64_t vaddr_to_paddr(void *address) {
 #ifdef PADDLE_MOBILE_ZU5
-  return driver::vaddr_to_paddr(address);
+  return driver::vaddr_to_paddr_driver(address);
 #else
  return 0;
 #endif

--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -37,6 +37,18 @@ enum LayoutType {
  LAYOUT_HWC = 0,
 };

+enum ActivationType {
+  NONE = 0,
+  LEAKYRELU = 1,
+  SIGMOID = 2,
+  TANH = 3,
+};
+
+struct ActivationArgs {
+  enum ActivationType activation_type;
+  int16_t leaky_relu_negative_slope;
+};
+
 struct KernelArgs {
  uint32_t width;
  uint32_t height;
@@ -58,7 +70,10 @@ struct ImageOutputArgs {
  void* address;         // output result address;
  float* scale_address;  // output scale address;
  uint64_t timer_cnt;    // time counter for FPGA computation
+  struct ActivationArgs
+      activation;  // To select activation and specify (Leaky)Relu parameter.
 };
+
 #ifdef PADDLE_MOBILE_FPGA_V1
 struct ConvDriverParam {
  uint64_t image_address_phy;
@@ -195,10 +210,14 @@ struct DeconvArgs {
  uint32_t sub_output_width;
  uint32_t sub_output_height;
  struct ImageOutputArgs output;
-  struct ConvArgs* conv_args;
+  struct SplitConvArgs* split_conv_args;
 };

-static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
+// }
+static inline uint32_t align_to_x(int64_t num, int64_t x) {
+  return ((uint32_t)(num + x) - 1) / (uint32_t)x * (uint32_t)x;
+}

 int16_t fp32_2_fp16(float fp32_num);
 float fp16_2_fp32(int16_t fp16_num);

--- a/src/framework/data_type.h
+++ b/src/framework/data_type.h
@@ -28,6 +28,10 @@ extern _PaddleMobile__Framework__Proto__VarType__Type ToDataType(
 extern std::type_index ToTypeIndex(
    _PaddleMobile__Framework__Proto__VarType__Type type);

+inline _PaddleMobile__Framework__Proto__VarType__Type ToDataType(int type) {
+  return static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(type);
+}
+
 template <typename Visitor>
 inline void VisitDataType(_PaddleMobile__Framework__Proto__VarType__Type type,
                          Visitor visitor) {

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -28,11 +28,6 @@ limitations under the License. */
 #include "framework/tensor.h"
 #include "memory/t_malloc.h"

-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-#include <queue>
-#include "common/threadpool.h"
-#endif
-
 #ifdef PADDLE_MOBILE_CL
 #include "framework/cl/cl_image.h"
 #endif
@@ -40,66 +35,67 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {

-using framework::Variable;
-using framework::Variable;
-
 #pragma mark - executor

-template <typename Dtype, Precision P>
-Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
-                             const bool use_optimize, const bool loddable)
-    : program_(p),
+template <typename Device, typename T>
+Executor<Device, T>::Executor(const Program<Device> &program, int batch_size,
+                              const bool use_optimize, const bool lod_mode)
+    : program_(program),
      batch_size_(batch_size),
      use_optimize_(use_optimize),
-      loddable_(loddable) {
+      lod_mode_(lod_mode) {
+  DLOG << "executor in lod mode: " << lod_mode_;
+
  Variable *variable_ptr = program_.scope->Var("batch_size");
  variable_ptr->SetValue<int>(batch_size);
-  to_predict_program_ =
+
+  program_desc_ =
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
-  PADDLE_MOBILE_ENFORCE(to_predict_program_ != nullptr,
-                        "to_predict_program_ == NULL!");
-  const std::vector<std::shared_ptr<framework::BlockDesc>> &blocks =
-      to_predict_program_->Blocks();
+  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
+                        "program_desc_ should not be nullptr");
+  const auto &blocks = program_desc_->Blocks();
+  ops_of_block_.resize(blocks.size());

-  DLOG << "executor in loaddable mode: " << loddable_;
  for (int i = 0; i < blocks.size(); ++i) {
-    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
-    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
+    std::shared_ptr<BlockDesc> block_desc = blocks[i];
+    std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
    for (int j = 0; j < ops.size(); ++j) {
-      std::shared_ptr<framework::OpDesc> op = ops[j];
-      DLOG << "create op: " << op->Type();
-      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
-          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
-          program_.scope);
-      // infer shape to reshape tensor before predict,
-      // but for lod tensor, it will still need to reshape in runtime
-      if (!loddable_) {
-        op_base->InferShape();
+      std::shared_ptr<OpDesc> op_desc = ops[j];
+      DLOG << "create op: " << op_desc->Type();
+      auto op_handler = OpRegistry<Device>::CreateOp(
+          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
+          op_desc->GetAttrMap(), program_.scope);
+      // infer shape to reshape inputs and outputs before predict,
+      // but for lod mode, it still need to infer shape in runtime
+      if (!lod_mode) {
+        op_handler->InferShape();
      }
-      ops_of_block_[*block_desc.get()].push_back(op_base);
+      ops_of_block_[i].push_back(op_handler);
    }
  }
+
  if (program_.combined) {
    InitCombineMemory();
  } else {
    InitMemory();
  }
-  std::shared_ptr<framework::BlockDesc> to_predict_block =
-      to_predict_program_->Block(0);
-  int i = 0;
-  auto &ops = ops_of_block_[*to_predict_block.get()];
-  for (const auto &op : ops) {
-    DLOG << "Initialize op[" << i++ << "]: " << op->Type();
-    op->Init();
+
+  int count = 0;
+  for (int block_id = 0; block_id < ops_of_block_.size(); ++block_id) {
+    for (auto &op_handler : ops_of_block_[block_id]) {
+      DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
+      op_handler->Init();
+      ops_list_.push_back(op_handler);
+    }
  }
 }

-template <typename Dtype>
-static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
+template <typename Device>
+static void LoadMemInternal(void **data, LoDTensor *tensor,
                            bool quant_uint8 = false) {
  char **data_buf = reinterpret_cast<char **>(data);
  int64_t size = tensor->numel();
-  Dtype *tensor_data = tensor->mutable_data<Dtype>();
+  Device *tensor_data = tensor->mutable_data<Device>();
  if (quant_uint8) {
    // should be moved into operator init function
    float min_value;
@@ -114,15 +110,15 @@ static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
    }
    data_buf += size * sizeof(uint8_t);
  } else {
-    memory::Copy(tensor_data, *data_buf, size * sizeof(Dtype));
-    *data_buf += size * sizeof(Dtype);
+    memory::Copy(tensor_data, *data_buf, size * sizeof(Device));
+    *data_buf += size * sizeof(Device);
  }
 }

-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::LoadMemory(
-    void **data, const std::shared_ptr<framework::VarDesc> var_desc,
-    framework::LoDTensor *tensor) {
+template <typename Device, typename T>
+void Executor<Device, T>::LoadMemory(void **data,
+                                     const std::shared_ptr<VarDesc> var_desc,
+                                     LoDTensor *tensor) {
  char **data_buf = reinterpret_cast<char **>(data);
  // version
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
@@ -152,18 +148,18 @@ void Executor<Dtype, P>::LoadMemory(
  // skip tensor desc
  *data_buf += tensor_desc_size;

-  const framework::TensorDesc &tensor_desc = var_desc->Tensor_desc();
-  tensor->Resize(framework::make_ddim(tensor_desc.Dims()));
+  const TensorDesc &tensor_desc = var_desc->Tensor_desc();
+  tensor->Resize(make_ddim(tensor_desc.Dims()));
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
-    case framework::VARTYPE_TYPE_FP32:
+    case VARTYPE_TYPE_FP32:
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
      break;
-    case framework::VARTYPE_TYPE_INT8:
+    case VARTYPE_TYPE_INT8:
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
      break;
-    case framework::VARTYPE_TYPE_INT32:
+    case VARTYPE_TYPE_INT32:
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
      break;
    default:
@@ -171,12 +167,12 @@ void Executor<Dtype, P>::LoadMemory(
  }
 }

-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::InitMemory() {
-  for (const auto &block : to_predict_program_->Blocks()) {
+template <typename Device, typename T>
+void Executor<Device, T>::InitMemory() {
+  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
-      auto tensor = var->template GetMutable<framework::LoDTensor>();
+      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
@@ -187,7 +183,7 @@ void Executor<Dtype, P>::InitMemory() {
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
      } else {
-        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          varInputMemory(var_desc, var, tensor);
        }
      }
@@ -195,8 +191,8 @@ void Executor<Dtype, P>::InitMemory() {
  }
 }

-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::InitCombineMemory() {
+template <typename Device, typename T>
+void Executor<Device, T>::InitCombineMemory() {
  char *origin_data = nullptr;
  bool self_alloc = false;
  if (program_.combined_params_buf && program_.combined_params_len) {
@@ -208,17 +204,17 @@ void Executor<Dtype, P>::InitCombineMemory() {
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
-  for (const auto &block : to_predict_program_->Blocks()) {
+  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
-      auto tensor = var->template GetMutable<framework::LoDTensor>();
+      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
      } else {
-        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
+        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          varInputMemory(var_desc, var, tensor);
        }
      }
@@ -230,152 +226,132 @@ void Executor<Dtype, P>::InitCombineMemory() {
  LOG(kLOG_INFO) << "init combine memory finish";
 }

-template <typename Dtype, Precision P>
-bool Executor<Dtype, P>::varInputMemory(
-    const std::shared_ptr<framework::VarDesc> &var_desc, Variable *var,
-    framework::LoDTensor *tensor) const {
+template <typename Device, typename T>
+bool Executor<Device, T>::varInputMemory(
+    const std::shared_ptr<VarDesc> &var_desc, Variable *var,
+    LoDTensor *tensor) const {
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
-    case framework::VARTYPE_TYPE_FP32:
+    case VARTYPE_TYPE_FP32:
      tensor->mutable_data<float>();
      break;
-    case framework::VARTYPE_TYPE_INT8:
+    case VARTYPE_TYPE_INT8:
      tensor->mutable_data<int8_t>();
      break;
-    case framework::VARTYPE_TYPE_INT32:
+    case VARTYPE_TYPE_INT32:
      tensor->mutable_data<int32_t>();
      break;
-    case framework::VARTYPE_TYPE_INT64:
+    case VARTYPE_TYPE_INT64:
      tensor->mutable_data<int64_t>();
      break;
    default:
      break;
  }
-  bool is_mute_match = (type == framework::VARTYPE_TYPE_FP32) ||
-                       (type == framework::VARTYPE_TYPE_INT8) ||
-                       (type == framework::VARTYPE_TYPE_INT32) ||
-                       (type == framework::VARTYPE_TYPE_INT64);
+  bool is_mute_match =
+      (type == VARTYPE_TYPE_FP32) || (type == VARTYPE_TYPE_INT8) ||
+      (type == VARTYPE_TYPE_INT32) || (type == VARTYPE_TYPE_INT64);
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
  return is_mute_match;
 }

-template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
-    const framework::Tensor &t) {
-  framework::Variable *g_feed_value = program_.scope->Var("feed");
-  framework::Tensor *feed_tensor =
-      g_feed_value->GetMutable<framework::LoDTensor>();
-  feed_tensor->Resize(t.dims());
-  feed_tensor->ShareDataWith(t);
-  std::shared_ptr<framework::BlockDesc> to_predict_block =
-      to_predict_program_->Block(0);
-  auto &ops = ops_of_block_[*to_predict_block.get()];
-
-#ifdef PADDLE_MOBILE_PROFILE
-  std::vector<ProfInfo> profile(ops.size());
-#endif
-  for (int i = 0; i < ops.size(); i++) {
-#ifdef PADDLE_MOBILE_PROFILE
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-    if (loddable_) {
-      ops[i]->InferShape();
+template <typename Device, typename T>
+PMStatus Executor<Device, T>::Predict(
+    const std::vector<std::pair<std::string, Tensor>> &inputs) {
+  for (const auto &input : inputs) {
+    SetInput(input.second, input.first);
  }
-    // to Run
-    ops[i]->Run();
-#ifdef PADDLE_MOBILE_PROFILE
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-  }
-  auto last_op = ops.rbegin();
-  auto output_map = (*last_op)->Outputs();
-  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
-  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
-  framework::LoDTensor *output_tensor =
-      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
-                                                   *(program_.scope));
-#ifdef PADDLE_MOBILE_PROFILE
-  std::unordered_map<std::string, uint64_t> _tp;
-  for (int i = 0; i < profile.size(); i++) {
-    const auto &pInfo = profile[i];
-    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
-    _tp[ops[i]->Type()] += timeCost;
-  }
-  printf("====================[ profile ]======================\n");
-  using prof_t = std::pair<std::string, uint64_t>;
-  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
-  uint64_t _ptotal = 0;
-  for (auto const &p : _tv) {
-    _ptotal += p.second;
-  }
-  auto compf = [](const prof_t &a, const prof_t &b) {
-    return a.second > b.second;
-  };
-  std::sort(_tv.begin(), _tv.end(), compf);
-  _tv.push_back(std::make_pair("total", _ptotal));
-  for (auto const &p : _tv) {
-    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
-           static_cast<float>(p.second),
-           static_cast<float>(p.second) / _ptotal * 100.0);
+  return this->Predict();
+}
+
+template <typename Device, typename T>
+PMStatus Executor<Device, T>::Predict(
+    const std::vector<std::pair<std::string, LoDTensor>> &inputs) {
+  for (const auto &input : inputs) {
+    SetInput(input.second, input.first);
  }
-  printf("====================[---------]======================\n");
-#endif
-  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
+  return this->Predict();
 }

-template <typename Dtype, Precision P>
-std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
-    const framework::LoDTensor &t) {
-  framework::Variable *g_feed_value = program_.scope->Var("feed");
-  framework::LoDTensor *feed_tensor =
-      g_feed_value->GetMutable<framework::LoDTensor>();
-  feed_tensor->Resize(t.dims());
-  feed_tensor->ShareDataWith(t);
-  feed_tensor->set_lod(t.lod());
+template <typename Device, typename T>
+std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
+                                            const std::vector<int64_t> &dims) {
+  Tensor feed_tensor(input, make_ddim(dims));
+  SetInput(feed_tensor, "feed");
+  std::vector<T> output;
+  if (this->Predict() == PMSuccess) {
+    const auto output_tensor = GetOutput("fetch");
+    output.resize(output_tensor->numel());
+    memcpy(output.data(), output_tensor->template data<T>(),
+           output.size() * sizeof(T));
+  }
+  return output;
+}

-  std::shared_ptr<framework::BlockDesc> to_predict_block =
-      to_predict_program_->Block(0);
+template <typename Device, typename T>
+void Executor<Device, T>::SetInput(const Tensor &input,
+                                   const std::string &var_name) {
+  auto *target_var = program_.scope->FindVar(var_name);
+  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
+                        var_name.c_str());
+  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
+  target_tensor->Resize(input.dims());
+  target_tensor->ShareDataWith(input);
+}

-  auto &ops = ops_of_block_[*to_predict_block.get()];
+template <typename Device, typename T>
+void Executor<Device, T>::SetInput(const LoDTensor &input,
+                                   const std::string &var_name) {
+  auto *target_var = program_.scope->FindVar(var_name);
+  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
+                        var_name.c_str());
+  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
+  target_tensor->Resize(input.dims());
+  target_tensor->ShareDataWith(input);
+  target_tensor->set_lod(input.lod());
+}

+template <typename Device, typename T>
+PMStatus Executor<Device, T>::Predict() {
 #ifdef PADDLE_MOBILE_PROFILE
-  std::vector<ProfInfo> profile(ops.size());
+  std::vector<ProfInfo> profile(ops_list_.size());
+  struct timespec ts;
+  int op_index = 0;
 #endif
-  for (int i = 0; i < ops.size(); i++) {
+  for (auto &block : ops_of_block_) {
+    for (auto &op_handler : block) {
 #ifdef PADDLE_MOBILE_PROFILE
-    struct timespec ts;
      clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+      profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
-    if (loddable_) {
-      ops[i]->InferShape();
+      if (lod_mode_) {
+        op_handler->InferShape();
      }
-    ops[i]->Run();
+      op_handler->Run();
 #ifdef PADDLE_MOBILE_PROFILE
      clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+      profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+      ++op_index;
 #endif
    }
-  auto last_op = ops.rbegin();
-
-  auto output_map = (*last_op)->Outputs();
-  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
-  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
-  framework::LoDTensor *output_tensor =
-      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
-                                                   *(program_.scope));
+  }
 #ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
-    _tp[ops[i]->Type()] += timeCost;
+    if (ops_list_[i]->Type() == "conv2d" ||
+        ops_list_[i]->Type() == "depthwise_conv2d") {
+      auto inputs = ops_list_[i]->Inputs();
+      auto *filter =
+          GetVarValue<LoDTensor>("Filter", inputs, *(program_.scope));
+      int kernel_size = filter->dims()[2];
+      _tp[ops_list_[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
+    } else {
+      _tp[ops_list_[i]->Type()] += timeCost;
+    }
  }
  printf("====================[ profile ]======================\n");
-  using prof_t = std::pair<std::string, uint64_t>;
+  typedef std::pair<std::string, uint64_t> prof_t;
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
@@ -393,72 +369,51 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
  }
  printf("====================[---------]======================\n");
 #endif
-  return std::make_shared<framework::LoDTensor>(
-      framework::LoDTensor(*output_tensor));
+  return PMSuccess;
 }

-template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
-    const framework::Tensor &t, int block_id) {
-  return Predict(t);
-}
-
-template <typename Dtype, Precision P>
-std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
-    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
-  framework::Tensor tensor(input, framework::make_ddim(dims));
-  std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
-  if (output_tensor != nullptr) {
-    Executor<Dtype, P>::Ptype *output_ptr =
-        output_tensor->data<typename Executor<Dtype, P>::Ptype>();
-    std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
-    for (int j = 0; j < output_tensor->numel(); ++j) {
-      result_vector.push_back(output_ptr[j]);
-    }
-    return result_vector;
-  } else {
-    DLOG << "return  empty vector";
-    return {};
-  }
+template <typename Device, typename T>
+std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
+    const std::string &var_name) {
+  auto *target_var = program_.scope->FindVar(var_name);
+  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
+                        var_name.c_str());
+  auto *output_tensor = target_var->template GetMutable<LoDTensor>();
+  return std::make_shared<LoDTensor>(*output_tensor);
 }

 #ifdef PADDLE_MOBILE_FPGA
-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
+template <typename Device, typename T>
+void Executor<Device, T>::InjectVariable(const Tensor &t,
                                         std::string var_name) {
-  framework::Variable *g_feed_value = program_.scope->Var(var_name);
-  framework::Tensor *feed_tensor =
-      g_feed_value->GetMutable<framework::LoDTensor>();
+  Variable *g_feed_value = program_.scope->Var(var_name);
+  Tensor *feed_tensor = g_feed_value->GetMutable<LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
 }

-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::FeedData(const framework::Tensor &t) {
+template <typename Device, typename T>
+void Executor<Device, T>::FeedData(const Tensor &t) {
  InjectVariable(t, "feed");
 }

-template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
-  std::shared_ptr<framework::BlockDesc> to_predict_block =
-      to_predict_program_->Block(0);
-  auto &ops = ops_of_block_[*to_predict_block.get()];
+template <typename Device, typename T>
+std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
+  auto &ops = ops_of_block_[0];

  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
-  auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
-      out_keys[0], output_map, *(program_.scope));
-  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
+  auto *output_tensor =
+      GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
+  return std::make_shared<Tensor>(Tensor(*output_tensor));
 }

-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::Predict_From_To(int start, int end) {
-  std::shared_ptr<framework::BlockDesc> to_predict_block =
-      to_predict_program_->Block(0);
-  auto &ops = ops_of_block_[*to_predict_block.get()];
+template <typename Device, typename T>
+void Executor<Device, T>::Predict_From_To(int start, int end) {
+  auto &ops = ops_of_block_[0];
  end = end < 0 ? static_cast<int>(ops.size()) : end;
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");
@@ -482,25 +437,25 @@ void Executor<Dtype, P>::Predict_From_To(int start, int end) {
  }
 }

-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::Predict_From(int start) {
+template <typename Device, typename T>
+void Executor<Device, T>::Predict_From(int start) {
  Predict_From_To(start);
 }

-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::Predict_To(int end) {
+template <typename Device, typename T>
+void Executor<Device, T>::Predict_To(int end) {
  Predict_From_To(0, end);
 }
 #endif

 #ifdef PADDLE_MOBILE_CL
-template <typename Dtype, Precision P>
-void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
-                                    float *tensorInput, char **data) {}
+template <typename Device, typename T>
+void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
+                                     char **data) {}

 template <>
-void Executor<GPU_CL, Precision::FP32>::LoadMemory(
-    const framework::VarDesc var_desc, float *tensorInput, char **data) {
+void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
+                                         float *tensorInput, char **data) {
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

@@ -538,38 +493,13 @@ void Executor<GPU_CL, Precision::FP32>::LoadMemory(
  }
  (*data) += (sizeof(char) * size);

-  const framework::TensorDesc &desc = var_desc.Tensor_desc();
+  const TensorDesc &desc = var_desc.Tensor_desc();
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
-  //            int type_size = 0;
-  //            switch (desc.DataType()) {
-  //                case framework::VARTYPE_TYPE_FP16:
-  //                    type_size = 2;
-  //                    break;
-  //                case framework::VARTYPE_TYPE_FP32:
-  //                    type_size = 4;
-  //                    memory = tensor->mutable_data<float>();
-  //                    break;
-  //                case framework::VARTYPE_TYPE_FP64:
-  //                    type_size = 8;
-  //                    break;
-  //                case framework::VARTYPE_TYPE_INT32:
-  //                    memory = tensor->mutable_data<int32_t>();
-  //                    type_size = 4;
-  //                    break;
-  //                case framework::VARTYPE_TYPE_INT64:
-  //                    type_size = 8;
-  //                    break;
-  //                case framework::VARTYPE_TYPE_BOOL:
-  //                    type_size = 1;
-  //                    break;
-  //                default:
-  //                    break;
-  //            }
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
@@ -600,24 +530,24 @@ void Executor<GPU_CL, Precision::FP32>::LoadMemory(
 }

 template <>
-void Executor<GPU_CL, Precision::FP32>::InitMemory() {
-  for (const auto &block : to_predict_program_->Blocks()) {
+void Executor<GPU_CL, float>::InitMemory() {
+  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
        CLImage *cl_image = nullptr;
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-          var->template GetMutable<framework::LoDTensor>();
+          var->template GetMutable<LoDTensor>();
          continue;
        } else {
-          cl_image = var->template GetMutable<framework::CLImage>();
+          cl_image = var->template GetMutable<CLImage>();
        }

        char *origin_data =
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
        char *data = origin_data;
        cl_context context = program_.scope->GetCLScpoe()->Context();
-        const framework::TensorDesc &desc = var_desc->Tensor_desc();
+        const TensorDesc &desc = var_desc->Tensor_desc();
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
@@ -627,7 +557,7 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);

-        framework::DDim ddim = framework::make_ddim(desc.Dims());
+        DDim ddim = make_ddim(desc.Dims());

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
@@ -635,15 +565,15 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
        delete origin_data;
        paddle_mobile::memory::Free(tensorInput);
      } else {
-        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-          auto cl_image = var->template GetMutable<framework::CLImage>();
+        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
+          auto cl_image = var->template GetMutable<CLImage>();
          cl_context context = program_.scope->GetCLScpoe()->Context();
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();

-          const framework::TensorDesc &desc = var_desc->Tensor_desc();
-          //          framework::DDim ddim = framework::make_ddim(desc.Dims());
-          framework::DDim ddim = cl_image->dims();
+          const TensorDesc &desc = var_desc->Tensor_desc();
+          //          DDim ddim = make_ddim(desc.Dims());
+          DDim ddim = cl_image->dims();
          DLOG << var_desc->Name();
          cl_image->InitEmptyImage(context, command_queue, ddim);
        }
@@ -653,7 +583,7 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
 }

 template <>
-void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
+void Executor<GPU_CL, float>::InitCombineMemory() {
  char *origin_data = nullptr;
  bool self_alloc = false;
  if (program_.combined_params_buf && program_.combined_params_len) {
@@ -667,22 +597,22 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
  float *data = reinterpret_cast<float *>(origin_data);

-  for (const auto &block : to_predict_program_->Blocks()) {
+  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
        CLImage *cl_image = nullptr;
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-          var->template GetMutable<framework::LoDTensor>();
+          var->template GetMutable<LoDTensor>();
          continue;
        } else {
-          cl_image = var->template GetMutable<framework::CLImage>();
+          cl_image = var->template GetMutable<CLImage>();
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

-        const framework::TensorDesc &desc = var_desc->Tensor_desc();
-        framework::DDim ddim = framework::make_ddim(desc.Dims());
+        const TensorDesc &desc = var_desc->Tensor_desc();
+        DDim ddim = make_ddim(desc.Dims());

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
@@ -697,13 +627,13 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {

        paddle_mobile::memory::Free(tensorInput);
      } else {
-        auto cl_image = var->template GetMutable<framework::CLImage>();
+        auto cl_image = var->template GetMutable<CLImage>();
        cl_context context = program_.scope->GetCLScpoe()->Context();
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
-        const framework::TensorDesc &desc = var_desc->Tensor_desc();
-        framework::DDim ddim = cl_image->dims();
-        //        framework::DDim ddim = framework::make_ddim(desc.Dims());
+        const TensorDesc &desc = var_desc->Tensor_desc();
+        DDim ddim = cl_image->dims();
+        //  DDim ddim = make_ddim(desc.Dims());
        cl_image->InitEmptyImage(context, command_queue, ddim);
      }
    }
@@ -716,13 +646,13 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {

 #endif

-template class Executor<CPU, Precision::FP32>;
+template class Executor<CPU, float>;

-template class Executor<FPGA, Precision::FP32>;
+template class Executor<FPGA, float>;

-template class Executor<GPU_CL, Precision::FP32>;
+template class Executor<GPU_CL, float>;

-template class Executor<GPU_MALI, Precision::FP32>;
+template class Executor<GPU_MALI, float>;

 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 #include "common/types.h"
 #include "common/util.h"
@@ -28,41 +29,29 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {

-template <typename Dtype = CPU, Precision P = Precision::FP32>
+template <typename Device, typename T = float>
 class Executor {
 public:
-  typedef typename PrecisionTrait<P>::ptype Ptype;
-  // exector constructor
-  // @param program program converted from proto program in PaddlePaddle
-  // @param use_optimize bool whether use operator fusion to speed up or not
-  // @param loddable bool
-  Executor(const framework::Program<Dtype> program, int batch_size = 1,
-           const bool use_optimize = true, const bool loddable = false);
-
-  // predict with tensor input
-  // @param t input tensor to do prediction
-  // @return predicted tensor
-  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
-
-  // predict with lod tensor input
-  // @param t input lod tensor to do prediction
-  // @return predicted lod tensor
-  std::shared_ptr<framework::LoDTensor> PredictLod(
-      const framework::LoDTensor &t);
-
-  // predict with vector input and dims
-  // @param input vector whose elements will be formed
-  // @param       input lod tensor to do prediction
-  // @param dims  vector whose elements will be formed
-  // @param       input tensor shape
-  // @return vector which is flatted from predicted tensor
-  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
+  Executor(const Program<Device> &program, int batch_size = 1,
+           const bool use_optimize = true, const bool lod_mode = false);
+
+  PMStatus Predict(const std::vector<std::pair<std::string, Tensor>> &inputs);
+  PMStatus Predict(
+      const std::vector<std::pair<std::string, LoDTensor>> &inputs);
+
+  std::vector<T> Predict(const std::vector<T> &input,
                         const std::vector<int64_t> &dims);
+  PMStatus Predict();
+
+  void SetInput(const Tensor &input, const std::string &var_name);
+  void SetInput(const LoDTensor &input, const std::string &var_name);
+
+  std::shared_ptr<LoDTensor> GetOutput(const std::string &var_name);

 #ifdef PADDLE_MOBILE_FPGA
-  void InjectVariable(const framework::Tensor &t, std::string var_name);
-  void FeedData(const framework::Tensor &t);
-  std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
+  void InjectVariable(const Tensor &t, std::string var_name);
+  void FeedData(const Tensor &t);
+  std::shared_ptr<Tensor> FetchResult(int id = -1);
  void Predict_From_To(int start = 0, int end = -1);
  void Predict_From(int start);
  void Predict_To(int end);
@@ -70,26 +59,28 @@ class Executor {

 protected:
  Executor() = default;
-  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t,
-                                             int block_id);
-  bool varInputMemory(const std::shared_ptr<framework::VarDesc> &var_desc,
-                      framework::Variable *var,
-                      framework::LoDTensor *tensor) const;
+
+  bool varInputMemory(const std::shared_ptr<VarDesc> &var_desc, Variable *var,
+                      LoDTensor *tensor) const;
  void InitMemory();
  void InitCombineMemory();
-  void LoadMemory(void **data,
-                  const std::shared_ptr<framework::VarDesc> var_desc,
-                  framework::LoDTensor *tensor);
+  void LoadMemory(void **data, const std::shared_ptr<VarDesc> var_desc,
+                  LoDTensor *tensor);
 #ifdef PADDLE_MOBILE_CL
-  void LoadMemory(const framework::VarDesc var_desc, float *tensorInput,
-                  char **data);
+  void LoadMemory(const VarDesc var_desc, float *tensorInput, char **data);
 #endif
-  framework::Program<Dtype> program_;
-  int batch_size_ = 1;
-  std::shared_ptr<framework::ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>>
-      ops_of_block_;
+
+  int batch_size_;
+  bool use_optimize_;
+  bool lod_mode_;
+  Program<Device> program_;
+  std::shared_ptr<ProgramDesc> program_desc_;
+
+  typedef std::shared_ptr<OperatorBase<Device>> OperatorBasePtr;
+  std::vector<std::vector<OperatorBasePtr>> ops_of_block_;
+  // operators list
+  std::vector<OperatorBasePtr> ops_list_;
+
 #ifdef PADDLE_MOBILE_PROFILE
  struct ProfInfo {
    int tid = 0;
@@ -97,8 +88,6 @@ class Executor {
    uint64_t runEnd = 0UL;
  };
 #endif
-  bool use_optimize_ = false;
-  bool loddable_ = false;
 };

 }  // namespace framework

--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -191,6 +191,7 @@ LOAD_OP2(mul, CPU, MALI_GPU);
 #endif
 #ifdef RELU_OP
 LOAD_OP2(relu, CPU, MALI_GPU);
+LOAD_OP1(relu6, CPU);
 #endif
 #ifdef IM2SEQUENCE_OP
 LOAD_OP1(im2sequence, CPU);
@@ -227,12 +228,22 @@ LOAD_FUSION_MATCHER(fusion_conv_bn);
 #ifdef ELEMENTWISESUB_OP
 LOAD_OP1(elementwise_sub, CPU)
 #endif
+#ifdef TOP_K_OP
+LOAD_OP1(top_k, CPU)
+#endif
+#ifdef CAST_OP
+LOAD_OP1(cast, CPU)
+#endif
 #ifdef QUANT_OP
 LOAD_OP1(quantize, CPU);
 #endif
 #ifdef DEQUANT_OP
 LOAD_OP1(dequantize, CPU);
 #endif
+#ifdef FUSION_DEQUANT_BN_OP
+LOAD_OP1(fusion_dequant_bn, CPU);
+LOAD_FUSION_MATCHER(fusion_dequant_bn);
+#endif
 #ifdef FUSION_DEQUANT_ADD_BN_OP
 LOAD_OP1(fusion_dequant_add_bn, CPU);
 LOAD_FUSION_MATCHER(fusion_dequant_add_bn);
@@ -245,3 +256,11 @@ LOAD_FUSION_MATCHER(fusion_dequant_bn_relu);
 LOAD_OP1(fusion_dequant_add_bn_relu, CPU);
 LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu);
 #endif
+#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
+LOAD_OP1(fusion_dequant_add_bn_quant, CPU);
+LOAD_FUSION_MATCHER(fusion_dequant_add_bn_quant);
+#endif
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
+LOAD_OP1(fusion_dequant_add_bn_relu_quant, CPU);
+LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu_quant);
+#endif
--- a/src/framework/loader.cpp
+++ b/src/framework/loader.cpp
@@ -23,14 +23,8 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {

-/**
- * muteandresize tensor as originProgramDesc and scope in loadParams
- *
- * @param originProgramDesc
- * @param scope
- */
-template <typename Dtype, Precision P>
-void Loader<Dtype, P>::InitMemoryFromProgram(
+template <typename Device, typename T>
+void Loader<Device, T>::InitMemoryFromProgram(
    const std::shared_ptr<ProgramDesc> &originProgramDesc,
    const std::shared_ptr<Scope> &scope) {
  for (const auto &block : originProgramDesc.get()->Blocks()) {
@@ -43,8 +37,6 @@ void Loader<Dtype, P>::InitMemoryFromProgram(
          tensor->Resize(make_ddim(dim));
        } else {
          auto dim = var_desc->Tensor_desc().Dims();
-          //          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
-          //          dim[0] = 1;
          if (dim.size() == 0) {
            auto tensor = var->GetMutable<LoDTensor>();
            framework::DDim dDim = {0};
@@ -60,7 +52,7 @@ void Loader<Dtype, P>::InitMemoryFromProgram(
          }
        }
      } else {
-        // TODO(codeWorm): some.
+        // TODO(codeWorm)
      }
    }
  }
@@ -68,7 +60,7 @@ void Loader<Dtype, P>::InitMemoryFromProgram(

 #ifdef PADDLE_MOBILE_CL
 template <>
-void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
+void Loader<GPU_CL, float>::InitMemoryFromProgram(
    const std::shared_ptr<ProgramDesc> &originProgramDesc,
    const std::shared_ptr<Scope> &scope) {
  for (const auto &block : originProgramDesc.get()->Blocks()) {
@@ -77,7 +69,6 @@ void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
      if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
        if (var_desc->Persistable()) {
          auto dim = var_desc->Tensor_desc().Dims();
-          //              auto tensor = var->GetMutable<LoDTensor>();
          auto cl_image = var->GetMutable<framework::CLImage>();
          cl_image->Resize(make_ddim(dim));
        } else {
@@ -88,14 +79,13 @@ void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
          cl_image->Resize(make_ddim(dim));
        }
      } else {
-        // TODO(codeWorm): some.
+        // TODO(codeWorm)
      }
    }
  }
 }
 template <>
-const Program<GPU_CL, Precision::FP32>
-Loader<GPU_CL, Precision::FP32>::LoadCombinedMemory(
+const Program<GPU_CL, float> Loader<GPU_CL, float>::LoadCombinedMemory(
    size_t read_size, const uint8_t *buf, size_t combined_params_len,
    uint8_t *combined_params_buf, bool optimize, bool quantification) {
  bool can_add_split = false;
@@ -113,7 +103,7 @@ Loader<GPU_CL, Precision::FP32>::LoadCombinedMemory(

  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);

-  Program<GPU_CL, Precision::FP32> program;
+  Program<GPU_CL, float> program;
  program.combined = true;
  program.originProgram = originProgramDesc;
  program.quantification = quantification;
@@ -145,16 +135,16 @@ Loader<GPU_CL, Precision::FP32>::LoadCombinedMemory(

 /**
 * fusion and print someinfos
- * @tparam Dtype
+ * @tparam Device
 * @tparam P
 * @param optimize
 * @param can_add_split
 * @param program
 * @param originProgramDesc
 */
-template <typename Dtype, Precision P>
+template <typename Device, typename T>
 void FusionAndPrintInfos(
-    bool optimize, bool can_add_split, Program<Dtype, P> *program,
+    bool optimize, bool can_add_split, Program<Device, T> *program,
    const std::shared_ptr<ProgramDesc> &originProgramDesc) {
  if (optimize) {
    ProgramOptimize program_optimize;
@@ -193,8 +183,8 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
  return cur_len;
 }

-template <typename Dtype, Precision P>
-const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &dirname,
+template <typename Device, typename T>
+const Program<Device, T> Loader<Device, T>::Load(const std::string &dirname,
                                                 bool optimize,
                                                 bool quantification,
                                                 bool can_add_split) {
@@ -204,8 +194,8 @@ const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &dirname,
  return program;
 }

-template <typename Dtype, Precision P>
-const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &model_path,
+template <typename Device, typename T>
+const Program<Device, T> Loader<Device, T>::Load(const std::string &model_path,
                                                 const std::string &para_path,
                                                 bool optimize,
                                                 bool quantification) {
@@ -217,8 +207,8 @@ const Program<Dtype, P> Loader<Dtype, P>::Load(const std::string &model_path,
  return program;
 }

-template <typename Dtype, Precision P>
-const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
+template <typename Device, typename T>
+const Program<Device, T> Loader<Device, T>::LoadProgram(
    const std::string &model_path, bool optimize, bool quantification,
    bool can_add_split) {
  std::string model_filename = model_path;
@@ -237,7 +227,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
  //
  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);

-  Program<Dtype, P> program;
+  Program<Device, T> program;
  program.originProgram = originProgramDesc;
  program.quantification = quantification;
  program.combined_params_len = 0;
@@ -254,8 +244,8 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
  return program;
 }

-template <typename Dtype, Precision P>
-const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
+template <typename Device, typename T>
+const Program<Device, T> Loader<Device, T>::LoadCombinedMemory(
    size_t read_size, const uint8_t *buf, size_t combined_params_len,
    uint8_t *combined_params_buf, bool optimize, bool quantification) {
  bool can_add_split = false;
@@ -273,7 +263,7 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(

  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);

-  Program<Dtype, P> program;
+  Program<Device, T> program;
  program.combined = true;
  program.originProgram = originProgramDesc;
  program.quantification = quantification;
@@ -289,13 +279,13 @@ const Program<Dtype, P> Loader<Dtype, P>::LoadCombinedMemory(
  return program;
 }

-template class Loader<CPU, Precision::FP32>;
+template class Loader<CPU, float>;

-template class Loader<FPGA, Precision::FP32>;
+template class Loader<FPGA, float>;

-template class Loader<GPU_MALI, Precision::FP32>;
+template class Loader<GPU_MALI, float>;

-template class Loader<GPU_CL, Precision::FP32>;
+template class Loader<GPU_CL, float>;

 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/loader.h
+++ b/src/framework/loader.h
@@ -22,28 +22,28 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace framework {

-template <typename Dtype = CPU, Precision P = Precision::FP32>
+template <typename Device = CPU, typename T = float>
 class Loader {
 public:
  /*
   * @b load separate format fluid model
-   * @b 加载分开形式的 fluid 模型
+   * @b 加载分开存储的fluid模型
   * */
-  const Program<Dtype, P> Load(const std::string &dirname,
+  const Program<Device, T> Load(const std::string &dirname,
                                bool optimize = false,
                                bool quantification = false,
                                bool can_add_split = false);

  /*
   * @b load combine format fluid mode
-   * @b 加载结合在一起格式的模型
+   * @b 加载统一存储的fluid模型
   * */
-  const Program<Dtype, P> Load(const std::string &model_path,
+  const Program<Device, T> Load(const std::string &model_path,
                                const std::string &para_path,
                                bool optimize = false,
                                bool quantification = false);

-  const Program<Dtype, P> LoadCombinedMemory(size_t model_len,
+  const Program<Device, T> LoadCombinedMemory(size_t model_len,
                                              const uint8_t *model_buf,
                                              size_t combined_params_len,
                                              uint8_t *combined_params_buf,
@@ -51,7 +51,7 @@ class Loader {
                                              bool quantification = false);

 private:
-  const Program<Dtype, P> LoadProgram(const std::string &model_path,
+  const Program<Device, T> LoadProgram(const std::string &model_path,
                                       bool optimize = false,
                                       bool quantification = false,
                                       bool can_add_split = false);

--- a/src/framework/lod_tensor.h
+++ b/src/framework/lod_tensor.h
@@ -16,12 +16,12 @@ limitations under the License. */

 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
-#include "tensor.h"
-#include "tensor_util.h"
+#include "framework/tensor.h"
+#include "framework/tensor_util.h"

 namespace paddle_mobile {
-
 namespace framework {

 /*
@@ -202,5 +202,29 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor);

 void DeserializeFromStream(std::istream &is, LoDTensor *tensor);

+#ifdef PADDLE_MOBILE_DEBUG
+inline Print &operator<<(Print &printer, const LoDTensor &tensor) {
+  printer << " dims: " << tensor.dims() << "\n";
+  int stride = tensor.numel() / 20;
+  stride = stride > 0 ? stride : 1;
+#ifndef PADDLE_MOBILE_FPGA
+  for (int i = 0; i < tensor.numel(); i += stride) {
+    if (tensor.type() == typeid(float)) {
+      printer << tensor.data<float>()[i] << " ";
+    } else if (tensor.type() == typeid(int32_t)) {
+      printer << tensor.data<int32_t>()[i] << " ";
+    } else if (tensor.type() == typeid(int64_t)) {
+      printer << tensor.data<int64_t>()[i] << " ";
+    } else if (tensor.type() == typeid(int8_t)) {
+      printer << static_cast<int>(tensor.data<int8_t>()[i]) << " ";
+    } else if (tensor.type() == typeid(int32_t)) {
+      printer << tensor.data<int32_t>()[i] << " ";
+    }
+  }
+#endif  // PADDLE_MOBILE_FPGA
+  return printer;
+}
+#endif  // PADDLE_MOBILE_DEBUG
+
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -98,24 +98,6 @@ class OpRegistry {
  }
 };

-#define REGISTER_OPERATOR_INT8(op_type, op_class, device_name, device_type) \
-  template class op_class<device_type, int8_t>;                             \
-  template <typename Dtype, typename T>                                     \
-  class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> {    \
-   public:                                                                  \
-    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_##device_name, op_class);    \
-  };                                                                        \
-  static paddle_mobile::framework::OperatorRegistrar<                       \
-      device_type, _OpClass_##op_type##_##device_name<device_type, int8_t>> \
-      __op_registrar_##op_type##_##device_name(#op_type);                   \
-  int TouchOpRegistrar_##op_type##_##device_name() {                        \
-    __op_registrar_##op_type##_##device_name.Touch();                       \
-    return 0;                                                               \
-  }
-
-#define REGISTER_OPERATOR_CPU_INT8(op_type, op_class) \
-  REGISTER_OPERATOR_INT8(op_type, op_class, cpu, paddle_mobile::CPU);
-
 #define REGISTER_OPERATOR(op_type, op_class, device_name, device_type)     \
  template class op_class<device_type, float>;                             \
  template <typename Dtype, typename T>                                    \

--- a/src/framework/program/program-optimize/node.cpp
+++ b/src/framework/program/program-optimize/node.cpp
@@ -220,7 +220,16 @@ void Node::Folder(
    }
  } else {
    for (auto &op_output : this->op_desc_->outputs_) {
-      op_desc->outputs_.emplace(op_output.first, op_output.second);
+      auto output_key = op_output.first;
+      if (change->find(this->type_) != change->end()) {
+        const auto change_pairs = (*change)[this->type_];
+        for (const auto &target : change_pairs) {
+          if (target.first == output_key) {
+            output_key = target.second;
+          }
+        }
+      }
+      op_desc->outputs_.emplace(output_key, op_output.second);
    }

    for (auto &output : this->outputs_) {

--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -14,16 +14,15 @@ limitations under the License. */

 #pragma once

+#include <string>
 #include "common/types.h"
 #include "framework/program/program_desc.h"
 #include "framework/scope.h"

-#include <string>
-
 namespace paddle_mobile {
 namespace framework {

-template <typename Dtype, Precision P = Precision::FP32>
+template <typename Device, typename T = float>
 class Program {
 public:
  std::shared_ptr<ProgramDesc> originProgram;

--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -26,6 +26,7 @@ limitations under the License. */

 namespace paddle_mobile {
 namespace framework {
+
 class Scope {
 public:
  Scope() = default;

--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -148,8 +148,8 @@ class Tensor : public TensorBase {
    PADDLE_MOBILE_ENFORCE(
        (std::is_same<T, void>::value ||
         holder_->type().hash_code() == typeid(T).hash_code()),
-        "Tensor holds the wrong type, it holds %s",
-        this->holder_->type().name());
+        "Tensor holds the wrong type, it holds %s, requested %s",
+        this->holder_->type().name(), typeid(T).name());

    return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                 offset_);
@@ -162,7 +162,7 @@ class Tensor : public TensorBase {
    PADDLE_MOBILE_ENFORCE(
        (std::is_same<T, void>::value ||
         holder_->type().hash_code() == typeid(T).hash_code()),
-        "Tensor holds the wrong type, it holds %s ,requested:%s",
+        "Tensor holds the wrong type, it holds %s, requested %s",
        this->holder_->type().name(), typeid(T).name());

    return reinterpret_cast<const T *>(
@@ -226,7 +226,6 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
    }
  }
 #endif
-
  return printer;
 }


--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -18,17 +18,17 @@

 namespace paddle_mobile {

-template <typename Dtype, Precision P>
-PaddleMobilePredictor<Dtype, P>::PaddleMobilePredictor(
+template <typename Device, typename T>
+PaddleMobilePredictor<Device, T>::PaddleMobilePredictor(
    const PaddleMobileConfig &config) {
  PADDLE_MOBILE_ENFORCE(Init(config) == true,
                        "paddle mobile predictor init failed!");
  config_ = config;
 }

-template <typename Dtype, Precision P>
-bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
-  paddle_mobile_.reset(new PaddleMobile<Dtype, P>());
+template <typename Device, typename T>
+bool PaddleMobilePredictor<Device, T>::Init(const PaddleMobileConfig &config) {
+  paddle_mobile_.reset(new PaddleMobile<Device, T>());
 #ifdef PADDLE_MOBILE_CL
  paddle_mobile_->SetCLPath(config.cl_path);
 #endif
@@ -52,8 +52,8 @@ bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
  paddle_mobile_->SetThreadNum(config.thread_num);
  return true;
 }
-template <typename Dtype, Precision P>
-bool PaddleMobilePredictor<Dtype, P>::Run(
+template <typename Device, typename T>
+bool PaddleMobilePredictor<Device, T>::Run(
    const std::vector<PaddleTensor> &inputs,
    std::vector<PaddleTensor> *output_data, int batch_size) {
  if (inputs.empty()) {
@@ -78,12 +78,12 @@ bool PaddleMobilePredictor<Dtype, P>::Run(
  framework::Tensor input_tensor;
  input_tensor.Resize(ddim);
  int input_length = framework::product(ddim);
-  typedef typename PrecisionTrait<P>::ptype PType;
-  auto input_ptr = input_tensor.mutable_data<PType>();
+  auto input_ptr = input_tensor.mutable_data<T>();

-  memcpy(input_ptr, static_cast<PType *>(input.data.data()),
-         input_length * sizeof(PType));
-  auto output_tensor = paddle_mobile_->Predict(input_tensor);
+  memcpy(input_ptr, static_cast<T *>(input.data.data()),
+         input_length * sizeof(T));
+  paddle_mobile_->Predict(input_tensor);
+  auto output_tensor = paddle_mobile_->Fetch();

  if (output_data->empty()) {
    LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
@@ -99,18 +99,18 @@ bool PaddleMobilePredictor<Dtype, P>::Run(
    output.shape.push_back(static_cast<int>(d));
  }

-  if (output.data.length() < output_length * sizeof(PType)) {
-    output.data.Resize(output_length * sizeof(PType));
+  if (output.data.length() < output_length * sizeof(T)) {
+    output.data.Resize(output_length * sizeof(T));
  }

-  memcpy(output.data.data(), output_tensor->template data<PType>(),
-         output_length * sizeof(PType));
+  memcpy(output.data.data(), output_tensor->template data<T>(),
+         output_length * sizeof(T));

  return true;
 }

-template <typename Dtype, Precision P>
-PaddleMobilePredictor<Dtype, P>::~PaddleMobilePredictor() {
+template <typename Device, typename T>
+PaddleMobilePredictor<Device, T>::~PaddleMobilePredictor() {
  paddle_mobile_->Clear();
 }

@@ -122,13 +122,13 @@ CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(
  std::unique_ptr<PaddlePredictor> x;
  if (config.precision == PaddleMobileConfig::FP32) {
    if (config.device == PaddleMobileConfig::kCPU) {
-      x.reset(new PaddleMobilePredictor<CPU, Precision::FP32>(config));
+      x.reset(new PaddleMobilePredictor<CPU, float>(config));
    } else if (config.device == PaddleMobileConfig::kFPGA) {
-      x.reset(new PaddleMobilePredictor<FPGA, Precision::FP32>(config));
+      x.reset(new PaddleMobilePredictor<FPGA, float>(config));
    } else if (config.device == PaddleMobileConfig::kGPU_MALI) {
-      x.reset(new PaddleMobilePredictor<GPU_MALI, Precision::FP32>(config));
+      x.reset(new PaddleMobilePredictor<GPU_MALI, float>(config));
    } else if (config.device == PaddleMobileConfig::kGPU_CL) {
-      x.reset(new PaddleMobilePredictor<GPU_CL, Precision::FP32>(config));
+      x.reset(new PaddleMobilePredictor<GPU_CL, float>(config));
    } else {
      LOG(kLOG_ERROR) << "unsupport device type!";
      return nullptr;

--- a/src/io/api_paddle_mobile.h
+++ b/src/io/api_paddle_mobile.h
@@ -29,7 +29,7 @@ limitations under the License. */

 namespace paddle_mobile {

-template <typename Dtype = CPU, Precision P = Precision::FP32>
+template <typename Device = CPU, typename T = float>
 class PaddleMobilePredictor : public PaddlePredictor {
 public:
  PaddleMobilePredictor() = delete;
@@ -43,7 +43,7 @@ class PaddleMobilePredictor : public PaddlePredictor {
  ~PaddleMobilePredictor() override;

 private:
-  std::unique_ptr<PaddleMobile<Dtype, P>> paddle_mobile_;
+  std::unique_ptr<PaddleMobile<Device, T>> paddle_mobile_;
  bool Init(const PaddleMobileConfig& config);

  PaddleMobileConfig config_;

--- a/src/io/ios_io/PaddleMobileCPU.mm
+++ b/src/io/ios_io/PaddleMobileCPU.mm
@@ -59,7 +59,7 @@

 @interface  PaddleMobileCPU()
 {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32> *pam_;
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU, float> *pam_;
  BOOL loaded_;
 }

@@ -73,7 +73,7 @@ static std::mutex shared_mutex;

 - (instancetype)initWithConfig:(PaddleMobileCPUConfig *)config {
  if (self = [super init]) {
-    pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, paddle_mobile::Precision::FP32>();
+    pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, float>();
    _config = config;
  }
  return self;
@@ -82,6 +82,7 @@ static std::mutex shared_mutex;
 -(instancetype)init {
  if (self = [super init]) {
    _config = [[PaddleMobileCPUConfig alloc] init];
+    pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, float>();
  }
  return self;
 }
@@ -246,7 +247,8 @@ static std::mutex shared_mutex;
  memcpy(input_ptr, input,
         numel * sizeof(float));

-  std::shared_ptr<paddle_mobile::framework::Tensor> output = pam_->Predict(input_tensor);
+  pam_->Predict(input_tensor);
+  std::shared_ptr<paddle_mobile::framework::Tensor> output = pam_->Fetch();

  float *output_pointer = new float[output->numel()];


--- a/src/io/jni/paddle_mobile_jni.cpp
+++ b/src/io/jni/paddle_mobile_jni.cpp
@@ -16,21 +16,23 @@ limitations under the License. */

 #include "paddle_mobile_jni.h"
 #include <cmath>
+#include <string>
+#include <vector>
 #include "common/log.h"
 #include "framework/tensor.h"
 #include "io/paddle_mobile.h"

 #ifdef ENABLE_EXCEPTION
-
 #include "common/enforce.h"
-
 #endif

 #ifdef __cplusplus
 extern "C" {
 #endif
+
 namespace paddle_mobile {
 namespace jni {
+
 using framework::DDim;
 using framework::Program;
 using framework::Tensor;
@@ -200,7 +202,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
    for (int i = 0; i < length; i++) {
      input_ptr[i] = dataPointer[i];
    }
-    auto output = getPaddleMobileInstance()->Predict(input);
+    getPaddleMobileInstance()->Predict(input);
+    auto output = getPaddleMobileInstance()->Fetch();
    count = output->numel();
    result = env->NewFloatArray(count);
    env->SetFloatArrayRegion(result, 0, count, output->data<float>());
@@ -233,7 +236,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
  for (int i = 0; i < length; i++) {
    input_ptr[i] = dataPointer[i];
  }
-  auto output = getPaddleMobileInstance()->Predict(input);
+  getPaddleMobileInstance()->Predict(input);
+  auto output = getPaddleMobileInstance()->Fetch();
  count = output->numel();
  result = env->NewFloatArray(count);
  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
@@ -328,7 +332,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
    for (int i = 0; i < length; i++) {
      input_ptr[i] = matrix[i];
    }
-    auto output = getPaddleMobileInstance()->Predict(input);
+    getPaddleMobileInstance()->Predict(input);
+    auto output = getPaddleMobileInstance()->Fetch();
    count = output->numel();
    result = env->NewFloatArray(count);
    env->SetFloatArrayRegion(result, 0, count, output->data<float>());
@@ -363,7 +368,8 @@ JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
  for (int i = 0; i < length; i++) {
    input_ptr[i] = matrix[i];
  }
-  auto output = getPaddleMobileInstance()->Predict(input);
+  getPaddleMobileInstance()->Predict(input);
+  auto output = getPaddleMobileInstance()->Fetch();
  count = output->numel();
  result = env->NewFloatArray(count);
  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
@@ -399,7 +405,8 @@ Java_com_baidu_paddle_PML_predictLod(JNIEnv *env, jclass thiz, jlongArray buf) {
  auto *pdata = words.mutable_data<int64_t>();
  size_t n = words.numel() * sizeof(int64_t);
  memcpy(pdata, ids.data(), n);
-  auto vec_result = paddle_mobile.PredictLod(words);
+  paddle_mobile.Predict(words);
+  auto vec_result = paddle_mobile.Fetch();
  int count = vec_result->numel();
  jlongArray result = NULL;
  ANDROIDLOGE("predict nlp size %d", count);

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -13,66 +13,68 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "io/paddle_mobile.h"
+#include <utility>
+#include "common/common.h"
 #ifdef PADDLE_MOBILE_CL
 #include <CL/cl.h>
 #include "framework/cl/cl_tensor.h"
 #endif
-#include "common/common.h"
 #include "operators/math/gemm.h"
+
 namespace paddle_mobile {

-template <typename Dtype, Precision P>
-void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::SetThreadNum(int num) {
 #ifdef _OPENMP
  omp_set_num_threads(num);
 #endif
 }

-template <typename Dtype, Precision P>
-bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
-                                  bool quantification, int batch_size,
-                                  bool loddable) {
+template <typename Device, typename T>
+PMStatus PaddleMobile<Device, T>::Load(const std::string &dirname,
+                                       bool optimize, bool quantification,
+                                       int batch_size, bool loddable) {
  if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<framework::Loader<Dtype, P>>();
+    loader_ = std::make_shared<framework::Loader<Device, T>>();
  } else {
    LOG(kLOG_INFO) << "loader inited";
  }

  if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<framework::Executor<Dtype, P>>(
+    executor_ = std::make_shared<framework::Executor<Device, T>>(
        loader_->Load(dirname, optimize, quantification), batch_size, optimize,
        loddable);
  } else {
    LOG(kLOG_INFO) << "executor inited";
  }

-  return true;
+  return PMSuccess;
 }

-template <typename Dtype, Precision P>
-bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
-                                  const std::string &para_path, bool optimize,
-                                  bool quantification, int batch_size,
-                                  bool loddable) {
+template <typename Device, typename T>
+PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path,
+                                       const std::string &para_path,
+                                       bool optimize, bool quantification,
+                                       int batch_size, bool loddable) {
  if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<framework::Loader<Dtype, P>>();
+    loader_ = std::make_shared<framework::Loader<Device, T>>();
  } else {
    LOG(kLOG_INFO) << "loader inited";
  }

  if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<framework::Executor<Dtype, P>>(
+    executor_ = std::make_shared<framework::Executor<Device, T>>(
        loader_->Load(model_path, para_path, optimize, quantification),
        batch_size, optimize, loddable);
  } else {
    LOG(kLOG_INFO) << "executor inited";
  }

-  return true;
+  return PMSuccess;
 }

-template <typename Dtype, Precision P>
-bool PaddleMobile<Dtype, P>::LoadCombinedMemory(size_t model_len,
+template <typename Dtype, typename T>
+bool PaddleMobile<Dtype, T>::LoadCombinedMemory(size_t model_len,
                                                const uint8_t *model_buf,
                                                size_t combined_params_len,
                                                uint8_t *combined_params_buf,
@@ -80,13 +82,12 @@ bool PaddleMobile<Dtype, P>::LoadCombinedMemory(size_t model_len,
                                                bool quantification, int batch_size,
                                                bool loddable) {
  if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<framework::Loader<Dtype, P>>();
+    loader_ = std::make_shared<framework::Loader<Device, T>>();
  } else {
    LOG(kLOG_INFO) << "loader inited";
  }
-
  if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<framework::Executor<Dtype, P>>(
+    executor_ = std::make_shared<framework::Executor<Device, T>>(
        loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
                                    combined_params_buf, optimize,
                                    quantification),
@@ -95,38 +96,76 @@ bool PaddleMobile<Dtype, P>::LoadCombinedMemory(size_t model_len,
    LOG(kLOG_INFO) << "executor inited";
  }

-  return true;
+  return PMSuccess;
 }
-template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::Predict(
-    const framework::Tensor &t) {
-  return executor_->Predict(t);
+
+template <typename Device, typename T>
+PMStatus PaddleMobile<Device, T>::Predict(const framework::Tensor &input) {
+  std::vector<std::pair<std::string, framework::Tensor>> inputs;
+  inputs.push_back(std::make_pair("feed", input));
+  return this->Predict(inputs);
+}
+
+template <typename Device, typename T>
+PMStatus PaddleMobile<Device, T>::Predict(const framework::LoDTensor &input) {
+  std::vector<std::pair<std::string, framework::LoDTensor>> inputs;
+  inputs.push_back(std::make_pair("feed", input));
+  return this->Predict(inputs);
 }

-template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::PredictLod(
-    const framework::LoDTensor &t) {
-  return executor_->PredictLod(t);
+template <typename Device, typename T>
+PMStatus PaddleMobile<Device, T>::Predict(
+    const std::vector<std::pair<std::string, framework::Tensor>> &inputs) {
+  return executor_->Predict(inputs);
 }

-template <typename Dtype, Precision P>
-std::vector<typename PaddleMobile<Dtype, P>::Ptype>
-PaddleMobile<Dtype, P>::Predict(const std::vector<Ptype> &input,
-                                const std::vector<int64_t> &dims) {
+template <typename Device, typename T>
+PMStatus PaddleMobile<Device, T>::Predict(
+    const std::vector<std::pair<std::string, framework::LoDTensor>> &inputs) {
+  return executor_->Predict(inputs);
+}
+
+template <typename Device, typename T>
+std::vector<T> PaddleMobile<Device, T>::Predict(
+    const std::vector<T> &input, const std::vector<int64_t> &dims) {
  return executor_->Predict(input, dims);
 }

-template <typename Dtype, Precision P>
-void PaddleMobile<Dtype, P>::Clear() {
+template <typename Device, typename T>
+PMStatus PaddleMobile<Device, T>::Predict() {
+  return executor_->Predict();
+}
+
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::Feed(const framework::Tensor &input,
+                                   const std::string &var_name) {
+  executor_->SetInput(input, var_name);
+}
+
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::Feed(const framework::LoDTensor &input,
+                                   const std::string &var_name) {
+  executor_->SetInput(input, var_name);
+}
+
+typedef std::shared_ptr<framework::LoDTensor> LoDTensorPtr;
+template <typename Device, typename T>
+LoDTensorPtr PaddleMobile<Device, T>::Fetch(const std::string &var_name) {
+  return executor_->GetOutput(var_name);
+}
+
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::Clear() {
  executor_ = nullptr;
  loader_ = nullptr;
 }
-template <typename Dtype, Precision P>
-double PaddleMobile<Dtype, P>::GetPredictTime() {}
+
+template <typename Device, typename T>
+double PaddleMobile<Device, T>::GetPredictTime() {}

 #ifdef PADDLE_MOBILE_CPU
 template <>
-double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
+double PaddleMobile<CPU, float>::GetPredictTime() {
  int m = 32;
  int n = 224 * 224;
  int k = 27;
@@ -142,14 +181,13 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
  int t1 = 1;
  int t2 = 1;
  for (int i = 0; i < m * k; ++i) {
-    unsigned int seed = 100;
-    a[i] = t1 + rand_r(&seed) % t2;
+    a[i] = t1 + rand() % t2;  // NOLINT
  }
  for (int i = 0; i < k * n; ++i) {
-    unsigned int seed = 200;
-    b[i] = t1 + rand_r(&seed) % t2;
+    b[i] = t1 + rand() % t2;  // NOLINT
  }
-  paddle_mobile::operators::math::Gemm gemm;
+
+  operators::math::Gemm gemm;
  auto time1 = paddle_mobile::time();
  gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
             static_cast<float>(0), c, ldc, false,
@@ -163,57 +201,51 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
 }
 #endif

-template <typename Dtype, Precision P>
-PaddleMobile<Dtype, P>::~PaddleMobile() {
-  executor_ = nullptr;
-  loader_ = nullptr;
-}
-
 #ifdef PADDLE_MOBILE_FPGA
-
-template <typename Dtype, Precision P>
-void PaddleMobile<Dtype, P>::InjectVariable(const framework::Tensor &t,
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::InjectVariable(const framework::Tensor &t,
                                             std::string var_name) {
  executor_->InjectVariable(t, var_name);
 }

-template <typename Dtype, Precision P>
-void PaddleMobile<Dtype, P>::FeedData(const framework::Tensor &t) {
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::FeedData(const framework::Tensor &t) {
  executor_->FeedData(t);
 }

-template <typename Dtype, Precision P>
-std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::FetchResult(int id) {
+template <typename Device, typename T>
+std::shared_ptr<framework::Tensor> PaddleMobile<Device, T>::FetchResult(
+    int id) {
  return executor_->FetchResult(id);
 }

-template <typename Dtype, Precision P>
-void PaddleMobile<Dtype, P>::Predict_From_To(int start, int end) {
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::Predict_From_To(int start, int end) {
  executor_->Predict_From_To(start, end);
 }

-template <typename Dtype, Precision P>
-void PaddleMobile<Dtype, P>::Predict_From(int start) {
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::Predict_From(int start) {
  executor_->Predict_From(start);
 }

-template <typename Dtype, Precision P>
-void PaddleMobile<Dtype, P>::Predict_To(int end) {
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::Predict_To(int end) {
  executor_->Predict_To(end);
 }
 #endif

 #ifdef PADDLE_MOBILE_CL
 static std::mutex lc;
-template <typename Dtype, Precision P>
-void PaddleMobile<Dtype, P>::SetCLPath(std::string path) {
+template <typename Device, typename T>
+void PaddleMobile<Device, T>::SetCLPath(std::string path) {
  std::lock_guard<std::mutex> lock(lc);
  if (framework::CLEngine::Instance()->GetCLPath() == "") {
    framework::CLEngine::Instance()->setClPath(path);
  }
 }
 template <>
-double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() {
+double PaddleMobile<GPU_CL, float>::GetPredictTime() {
  cl_int status;
  cl_uint nPlatform;
  clGetPlatformIDs(0, NULL, &nPlatform);
@@ -411,8 +443,8 @@ double PaddleMobile<GPU_CL, Precision::FP32>::GetPredictTime() {
    return -1;
  }
 }
-template <typename Dtype, Precision P>
-int PaddleMobile<Dtype, P>::readText(
+template <typename Device, typename T>
+int PaddleMobile<Device, T>::readText(
    const char *kernelPath,
    char **pcode) {  // 读取文本文件放入 pcode，返回字符串长度
  FILE *fp;
@@ -441,13 +473,11 @@ int PaddleMobile<Dtype, P>::readText(
  fclose(fp);
  return size + 1;
 }
-
 #endif

-template class PaddleMobile<CPU, Precision::FP32>;
-template class PaddleMobile<FPGA, Precision::FP32>;
-template class PaddleMobile<GPU_MALI, Precision::FP32>;
-
-template class PaddleMobile<GPU_CL, Precision::FP32>;
+template class PaddleMobile<CPU, float>;
+template class PaddleMobile<FPGA, float>;
+template class PaddleMobile<GPU_MALI, float>;
+template class PaddleMobile<GPU_CL, float>;

 }  // namespace paddle_mobile
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 #ifdef _OPENMP
 #include <omp.h>
@@ -32,44 +33,53 @@ limitations under the License. */

 namespace paddle_mobile {

-template <typename Dtype = CPU, Precision P = Precision::FP32>
+template <typename Device, typename T = float>
 class PaddleMobile {
-  typedef typename PrecisionTrait<P>::ptype Ptype;
-
 public:
  PaddleMobile() {
 #ifndef PADDLE_MOBILE_CL
-    bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Dtype>::value;
-    PADDLE_MOBILE_ENFORCE(!is_gpu,
-                          "Not Enable GPU in CmakeList but run gpu codes ");
+    bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value;
+    PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on");
 #endif
  }
-  bool Load(const std::string &dirname, bool optimize = false,
-            bool quantification = false, int batch_size = 1,
-            bool loddable = false);
+  ~PaddleMobile() {}

-  bool Load(const std::string &model_path, const std::string &para_path,
-            bool optimize = false, bool quantification = false,
-            int batch_size = 1, bool loddable = false);
+  PMStatus Load(const std::string &dirname, const bool optimize = false,
+                const bool quantification = false, const int batch_size = 1,
+                const bool lod = false);
+  PMStatus Load(const std::string &model_path, const std::string &para_path,
+                const bool optimize = false, const bool quantification = false,
+                const int batch_size = 1, const bool lod = false);

-  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
+  PMStatus Predict(const framework::Tensor &input);
+  PMStatus Predict(const framework::LoDTensor &input);

-  std::shared_ptr<framework::Tensor> PredictLod(const framework::LoDTensor &t);
+  PMStatus Predict(
+      const std::vector<std::pair<std::string, framework::Tensor>> &inputs);
+  PMStatus Predict(
+      const std::vector<std::pair<std::string, framework::LoDTensor>> &inputs);

-  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
+  std::vector<T> Predict(const std::vector<T> &input,
                         const std::vector<int64_t> &dims);
+  PMStatus Predict();
+
+  void Feed(const framework::LoDTensor &input, const std::string &var_name);
+  void Feed(const framework::Tensor &input, const std::string &var_name);
+
+  typedef std::shared_ptr<framework::LoDTensor> LoDTensorPtr;
+  LoDTensorPtr Fetch(const std::string &var_name);
+
+  LoDTensorPtr Fetch() { return Fetch("fetch"); }

  bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
                          size_t combined_params_len,
                          uint8_t *combined_params_buf, bool optimize = false, bool quantification = false,
                          int batch_size = 1, bool loddable = false);

-  void SetThreadNum(int num);
+  void SetThreadNum(int count);
  void Clear();
  double GetPredictTime();

-  ~PaddleMobile();
-
 #ifdef PADDLE_MOBILE_FPGA
  void InjectVariable(const framework::Tensor &t, std::string var_name);
  void FeedData(const framework::Tensor &t);
@@ -80,15 +90,15 @@ class PaddleMobile {
 #endif

 #ifdef PADDLE_MOBILE_CL
- public:
+ public:  // NOLINT
  void SetCLPath(std::string cl_path);
  int readText(const char *kernelPath,
               char **pcode);  // 读取文本文件放入 pcode，返回字符串长度
 #endif

 private:
-  std::shared_ptr<framework::Loader<Dtype, P>> loader_;
-  std::shared_ptr<framework::Executor<Dtype, P>> executor_;
+  std::shared_ptr<framework::Loader<Device, T>> loader_;
+  std::shared_ptr<framework::Executor<Device, T>> executor_;
 };

 }  // namespace paddle_mobile
--- a/src/io/paddle_test_inference_api.cpp
+++ b/src/io/paddle_test_inference_api.cpp
@@ -14,10 +14,12 @@ limitations under the License. */

 #include "io/paddle_test_inference_api.h"
 #include "io/paddle_mobile.h"
+
 namespace paddle_mobile {
-template <typename Dtype, Precision P>
-double PaddleTester<Dtype, P>::CaculatePredictTime(std::string *cl_path) {
-  PaddleMobile<Dtype, P> paddle_mobile;
+
+template <typename Device, typename T>
+double PaddleTester<Device, T>::CaculatePredictTime(std::string *cl_path) {
+  PaddleMobile<Device, T> paddle_mobile;
 #ifdef PADDLE_MOBILE_CL
  if (cl_path) {
    paddle_mobile.SetCLPath(*cl_path);
@@ -26,10 +28,10 @@ double PaddleTester<Dtype, P>::CaculatePredictTime(std::string *cl_path) {
 #endif
  return paddle_mobile.GetPredictTime();
 }
-template class PaddleTester<CPU, Precision::FP32>;
-template class PaddleTester<FPGA, Precision::FP32>;
-template class PaddleTester<GPU_MALI, Precision::FP32>;
+template class PaddleTester<CPU, float>;
+template class PaddleTester<FPGA, float>;
+template class PaddleTester<GPU_MALI, float>;

-template class PaddleTester<GPU_CL, Precision::FP32>;
+template class PaddleTester<GPU_CL, float>;

 }  // namespace paddle_mobile
--- a/src/io/paddle_test_inference_api.h
+++ b/src/io/paddle_test_inference_api.h
@@ -20,10 +20,13 @@ limitations under the License. */
 */

 #pragma once
+
 #include "common/types.h"
 #include "string"
+
 namespace paddle_mobile {
-template <typename Dtype, Precision P = Precision::FP32>
+
+template <typename Device, typename T = float>
 class PaddleTester {
 public:
  double CaculatePredictTime(std::string *cl_path = nullptr);

--- a/src/operators/kernel/dequant_add_bn_kernel.h
+++ b/src/operators/kernel/dequant_add_bn_kernel.h
@@ -12,26 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#pragma once
+#ifdef CAST_OP

-#ifdef FUSION_DEQUANT_ADD_BN_OP
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
+#include "operators/cast_op.h"

 namespace paddle_mobile {
 namespace operators {

 template <typename DeviceType, typename T>
-class FusionDequantAddBNKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     FusionDequantAddBNParam<DeviceType>> {
- public:
-  void Compute(const FusionDequantAddBNParam<DeviceType> &param);
-  bool Init(FusionDequantAddBNParam<DeviceType> *param);
-};
+void CastOp<DeviceType, T>::InferShape() const {
+  const auto &dims = this->param_.input_->dims();
+  this->param_.output_->Resize(dims);
+}

 }  // namespace operators
 }  // namespace paddle_mobile

+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(cast, ops::CastOp);
 #endif
+
+#endif  // CAST_OP
--- a/src/operators/fusion_fc_int8_op.h
+++ b/src/operators/fusion_fc_int8_op.h
@@ -12,39 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef FUSION_FC_INT8_OP
+#ifdef CAST_OP

 #pragma once

 #include <string>
-#include <vector>
-
 #include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/fusion_fc_kernel.h"
+#include "operators/kernel/kernels.h"
 #include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {

 template <typename DeviceType, typename T>
-class FusionFcInt8Op
-    : public framework::OperatorWithKernel<DeviceType,
-                                           FusionFcParam<DeviceType>,
-                                           FusionFcKernel<DeviceType, T>> {
+class CastOp : public framework::OperatorWithKernel<
+                   DeviceType, CastParam<DeviceType>,
+                   operators::CastKernel<DeviceType, T>> {
 public:
-  FusionFcInt8Op(const std::string &type, const VariableNameMap &inputs,
-                 const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs,
+  CastOp(const std::string &type, const VariableNameMap &inputs,
+         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
         std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType, FusionFcParam<DeviceType>,
-                                      FusionFcKernel<DeviceType, T>>(
+      : framework::OperatorWithKernel<DeviceType, CastParam<DeviceType>,
+                                      operators::CastKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
-
+  // inference output shape
  void InferShape() const override;
 };

 }  // namespace operators
 }  // namespace paddle_mobile

-#endif  // FUSION_FC_INT8_OP
+#endif  // CAST_OP
--- a/src/operators/dequantize_op.cpp
+++ b/src/operators/dequantize_op.cpp
@@ -33,4 +33,4 @@ namespace ops = paddle_mobile::operators;
 REGISTER_OPERATOR_CPU(dequantize, ops::DequantizeOp);
 #endif

-#endif
+#endif  // DEQUANT_OP
--- a/src/operators/dequantize_op.h
+++ b/src/operators/dequantize_op.h
@@ -44,4 +44,4 @@ class DequantizeOp
 }  // namespace operators
 }  // namespace paddle_mobile

-#endif
+#endif  // DEQUANT_OP
--- a/src/operators/fill_constant_op.h
+++ b/src/operators/fill_constant_op.h
@@ -25,12 +25,11 @@ limitations under the License. */

 namespace paddle_mobile {
 namespace operators {
-using std::string;

 template <typename DeviceType, typename T>
 class FillConstantOp : public framework::OperatorBase<DeviceType> {
 public:
-  FillConstantOp(const string &type, const VariableNameMap &inputs,
+  FillConstantOp(const std::string &type, const VariableNameMap &inputs,
                 const VariableNameMap &outputs,
                 const framework::AttributeMap attrs,
                 std::shared_ptr<framework::Scope> scope)
@@ -58,7 +57,7 @@ class FillConstantOp : public framework::OperatorBase<DeviceType> {
    tensor->Resize(framework::make_ddim(param_.Shape()));
    tensor->mutable_data(framework::ToTypeIndex(data_type));

-    math::set_constant(tensor, value);
+    math::SetConstant(tensor, value);
  }

  void Init() {}

--- a/src/operators/fusion_conv_add_relu_int8_op.cpp
+++ b/src/operators/fusion_conv_add_relu_int8_op.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDRELU_INT8_OP
-
-#include "operators/fusion_conv_add_relu_int8_op.h"
-#include <vector>
-#include "operators/math/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvAddReluInt8Op<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
-  }
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU_INT8(fusion_conv_add_relu_int8,
-                           ops::FusionConvAddReluInt8Op);
-#endif
-#endif  // FUSION_CONVADDRELU_INT8_OP
--- a/src/operators/fusion_dequant_add_bn_op.h
+++ b/src/operators/fusion_dequant_add_bn_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_add_bn_kernel.h"
+#include "operators/kernel/dequant_bn_kernel.h"
 #include "operators/op_param.h"

 namespace paddle_mobile {
@@ -43,7 +43,8 @@ class FusionDequantAddBNMatcher : public framework::FusionOpMatcher {
                   {{"Scale", "BNScale"},
                    {"Mean", "BNMean"},
                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"}}}},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
                 removed_nodes);
  }


--- a/src/operators/fusion_dequant_add_bn_relu_op.h
+++ b/src/operators/fusion_dequant_add_bn_relu_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_bn_relu_kernel.h"
+#include "operators/kernel/dequant_bn_kernel.h"
 #include "operators/op_param.h"

 namespace paddle_mobile {
@@ -44,7 +44,8 @@ class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher {
                   {{"Scale", "BNScale"},
                    {"Mean", "BNMean"},
                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"}}}},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
                 removed_nodes);
  }

@@ -54,7 +55,7 @@ class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher {
 template <typename DeviceType, typename T>
 class FusionDequantAddBNReluOp
    : public framework::OperatorWithKernel<
-          DeviceType, FusionDequantAddBNReluParam<DeviceType>,
+          DeviceType, FusionDequantAddBNParam<DeviceType>,
          operators::FusionDequantAddBNReluKernel<DeviceType, T>> {
 public:
  FusionDequantAddBNReluOp(const std::string &type,
@@ -63,7 +64,7 @@ class FusionDequantAddBNReluOp
                           const framework::AttributeMap &attrs,
                           std::shared_ptr<framework::Scope> scope)
      : framework::OperatorWithKernel<
-            DeviceType, FusionDequantAddBNReluParam<DeviceType>,
+            DeviceType, FusionDequantAddBNParam<DeviceType>,
            operators::FusionDequantAddBNReluKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
  // inference output shape

--- a/src/operators/fusion_fc_int8_op.cpp
+++ b/src/operators/fusion_fc_int8_op.cpp
@@ -12,50 +12,51 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef FUSION_FC_INT8_OP
-
-#include "operators/fusion_fc_int8_op.h"
+#include "operators/fusion_dequant_add_bn_relu_quant_op.h"

+#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
 namespace paddle_mobile {
 namespace operators {

 template <typename Dtype, typename T>
-void FusionFcInt8Op<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  auto y_dims = this->param_.InputY()->dims();
-  int x_num_col_dims = this->param_.XNumColDims();
-  int y_num_col_dims = this->param_.YNumColDims();
-
-  assert(x_dims.size() > x_num_col_dims);
-  assert(y_dims.size() > y_num_col_dims);
-
-  /// (1,2,3,4) , x_num_col_dims = 2  -> (2,12)
-  auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
-  auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
+void FusionDequantAddBNReluQuantOp<Dtype, T>::InferShape() const {
+  const auto& input_dims = this->param_.input_->dims();
+  this->param_.output_->Resize(input_dims);
+}

-  assert(x_mat_dims[1] == y_mat_dims[0]);
+}  // namespace operators
+}  // namespace paddle_mobile

-  std::vector<int64_t> output_dims;
-  output_dims.reserve(
-      static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
+namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_relu_quant,
+                        ops::FusionDequantAddBNReluQuantMatcher);

-  for (int i = 0; i < x_num_col_dims; ++i) {
-    output_dims.push_back(x_dims[i]);
-  }
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_relu_quant,
+                      ops::FusionDequantAddBNReluQuantOp);
+#endif
+#endif  // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP

-  for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
-    output_dims.push_back(y_dims[i]);
-  }
+#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
+namespace paddle_mobile {
+namespace operators {

-  framework::DDim ddim = framework::make_ddim(output_dims);
-  this->param_.Out()->Resize(ddim);
+template <typename Dtype, typename T>
+void FusionDequantAddBNQuantOp<Dtype, T>::InferShape() const {
+  const auto& input_dims = this->param_.input_->dims();
+  this->param_.output_->Resize(input_dims);
 }

 }  // namespace operators
 }  // namespace paddle_mobile

 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_quant,
+                        ops::FusionDequantAddBNQuantMatcher);
+
 #ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU_INT8(fusion_fc_int8, ops::FusionFcInt8Op);
+REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_quant,
+                      ops::FusionDequantAddBNQuantOp);
 #endif
-#endif  // FUSION_FC_INT8_OP
+
+#endif  // FUSION_DEQUANT_ADD_BN_QUANT_OP
--- a/src/operators/fusion_dequant_add_bn_relu_quant_op.h
+++ b/src/operators/fusion_dequant_add_bn_relu_quant_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/dequant_bn_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
+class FusionDequantAddBNReluQuantMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDequantAddBNReluQuantMatcher() {
+    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_RELU) >
+        std::make_shared<framework::Node>(G_OP_TYPE_QUANTIZE);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
+                  {G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "BNScale"},
+                    {"Mean", "BNMean"},
+                    {"Bias", "BNBias"},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT; }
+};
+
+template <typename DeviceType, typename T>
+class FusionDequantAddBNReluQuantOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionDequantAddBNReluQuantParam<DeviceType>,
+          operators::FusionDequantAddBNReluQuantKernel<DeviceType, T>> {
+ public:
+  FusionDequantAddBNReluQuantOp(const std::string &type,
+                                const VariableNameMap &inputs,
+                                const VariableNameMap &outputs,
+                                const framework::AttributeMap &attrs,
+                                std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDequantAddBNReluQuantParam<DeviceType>,
+            operators::FusionDequantAddBNReluQuantKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  // inference output shape
+  void InferShape() const override;
+};
+#endif  // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
+
+#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
+class FusionDequantAddBNQuantMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDequantAddBNQuantMatcher() {
+    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
+        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
+        std::make_shared<framework::Node>(G_OP_TYPE_QUANTIZE);
+  }
+
+  void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
+                  {G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "BNScale"},
+                    {"Mean", "BNMean"},
+                    {"Bias", "BNBias"},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT; }
+};
+
+template <typename DeviceType, typename T>
+class FusionDequantAddBNQuantOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionDequantAddBNQuantParam<DeviceType>,
+          operators::FusionDequantAddBNQuantKernel<DeviceType, T>> {
+ public:
+  FusionDequantAddBNQuantOp(const std::string &type,
+                            const VariableNameMap &inputs,
+                            const VariableNameMap &outputs,
+                            const framework::AttributeMap &attrs,
+                            std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDequantAddBNQuantParam<DeviceType>,
+            operators::FusionDequantAddBNQuantKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  // inference output shape
+  void InferShape() const override;
+};
+#endif  // FUSION_DEQUANT_ADD_BN_QUANT_OP
+
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/fusion_dequant_bn_relu_op.cpp
+++ b/src/operators/fusion_dequant_bn_relu_op.cpp
@@ -12,28 +12,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef FUSION_DEQUANT_BN_RELU_OP
-
-#include "operators/fusion_dequant_bn_relu_op.h"
+#include "operators/fusion_dequant_bn_op.h"

 namespace paddle_mobile {
 namespace operators {

+#ifdef FUSION_DEQUANT_BN_OP
+template <typename Dtype, typename T>
+void FusionDequantBNOp<Dtype, T>::InferShape() const {
+  const auto& input_dims = this->param_.input_->dims();
+  this->param_.output_->Resize(input_dims);
+}
+#endif  // FUSION_DEQUANT_BN_OP
+
+#ifdef FUSION_DEQUANT_BN_RELU_OP
 template <typename Dtype, typename T>
 void FusionDequantBNReluOp<Dtype, T>::InferShape() const {
  const auto& input_dims = this->param_.input_->dims();
  this->param_.output_->Resize(input_dims);
 }
+#endif  // FUSION_DEQUANT_BN_RELU_OP

 }  // namespace operators
 }  // namespace paddle_mobile

 namespace ops = paddle_mobile::operators;
+
+#ifdef FUSION_DEQUANT_BN_OP
+REGISTER_FUSION_MATCHER(fusion_dequant_bn, ops::FusionDequantBNMatcher);
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(fusion_dequant_bn, ops::FusionDequantBNOp);
+#endif  // PADDLE_MOBILE_CPU
+#endif  // FUSION_DEQUANT_BN_OP
+
+#ifdef FUSION_DEQUANT_BN_RELU_OP
 REGISTER_FUSION_MATCHER(fusion_dequant_bn_relu,
                        ops::FusionDequantBNReluMatcher);
-
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_dequant_bn_relu, ops::FusionDequantBNReluOp);
-#endif
-
-#endif
+#endif  // PADDLE_MOBILE_CPU
+#endif  // FUSION_DEQUANT_BN_RELU_OP
--- a/src/operators/fusion_dequant_bn_op.h
+++ b/src/operators/fusion_dequant_bn_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/dequant_bn_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_BN_RELU_OP)
+class FusionDequantBNMatcher : public framework::FusionOpMatcher {
+ public:
+  FusionDequantBNMatcher() {
+    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
+  }
+
+  virtual void FolderNodes(
+      framework::Node *node,
+      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
+    node->Folder(node_.Depth(), Type(),
+                 {{G_OP_TYPE_BATCHNORM,
+                   {{"Scale", "BNScale"},
+                    {"Mean", "BNMean"},
+                    {"Bias", "BNBias"},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
+                 removed_nodes);
+  }
+
+  std::string Type() override { return G_OP_TYPE_FUSION_DEQUANT_BN; }
+};
+#endif  // FUSION_DEQUANT_BN_OP || FUSION_DEQUANT_BN_RELU_OP
+
+#ifdef FUSION_DEQUANT_BN_OP
+template <typename DeviceType, typename T>
+class FusionDequantBNOp : public framework::OperatorWithKernel<
+                              DeviceType, FusionDequantBNParam<DeviceType>,
+                              operators::FusionDequantBNKernel<DeviceType, T>> {
+ public:
+  FusionDequantBNOp(const std::string &type, const VariableNameMap &inputs,
+                    const VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs,
+                    std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDequantBNParam<DeviceType>,
+            operators::FusionDequantBNKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  // inference output shape
+  void InferShape() const override;
+};
+#endif  // FUSION_DEQUANT_BN_OP
+
+#ifdef FUSION_DEQUANT_BN_RELU_OP
+class FusionDequantBNReluMatcher : public FusionDequantBNMatcher {
+ public:
+  FusionDequantBNReluMatcher() : FusionDequantBNMatcher() {
+    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
+  }
+
+  virtual std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_BN_RELU; }
+};
+
+template <typename DeviceType, typename T>
+class FusionDequantBNReluOp
+    : public framework::OperatorWithKernel<
+          DeviceType, FusionDequantBNParam<DeviceType>,
+          operators::FusionDequantBNReluKernel<DeviceType, T>> {
+ public:
+  FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs,
+                        const VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs,
+                        std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, FusionDequantBNParam<DeviceType>,
+            operators::FusionDequantBNReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  void InferShape() const override;
+};
+#endif  // FUSION_DEQUANT_BN_RELU_OP
+
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/fusion_dequant_bn_relu_op.h
+++ b/src/operators/fusion_dequant_bn_relu_op.h
@@ -42,7 +42,8 @@ class FusionDequantBNReluMatcher : public framework::FusionOpMatcher {
                   {{"Scale", "BNScale"},
                    {"Mean", "BNMean"},
                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"}}}},
+                    {"Variance", "BNVariance"},
+                    {"Y", "Out"}}}},
                 removed_nodes);
  }


--- a/src/operators/gru_op.cpp
+++ b/src/operators/gru_op.cpp
@@ -14,19 +14,15 @@ limitations under the License. */

 #ifdef GRU_OP

+#include "operators/gru_op.h"
 #include <vector>
-
 #include "common/enforce.h"
-#include "operators/gru_op.h"

 namespace paddle_mobile {
 namespace operators {

 template <typename Dtype, typename T>
 void GruOp<Dtype, T>::InferShape() const {
-  auto lod_size = this->param_.InputInput()->lod().size();
-  PADDLE_MOBILE_ENFORCE((lod_size == 1),
-                        "Current LoD only supports one dimension.");
  auto input_dims = this->param_.InputInput()->dims();
  auto weight_dims = this->param_.InputWeight()->dims();
  int input_size = input_dims[1];

--- a/src/operators/im2sequence_op.cpp
+++ b/src/operators/im2sequence_op.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef IM2SEQUENCE_OP

 #include "operators/im2sequence_op.h"
+#include <vector>

 namespace paddle_mobile {
 namespace operators {
@@ -29,20 +30,16 @@ int Im2SequenceOutputSize(int input_size, int kernel, int padding_1,
 template <typename Dtype, typename T>
 void Im2SequenceOp<Dtype, T>::InferShape() const {
  auto in_x_dims = this->param_.Input()->dims();
-
  const std::vector<int> &kernels = this->param_.Kernels();
-
  const std::vector<int> &strides = this->param_.Strides();
-
  std::vector<int> paddings = this->param_.Paddings();
-
  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+
  for (size_t i = 0; i < strides.size(); ++i) {
    output_shape.push_back(Im2SequenceOutputSize(in_x_dims[i + 2], kernels[i],
                                                 paddings[i], paddings[i + 2],
                                                 strides[i]));
  }
-
  framework::DDim ddim = framework::make_ddim(output_shape);
  this->param_.Output()->Resize(ddim);
 }
@@ -54,9 +51,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(im2sequence, ops::Im2SequenceOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif

-#endif
+#endif  // IM2SEQUENCE_OP
--- a/src/operators/math/pool_3x3.h
+++ b/src/operators/math/pool_3x3.h
@@ -12,39 +12,63 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef POOL_OP
+#ifdef CAST_OP

-#pragma once
-#ifdef _OPENMP
-#include <omp.h>
-#endif
 #include <algorithm>
 #include <vector>
-#include "framework/tensor.h"
-#if __ARM_NEON
-#include <arm_neon.h>
-#endif  // __ARM_NEON
+#include "framework/data_type.h"
+#include "operators/kernel/kernels.h"

 namespace paddle_mobile {
 namespace operators {
-namespace math {
-void Pool3x3Avgs1p1(const framework::Tensor *input, framework::Tensor *output);
-void Pool3x3Maxs1p1(const framework::Tensor *input, framework::Tensor *output);
-void Pool3x3Max(std::vector<int> strides, std::vector<int> paddings,
-                const framework::Tensor *input, framework::Tensor *output);
-
-void Pool3x3Avg(std::vector<int> strides, std::vector<int> paddings,
-                const framework::Tensor *in_x, framework::Tensor *out);
-
-void Pool3x3Maxs1_int8(const framework::Tensor *input,
-                       framework::Tensor *output, int32_t pad_h, int32_t pad_w);
-void Pool3x3Maxs2_int8(const framework::Tensor *input,
-                       framework::Tensor *output, int32_t pad_h, int32_t pad_w);
-void Pool3x3Max_int8(const std::vector<int> &strides,
-                     const std::vector<int> &paddings,
-                     const framework::Tensor *input, framework::Tensor *output);
-}  // namespace math
+
+template <typename InT>
+struct CastOutOpFunctor {
+  const framework::Tensor* in_;
+  framework::Tensor* out_;
+  CastOutOpFunctor(const framework::Tensor* in, framework::Tensor* out)
+      : in_(in), out_(out) {}
+
+  template <typename OutT>
+  void apply() const {
+    const InT* input = in_->data<InT>();
+    OutT* output = out_->mutable_data<OutT>();
+    size_t numel = in_->numel();
+    for (int i = 0; i < numel; ++i) {
+      output[i] = static_cast<OutT>(input[i]);
+    }
+  }
+};
+
+struct CastOpFunctor {
+  const framework::Tensor* in_;
+  framework::Tensor* out_;
+  int output_type_;
+  CastOpFunctor(const framework::Tensor* in, framework::Tensor* out,
+                const int output_type)
+      : in_(in), out_(out), output_type_(output_type) {}
+
+  template <typename InT>
+  void apply() const {
+    framework::VisitDataType(framework::ToDataType(output_type_),
+                             CastOutOpFunctor<InT>(in_, out_));
+  }
+};
+
+template <>
+bool CastKernel<CPU, float>::Init(CastParam<CPU>* param) {
+  return true;
+}
+
+template <>
+void CastKernel<CPU, float>::Compute(const CastParam<CPU>& param) {
+  const Tensor* input = param.input_;
+  Tensor* output = param.output_;
+  framework::VisitDataType(framework::ToDataType(param.input_type_),
+                           CastOpFunctor(input, output, param.output_type_));
+}
+
 }  // namespace operators
 }  // namespace paddle_mobile

-#endif
+#endif  // CAST_OP
--- a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBNRELU_OP

 #include "operators/kernel/conv_add_bn_relu_kernel.h"
+#include <math.h>
 #include "operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h"

 namespace paddle_mobile {

--- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
@@ -32,20 +32,6 @@ void ConvAddReluKernel<CPU, float>::Compute(
 }
 template class ConvAddReluKernel<CPU, float>;

-#ifdef FUSION_CONVADDRELU_INT8_OP
-template <>
-bool ConvAddReluKernel<CPU, int8_t>::Init(FusionConvAddReluParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ConvAddReluKernel<CPU, int8_t>::Compute(
-    const FusionConvAddReluParam<CPU> &param) {
-  ConvAddReluCompute<int8_t, int32_t>(param);
-}
-template class ConvAddReluKernel<CPU, int8_t>;
-#endif
-
 }  // namespace operators
 }  // namespace paddle_mobile


--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -55,10 +55,9 @@ bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
               param->Input()->dims()[2] <= 140 /* refered from ncnn */) {
      param->ExecMode() = ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT;
      // transform weight
-      framework::Tensor transformed_weight;
-      operators::math::winograd_transform_weight<8, 3>(*param->Filter(),
-                                                       &transformed_weight);
-      framework::TensorCopy(transformed_weight, param->Filter());
+      param->transformed_filter_ = new framework::Tensor;
+      operators::math::winograd_transform_weight<8, 3>(
+          *param->Filter(), param->transformed_filter_);
 #endif
    } else {
      param->ExecMode() = ConvParam<CPU>::EXEC_GEMM_FLOAT;

--- a/src/operators/kernel/arm/dequant_add_bn_kernel.cpp
+++ b/src/operators/kernel/arm/dequant_add_bn_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DEQUANT_ADD_BN_OP
-
-#include "operators/kernel/dequant_add_bn_kernel.h"
-#include <cmath>
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FusionDequantAddBNKernel<CPU, float>::Init(
-    FusionDequantAddBNParam<CPU> *param) {
-  // elementwise add params
-  const Tensor *bias = param->bias_;
-  // batch norm params
-  const Tensor *bn_mean = param->bn_mean_;
-  const Tensor *bn_variance = param->bn_variance_;
-  Tensor *bn_scale = param->bn_scale_;
-  Tensor *bn_bias = param->bn_bias_;
-  const float epsilon = param->epsilon_;
-
-  const float *bias_ptr = bias->data<float>();
-  const float *mean_ptr = bn_mean->data<float>();
-  const float *var_ptr = bn_variance->data<float>();
-  float *bn_scale_ptr = bn_scale->mutable_data<float>();
-  float *bn_bias_ptr = bn_bias->mutable_data<float>();
-  for (int c = 0; c < bn_scale->numel(); ++c) {
-    float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
-    bn_scale_ptr[c] = inv_scale;
-    bn_bias_ptr[c] = inv_scale * (bias_ptr[c] - mean_ptr[c]) + bn_bias_ptr[c];
-  }
-  return true;
-}
-
-template <>
-void FusionDequantAddBNKernel<CPU, float>::Compute(
-    const FusionDequantAddBNParam<CPU> &param) {
-  const int32_t *input = param.input_->data<int32_t>();
-  const float *bn_scale = param.bn_scale_->data<float>();
-  const float *bn_bias = param.bn_bias_->data<float>();
-  // dequantize params
-  const float activation_scale = param.activation_scale_->data<float>()[0];
-  const float weight_scale = param.weight_scale_;
-  const float dequant_scale = activation_scale / weight_scale;
-
-  float *output = param.output_->mutable_data<float>();
-  int batch_size = param.input_->dims()[0];
-  int channels = param.input_->dims()[1];
-  size_t spatial_size = param.input_->dims()[2] * param.input_->dims()[3];
-
-  #pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int c = 0; c < channels; ++c) {
-      float scale = bn_scale[c] * dequant_scale;
-      float bias = bn_bias[c];
-      size_t offset = (batch * channels + c) * spatial_size;
-      const int32_t *x = input + offset;
-      float *y = output + offset;
-      size_t remain = spatial_size;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      int loop = spatial_size >> 4;
-      remain = spatial_size & 0xF;
-      float32x4_t __scale = vdupq_n_f32(scale);
-      float32x4_t __bias = vdupq_n_f32(bias);
-
-      for (int k = 0; k < loop; ++k, x += 16, y += 16) {
-        int32x4_t r0 = vld1q_s32(x);
-        int32x4_t r1 = vld1q_s32(x + 4);
-        int32x4_t r2 = vld1q_s32(x + 8);
-        int32x4_t r3 = vld1q_s32(x + 12);
-        float32x4_t f0 = vcvtq_f32_s32(r0);
-        float32x4_t f1 = vcvtq_f32_s32(r1);
-        float32x4_t f2 = vcvtq_f32_s32(r2);
-        float32x4_t f3 = vcvtq_f32_s32(r3);
-        f0 = vmlaq_f32(__bias, __scale, f0);
-        f1 = vmlaq_f32(__bias, __scale, f1);
-        f2 = vmlaq_f32(__bias, __scale, f2);
-        f3 = vmlaq_f32(__bias, __scale, f3);
-        vst1q_f32(y, f0);
-        vst1q_f32(y + 4, f1);
-        vst1q_f32(y + 8, f2);
-        vst1q_f32(y + 12, f3);
-      }
-#endif  // __ARM_NEON__
-      for (int k = 0; k < remain; ++k) {
-        y[k] = scale * x[k] + bias;
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_DEQUANT_ADD_BN_OP
--- a/src/operators/kernel/arm/dequant_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/dequant_bn_relu_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/dequant_bn_relu_kernel.h"
-#include <cmath>
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-#if defined(FUSION_DEQUANT_BN_RELU_OP) || defined(FUSION_DEQUANT_ADD_BN_RELU_OP)
-void DequantBNReluCompute(const FusionDequantBNParam<CPU> *param) {
-  const int32_t *input = param->input_->data<int32_t>();
-  const float *bn_scale = param->bn_scale_->data<float>();
-  const float *bn_bias = param->bn_bias_->data<float>();
-  // dequantize params
-  const float activation_scale = param->activation_scale_->data<float>()[0];
-  const float weight_scale = param->weight_scale_;
-  const float dequant_scale = activation_scale / weight_scale;
-
-  float *output = param->output_->mutable_data<float>();
-  int batch_size = param->input_->dims()[0];
-  int channels = param->input_->dims()[1];
-  size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
-
-  #pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int c = 0; c < channels; ++c) {
-      float scale = bn_scale[c] * dequant_scale;
-      float bias = bn_bias[c];
-      size_t offset = (batch * channels + c) * spatial_size;
-      const int32_t *x = input + offset;
-      float *y = output + offset;
-      size_t remain = spatial_size;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      int loop = spatial_size >> 4;
-      remain = spatial_size & 0xF;
-      float32x4_t __scale = vdupq_n_f32(scale);
-      float32x4_t __bias = vdupq_n_f32(bias);
-      float32x4_t __zero = vdupq_n_f32(0.f);
-
-      for (int k = 0; k < loop; ++k, x += 16, y += 16) {
-        int32x4_t r0 = vld1q_s32(x);
-        int32x4_t r1 = vld1q_s32(x + 4);
-        int32x4_t r2 = vld1q_s32(x + 8);
-        int32x4_t r3 = vld1q_s32(x + 12);
-        float32x4_t f0 = vcvtq_f32_s32(r0);
-        float32x4_t f1 = vcvtq_f32_s32(r1);
-        float32x4_t f2 = vcvtq_f32_s32(r2);
-        float32x4_t f3 = vcvtq_f32_s32(r3);
-        f0 = vmlaq_f32(__bias, __scale, f0);
-        f1 = vmlaq_f32(__bias, __scale, f1);
-        f2 = vmlaq_f32(__bias, __scale, f2);
-        f3 = vmlaq_f32(__bias, __scale, f3);
-        f0 = vmaxq_f32(__zero, f0);
-        f1 = vmaxq_f32(__zero, f1);
-        f2 = vmaxq_f32(__zero, f2);
-        f3 = vmaxq_f32(__zero, f3);
-        vst1q_f32(y, f0);
-        vst1q_f32(y + 4, f1);
-        vst1q_f32(y + 8, f2);
-        vst1q_f32(y + 12, f3);
-      }
-#endif  // __ARM_NEON__
-      for (int k = 0; k < remain; ++k) {
-        y[k] = std::max(scale * x[k] + bias, 0.f);
-      }
-    }
-  }
-}
-#endif
-
-#ifdef FUSION_DEQUANT_BN_RELU_OP
-template <>
-bool FusionDequantBNReluKernel<CPU, float>::Init(
-    FusionDequantBNReluParam<CPU> *param) {
-  // batch norm params
-  const Tensor *bn_mean = param->bn_mean_;
-  const Tensor *bn_variance = param->bn_variance_;
-  Tensor *bn_scale = param->bn_scale_;
-  Tensor *bn_bias = param->bn_bias_;
-  const float epsilon = param->epsilon_;
-
-  const float *mean_ptr = bn_mean->data<float>();
-  const float *var_ptr = bn_variance->data<float>();
-  float *bn_scale_ptr = bn_scale->mutable_data<float>();
-  float *bn_bias_ptr = bn_bias->mutable_data<float>();
-  for (int c = 0; c < bn_scale->numel(); ++c) {
-    float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
-    bn_scale_ptr[c] = inv_scale;
-    bn_bias_ptr[c] = bn_bias_ptr[c] - inv_scale * mean_ptr[c];
-  }
-  return true;
-}
-
-template <>
-void FusionDequantBNReluKernel<CPU, float>::Compute(
-    const FusionDequantBNReluParam<CPU> &param) {
-  DequantBNReluCompute(&param);
-}
-#endif  // FUSION_DEQUANT_BN_RELU_OP
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
-template <>
-bool FusionDequantAddBNReluKernel<CPU, float>::Init(
-    FusionDequantAddBNReluParam<CPU> *param) {
-  // elementwise add params
-  const Tensor *bias = param->bias_;
-  // batch norm params
-  const Tensor *bn_mean = param->bn_mean_;
-  const Tensor *bn_variance = param->bn_variance_;
-  Tensor *bn_scale = param->bn_scale_;
-  Tensor *bn_bias = param->bn_bias_;
-  const float epsilon = param->epsilon_;
-
-  const float *bias_ptr = bias->data<float>();
-  const float *mean_ptr = bn_mean->data<float>();
-  const float *var_ptr = bn_variance->data<float>();
-  float *bn_scale_ptr = bn_scale->mutable_data<float>();
-  float *bn_bias_ptr = bn_bias->mutable_data<float>();
-  for (int c = 0; c < bn_scale->numel(); ++c) {
-    float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon));
-    bn_scale_ptr[c] = inv_scale;
-    bn_bias_ptr[c] = inv_scale * (bias_ptr[c] - mean_ptr[c]) + bn_bias_ptr[c];
-  }
-  return true;
-}
-
-template <>
-void FusionDequantAddBNReluKernel<CPU, float>::Compute(
-    const FusionDequantAddBNReluParam<CPU> &param) {
-  DequantBNReluCompute(&param);
-}
-#endif  // FUSION_DEQUANT_ADD_BN_RELU_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
--- a/src/operators/kernel/arm/dequantize_bn_kernel.cpp
+++ b/src/operators/kernel/arm/dequantize_bn_kernel.cpp
+/* Copyright (c) 201f8 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+#include "operators/kernel/dequant_bn_kernel.h"
+#include "operators/math/activation.h"
+#include "operators/math/quantize.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
+
+namespace paddle_mobile {
+namespace operators {
+
+#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
+    defined(FUSION_DEQUANT_BN_RELU_OP) ||                                 \
+    defined(FUSION_DEQUANT_ADD_BN_RELU_OP) ||                             \
+    defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) ||                            \
+    defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
+void PublicFusionDequantBNInitParam(FusionDequantBNParam<CPU> *param,
+                                    const framework::Tensor *bias) {
+  // batch norm params
+  const Tensor *bn_mean = param->bn_mean_;
+  const Tensor *bn_variance = param->bn_variance_;
+  Tensor *bn_scale = param->bn_scale_;
+  Tensor *bn_bias = param->bn_bias_;
+  const float epsilon = param->epsilon_;
+
+  const float *mean_ptr = bn_mean->data<float>();
+  const float *var_ptr = bn_variance->data<float>();
+  float *bn_scale_ptr = bn_scale->mutable_data<float>();
+  float *bn_bias_ptr = bn_bias->mutable_data<float>();
+  for (int c = 0; c < bn_scale->numel(); ++c) {
+    float inv_scale = 1.f / (std::sqrt(var_ptr[c] + epsilon));
+    float val = bias ? bias->data<float>()[c] : 0;
+    bn_bias_ptr[c] =
+        inv_scale * bn_scale_ptr[c] * (val - mean_ptr[c]) + bn_bias_ptr[c];
+    bn_scale_ptr[c] = inv_scale * bn_scale_ptr[c];
+  }
+}
+#endif
+
+#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
+    defined(FUSION_DEQUANT_BN_RELU_OP) ||                                 \
+    defined(FUSION_DEQUANT_ADD_BN_RELU_OP)
+template <ActivationType Act>
+void DequantBNCompute(const FusionDequantBNParam<CPU> *param) {
+  const int32_t *input = param->input_->data<int32_t>();
+  const float *bn_scale = param->bn_scale_->data<float>();
+  const float *bn_bias = param->bn_bias_->data<float>();
+  // dequantize params
+  const float activation_scale = param->activation_scale_->data<float>()[0];
+  const float weight_scale = param->weight_scale_;
+  const float dequant_scale = activation_scale / weight_scale;
+
+  float *output = param->output_->mutable_data<float>();
+  int batch_size = param->input_->dims()[0];
+  int channels = param->input_->dims()[1];
+  size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
+
+  #pragma omp parallel for collapse(2)
+  for (int batch = 0; batch < batch_size; ++batch) {
+    for (int c = 0; c < channels; ++c) {
+      // not fuse bn and dequant scale to minimize precision difference
+      // float scale = bn_scale[c] * dequant_scale;
+      float scale = bn_scale[c];
+      float bias = bn_bias[c];
+      size_t offset = (batch * channels + c) * spatial_size;
+      const int32_t *x = input + offset;
+      float *y = output + offset;
+      size_t remain = spatial_size;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+      int loop = spatial_size >> 4;
+      remain = spatial_size & 0xF;
+      float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
+      float32x4_t __scale = vdupq_n_f32(scale);
+      float32x4_t __bias = vdupq_n_f32(bias);
+      for (int k = 0; k < loop; ++k, x += 16, y += 16) {
+        int32x4_t r0 = vld1q_s32(x);
+        int32x4_t r1 = vld1q_s32(x + 4);
+        int32x4_t r2 = vld1q_s32(x + 8);
+        int32x4_t r3 = vld1q_s32(x + 12);
+        float32x4_t f0 = vcvtq_f32_s32(r0);
+        float32x4_t f1 = vcvtq_f32_s32(r1);
+        float32x4_t f2 = vcvtq_f32_s32(r2);
+        float32x4_t f3 = vcvtq_f32_s32(r3);
+        f0 = vmulq_f32(__dequant_scale, f0);
+        f1 = vmulq_f32(__dequant_scale, f1);
+        f2 = vmulq_f32(__dequant_scale, f2);
+        f3 = vmulq_f32(__dequant_scale, f3);
+        f0 = vmlaq_f32(__bias, __scale, f0);
+        f1 = vmlaq_f32(__bias, __scale, f1);
+        f2 = vmlaq_f32(__bias, __scale, f2);
+        f3 = vmlaq_f32(__bias, __scale, f3);
+        f0 = math::vActiveq_f32<Act>(f0);
+        f1 = math::vActiveq_f32<Act>(f1);
+        f2 = math::vActiveq_f32<Act>(f2);
+        f3 = math::vActiveq_f32<Act>(f3);
+        vst1q_f32(y, f0);
+        vst1q_f32(y + 4, f1);
+        vst1q_f32(y + 8, f2);
+        vst1q_f32(y + 12, f3);
+      }
+#endif  // __ARM_NEON__
+      for (int k = 0; k < remain; ++k) {
+        y[k] = math::Active<Act>(scale * (dequant_scale * x[k]) + bias);
+      }
+    }
+  }
+}
+#endif
+
+#ifdef FUSION_DEQUANT_BN_OP
+template <>
+bool FusionDequantBNKernel<CPU, float>::Init(FusionDequantBNParam<CPU> *param) {
+  PublicFusionDequantBNInitParam(param, nullptr);
+  return true;
+}
+
+template <>
+void FusionDequantBNKernel<CPU, float>::Compute(
+    const FusionDequantBNParam<CPU> &param) {
+  DequantBNCompute<IDENTITY>(&param);
+}
+#endif  // FUSION_DEQUANT_BN_OP
+
+#ifdef FUSION_DEQUANT_BN_RELU_OP
+template <>
+bool FusionDequantBNReluKernel<CPU, float>::Init(
+    FusionDequantBNParam<CPU> *param) {
+  PublicFusionDequantBNInitParam(param, nullptr);
+  return true;
+}
+
+template <>
+void FusionDequantBNReluKernel<CPU, float>::Compute(
+    const FusionDequantBNParam<CPU> &param) {
+  DequantBNCompute<RELU>(&param);
+}
+#endif  // FUSION_DEQUANT_BN_RELU_OP
+
+#ifdef FUSION_DEQUANT_ADD_BN_OP
+template <>
+bool FusionDequantAddBNKernel<CPU, float>::Init(
+    FusionDequantAddBNParam<CPU> *param) {
+  const framework::Tensor *bias = param->bias_;
+  PublicFusionDequantBNInitParam(param, bias);
+  return true;
+}
+
+template <>
+void FusionDequantAddBNKernel<CPU, float>::Compute(
+    const FusionDequantAddBNParam<CPU> &param) {
+  DequantBNCompute<IDENTITY>(&param);
+}
+#endif  // FUSION_DEQUANT_ADD_BN_OP
+
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
+template <>
+bool FusionDequantAddBNReluKernel<CPU, float>::Init(
+    FusionDequantAddBNParam<CPU> *param) {
+  const framework::Tensor *bias = param->bias_;
+  PublicFusionDequantBNInitParam(param, bias);
+  return true;
+}
+
+template <>
+void FusionDequantAddBNReluKernel<CPU, float>::Compute(
+    const FusionDequantAddBNParam<CPU> &param) {
+  DequantBNCompute<RELU>(&param);
+}
+#endif  // FUSION_DEQUANT_ADD_BN_RELU_OP
+
+#if defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
+    defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
+template <Activation Act, RoundType R>
+void DequantBNQuantCompute(const FusionDequantAddBNQuantParam<CPU> *param) {
+  const int32_t *input = param->input_->data<int32_t>();
+  const float *bn_scale = param->bn_scale_->data<float>();
+  const float *bn_bias = param->bn_bias_->data<float>();
+  // dequantize params
+  const float activation_scale = param->activation_scale_->data<float>()[0];
+  const float weight_scale = param->weight_scale_;
+  const float dequant_scale = activation_scale / weight_scale;
+  // quantize params
+  Tensor *output_scale = param->online_scale_;
+  float max_abs = 0.f;
+
+  int8_t *output = param->output_->mutable_data<int8_t>();
+  int batch_size = param->input_->dims()[0];
+  int channels = param->input_->dims()[1];
+  size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
+
+  //  if (param->is_static_) {
+  if (true) {
+    max_abs = param->static_scale_;
+    float quant_scale = 127.f / max_abs;
+    #pragma omp parallel for collapse(2)
+    for (int batch = 0; batch < batch_size; ++batch) {
+      for (int c = 0; c < channels; ++c) {
+        // not fuse bn and dequant scale to minimize precision difference
+        // float scale = bn_scale[c] * dequant_scale;
+        float scale = bn_scale[c];
+        float bias = bn_bias[c];
+        size_t offset = (batch * channels + c) * spatial_size;
+        const int32_t *x = input + offset;
+        int8_t *y = output + offset;
+        size_t remain = spatial_size;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+        int loop = spatial_size >> 4;
+        remain = spatial_size & 0xF;
+        float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
+        float32x4_t __scale = vdupq_n_f32(scale);
+        float32x4_t __bias = vdupq_n_f32(bias);
+        float32x4_t __quant_scale = vdupq_n_f32(quant_scale);
+        for (int k = 0; k < loop; ++k, x += 16, y += 16) {
+          int32x4_t r0 = vld1q_s32(x);
+          int32x4_t r1 = vld1q_s32(x + 4);
+          int32x4_t r2 = vld1q_s32(x + 8);
+          int32x4_t r3 = vld1q_s32(x + 12);
+          float32x4_t f0 = vcvtq_f32_s32(r0);
+          float32x4_t f1 = vcvtq_f32_s32(r1);
+          float32x4_t f2 = vcvtq_f32_s32(r2);
+          float32x4_t f3 = vcvtq_f32_s32(r3);
+          f0 = vmulq_f32(__dequant_scale, f0);
+          f1 = vmulq_f32(__dequant_scale, f1);
+          f2 = vmulq_f32(__dequant_scale, f2);
+          f3 = vmulq_f32(__dequant_scale, f3);
+          f0 = vmlaq_f32(__bias, __scale, f0);
+          f1 = vmlaq_f32(__bias, __scale, f1);
+          f2 = vmlaq_f32(__bias, __scale, f2);
+          f3 = vmlaq_f32(__bias, __scale, f3);
+          f0 = math::vActiveq_f32<Act>(f0);
+          f1 = math::vActiveq_f32<Act>(f1);
+          f2 = math::vActiveq_f32<Act>(f2);
+          f3 = math::vActiveq_f32<Act>(f3);
+          f0 = vmulq_f32(__quant_scale, f0);
+          f1 = vmulq_f32(__quant_scale, f1);
+          f2 = vmulq_f32(__quant_scale, f2);
+          f3 = vmulq_f32(__quant_scale, f3);
+          int32x4_t q0 = math::vRoundq_f32<R>(f0);
+          int32x4_t q1 = math::vRoundq_f32<R>(f1);
+          int32x4_t q2 = math::vRoundq_f32<R>(f2);
+          int32x4_t q3 = math::vRoundq_f32<R>(f3);
+          int16x4_t d0 = vmovn_s32(q0);
+          int16x4_t d1 = vmovn_s32(q1);
+          int16x4_t d2 = vmovn_s32(q2);
+          int16x4_t d3 = vmovn_s32(q3);
+          int16x8_t q5 = vcombine_s16(d0, d1);
+          int16x8_t q6 = vcombine_s16(d2, d3);
+          int8x8_t d5 = vmovn_s16(q5);
+          int8x8_t d6 = vmovn_s16(q6);
+          vst1_s8(y, d5);
+          vst1_s8(y + 8, d6);
+        }
+#endif  // __ARM_NEON__
+        for (int k = 0; k < remain; ++k) {
+          float x_temp =
+              math::Active<Act>(scale * (dequant_scale * x[k]) + bias);
+          y[k] = math::Round<R>(x_temp * quant_scale);
+        }
+      }
+    }
+  } else {
+    // TODO(hjchen2)
+    max_abs = std::max(max_abs, 1e-6f);
+  }
+  param->online_scale_->mutable_data<float>()[0] = max_abs;
+}
+
+template <>
+bool FusionDequantAddBNQuantKernel<CPU, float>::Init(
+    FusionDequantAddBNQuantParam<CPU> *param) {
+  const framework::Tensor *bias = param->bias_;
+  PublicFusionDequantBNInitParam(param, bias);
+  return true;
+}
+
+template <>
+void FusionDequantAddBNQuantKernel<CPU, float>::Compute(
+    const FusionDequantAddBNQuantParam<CPU> &param) {
+  switch (param.round_type_) {
+    case ROUND_NEAREST_TO_EVEN:
+      DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_TO_EVEN>(&param);
+      break;
+    case ROUND_NEAREST_TOWARDS_ZERO:
+      DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_TOWARDS_ZERO>(&param);
+      break;
+    case ROUND_NEAREST_AWAY_ZERO:
+      DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_AWAY_ZERO>(&param);
+      break;
+    default:
+      LOG(kLOG_ERROR) << "round type is not supported.";
+      break;
+  }
+}
+#endif  // FUSION_DEQUANT_ADD_BN_QUANT_OP
+
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
+template <>
+bool FusionDequantAddBNReluQuantKernel<CPU, float>::Init(
+    FusionDequantAddBNQuantParam<CPU> *param) {
+  const framework::Tensor *bias = param->bias_;
+  PublicFusionDequantBNInitParam(param, bias);
+  return true;
+}
+
+template <>
+void FusionDequantAddBNReluQuantKernel<CPU, float>::Compute(
+    const FusionDequantAddBNQuantParam<CPU> &param) {
+  switch (param.round_type_) {
+    case ROUND_NEAREST_TO_EVEN:
+      DequantBNQuantCompute<RELU, ROUND_NEAREST_TO_EVEN>(&param);
+      break;
+    case ROUND_NEAREST_TOWARDS_ZERO:
+      DequantBNQuantCompute<RELU, ROUND_NEAREST_TOWARDS_ZERO>(&param);
+      break;
+    case ROUND_NEAREST_AWAY_ZERO:
+      DequantBNQuantCompute<RELU, ROUND_NEAREST_AWAY_ZERO>(&param);
+      break;
+    default:
+      LOG(kLOG_ERROR) << "round type is not supported.";
+      break;
+  }
+}
+#endif  // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
+
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/arm/dequantize_kernel.cpp
+++ b/src/operators/kernel/arm/dequantize_kernel.cpp
@@ -30,8 +30,8 @@ bool DequantizeKernel<CPU, float>::Init(DequantizeParam<CPU> *param) {

 template <>
 void DequantizeKernel<CPU, float>::Compute(const DequantizeParam<CPU> &param) {
-  const Tensor *input = param.input_;
-  Tensor *output = param.output_;
+  const LoDTensor *input = param.input_;
+  LoDTensor *output = param.output_;
  float activation_scale = param.activation_scale_->data<float>()[0];
  float weight_scale = param.weight_scale_;
  const int32_t *x = input->data<const int32_t>();
@@ -72,6 +72,7 @@ void DequantizeKernel<CPU, float>::Compute(const DequantizeParam<CPU> &param) {
  for (size_t i = 0; i < size; ++i) {
    y[i] = x[i] * scale;
  }
+  output->set_lod(input->lod());
 }

 }  // namespace operators

--- a/src/operators/kernel/arm/gru_kernel.cpp
+++ b/src/operators/kernel/arm/gru_kernel.cpp
@@ -29,12 +29,6 @@ template <>
 void GruKernel<CPU, float>::Compute(const GruParam<CPU> &param) {
  GruCompute<float>(param);
  param.OutHidden()->set_lod(param.InputInput()->lod());
-  //  DLOG << "________________" << param.OutHidden()->dims();
-  //  DLOG << "________________" << param.OutHidden()->numel();
-  //  auto *hiden_data = param.OutHidden()->data<float>();
-  //  for (int64_t i = 0; i < 10; i++) {
-  //    DLOG << "****************" << hiden_data[i];
-  //  }
 }

 template class GruKernel<CPU, float>;

--- a/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
@@ -16,6 +16,7 @@ limitations under the License. */

 #include "operators/kernel/quantize_kernel.h"
 #include <cmath>
+#include "operators/math/quantize.h"

 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #include <arm_neon.h>
@@ -32,81 +33,68 @@ inline float32_t vmaxvq_f32(float32x4_t r) {
 }
 #endif

-template <RoundType R = ROUND_NEAREST_TOWARDS_ZERO>
-inline int32x4_t vround_f32(float32x4_t r) {
-  return vcvtq_s32_f32(r);
-}
-
-template <>
-inline int32x4_t vround_f32<ROUND_NEAREST_AWAY_ZERO>(float32x4_t r) {
-  float32x4_t plus = vdupq_n_f32(0.5);
-  float32x4_t minus = vdupq_n_f32(-0.5);
-  float32x4_t zero = vdupq_n_f32(0);
-  uint32x4_t more_than_zero = vcgtq_f32(r, zero);
-  float32x4_t temp = vbslq_f32(more_than_zero, plus, minus);
-  temp = vaddq_f32(r, temp);
-  int32x4_t ret = vcvtq_s32_f32(temp);
-  return ret;
-}
-
-template <>
-inline int32x4_t vround_f32<ROUND_NEAREST_TO_EVEN>(float32x4_t r) {
-  float32x4_t point5 = vdupq_n_f32(0.5);
-  int32x4_t one = vdupq_n_s32(1);
-  int32x4_t zero = vdupq_n_s32(0);
-
-  int32x4_t rnd = vround_f32<ROUND_NEAREST_AWAY_ZERO>(r);
-  float32x4_t frnd = vcvtq_f32_s32(rnd);
-  frnd = vsubq_f32(frnd, r);
-  frnd = vabsq_f32(frnd);
-  uint32x4_t equal_point5 = vceqq_f32(frnd, point5);
-  int32x4_t abs_rnd = vabsq_s32(rnd);
-  abs_rnd = vandq_s32(abs_rnd, one);
-  uint32x4_t not_mod2 = vreinterpretq_u32_s32(abs_rnd);
-  uint32x4_t mask = vandq_u32(equal_point5, not_mod2);
-  uint32x4_t more_than_zero = vcgtq_s32(rnd, zero);
-  more_than_zero = vandq_u32(more_than_zero, vreinterpretq_u32_s32(one));
-  mask = veorq_u32(more_than_zero, mask);
-  more_than_zero = veorq_u32(more_than_zero, vreinterpretq_u32_s32(one));
-  mask = vaddq_u32(more_than_zero, mask);
-  int32x4_t smask = vreinterpretq_s32_u32(mask);
-  smask = vsubq_s32(smask, one);
-  rnd = vaddq_s32(rnd, smask);
-  return rnd;
-}
-#endif
-
-template <RoundType R = ROUND_NEAREST_TOWARDS_ZERO>
-inline int8_t Round(const float &x) {
-  return static_cast<int8_t>(x);
-}
-
-template <>
-inline int8_t Round<ROUND_NEAREST_AWAY_ZERO>(const float &x) {
-  return std::round(x);
-}
-
-template <>
-inline int8_t Round<ROUND_NEAREST_TO_EVEN>(const float &x) {
-  float v = std::round(x);
-  int32_t q = static_cast<int32_t>(v);
-  if (std::abs(std::abs(q - v) - 0.5) <= 0) {
-    if (std::abs(q) % 2 != 0) {
-      q = q + ((q > 0) ? -1 : 1);
+template <RoundType R>
+inline void QuantizeOffline(const Tensor *input, const float scale,
+                            const float max_abs, Tensor *output) {
+  const float *x = input->data<const float>();
+  int8_t *y = output->mutable_data<int8_t>();
+  size_t remain = input->numel();
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  size_t loop = remain >> 4;
+  remain = remain & 0xF;
+  float32x4_t __scale = vdupq_n_f32(scale);
+  float32x4_t __postive_max = vdupq_n_f32(max_abs);
+  float32x4_t __negtive_max = vdupq_n_f32(-max_abs);
+  #pragma omp parallel for
+  for (size_t i = 0; i < loop; ++i) {
+    const float *local_x = x + (i << 4);
+    int8_t *local_y = y + (i << 4);
+    float32x4_t r0 = vld1q_f32(local_x);
+    float32x4_t r1 = vld1q_f32(local_x + 4);
+    float32x4_t r2 = vld1q_f32(local_x + 8);
+    float32x4_t r3 = vld1q_f32(local_x + 12);
+    r0 = vmaxq_f32(vminq_f32(r0, __postive_max), __negtive_max);
+    r1 = vmaxq_f32(vminq_f32(r1, __postive_max), __negtive_max);
+    r2 = vmaxq_f32(vminq_f32(r2, __postive_max), __negtive_max);
+    r3 = vmaxq_f32(vminq_f32(r3, __postive_max), __negtive_max);
+    r0 = vmulq_f32(r0, __scale);
+    r1 = vmulq_f32(r1, __scale);
+    r2 = vmulq_f32(r2, __scale);
+    r3 = vmulq_f32(r3, __scale);
+    int32x4_t q0 = math::vRoundq_f32<R>(r0);
+    int32x4_t q1 = math::vRoundq_f32<R>(r1);
+    int32x4_t q2 = math::vRoundq_f32<R>(r2);
+    int32x4_t q3 = math::vRoundq_f32<R>(r3);
+    int16x4_t d0 = vmovn_s32(q0);
+    int16x4_t d1 = vmovn_s32(q1);
+    int16x4_t d2 = vmovn_s32(q2);
+    int16x4_t d3 = vmovn_s32(q3);
+    int16x8_t q5 = vcombine_s16(d0, d1);
+    int16x8_t q6 = vcombine_s16(d2, d3);
+    int8x8_t d5 = vmovn_s16(q5);
+    int8x8_t d6 = vmovn_s16(q6);
+    vst1_s8(local_y, d5);
+    vst1_s8(local_y + 8, d6);
  }
+  x += (loop << 4);
+  y += (loop << 4);
+#endif
+  for (size_t i = 0; i < remain; ++i) {
+    float x_temp = std::max(std::min(x[i], max_abs), -max_abs);
+    y[i] = math::Round<R>(x_temp * scale);
  }
-  return static_cast<int8_t>(q);
 }

 template <RoundType R>
-static void Quantize(const Tensor *input, const float scale, Tensor *output) {
+inline void QuantizeOnline(const Tensor *input, const float scale,
+                           Tensor *output) {
  const float *x = input->data<const float>();
  int8_t *y = output->mutable_data<int8_t>();
  size_t remain = input->numel();
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
  size_t loop = remain >> 4;
  remain = remain & 0xF;
-
+  float32x4_t __scale = vdupq_n_f32(scale);
  #pragma omp parallel for
  for (size_t i = 0; i < loop; ++i) {
    const float *local_x = x + (i << 4);
@@ -115,14 +103,14 @@ static void Quantize(const Tensor *input, const float scale, Tensor *output) {
    float32x4_t r1 = vld1q_f32(local_x + 4);
    float32x4_t r2 = vld1q_f32(local_x + 8);
    float32x4_t r3 = vld1q_f32(local_x + 12);
-    r0 = vmulq_n_f32(r0, scale);
-    r1 = vmulq_n_f32(r1, scale);
-    r2 = vmulq_n_f32(r2, scale);
-    r3 = vmulq_n_f32(r3, scale);
-    int32x4_t q0 = vround_f32<R>(r0);
-    int32x4_t q1 = vround_f32<R>(r1);
-    int32x4_t q2 = vround_f32<R>(r2);
-    int32x4_t q3 = vround_f32<R>(r3);
+    r0 = vmulq_f32(r0, __scale);
+    r1 = vmulq_f32(r1, __scale);
+    r2 = vmulq_f32(r2, __scale);
+    r3 = vmulq_f32(r3, __scale);
+    int32x4_t q0 = math::vRoundq_f32<R>(r0);
+    int32x4_t q1 = math::vRoundq_f32<R>(r1);
+    int32x4_t q2 = math::vRoundq_f32<R>(r2);
+    int32x4_t q3 = math::vRoundq_f32<R>(r3);
    int16x4_t d0 = vmovn_s32(q0);
    int16x4_t d1 = vmovn_s32(q1);
    int16x4_t d2 = vmovn_s32(q2);
@@ -138,7 +126,18 @@ static void Quantize(const Tensor *input, const float scale, Tensor *output) {
  y += (loop << 4);
 #endif
  for (size_t i = 0; i < remain; ++i) {
-    y[i] = Round<R>(x[i] * scale);
+    y[i] = math::Round<R>(x[i] * scale);
+  }
+}
+
+template <RoundType R>
+static void Quantize(const Tensor *input, const float max_abs,
+                     const bool offline, Tensor *output) {
+  float scale = 127.f / max_abs;
+  if (offline) {
+    QuantizeOffline<R>(input, scale, max_abs, output);
+  } else {
+    QuantizeOnline<R>(input, scale, output);
  }
 }

@@ -173,6 +172,13 @@ float find_abs_max(const Tensor *input) {
  return max_abs;
 }

+}  // namespace operators
+}  // namespace paddle_mobile
+#endif  // __ARM_NEON__
+
+namespace paddle_mobile {
+namespace operators {
+
 template <>
 bool QuantizeKernel<CPU, float>::Init(QuantizeParam<CPU> *param) {
  return true;
@@ -180,36 +186,36 @@ bool QuantizeKernel<CPU, float>::Init(QuantizeParam<CPU> *param) {

 template <>
 void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
-  const Tensor *input = param.input_;
-  Tensor *output = param.output_;
+  const LoDTensor *input = param.input_;
+  LoDTensor *output = param.output_;
  Tensor *output_scale = param.online_scale_;
  float max_abs = 0.f;
-  if (param.is_static_) {
-    max_abs = param.static_scale_;
+  if (param.offline_) {
+    max_abs = param.offline_scale_->data<float>()[0];
  } else {
    max_abs = find_abs_max(input);
  }
  max_abs = std::max(max_abs, 1e-6f);
-  // only support int8 currently
-  float scale = 127 / max_abs;
  param.online_scale_->mutable_data<float>()[0] = max_abs;
  switch (param.round_type_) {
    case ROUND_NEAREST_TO_EVEN:
-      Quantize<ROUND_NEAREST_TO_EVEN>(input, scale, output);
+      Quantize<ROUND_NEAREST_TO_EVEN>(input, max_abs, param.offline_, output);
      break;
    case ROUND_NEAREST_TOWARDS_ZERO:
-      Quantize<ROUND_NEAREST_TOWARDS_ZERO>(input, scale, output);
+      Quantize<ROUND_NEAREST_TOWARDS_ZERO>(input, max_abs, param.offline_,
+                                           output);
      break;
    case ROUND_NEAREST_AWAY_ZERO:
-      Quantize<ROUND_NEAREST_AWAY_ZERO>(input, scale, output);
+      Quantize<ROUND_NEAREST_AWAY_ZERO>(input, max_abs, param.offline_, output);
      break;
    default:
      LOG(kLOG_ERROR) << "round type is not supported.";
      break;
  }
+  output->set_lod(input->lod());
 }

 }  // namespace operators
 }  // namespace paddle_mobile

-#endif
+#endif  // QUANT_OP
--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -15,11 +15,56 @@ limitations under the License. */
 #ifdef RELU_OP

 #include "operators/kernel/relu_kernel.h"
-#include "operators/kernel/central-arm-func/relu_arm_func.h"
+#include "common/types.h"
+#include "operators/math/activation.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif

 namespace paddle_mobile {
 namespace operators {

+template <typename Dtype, ActivationType Act>
+struct ReluCompute {
+  void operator()(const Tensor *input, Tensor *output) {}
+};
+
+template <ActivationType Act>
+struct ReluCompute<float, Act> {
+  void operator()(const Tensor *input, Tensor *output) {
+    const float *x = input->data<float>();
+    float *y = output->mutable_data<float>();
+    size_t remain = input->numel();
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+    size_t loop = remain >> 4;
+    remain = remain & 0xF;
+
+    #pragma omp parallel for
+    for (size_t i = 0; i < loop; ++i) {
+      const float *local_x = x + (i << 4);
+      float *local_y = y + (i << 4);
+      float32x4_t r0 = vld1q_f32(local_x);
+      float32x4_t r1 = vld1q_f32(local_x + 4);
+      float32x4_t r2 = vld1q_f32(local_x + 8);
+      float32x4_t r3 = vld1q_f32(local_x + 12);
+      r0 = math::vActiveq_f32<Act>(r0);
+      r1 = math::vActiveq_f32<Act>(r1);
+      r2 = math::vActiveq_f32<Act>(r2);
+      r3 = math::vActiveq_f32<Act>(r3);
+      vst1q_f32(local_y, r0);
+      vst1q_f32(local_y + 4, r1);
+      vst1q_f32(local_y + 8, r2);
+      vst1q_f32(local_y + 12, r3);
+    }
+    x += (loop << 4);
+    y += (loop << 4);
+#endif
+    for (size_t i = 0; i < remain; ++i) {
+      y[i] = math::Active<Act>(x[i]);
+    }
+  }
+};
+
 template <>
 bool ReluKernel<CPU, float>::Init(ReluParam<CPU> *param) {
  return true;
@@ -27,7 +72,21 @@ bool ReluKernel<CPU, float>::Init(ReluParam<CPU> *param) {

 template <>
 void ReluKernel<CPU, float>::Compute(const ReluParam<CPU> &param) {
-  ReluCompute<float>(param);
+  const Tensor *input = param.InputX();
+  Tensor *output = param.Out();
+  ReluCompute<float, RELU>()(input, output);
+}
+
+template <>
+bool Relu6Kernel<CPU, float>::Init(ReluParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void Relu6Kernel<CPU, float>::Compute(const ReluParam<CPU> &param) {
+  const Tensor *input = param.InputX();
+  Tensor *output = param.Out();
+  ReluCompute<float, RELU6>()(input, output);
 }

 }  // namespace operators

--- a/src/operators/kernel/central-arm-func/transpose2_arm_func.h
+++ b/src/operators/kernel/central-arm-func/transpose2_arm_func.h
@@ -12,54 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef TRANSPOSE2_OP
-#pragma once
+#ifdef TOP_K_OP

+#include <algorithm>
+#include <iostream>
 #include <vector>
-#include "operators/op_param.h"
+#include "operators/kernel/kernels.h"

 namespace paddle_mobile {
 namespace operators {

-template <typename P>
-void Transpose2Compute(const Transpose2Param<CPU>& param) {
-  const auto* input_x = param.InputX();
-  const auto input_x_dims = input_x->dims();
-  auto* out = param.Out();
-  const auto axis = param.Axis();
-  const auto* input_x_data = input_x->data<float>();
-  auto* out_data = out->mutable_data<float>();
+template <>
+bool TopKKernel<CPU, float>::Init(TopKParam<CPU> *param) {
+  return true;
+}

-  size_t ndim = axis.size();
-  std::vector<int> xdim(ndim);
-  std::vector<int> xstride(ndim);
-  std::vector<int> xout(ndim);
-  for (int i = 0; i < ndim; i++) {
-    int j = ndim - 1 - i;
-    xdim[j] = input_x_dims[axis[i]];
-    xstride[j] = 1;
-    for (int k = axis[i] + 1; k < ndim; k++) {
-      xstride[j] *= input_x_dims[k];
-    }
-    xout[j] = xstride[j] * xdim[j];
-  }
+template <>
+void TopKKernel<CPU, float>::Compute(const TopKParam<CPU> &param) {
+  const Tensor *input = param.input_;
+  Tensor *output = param.output_;
+  Tensor *indices = param.indices_;
+  const float *input_data = input->data<float>();
+  float *output_data = output->mutable_data<float>();
+  int64_t *indices_data = indices->mutable_data<int64_t>();
+
+  framework::DDim input_dims = input->dims();
+  const size_t row = framework::product(
+      framework::slice_ddim(input_dims, 0, input_dims.size() - 1));
+  const size_t col = input_dims[input_dims.size() - 1];
+
+  #pragma omp parallel for
+  for (size_t i = 0; i < row; i++) {
+    std::vector<std::pair<float, size_t>> vec(col);
+    const float *input_ptr = input_data + i * col;
+    float *output_ptr = output_data + i * param.k_;
+    int64_t *indices_ptr = indices_data + i * param.k_;

-  auto numel = input_x->numel();
-  size_t pind = 0;
-  std::vector<int> ind(ndim);
-  for (int i = 0; i < numel; i++) {
-    out_data[i] = input_x_data[pind];
-    ind[0]++;
-    pind += xstride[0];
-    for (int j = 0; j < ndim - 1; j++) {
-      if (ind[j] == xdim[j]) {
-        ind[j + 1]++;
-        ind[j] = 0;
-        pind += xstride[j + 1];
-        pind -= xout[j];
-      } else {
-        break;
+    for (size_t j = 0; j < col; j++) {
+      vec[j] = std::move(std::pair<float, size_t>(input_ptr[j], j));
    }
+    std::partial_sort(
+        vec.begin(), vec.begin() + param.k_, vec.end(),
+        [](const std::pair<float, size_t> &l,
+           const std::pair<float, size_t> &r) { return l.first > r.first; });
+    for (int j = 0; j < param.k_; ++j) {
+      output_ptr[j] = vec[j].first;
+      indices_ptr[j] = static_cast<int64_t>(vec[j].second);
    }
  }
 }
@@ -67,4 +65,4 @@ void Transpose2Compute(const Transpose2Param<CPU>& param) {
 }  // namespace operators
 }  // namespace paddle_mobile

-#endif
+#endif  // TOP_K_OP
--- a/src/operators/kernel/arm/transpose2_kernel.cpp
+++ b/src/operators/kernel/arm/transpose2_kernel.cpp
@@ -11,14 +11,111 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #ifdef TRANSPOSE2_OP

 #include "operators/kernel/transpose2_kernel.h"
-#include "operators/kernel/central-arm-func/transpose2_arm_func.h"

 namespace paddle_mobile {
 namespace operators {

+bool IsShuffleChannel(const std::vector<int> &axis) {
+  bool is_shuffle_channel = true;
+  if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) {
+    for (int i = 3; i < axis.size(); ++i) {
+      if (axis[i] != i) {
+        is_shuffle_channel = false;
+        break;
+      }
+    }
+  } else {
+    return false;
+  }
+  return is_shuffle_channel;
+}
+
+template <typename Dtype>
+void ShuffleChannelCompute(const Transpose2Param<CPU> &param) {
+  const std::vector<int> &axis = param.Axis();
+  const Tensor *input = param.InputX();
+  const Dtype *input_ptr = input->data<Dtype>();
+  Tensor *output = param.Out();
+  Dtype *output_ptr = output->mutable_data<Dtype>();
+  // input and output's shape dimension must >= 2 && <= 6.
+  const framework::DDim &in_dim = input->dims();
+  const framework::DDim &out_dim = output->dims();
+  size_t offset = 1;
+  for (int i = 3; i < axis.size(); ++i) {
+    offset *= in_dim[i];
+  }
+
+  #pragma omp parallel for collapse(3)
+  for (int batch = 0; batch < out_dim[0]; ++batch) {
+    for (int c1 = 0; c1 < out_dim[1]; ++c1) {
+      for (int c2 = 0; c2 < out_dim[2]; ++c2) {
+        size_t out_offset =
+            ((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset;
+        size_t in_offset = ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset;
+        memcpy(output_ptr + out_offset, input_ptr + in_offset,
+               offset * sizeof(Dtype));
+      }
+    }
+  }
+}
+
+template <typename Dtype>
+void Transpose2Compute(const Transpose2Param<CPU> &param) {
+  const std::vector<int> &axis = param.Axis();
+  const Tensor *input = param.InputX();
+  const Dtype *input_ptr = input->data<Dtype>();
+  Tensor *output = param.Out();
+  Dtype *output_ptr = output->mutable_data<Dtype>();
+  // input and output's shape dimension must >= 2 && <= 6.
+  const framework::DDim &in_dim = input->dims();
+  const framework::DDim &out_dim = output->dims();
+
+  // precompute inverted output dim and strides
+  size_t rout_dim[6], strides[6];
+  int permute = axis.size();  // permute must >=2 && <= 6.
+  for (int i = 0; i < permute; ++i) {
+    int k = permute - 1 - i;
+    strides[k] = 1;
+    for (int j = axis[i] + 1; j < permute; ++j) {
+      strides[k] *= in_dim[j];
+    }
+    rout_dim[k] = out_dim[i];
+  }
+  // unroll the first 2 dimensions
+  int reamin_dim = 1;
+  for (int i = 2; i < out_dim.size(); ++i) {
+    reamin_dim *= out_dim[i];
+  }
+
+  #pragma omp parallel for collapse(2)
+  for (int batch = 0; batch < out_dim[0]; ++batch) {
+    for (int j = 0; j < out_dim[1]; ++j) {
+      size_t offset = batch * strides[permute - 1] + j * strides[permute - 2];
+      Dtype *out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim;
+      int indics[4] = {0, 0, 0, 0};
+      for (int k = 0; k < reamin_dim; ++k) {
+        out_ptr[k] = input_ptr[offset];
+        indics[0] += 1;
+        offset += strides[0];
+        for (int p = 0; p < permute - 3; ++p) {
+          if (indics[p] == rout_dim[p]) {
+            indics[p + 1] += 1;
+            indics[p] = 0;
+            offset += strides[p + 1];
+            offset -= rout_dim[p] * strides[p];
+          } else {
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+
 template <>
 bool Transpose2Kernel<CPU, float>::Init(Transpose2Param<CPU> *param) {
  return true;
@@ -26,10 +123,24 @@ bool Transpose2Kernel<CPU, float>::Init(Transpose2Param<CPU> *param) {

 template <>
 void Transpose2Kernel<CPU, float>::Compute(const Transpose2Param<CPU> &param) {
+  const std::vector<int> &axis = param.Axis();
+  bool shuffle_channel = IsShuffleChannel(axis);
+  if (shuffle_channel) {
+    if (param.InputX()->type() == typeid(int8_t)) {
+      ShuffleChannelCompute<int8_t>(param);
+    } else {
+      ShuffleChannelCompute<float>(param);
+    }
+  } else {
+    if (param.InputX()->type() == typeid(int8_t)) {
+      Transpose2Compute<int8_t>(param);
+    } else {
      Transpose2Compute<float>(param);
+    }
+  }
 }

 }  // namespace operators
 }  // namespace paddle_mobile

-#endif
+#endif  // TRANSPOSE2_OP
--- a/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
+++ b/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
@@ -18,283 +18,63 @@ limitations under the License. */

 #include <cmath>
 #include "operators/op_param.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif  // __ARM_NEON__

 namespace paddle_mobile {
 namespace operators {

 template <typename P>
 void BatchnormCompute(const BatchNormParam<CPU> &param) {
-  const Tensor *input_x = param.InputX();
-  auto input_x_ptr = input_x->data<float>();
-  const auto &x_dims = input_x->dims();
-  const int N = x_dims[0];
-  const int C = x_dims[1];
-  const int H = x_dims[2];
-  const int W = x_dims[3];
-  const int stride0 = C * H * W;
-  const int stride1 = H * W;
-  const int stride2 = W;
-  Tensor *out = param.OutputY();
-  auto out_ptr = out->mutable_data<float>();
  const float epsilon = param.Epsilon();
-  const Tensor *mean = param.InputMean();
-  const Tensor *variance = param.InputVariance();
-  const Tensor *scale = param.InputScale();
-  const Tensor *bias = param.InputBias();
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-
-  //  Tensor inv_std;
-  //  auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
-
-  PADDLE_MOBILE_ENFORCE(C == variance->numel(),
-                        "C must equal to variance.numel()");
-
-  int HXW = H * W;
-
-#if __ARM_NEON
-#if __aarch64__
-  float *inv_std_ptr = new float[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-
-  Tensor new_scale;
-  auto new_scale_ptr = new_scale.mutable_data<float>(framework::make_ddim({C}));
-  Tensor new_bias;
-  auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
-
-  /// ((x - est_mean) * (inv_var) * scale + bias equal to
-  /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-    {
-      for (int n = 0; n < N; n++) {
-        for (int h = 0; h < H; h++) {
-          int tmp_index = n * stride0 + i * stride1 + h * stride2;
-          for (int w = 0; w < W; w++) {
-            int index = tmp_index + w;
-            out_ptr[index] =
-                input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
-          }
+  const float *mean_ptr = param.InputMean()->data<float>();
+  const float *variance_ptr = param.InputVariance()->data<float>();
+  const float *scale_ptr = param.InputScale()->data<float>();
+  const float *bias_ptr = param.InputBias()->data<float>();
+
+  const framework::Tensor *input = param.InputX();
+  const float *input_ptr = input->data<float>();
+  framework::Tensor *output = param.OutputY();
+  float *output_ptr = output->mutable_data<float>();
+  size_t spatial_size = output->dims()[2] * output->dims()[3];
+  int channels = output->dims()[1];
+
+  #pragma omp parallel for collapse(2)
+  for (int batch = 0; batch < output->dims()[0]; ++batch) {
+    for (int c = 0; c < channels; ++c) {
+      float inv_scale = 1.f / (std::sqrt(variance_ptr[c] + epsilon));
+      float bias = bias_ptr[c] - inv_scale * scale_ptr[c] * mean_ptr[c];
+      float scale = inv_scale * scale_ptr[c];
+      size_t offset = (batch * channels + c) * spatial_size;
+      const float *x = input_ptr + offset;
+      float *y = output_ptr + offset;
+      size_t remain = spatial_size;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+      int loop = spatial_size >> 4;
+      remain = spatial_size & 0xF;
+      float32x4_t __scale = vdupq_n_f32(scale);
+      float32x4_t __bias = vdupq_n_f32(bias);
+      for (int k = 0; k < loop; ++k, x += 16, y += 16) {
+        float32x4_t r0 = vld1q_f32(x);
+        float32x4_t r1 = vld1q_f32(x + 4);
+        float32x4_t r2 = vld1q_f32(x + 8);
+        float32x4_t r3 = vld1q_f32(x + 12);
+        r0 = vmlaq_f32(__bias, __scale, r0);
+        r1 = vmlaq_f32(__bias, __scale, r1);
+        r2 = vmlaq_f32(__bias, __scale, r2);
+        r3 = vmlaq_f32(__bias, __scale, r3);
+        vst1q_f32(y, r0);
+        vst1q_f32(y + 4, r1);
+        vst1q_f32(y + 8, r2);
+        vst1q_f32(y + 12, r3);
+      }
+#endif  // __ARM_NEON__
+      for (int k = 0; k < remain; ++k) {
+        y[k] = scale * x[k] + bias;
      }
    }
  }
-  }
-  delete[] inv_std_ptr;
-#else
-
-  if (HXW > 32) {
-    int NXC = N * C;
-    float *inv_std_ptr = new float[NXC * 4];
-    float *volatile new_scale_ptr = new float[NXC * 4];
-    float *volatile new_bias_ptr = new float[NXC * 4];
-
-    /// std = (var + epsilon).sqrt();
-    /// inv_std = 1 / std;
-    for (int i = 0; i < C * 4; i += 4) {
-      int index = i / 4;
-      inv_std_ptr[i] =
-          1 / static_cast<float>(pow((variance_ptr[index] + epsilon), 0.5));
-      inv_std_ptr[i + 1] = inv_std_ptr[i];
-      inv_std_ptr[i + 2] = inv_std_ptr[i];
-      inv_std_ptr[i + 3] = inv_std_ptr[i];
-
-      new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[index];
-      new_scale_ptr[i + 1] = new_scale_ptr[i];
-      new_scale_ptr[i + 2] = new_scale_ptr[i];
-      new_scale_ptr[i + 3] = new_scale_ptr[i];
-
-      new_bias_ptr[i] =
-          bias_ptr[index] - mean_ptr[index] * inv_std_ptr[i] * scale_ptr[index];
-
-      new_bias_ptr[i + 1] = new_bias_ptr[i];
-      new_bias_ptr[i + 2] = new_bias_ptr[i];
-      new_bias_ptr[i + 3] = new_bias_ptr[i];
-    }
-
-    for (int j = C * 4; j < NXC * 4; ++j) {
-      new_scale_ptr[j] = new_scale_ptr[j - C * 4];
-      new_bias_ptr[j] = new_bias_ptr[j - C * 4];
-    }
-
-    asm volatile(
-        "subs %[N], %[N], #1                  \n\t"
-        "blt        end_n_%=                  \n\t"
-        "loop_n_%=:                           \n\t"
-
-        "subs %[C], %[C], #1                   \n\t"
-        "blt        end_c_%=                  \n\t"
-        "loop_c_%=:                           \n\t"
-
-        "vld1.32 {q9}, [%[new_scale_ptr]]!    \n\t"
-        "vld1.32 {q10}, [%[new_bias_ptr]]!    \n\t"
-
-        "mov r6, %[HXW]       \n\t"
-
-        "subs r6, r6, #32                       \n\t"
-        "blt        end_hw_%=                   \n\t"
-        "loop_hw_%=:                            \n\t"
-
-        "vld1.32 {q1, q2}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q3, q4}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q5, q6}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q7, q8}, [%[input_x_ptr]]!    \n\t"
-
-        "vmul.f32   q1, q1,   q9  \n\t"
-        "vmul.f32   q2, q2,   q9  \n\t"
-        "vmul.f32   q3, q3,   q9  \n\t"
-        "vmul.f32   q4, q4,   q9  \n\t"
-
-        "vmul.f32   q5, q5,   q9  \n\t"
-        "vmul.f32   q6, q6,   q9  \n\t"
-        "vmul.f32   q7, q7,   q9  \n\t"
-        "vmul.f32   q8, q8,   q9  \n\t"
-
-        "vadd.f32   q1,  q1,  q10 \n\t"
-        "vadd.f32   q2, q2,   q10  \n\t"
-        "vadd.f32   q3, q3,   q10  \n\t"
-        "vadd.f32   q4,  q4,  q10 \n\t"
-        "vadd.f32   q5,  q5,  q10 \n\t"
-        "vadd.f32   q6,  q6,  q10 \n\t"
-        "vadd.f32   q7,  q7,  q10 \n\t"
-        "vadd.f32   q8,  q8,  q10 \n\t"
-
-        "vst1.32 {q1, q2}, [%[out_ptr]]!        \n\t"
-        "vst1.32 {q3, q4}, [%[out_ptr]]!       \n\t"
-        "vst1.32 {q5, q6}, [%[out_ptr]]!       \n\t"
-        "vst1.32 {q7, q8}, [%[out_ptr]]!       \n\t"
-
-        "subs r6, r6, #32                    \n\t"
-        "bge        loop_hw_%=                \n\t"
-        "end_hw_%=:                           \n\t"
-
-        "cmp  r6, #0                                \n\t"
-        "bge  end_remainder_%=                      \n\t"
-        "mov r5, #4                             \n\t"
-        "mul  r6, r6, r5                            \n\t"
-        "add %[input_x_ptr], %[input_x_ptr], r6     \n\t"
-
-        "vld1.32 {q1, q2}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q3, q4}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q5, q6}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q7, q8}, [%[input_x_ptr]]!    \n\t"
-
-        "vmul.f32   q1, q1,   q9  \n\t"
-        "vmul.f32   q2, q2,   q9  \n\t"
-        "vmul.f32   q3, q3,   q9  \n\t"
-        "vmul.f32   q4, q4,   q9  \n\t"
-        "vmul.f32   q5, q5,   q9  \n\t"
-        "vmul.f32   q6, q6,   q9  \n\t"
-        "vmul.f32   q7, q7,   q9  \n\t"
-        "vmul.f32   q8, q8,   q9  \n\t"
-        "vadd.f32   q1,  q1,  q10 \n\t"
-        "vadd.f32   q2, q2,   q10  \n\t"
-        "vadd.f32   q3, q3,   q10  \n\t"
-        "vadd.f32   q4,  q4,  q10 \n\t"
-        "vadd.f32   q5,  q5,  q10 \n\t"
-        "vadd.f32   q6,  q6,  q10 \n\t"
-        "vadd.f32   q7,  q7,  q10 \n\t"
-        "vadd.f32   q8,  q8,  q10 \n\t"
-
-        "add %[out_ptr], %[out_ptr], r6         \n\t"
-        "vst1.32 {q1, q2}, [%[out_ptr]]!        \n\t"
-        "vst1.32 {q3, q4}, [%[out_ptr]]!        \n\t"
-        "vst1.32 {q5, q6}, [%[out_ptr]]!        \n\t"
-        "vst1.32 {q7, q8}, [%[out_ptr]]!        \n\t"
-
-        "end_remainder_%=:                      \n\t"
-
-        "subs %[C], %[C], #1                    \n\t"
-        "bge        loop_c_%=                   \n\t"
-        "end_c_%=:                              \n\t"
-
-        "subs %[N], %[N], #1                    \n\t"
-        "bge        loop_n_%=                   \n\t"
-        "end_n_%=:                              \n\t"
-        :
-        : [input_x_ptr] "r"(input_x_ptr), [out_ptr] "r"(out_ptr),
-          [new_scale_ptr] "r"(new_scale_ptr), [new_bias_ptr] "r"(new_bias_ptr),
-          [N] "r"(N), [C] "r"(C), [HXW] "r"(HXW)
-        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-          "q10", "r5", "r6");
-
-    delete[] inv_std_ptr;
-    delete[] new_scale_ptr;
-    delete[] new_bias_ptr;
-
-  } else {
-    float *inv_std_ptr = new float[C];
-    for (int i = 0; i < C; i++) {
-      inv_std_ptr[i] =
-          1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-    }
-
-    Tensor new_scale;
-    auto new_scale_ptr =
-        new_scale.mutable_data<float>(framework::make_ddim({C}));
-    Tensor new_bias;
-    auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
-
-    /// ((x - est_mean) * (inv_var) * scale + bias equal to
-    /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
-    for (int i = 0; i < C; i++) {
-      new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-      new_bias_ptr[i] =
-          bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-      {
-        for (int n = 0; n < N; n++) {
-          for (int h = 0; h < H; h++) {
-            int tmp_index = n * stride0 + i * stride1 + h * stride2;
-            for (int w = 0; w < W; w++) {
-              int index = tmp_index + w;
-              out_ptr[index] =
-                  input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
-            }
-          }
-        }
-      }
-    }
-
-    delete[] inv_std_ptr;
-  }
-#endif
-#else
-  float *inv_std_ptr = new float[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-
-  Tensor new_scale;
-  auto new_scale_ptr = new_scale.mutable_data<float>(framework::make_ddim({C}));
-  Tensor new_bias;
-  auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
-
-  /// ((x - est_mean) * (inv_var) * scale + bias equal to
-  /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-    {
-      for (int n = 0; n < N; n++) {
-        for (int h = 0; h < H; h++) {
-          int tmp_index = n * stride0 + i * stride1 + h * stride2;
-          for (int w = 0; w < W; w++) {
-            int index = tmp_index + w;
-            out_ptr[index] =
-                input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
-          }
-        }
-      }
-    }
-  }
-  delete[] inv_std_ptr;
-#endif
 }

 }  // namespace operators

--- a/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #ifdef FUSION_CONVADDADDPRELU_OP
-
 #pragma once
+
+#include <string>
 #include <vector>
 #include "operators/math/conv_func.h"
 #include "operators/math/im2col.h"
@@ -115,20 +116,7 @@ void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> &param) {
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
      Tensor bias1_slice = bias1_batch.Slice(g * out_step, (g + 1) * out_step);
      float *biase_data1 = bias1_slice.data<float>();
-      //                    int n = bias1_slice.dims()[0];
-      //                    int m = bias1_slice.dims()[1];
-      //                    for(int i=0;i<n*m;i++){
-      //                        if(biase_data1[i]!=0)
-      //                        DLOG<<biase_data1[i]<<",yangfei";
-      //                    }
-
-      //                    math::matmul<float>(filter_slice, false, col_matrix,
-      //                    false,
-      //                                        static_cast<float>(1),
-      //                                        &out_slice,
-      //                                        static_cast<float>(1), true,
-      //                                        biase_data);
-      math::matmulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
+      math::MatMulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
                            p, mode, biase_data, biase_data1);
    }
  }
@@ -137,4 +125,4 @@ void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> &param) {
 }  // namespace operators
 }  // namespace paddle_mobile

-#endif
+#endif  // FUSION_CONVADDADDPRELU_OP
--- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h
@@ -25,6 +25,7 @@ limitations under the License. */

 namespace paddle_mobile {
 namespace operators {
+
 void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
@@ -106,7 +107,7 @@ void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
+      math::MatMul<float, float>(filter_slice, false, col_matrix, false,
                                 static_cast<float>(1), &out_slice,
                                 static_cast<float>(1), false, biase_data);
    }

--- a/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
@@ -25,6 +25,7 @@ limitations under the License. */

 namespace paddle_mobile {
 namespace operators {
+
 void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
@@ -105,12 +106,13 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> &param) {
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);

-      math::matmulWithBn<float>(
-          filter_slice, false, col_matrix, false, static_cast<float>(1),
-          &out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
+      math::MatMulWithBn(filter_slice, false, col_matrix, false,
+                         static_cast<float>(1), &out_slice,
+                         static_cast<float>(0), true, &new_scale, &new_bias, g);
    }
  }
 }
+
 template <typename P>
 void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> &param) {
  Tensor Bias;
@@ -126,9 +128,6 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> &param) {
             param.Input()->dims()[1] == param.Output()->dims()[1] &&
             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
-    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
-    //                                        param.Output(), param.NewScale(),
-    //                                        param.NewBias(), 1);
    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
                                          param.Output(), param.NewScale(),
                                          param.NewBias(), true);

--- a/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #ifdef FUSION_CONVADDPRELU_OP
-
 #pragma once
+
+#include <string>
 #include <vector>
 #include "operators/math/conv_func.h"
 #include "operators/math/im2col.h"
@@ -30,8 +31,6 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
  Tensor bias = *param.Bias();
-  //            DLOG<<"yangfei";
-  //            DLOG<<bias.dims();
  int axis = param.Axis();
  Tensor *output = param.Output();
  float *biase_data = bias.data<float>();
@@ -112,13 +111,7 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) {
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      //                    math::matmul<float>(filter_slice, false, col_matrix,
-      //                    false,
-      //                                        static_cast<float>(1),
-      //                                        &out_slice,
-      //                                        static_cast<float>(1), true,
-      //                                        biase_data);
-      math::matmulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
+      math::MatMulWithPRelu(filter_slice, false, col_matrix, false, &out_slice,
                            p, mode, biase_data, nullptr);
    }
  }
@@ -127,4 +120,4 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> &param) {
 }  // namespace operators
 }  // namespace paddle_mobile

-#endif
+#endif  // FUSION_CONVADDPRELU_OP
--- a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
@@ -25,24 +25,18 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {

-template <typename P, typename S>
+template <typename Itype, typename Otype>
 void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
  Tensor bias = *param.Bias();
  int32_t axis = param.Axis();
-  S *bias_data = bias.data<S>();
+  Otype *bias_data = bias.data<Otype>();
  Tensor *output = param.Output();
-  output->mutable_data<P>();
+  output->mutable_data<Otype>();

  float alpha = 1.0f;
  float beta = 1.0f;
-
-#ifdef FUSION_CONVADDRELU_INT8_OP
-  alpha = param.InputScale()->data<float>()[0];
-  beta = 0.0f;
-#endif
-
  int32_t groups = param.Groups();
  std::vector<int32_t> strides = param.Strides();
  std::vector<int32_t> paddings = param.Paddings();
@@ -70,7 +64,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
  Tensor col;
  Tensor col_matrix;
  if (is_expand) {
-    col.mutable_data<P>(col_shape);
+    col.mutable_data<Itype>(col_shape);
    col_matrix.ShareDataWith(col);
    col_matrix.Resize(col_matrix_shape);
  }
@@ -89,8 +83,8 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
  int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups;
  int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups;

-  math::Vol2ColFunctor<CPU, P> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, P> im2col;
+  math::Vol2ColFunctor<CPU, Itype> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Itype> im2col;

  for (int32_t i = 0; i < batch_size; i++) {
    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
@@ -118,8 +112,8 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);

-      math::matmul(filter_slice, false, col_matrix, false, alpha, &out_slice,
-                   beta, true, bias_data);
+      math::MatMul<Itype, Otype>(filter_slice, false, col_matrix, false, alpha,
+                                 &out_slice, beta, true, bias_data);
    }
  }
 }

--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -106,9 +106,10 @@ inline void GemmConv(const ConvParam<CPU> &param) {
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul(filter_slice, false, col_matrix, false,
-                   static_cast<float>(1), &out_slice, static_cast<float>(0),
-                   false, static_cast<Otype *>(nullptr));
+      math::MatMul<Itype, Otype>(filter_slice, false, col_matrix, false,
+                                 static_cast<float>(1), &out_slice,
+                                 static_cast<float>(0), false,
+                                 static_cast<Otype *>(nullptr));
    }
  }
 }
@@ -116,7 +117,7 @@ inline void GemmConv(const ConvParam<CPU> &param) {
 template <int tile, int kernel>
 inline void WinogradConv3x3(const ConvParam<CPU> &param) {
  const Tensor *input = param.Input();
-  const Tensor *filter = param.Filter();
+  const Tensor *filter = param.transformed_filter_;
  Tensor *output = param.Output();
  output->mutable_data<float>();
  int batch_size = input->dims()[0];

--- a/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
@@ -108,10 +108,10 @@ void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> &param) {
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
      Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step);
-      math::matmulWithBn<float>(filter_slice, false, col_matrix, false,
+      math::MatMulWithBn(filter_slice, false, col_matrix, false,
                         static_cast<float>(1), &out_slice,
-                                static_cast<float>(1), true, &new_scale,
-                                &new_bias, g, bias_data.data<float>());
+                         static_cast<float>(1), true, &new_scale, &new_bias, g,
+                         bias_data.data<float>());
    }
  }
 }

--- a/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
@@ -107,9 +107,9 @@ void ConvBNReluBasic(const FusionConvBNReluParam<CPU> &param) {
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);

-      math::matmulWithBn<float>(
-          filter_slice, false, col_matrix, false, static_cast<float>(1),
-          &out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
+      math::MatMulWithBn(filter_slice, false, col_matrix, false,
+                         static_cast<float>(1), &out_slice,
+                         static_cast<float>(0), true, &new_scale, &new_bias, g);
    }
  }
 }

--- a/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
@@ -93,8 +93,8 @@ void ConvTransposeCompute(const ConvTransposeParam<CPU> &param) {
      Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
      Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);

-      math::matmul(filter_slice, true, in_slice, false, static_cast<P>(1.0),
-                   &col_matrix, static_cast<P>(0.0));
+      math::MatMul<P, P>(filter_slice, true, in_slice, false,
+                         static_cast<P>(1.0), &col_matrix, static_cast<P>(0.0));
      if (data_dim == 2U) {
        col2im(col, dilations, strides,
               std::vector<int>{paddings[0], paddings[1], paddings[0],

--- a/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
@@ -106,9 +106,9 @@ void DWConvBNReluBasic(const FusionDWConvBNReluParam<CPU> &param) {
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmulWithBn<float>(
-          filter_slice, false, col_matrix, false, static_cast<float>(1),
-          &out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
+      math::MatMulWithBn(filter_slice, false, col_matrix, false,
+                         static_cast<float>(1), &out_slice,
+                         static_cast<float>(0), true, &new_scale, &new_bias, g);
    }
  }
 }

--- a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
@@ -26,18 +26,12 @@ namespace paddle_mobile {
 namespace operators {

 template <typename T>
-struct AddFunctor {
-  inline T operator()(T a, T b) const { return a + b; }
-};
-
-template <typename P>
-void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *Out = param.Out();
-  Out->mutable_data<float>();
+inline void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
+  const framework::Tensor *input_x = param.InputX();
+  const framework::Tensor *input_y = param.InputY();
+  framework::Tensor *Out = param.Out();
  int axis = param.Axis();
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
  const auto &x_dims = input_x->dims();
  const auto &y_dims = input_y->dims();
  /// axis = -1 represent the last dimensions.
@@ -57,18 +51,20 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
  const float *bias_data = input_y->data<float>();
  const float *input_data = input_x->data<float>();
  float *output_data = Out->mutable_data<float>();
+
+  #pragma omp parallel for collapse(2)
  for (int i = 0; i < batch; ++i) {
-    #pragma omp parallel for
    for (int j = 0; j < channels; ++j) {
      size_t offset = (i * channels + j) * elementwise_num;
      const float *input = input_data + offset;
-      const float *bias = bias_data + j;
+      const float bias = bias_data[j];
      float *output = output_data + offset;
-
+      int remain = elementwise_num;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
      int loop = elementwise_num >> 0x4;
-      int remain = elementwise_num & 0xF;
+      remain = elementwise_num & 0xF;
      for (int k = 0; k < loop; ++k) {
-        float32x4_t rb = vdupq_n_f32(*bias);
+        float32x4_t rb = vdupq_n_f32(bias);
        float32x4_t r0 = vld1q_f32(input);
        float32x4_t r1 = vld1q_f32(input + 4);
        float32x4_t r2 = vld1q_f32(input + 8);
@@ -84,15 +80,12 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
        input += 16;
        output += 16;
      }
+#endif
      for (int k = 0; k < remain; ++k) {
-        output[k] = input[k] + *bias;
+        output[k] = input[k] + bias;
      }
    }
  }
-#else
-  ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
-                                                 AddFunctor<float>(), Out);
-#endif
 }

 template class ElementwiseAddKernel<CPU, float>;

--- a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
+++ b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
@@ -23,20 +23,15 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {

-template <typename P, typename S>
+template <typename Itype, typename Otype>
 void FusionFcCompute(const FusionFcParam<CPU> &param) {
  const Tensor *input_x = param.InputX();
  const Tensor *input_y = param.InputY();
  Tensor *input_z = param.InputZ();
-  S *input_z_data = input_z->data<S>();
+  Otype *input_z_data = input_z->data<Otype>();
  int axis = param.Axis();
  Tensor *out = param.Out();
-  //  int m = out->dims()[0];
-  //  int n = out->dims()[1];
-  auto *out_data = out->mutable_data<P>();
-
-  float alpha = 1.0f;
-  float beta = 1.0f;
+  auto *out_data = out->mutable_data<Itype>();

  const Tensor x_matrix =
      input_x->dims().size() > 2
@@ -57,28 +52,14 @@ void FusionFcCompute(const FusionFcParam<CPU> &param) {
  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ");

-  if (std::is_same<P, int8_t>::value) {
-#ifdef FUSION_FC_INT8_OP
-    alpha = param.InputScale()->data<float>()[0];
-    beta = 0.0f;
-    math::matmul(x_matrix, false, y_matrix, false, alpha, out, beta, false,
-                 input_z_data, true);
-#endif
-  } else {
  // bias_data的维度和out的第二个维度一致
  int64_t classes = input_z->numel();
  for (int i = 0; i < out_dim[0]; i++) {
-      memory::Copy(out_data + i * classes, input_z_data,
-                   sizeof(float) * classes);
+    memory::Copy(out_data + i * classes, input_z_data, sizeof(Otype) * classes);
  }
-
-    math::matmul<float>(x_matrix, false, y_matrix, false, alpha, out, beta,
+  math::MatMul<Itype, Otype>(x_matrix, false, y_matrix, false,
+                             static_cast<float>(1), out, static_cast<float>(1),
                             false);
-  }
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-  //  if (out_dim.size() != 2) {
-  //      out->Resize(out_dim);
-  //  }
 }

 }  // namespace operators

--- a/src/operators/kernel/central-arm-func/gru_arm_func.h
+++ b/src/operators/kernel/central-arm-func/gru_arm_func.h
@@ -25,18 +25,16 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {

-using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
-
-template <typename DeviceType, typename T>
+template <typename Device, typename T>
 inline void ReorderInitState(const framework::Tensor& src,
                             std::vector<size_t> index_lod,
                             framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceType, T> row_shuffle;
+  math::CopyMatrixRowsFunctor<Device, T> row_shuffle;
  dst->mutable_data<T>(src.dims());
  row_shuffle(src, index_lod, dst, indexed_src);
 }
-template <typename P>
+
+template <typename T>
 void GruCompute(const GruParam<CPU>& param) {
  auto* input = param.InputInput();
  auto* h0 = param.InputH0();
@@ -57,8 +55,6 @@ void GruCompute(const GruParam<CPU>& param) {
  bool is_reverse = param.IsReverse();
  math::LoDTensor2BatchFunctor<CPU, float> to_batch;
  to_batch(*input, batch_gate, true, is_reverse);
-  //  math::ClearTensor<CPU, float> clearTensor;
-  //  clearTensor(batch_gate);
  if (bias) {
    math::RowwiseAdd<CPU, float> add_bias;
    add_bias(*batch_gate, *bias, batch_gate);
@@ -68,7 +64,7 @@ void GruCompute(const GruParam<CPU>& param) {
  gru_value.gate_weight = const_cast<float*>(weight_data);
  gru_value.state_weight =
      const_cast<float*>(weight_data + 2 * frame_size * frame_size);
-  Tensor ordered_h0;
+  framework::Tensor ordered_h0;
  std::vector<size_t> order(batch_gate->lod()[2]);
  if (h0) {
    // Since the batch computing for GRU reorders the input sequences
@@ -87,9 +83,10 @@ void GruCompute(const GruParam<CPU>& param) {
    int bstart = static_cast<int>(batch_starts[n]);
    int bend = static_cast<int>(batch_starts[n + 1]);
    int cur_batch_size = bend - bstart;
-    Tensor gate_t = batch_gate->Slice(bstart, bend);  // BUG
-    Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
-    Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+    framework::Tensor gate_t = batch_gate->Slice(bstart, bend);
+    framework::Tensor reset_hidden_prev_t =
+        batch_reset_hidden_prev->Slice(bstart, bend);
+    framework::Tensor hidden_t = batch_hidden->Slice(bstart, bend);
    gru_value.output_value = hidden_t.data<float>();
    gru_value.gate_value = gate_t.data<float>();
    gru_value.reset_output_value = reset_hidden_prev_t.data<float>();
@@ -105,7 +102,6 @@ void GruCompute(const GruParam<CPU>& param) {
 }

 }  // namespace operators
-
 }  // namespace paddle_mobile

-#endif
+#endif  // GRU_OP
--- a/src/operators/kernel/central-arm-func/mul_arm_func.h
+++ b/src/operators/kernel/central-arm-func/mul_arm_func.h
@@ -19,40 +19,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {

-// 1、如果x,y维度都是2维，
-// x = [[1,2],   y = [[5,6],
-//      [3,4]]        [7,8]]
-// 运算结果为正常矩阵相乘。结果 out =
-//  [[1*5+2*7,1*6+2*8],[3*5+4*7, 3*6+4*8]]
-//
-// 2、如果x的维度大于2或者y的维度大于2,x的维度(2,3,4) ,y的维度(4,1,2)
-// x = [[[1,2,3,4],
-//       [2,3,4,5],
-//       [3,4,5,6]],
-//      [[1,2,3,4],
-//       [2,3,4,5],
-//       [3,4,5,6]]]
-// y = [[[1,2]],
-//      [[3,4]],
-//      [[5,6]],
-//      [[7,8]]]
-// 需要借助x_num_col_dims和y_num_col_dims将x和y的维度转换为2维
-// 从模型中读到参数,x_num_col_dims = 2,y_num_col_dims = 1,左开右闭
-// (1) 将x = (2,3,4)的index [0,x_num_col_dims)部分2,3相乘，得到6，
-//     [x_num_col_dims,xdim.size())部分4相乘，得到4，
-//     将Tensor x的dims重写成(6,4)
-// (2) 将y = (4,1,2)的index [0,y_num_col_dims)部分4相乘，得到4，
-//     [y_num_col_dims,ydim.size())部分1,2相乘，得到2,
-//     将Tensor y的dims重写成(4,2)
-// 并不影响x,y在内存中的分布。
-// x = [[1,2,3,4],             y = [[1,2],
-//      [2,3,4,5],                  [3,4],
-//      [3,4,5,6],   矩阵乘法        [5,6],
-//      [1,2,3,4],                  [7,8]]
-//      [2,3,4,5],
-//      [3,4,5,6]]
-// 结果x(6行4列)乘y(4行2列)，按1中矩阵相乘，结果out(6行2列)
-
 template <typename P>
 void MulCompute(const MulParam<CPU> &param) {
  const Tensor *input_x = param.InputX();
@@ -73,14 +39,14 @@ void MulCompute(const MulParam<CPU> &param) {
  }
  if (param.InputX()->type() == typeid(int8_t)) {
    out->mutable_data<int32_t>();
-    math::matmul<float, int32_t>(x_matrix, false, y_matrix, false,
+    math::MatMul<int8_t, int32_t>(x_matrix, false, y_matrix, false,
                                  static_cast<float>(1), out,
                                  static_cast<float>(0));
-
  } else {
    out->mutable_data<float>();
-    math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
-                        out, static_cast<float>(0));
+    math::MatMul<float, float>(x_matrix, false, y_matrix, false,
+                               static_cast<float>(1), out,
+                               static_cast<float>(0));
  }
  if (out_dim.size() != 2) {
    out->Resize(out_dim);

--- a/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
+++ b/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
@@ -294,11 +294,6 @@ void MultiClassNMSCompute(const MultiClassNMSParam<CPU>& param) {
      }
    }
  }
-
-  //            framework::LoD lod;
-  //            lod.emplace_back(batch_starts);
-  //
-  //            outs->set_lod(lod);
 }

 }  // namespace operators

--- a/src/operators/kernel/central-arm-func/pool_arm_func.h
+++ b/src/operators/kernel/central-arm-func/pool_arm_func.h
@@ -17,103 +17,53 @@ limitations under the License. */

 #include <string>
 #include <vector>
+#include "common/types.h"
 #include "operators/math/pooling.h"

 namespace paddle_mobile {
 namespace operators {
-using framework::Tensor;
-
-template <typename T, typename S>
-void PoolBasic(std::string pooling_type, std::vector<int> ksize,
-               std::vector<int> strides, std::vector<int> paddings,
-               const Tensor *in_x, Tensor *out) {
-  if (pooling_type == "max") {
-    math::PoolFunctor<CPU, math::MaxPool<T>, T> pool2d_forward;
-    math::MaxPool<T> pool_process;
-    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
-
-  } else if (pooling_type == "avg") {
-    math::PoolFunctor<CPU, math::AvgPool<T, S>, T> pool2d_forward;
-    math::AvgPool<T, S> pool_process;
-    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
-  }
-}

 template <typename P>
 void PoolCompute(const PoolParam<CPU> &param) {
-  const Tensor *in_x = param.Input();
-  Tensor *out = param.Output();
-  std::string pooling_type = param.PoolingType();
-
+  const framework::Tensor *input = param.Input();
+  framework::Tensor *output = param.Output();
+  const std::string &pooling_type = param.PoolingType();
  std::vector<int> ksize = param.Ksize();
-
  std::vector<int> strides = param.Strides();
-
  std::vector<int> paddings = param.Paddings();
-  if (ksize.size() != 2) {
-    LOG(paddle_mobile::LogLevel::kLOG_ERROR)
-        << "Pool op only supports 2D and 3D input.";
-  }
  if (param.isGlobalPooling()) {
    for (size_t i = 0; i < ksize.size(); ++i) {
      paddings[i] = 0;
-      ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      ksize[i] = static_cast<int>(input->dims()[i + 2]);
    }
  }
-  if (in_x->type() == typeid(int8_t)) {
-    if (pooling_type == "max" && ksize[0] == 3 && ksize[0] == ksize[1]) {
-      if (strides[0] == strides[1] && strides[0] == 1) {
-        math::Pool3x3Maxs1_int8(in_x, out, paddings[0], paddings[1]);
-      } else if (strides[0] == strides[1] && strides[0] == 2) {
-        math::Pool3x3Maxs2_int8(in_x, out, paddings[0], paddings[1]);
+  if (ksize[0] == 3 && ksize[0] == ksize[1]) {
+    if (pooling_type == "max" && strides[0] == strides[1]) {
+      if (strides[0] == 1) {
+        math::Pooling3x3<MAX, 1>()(*input, paddings, output);
+      } else if (strides[0] == 2) {
+        math::Pooling3x3<MAX, 2>()(*input, paddings, output);
      } else {
-        math::Pool3x3Max_int8(strides, paddings, in_x, out);
+        math::Pooling<MAX>()(*input, ksize, strides, paddings, output);
      }
+    } else if (pooling_type == "avg" && strides[0] == strides[1]) {
+      if (strides[0] == 1) {
+        math::Pooling3x3<AVG, 1>()(*input, paddings, output);
+      } else if (strides[0] == 2) {
+        math::Pooling3x3<AVG, 2>()(*input, paddings, output);
      } else {
-      PoolBasic<int8_t, int32_t>(pooling_type, ksize, strides, paddings, in_x,
-                                 out);
+        math::Pooling<AVG>()(*input, ksize, strides, paddings, output);
      }
    } else {
-    if (ksize[0] == 3 && ksize[0] == ksize[1]) {
-      if (pooling_type == "max") {
-        if (strides[0] == strides[1] && strides[0] == 1 &&
-            paddings[0] == paddings[1] && paddings[1] == 1) {
-          math::Pool3x3Maxs1p1(in_x, out);
-        } else {
-          math::Pool3x3Max(strides, paddings, in_x, out);
+      // Others
    }
-      } else if (pooling_type == "avg") {
-        if (strides[0] == strides[1] && strides[0] == 1 &&
-            paddings[0] == paddings[1] && paddings[1] == 1) {
-          math::Pool3x3Avgs1p1(in_x, out);
  } else {
-          math::Pool3x3Avg(strides, paddings, in_x, out);
-        }
-      }
-
-    } else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 &&
-               strides[0] == strides[1] && paddings[0] == paddings[1] &&
-               paddings[1] == 0) {
-#if __ARM_NEON
-#if __aarch64__
-      PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
-                              out);
-#else
-      /// todo: fix bug in Pool2x2
    if (pooling_type == "max") {
-        math::Pool2x2Maxs2p0(strides, paddings, in_x, out);
+      math::Pooling<MAX>()(*input, ksize, strides, paddings, output);
    } else if (pooling_type == "avg") {
-        math::Pool2x2Avgs2p0(strides, paddings, in_x, out);
-      }
-#endif
-#else
-      PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
-                              out);
-#endif  // __ARM_NEON
-
+      math::Pooling<AVG>()(*input, ksize, strides, paddings, output);
    } else {
-      PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
-                              out);
+      // Others
    }
  }
 }

--- a/src/operators/kernel/central-arm-func/relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/relu_arm_func.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RELU_OP
-#pragma once
-
-#include <operators/math/transform.h>
-#include "operators/op_param.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct ReluFunctor {
-  inline T operator()(T in) const { return in > 0 ? in : 0; }
-};
-
-/*
- * @b 特化到具体平台的实现, param 从 op 层传入
- * */
-template <typename P>
-void ReluCompute(const ReluParam<CPU> &param) {
-  const auto *input_x = param.InputX();
-  auto *input_x_ptr = input_x->data<float>();
-  auto *out = param.Out();
-  auto *out_ptr = out->mutable_data<float>();
-
-  int numel = input_x->numel();
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#if __aarch64__
-  if (numel > 0) {
-    int loop = numel >> 0x4;
-    int remain = numel & 0xF;
-    float32x4_t zero = vdupq_n_f32(0.f);
-    for (int i = 0; i < loop; ++i) {
-      float32x4_t r0 = vld1q_f32(input_x_ptr);
-      float32x4_t r1 = vld1q_f32(input_x_ptr + 4);
-      float32x4_t r2 = vld1q_f32(input_x_ptr + 8);
-      float32x4_t r3 = vld1q_f32(input_x_ptr + 12);
-      r0 = vmaxq_f32(r0, zero);
-      r1 = vmaxq_f32(r1, zero);
-      r2 = vmaxq_f32(r2, zero);
-      r3 = vmaxq_f32(r3, zero);
-      vst1q_f32(out_ptr, r0);
-      vst1q_f32(out_ptr + 4, r1);
-      vst1q_f32(out_ptr + 8, r2);
-      vst1q_f32(out_ptr + 12, r3);
-      input_x_ptr += 16;
-      out_ptr += 16;
-    }
-    for (int i = 0; i < remain; ++i) {
-      out_ptr[i] = (input_x_ptr[i] > 0) * input_x_ptr[i];
-    }
-#else
-  if (numel > 64) {
-    asm volatile(
-        "pld        [%[input_x_ptr], #0]        \n\t"
-        "vmov.f32   q8,    #0.0                 \n\t"
-        "subs %[num], %[num], #32                \n\t"
-        "blt        end_num_%=                  \n\t"
-        "loop_num_%=:                           \n\t"
-        "pld        [%[input_x_ptr], #1024]      \n\t"
-
-        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-
-        "vmax.f32 q0, q0, q8                   \n\t"
-        "vmax.f32 q1, q1, q8                    \n\t"
-        "vmax.f32 q2, q2, q8                   \n\t"
-        "vmax.f32 q3, q3, q8                   \n\t"
-        "vmax.f32 q4, q4, q8                   \n\t"
-        "vmax.f32 q5, q5, q8                   \n\t"
-        "vmax.f32 q6, q6, q8                   \n\t"
-        "vmax.f32 q7, q7, q8                   \n\t"
-
-        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
-        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
-        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
-        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
-
-        "subs %[num], %[num], #32              \n\t"
-        "bge        loop_num_%=                \n\t"
-        "end_num_%=:                           \n\t"
-        "cmp %[num], #0                         \n\t"
-        "bge   end_%=                          \n\t"
-        "mov r6, #4                             \n\t"
-        "mul r5, %[num], r6                     \n\t"
-        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
-        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
-        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
-        "vmax.f32 q0, q0, q8                   \n\t"
-        "vmax.f32 q1, q1, q8                    \n\t"
-        "vmax.f32 q2, q2, q8                   \n\t"
-        "vmax.f32 q3, q3, q8                   \n\t"
-        "vmax.f32 q4, q4, q8                   \n\t"
-        "vmax.f32 q5, q5, q8                   \n\t"
-        "vmax.f32 q6, q6, q8                   \n\t"
-        "vmax.f32 q7, q7, q8                   \n\t"
-        "add %[out_ptr], %[out_ptr], r5       \n\t"
-        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
-        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
-        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
-        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
-        "end_%=:                                \n\t"
-        :
-        :
-        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel)
-        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5",
-          "r6");
-#endif
-  } else {
-#endif
-    ReluFunctor<float> func_;
-    math::Transform trans;
-    trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  }
-#endif
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
--- a/src/operators/kernel/central-arm-func/transpose_arm_func.h
+++ b/src/operators/kernel/central-arm-func/transpose_arm_func.h
@@ -21,23 +21,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {

-// vector<int> pos;
-// template <typename T>
-// void TransposeFunc(const int numel, const T* input, const vector<int> axis,
-//                    const vector<int> old_strides, const vector<int>
-//                    new_strides, T* output) {
-//   for (int i = 0; i < numel; ++i) {
-//     int old_idx = 0;
-//     int idx = i;
-//     for (int j = 0; j < axis.size(); ++j) {
-//       int order = axis[j];
-//       old_idx += (idx / new_strides[j]) * old_strides[order];
-//       idx %= new_strides[j];
-//     }
-//     output[i] = input[old_idx];
-//   }
-// }
-
 template <typename P>
 void TransposeCompute(const TransposeParam<CPU>& param) {
  const auto* input_x = param.InputX();

--- a/src/operators/kernel/cl/feed_kernel.cpp
+++ b/src/operators/kernel/cl/feed_kernel.cpp
@@ -14,6 +14,7 @@ limitations under the License. */

 #include "operators/kernel/feed_kernel.h"
 #include "framework/cl/cl_tensor.h"
+
 namespace paddle_mobile {
 namespace operators {

@@ -43,7 +44,7 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
  const int Stride2 = out_C * out_H * out_W;
  const int Stride1 = out_H * out_W;
  const int Stride0 = out_W;
-  CLTensor input_cl_tensor(this->cl_helper_.CLContext(),
+  framework::CLTensor input_cl_tensor(this->cl_helper_.CLContext(),
                                      this->cl_helper_.CLCommandQueue());
  input_cl_tensor.Resize(input->dims());
  cl_mem inputBuffer = input_cl_tensor.mutable_with_data<float>(input_data);

--- a/src/operators/kernel/cl/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/cl/fusion_fc_kernel.cpp
@@ -94,27 +94,20 @@ void FusionFcCompute(const FusionFcParam<GPU_CL> &param, cl_context context,
    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
  }

-  //  for (int i = 0; i < out->numel(); i++) {
-  //    DLOG << out_data[i];
-  //  }
-  // bias_data的维度和out的维度一致
-  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
-                      out, static_cast<float>(1), false);
+  math::MatMul<float, float>(x_matrix, false, y_matrix, false,
+                             static_cast<float>(1), out, static_cast<float>(1),
+                             false);

  out_image->InitEmptyImage(context, commandQueue, out->dims());
  framework::TensorToCLImage(out, out_image, context, commandQueue, kernel1);

-  DLOG << *out;
-
  delete (input_x);
  delete (input_y);
  delete (input_z);
  delete (out);
  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-  //            if (out_dim.size() != 2) {
-  //                out->Resize(out_dim);
-  //            }
 }
+
 template <>
 void FusionFcKernel<GPU_CL, float>::Compute(
    const FusionFcParam<GPU_CL> &param) {

--- a/src/operators/fusion_conv_add_relu_int8_op.h
+++ b/src/operators/fusion_conv_add_relu_int8_op.h
@@ -12,31 +12,46 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef FUSION_CONVADDRELU_INT8_OP
 #pragma once
-#include <string>
+
 #include "framework/operator.h"
-#include "operators/kernel/conv_add_relu_kernel.h"
 #include "operators/op_param.h"
+
 namespace paddle_mobile {
 namespace operators {
-template <typename DeviceType, typename T>
-class FusionConvAddReluInt8Op
-    : public framework::OperatorWithKernel<DeviceType,
-                                           FusionConvAddReluParam<DeviceType>,
-                                           ConvAddReluKernel<DeviceType, T>> {
- public:
-  FusionConvAddReluInt8Op(const std::string &type,
-                          const VariableNameMap &inputs,
-                          const VariableNameMap &outputs,
-                          const framework::AttributeMap &attrs,
-                          std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<DeviceType,
-                                      FusionConvAddReluParam<DeviceType>,
-                                      ConvAddReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
+
+#define DECLARE_KERNEL(KernelClass, KernelParam)                              \
+  template <typename DeviceType, typename T>                                  \
+  class KernelClass                                                           \
+      : public framework::OpKernelBase<DeviceType, KernelParam<DeviceType>> { \
+   public:                                                                    \
+    bool Init(KernelParam<DeviceType> *param);                                \
+    void Compute(const KernelParam<DeviceType> &param);                       \
+  };
+
+#ifdef FUSION_DEQUANT_BN_OP
+DECLARE_KERNEL(FusionDequantBNKernel, FusionDequantBNParam);
+#endif
+
+#ifdef FUSION_DEQUANT_BN_RELU_OP
+DECLARE_KERNEL(FusionDequantBNReluKernel, FusionDequantBNParam);
+#endif
+
+#ifdef FUSION_DEQUANT_ADD_BN_OP
+DECLARE_KERNEL(FusionDequantAddBNKernel, FusionDequantAddBNParam);
+#endif
+
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
+DECLARE_KERNEL(FusionDequantAddBNReluKernel, FusionDequantAddBNParam);
+#endif
+
+#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
+DECLARE_KERNEL(FusionDequantAddBNQuantKernel, FusionDequantAddBNQuantParam);
+#endif
+
+#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
+DECLARE_KERNEL(FusionDequantAddBNReluQuantKernel, FusionDequantAddBNQuantParam);
+#endif
+
 }  // namespace operators
 }  // namespace paddle_mobile
-#endif  // FUSION_CONVADDRELU_INT8_OP
--- a/src/operators/kernel/feed_kernel.h
+++ b/src/operators/kernel/feed_kernel.h
@@ -19,7 +19,7 @@ limitations under the License. */

 namespace paddle_mobile {
 namespace operators {
-using namespace framework;
+
 template <typename DeviceType, typename T>
 class FeedKernel
    : public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>> {

--- a/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBN_OP

 #include "operators/kernel/conv_bn_kernel.h"
+#include <cmath>

 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBNRELU_OP

 #include "operators/kernel/conv_bn_relu_kernel.h"
+#include <cmath>

 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
@@ -54,11 +54,11 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
  fpga::format_deconv_filter(filter, max_value, param->Groups(),
                             param->Strides()[0]);

-  // int element_num_per_div =
-  //    fpga::get_filter_num_per_div(filter, param->Groups());
+  int element_num_per_div =
+      fpga::get_deconv_filter_num_per_div(filter, param->Groups(), sub_conv_n);

-  // deconv only support group=1 && no spilt
-  fpga::format_bias_scale_array(&bs_ptr, channel * sub_conv_n,
+  //
+  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div,
                                channel * sub_conv_n);

  fpga::format_fp16_ofm(out);

--- a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
@@ -55,11 +55,10 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
  fpga::format_deconv_filter(filter, max_value, param->Groups(),
                             param->Strides()[0]);

-  // int element_num_per_div =
-  //     fpga::get_filter_num_per_div(filter, param->Groups());
+  int element_num_per_div =
+      fpga::get_deconv_filter_num_per_div(filter, param->Groups(), sub_conv_n);

-  // deconv only support group=1 && no spilt
-  fpga::format_bias_scale_array(&bs_ptr, channel * sub_conv_n,
+  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div,
                                channel * sub_conv_n);

  fpga::format_fp16_ofm(out);

--- a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
@@ -14,7 +14,6 @@ limitations under the License. */
 #ifdef TRANSPOSE2_OP

 #include "operators/kernel/transpose2_kernel.h"
-#include "operators/kernel/central-arm-func/transpose2_arm_func.h"

 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/kernels.h
+++ b/src/operators/kernel/kernels.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+#define DECLARE_KERNEL(KernelClass, KernelParam)                              \
+  template <typename DeviceType, typename T>                                  \
+  class KernelClass                                                           \
+      : public framework::OpKernelBase<DeviceType, KernelParam<DeviceType>> { \
+   public:                                                                    \
+    bool Init(KernelParam<DeviceType> *param);                                \
+    void Compute(const KernelParam<DeviceType> &param);                       \
+  };
+
+#ifdef TOP_K_OP
+DECLARE_KERNEL(TopKKernel, TopKParam)
+#endif  // TOP_K_OP
+
+#ifdef CAST_OP
+DECLARE_KERNEL(CastKernel, CastParam)
+#endif  // CAST_OP
+
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/mali/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
@@ -61,7 +61,7 @@ void FusionFcKernel<GPU_MALI, float>::Compute(
  for (int i = 0; i < out->numel(); i++) {
    DLOG << out_data[i];
  }
-  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+  math::MatMul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
                      out, static_cast<float>(1));
  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
  //            if (out_dim.size() != 2) {

--- a/src/operators/kernel/mali/mul_kernel.cpp
+++ b/src/operators/kernel/mali/mul_kernel.cpp
@@ -44,7 +44,7 @@ void MulKernel<GPU_MALI, float>::Compute(const MulParam<GPU_MALI> &param) {
  if (out_dim.size() != 2) {
    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
  }
-  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+  math::MatMul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
                      out, static_cast<float>(0));
  if (out_dim.size() != 2) {
    out->Resize(out_dim);

--- a/src/operators/kernel/relu_kernel.h
+++ b/src/operators/kernel/relu_kernel.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #pragma once

 #include "framework/operator.h"
-
 #include "operators/op_param.h"

 namespace paddle_mobile {
@@ -30,6 +29,15 @@ class ReluKernel
  void Compute(const ReluParam<DeviceType>& param);
  bool Init(ReluParam<DeviceType>* param);
 };
+
+template <typename DeviceType, typename T>
+class Relu6Kernel
+    : public framework::OpKernelBase<DeviceType, ReluParam<DeviceType>> {
+ public:
+  void Compute(const ReluParam<DeviceType>& param);
+  bool Init(ReluParam<DeviceType>* param);
+};
+
 }  // namespace operators
 }  // namespace paddle_mobile


--- a/src/operators/math/activation_functions.h
+++ b/src/operators/math/activation_functions.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,9 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include <math.h>
+
+#include <algorithm>
+#include <cmath>
 #include <string>
 #include "common/enforce.h"
+#include "common/types.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#include "operators/math/math_func_neon.h"
+#endif
+
 namespace paddle_mobile {
 namespace operators {
 namespace math {
@@ -24,68 +32,92 @@ namespace math {
 #define SIGMOID_THRESHOLD_MAX 13.0
 #define EXP_MAX_INPUT 40.0

-enum ActivationType {
-  kSigmoid,
-  kReLU,
-  kTanh,
-  kIdentity,
-};
-
 inline ActivationType GetActivationType(const std::string &type) {
  if (type == "sigmoid") {
-    return ActivationType::kSigmoid;
+    return ActivationType::SIGMOID;
  } else if (type == "relu") {
-    return ActivationType::kReLU;
+    return ActivationType::RELU;
  } else if (type == "tanh") {
-    return ActivationType::kTanh;
+    return ActivationType::TANH;
  } else if (type == "identity" || type == "") {
-    return ActivationType::kIdentity;
+    return ActivationType::IDENTITY;
  }
  PADDLE_MOBILE_THROW_EXCEPTION("Not support activation type.");
 }

-namespace forward {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+template <ActivationType Act = IDENTITY>
+inline float32x4_t vActiveq_f32(const float32x4_t &x) {
+  return x;
+}

-template <typename T>
-T Identity(const T a) {
-  return a;
+template <>
+inline float32x4_t vActiveq_f32<RELU>(const float32x4_t &x) {
+  float32x4_t __zero = vdupq_n_f32(0.f);
+  return vmaxq_f32(x, __zero);
 }

-template <typename T>
-T Relu(const T a) {
-  return a > static_cast<T>(0.0) ? a : static_cast<T>(0.0);
+template <>
+inline float32x4_t vActiveq_f32<RELU6>(const float32x4_t &x) {
+  float32x4_t __zero = vdupq_n_f32(0.f);
+  float32x4_t __six = vdupq_n_f32(6.f);
+  return vminq_f32(vmaxq_f32(x, __zero), __six);
 }

-template <typename T>
-T Sigmoid(const T a) {
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  T tmp = (a < min) ? min : ((a > max) ? max : a);
-  return static_cast<T>(1.0) / (static_cast<T>(1.0) + exp(-tmp));
+template <>
+inline float32x4_t vActiveq_f32<SIGMOID>(const float32x4_t &x) {
+  float32x4_t __one = vdupq_n_f32(1.f);
+  float32x4_t __x = vnegq_f32(x);
+  __x = exp_ps(__x);
+  __x = vaddq_f32(__x, __one);
+  float32x4_t __out = vrecpeq_f32(__x);
+  return vmulq_f32(vrecpsq_f32(__x, __out), __out);
 }

-template <typename T>
-T Tanh(const T a) {
-  T tmp = -2.0 * a;
-  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-  return (2.0 / (1.0 + exp(tmp))) - 1.0;
+template <>
+inline float32x4_t vActiveq_f32<TANH>(const float32x4_t &x) {
+  float32x4_t __one = vdupq_n_f32(1.f);
+  float32x4_t __x = vnegq_f32(x);
+  __x = vmulq_n_f32(__x, 2.f);
+  __x = exp_ps(__x);
+  __x = vaddq_f32(__x, __one);
+  float32x4_t __out = vrecpeq_f32(__x);
+  __out = vmulq_f32(vrecpsq_f32(__x, __out), __out);
+  __out = vmulq_n_f32(__out, 2.f);
+  return vsubq_f32(__out, __one);
 }
+#endif

-}  // namespace forward
+template <ActivationType Act = IDENTITY>
+inline float Active(const float &x) {
+  return x;
+}

-template <typename T>
-struct Active {
-  typedef T (*Act)(T);
-};
+template <>
+inline float Active<RELU>(const float &x) {
+  return std::max(x, 0.f);
+}

-static Active<float>::Act kActFloat[] = {
-    &forward::Sigmoid<float>, &forward::Relu<float>, &forward::Tanh<float>,
-    &forward::Identity<float>};
+template <>
+inline float Active<RELU6>(const float &x) {
+  return std::min(std::max(x, 0.f), 6.f);
+}

-namespace forward {
-inline float activation(float a, int index) { return kActFloat[index](a); }
+template <>
+inline float Active<SIGMOID>(const float &x) {
+  //  float tmp = x > SIGMOID_THRESHOLD_MAX ? SIGMOID_THRESHOLD_MAX : x;
+  //  tmp = x > SIGMOID_THRESHOLD_MIN ? x : SIGMOID_THRESHOLD_MIN;
+  //  return 1.f / (1.f + exp(-tmp));
+  return 1.f / (1.f + exp(-x));
+}

-}  // namespace forward
+template <>
+inline float Active<TANH>(const float &x) {
+  //  float tmp = -2.f * x;
+  //  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  //  return (2.f / (1.f + exp(tmp))) - 1.f;
+  return 2.f / (1.f + exp(-2.f * x)) - 1.f;
+}

 }  // namespace math
 }  // namespace operators

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -1260,10 +1260,10 @@ void Gemm::AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
        "q10", "q11", "q12", "q13");
 }

-/*
-void Gemm::VectorKernel(int m, int n, int k, float alpha, const float *A, int
-lda, const float *B, int ldb, float beta, float *C, int ldc, bool relu) { float
-*bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
+void Gemm::VectorKernel(int m, int n, int k, float alpha, const float *A,
+                        int lda, const float *B, int ldb, float beta, float *C,
+                        int ldc, bool relu) {
+  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));

  const float *a0, *b0, *b1, *b2, *b3;
  float *c0, *C0;
@@ -1482,6 +1482,7 @@ lda, const float *B, int ldb, float beta, float *C, int ldc, bool relu) { float
  }
 }

+/*
 void Gemm::VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
                        int lda, const float *B, int ldb, float beta, float *C,
                        int ldc, bool relu, float *new_scale, float *new_bias) {
@@ -2579,9 +2580,8 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
  }
 }

-  /*
-  // C = A * B
-  void Gemm::VecWriteBasic(int n, float *c, float *C, int ldc) {
+// C = A * B
+void Gemm::VecWriteBasic(int n, float *c, float *C, int ldc) {
  int nc1 = n / 16;
  int _nc1 = n % 16;
  int nc2 = _nc1 / 4;
@@ -2624,13 +2624,13 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
      :
      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3)
      : "memory", "q0", "q1", "q2", "q3", "q4", "q5");
-  }
+}

-  // C = alpha * A * B + beta * C
-  void Gemm::VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
+// C = alpha * A * B + beta * C
+void Gemm::VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}

-  // C = A * B + C
-  void Gemm::VecWriteWithAdd(int n, float *c, float *C, int ldc) {
+// C = A * B + C
+void Gemm::VecWriteWithAdd(int n, float *c, float *C, int ldc) {
  int nc1 = n / 16;
  int _nc1 = n % 16;

@@ -2657,18 +2657,18 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,

      : [C] "+r"(C), [c] "+r"(c)
      : [nc1] "r"(nc1)
-        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10",
-  "q11", "q12", "q13");
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
+        "q12", "q13");

  if (_nc1 != 0) {
    for (int j = 0; j < _nc1; j++) {
      *C++ += *c++;
    }
  }
-  }
+}

-  // C = A * B + C, relu(C)
-  void Gemm::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
+// C = A * B + C, relu(C)
+void Gemm::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
  int nc1 = n / 16;
  int _nc1 = n % 16;

@@ -2700,8 +2700,8 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,

      : [C] "+r"(C), [c] "+r"(c)
      : [nc1] "r"(nc1)
-        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10",
-  "q11", "q12", "q13");
+      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
+        "q12", "q13");

  if (_nc1 != 0) {
    for (int j = 0; j < _nc1; j++) {
@@ -2713,8 +2713,9 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
      c++;
    }
  }
-  }
+}

+  /*
    // C = A * B, batchnorm(C)
    void Gemm::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
                        float *bias) {
@@ -3149,13 +3150,18 @@ void Gemm::SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
 void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
                     const float *B, int ldb, float beta, float *C, int ldc,
                     bool relu, float *bias) {
+  if (m == 1 && bias == nullptr) {
+    return VectorKernel(m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, relu);
+  }
 #ifdef _OPENMP
  int max_threads = omp_get_max_threads();
 #else
  int max_threads = 1;
 #endif

-  int L1 = 64 / max_threads * 1024;
+  //  int L1 = 64 / max_threads * 1024;
+  int L = (max_threads > 2) ? 64 : 32;
+  int L1 = L / max_threads * 1024;
  KC = k;
  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);

--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -105,12 +105,11 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                            float *c, float *C, int ldc, float *p,
                            std::string mode, float *bias, float *bias1);

-  /*
  // 向量矩阵乘法 (M = 1)
  void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
                    const float *B, int ldb, float beta, float *C, int ldc,
                    bool relu);
-
+  /*
    void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
                            int lda, const float *B, int ldb, float beta, float
    *C, int ldc, bool relu, float *new_scale, float *new_bias);
@@ -149,7 +148,6 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
  void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
                          float *new_scale, float *new_bias, float *bias1);

-  /*
  // 向量矩阵乘法结果回写
  // C = A * B
  void VecWriteBasic(int n, float *c, float *C, int ldc);
@@ -159,12 +157,13 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
  void VecWriteWithAdd(int n, float *c, float *C, int ldc);
  // C = A * B + C, relu(C)
  void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
+  /*
    // C = A * B, batchnorm(C)
    void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
                        float *new_bias);
    // C = A * B, batchnorm(C), relu(C)
-  void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
-                          float *new_bias);
+    void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float
+    *new_scale, float *new_bias);
    */

  // 32位 float 矩阵乘法
@@ -392,7 +391,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
    packedB_int8 = static_cast<int8_t *>(
        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
 #if __aarch64__
-    // TODO()
+    // TODO(paddle mobile)
 #else
    PackMatrixB_omp_2c_16(k, n, n % NR_INT8, B, ldb, packedB_int8);
 #endif
@@ -414,7 +413,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
    packedA_int8 = static_cast<int8_t *>(
        paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
 #if __aarch64__
-    // TODO()
+    // TODO(paddle mobile)
 #else
    PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8);
 #endif
@@ -438,7 +437,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
      int8_t *local_A = packedA_int8 + MC * KC * local_threads;
      int32_t *local_C = packedC_int32 + MC * NC * local_threads;
 #if __aarch64__
-      // TODO()
+      // TODO(paddle mobile)
 #else
      PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A);
 #endif
@@ -468,7 +467,7 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
      int8_t *local_B = packedB_int8 + KC * NC * local_threads;
      int32_t *local_C = packedC_int32 + MC * NC * local_threads;
 #if __aarch64__
-      // TODO()
+      // TODO(paddle mobile)
 #else
      PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B);
 #endif

--- a/src/operators/math/gru_compute.cpp
+++ b/src/operators/math/gru_compute.cpp
@@ -11,13 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #ifdef GRU_OP
+
 #include "operators/math/gru_compute.h"
 #include "common/types.h"
-#include "operators/math/activation_functions.h"
+#include "operators/math/activation.h"
 #include "operators/math/gemm.h"
 #include "operators/math/gru_cpu_kernel.h"
-#include "operators/math/gru_kernel.h"

 namespace paddle_mobile {
 namespace operators {
@@ -43,8 +44,7 @@ struct GRUUnitFunctor<CPU, T> {
 #endif
    }

-    forward_reset_output(forward::gru_resetOutput<T>(), value, frame_size,
-                         batch_size, active_gate);
+    forward_reset_output(value, frame_size, batch_size, active_gate);

    if (value.prev_out_value) {
 #ifdef _OPENMP
@@ -60,8 +60,7 @@ struct GRUUnitFunctor<CPU, T> {
 #endif
    }

-    forward_final_output(forward::gru_finalOutput<T>(), value, frame_size,
-                         batch_size, active_node);
+    forward_final_output(value, frame_size, batch_size, active_node);
  }
 };


--- a/src/operators/math/gru_compute.h
+++ b/src/operators/math/gru_compute.h
@@ -11,7 +11,7 @@ limitations under the License. */
 #ifdef GRU_OP
 #pragma once

-#include "operators/math/activation_functions.h"
+#include "operators/math/activation.h"

 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/math/gru_cpu_kernel.h
+++ b/src/operators/math/gru_cpu_kernel.h
@@ -11,21 +11,22 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #ifdef GRU_OP
+
 #pragma once
+
 #include <type_traits>
-#include "operators/math/activation_functions.h"
+#include "operators/math/activation.h"
 #include "operators/math/gru_compute.h"

 namespace paddle_mobile {
 namespace operators {
 namespace math {

-template <class OpResetOutput, typename T>
-void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
-                                       T *gate_value, T *reset_output_value,
-                                       T *prev_output_value, int frame_size,
-                                       ActivationType active_gate) {
+template <typename T, ActivationType Act>
+void hl_naive_gru_forward_reset_output(T *gate_value, T *reset_output_value,
+                                       T *prev_output_value, int frame_size) {
  T r_value_update_gate;
  T r_value_reset_gate;
  T r_value_reset_output;
@@ -33,27 +34,57 @@ void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
  T *update_gate = gate_value;
  T *reset_gate = gate_value + frame_size;

-  for (int i = 0; i < frame_size; i++) {
+  int remain = frame_size;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  int loop = remain >> 3;
+  remain = remain & 0x7;
+  float32x4_t prev0 = vdupq_n_f32(0.f);
+  float32x4_t prev1 = vdupq_n_f32(0.f);
+  for (int i = 0; i < loop; ++i) {
+    float32x4_t update0 = vld1q_f32(update_gate);
+    float32x4_t update1 = vld1q_f32(update_gate + 4);
+    float32x4_t reset0 = vld1q_f32(reset_gate);
+    float32x4_t reset1 = vld1q_f32(reset_gate + 4);
+    if (prev_output_value) {
+      prev0 = vld1q_f32(prev_output_value);
+      prev1 = vld1q_f32(prev_output_value + 4);
+      prev_output_value += 8;
+    }
+    update0 = vActiveq_f32<Act>(update0);
+    update1 = vActiveq_f32<Act>(update1);
+    reset0 = vActiveq_f32<Act>(reset0);
+    reset1 = vActiveq_f32<Act>(reset1);
+    float32x4_t output0 = vmulq_f32(prev0, reset0);
+    float32x4_t output1 = vmulq_f32(prev1, reset1);
+    vst1q_f32(update_gate, update0);
+    vst1q_f32(update_gate + 4, update1);
+    vst1q_f32(reset_gate, reset0);
+    vst1q_f32(reset_gate + 4, reset1);
+    vst1q_f32(reset_output_value, output0);
+    vst1q_f32(reset_output_value + 4, output1);
+    update_gate += 8;
+    reset_gate += 8;
+    reset_output_value += 8;
+  }
+#endif  // __ARM_NEON__
+  for (int i = 0; i < remain; i++) {
    r_value_update_gate = update_gate[i];
    r_value_reset_gate = reset_gate[i];
    if (prev_output_value) {
      r_prev_out = prev_output_value[i];
    }
-
-    op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
-                    &r_value_reset_output, active_gate);
-
+    r_value_update_gate = Active<Act>(r_value_update_gate);
+    r_value_reset_gate = Active<Act>(r_value_reset_gate);
+    r_value_reset_output = r_prev_out * r_value_reset_gate;
    update_gate[i] = r_value_update_gate;
    reset_gate[i] = r_value_reset_gate;
    reset_output_value[i] = r_value_reset_output;
  }
 }

-template <class OpFinalOutput, typename T>
-void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
-                                       T *gate_value, T *prev_output_value,
-                                       T *output_value, int frame_size,
-                                       ActivationType active_node) {
+template <typename T, ActivationType Act>
+void hl_naive_gru_forward_final_output(T *gate_value, T *prev_output_value,
+                                       T *output_value, int frame_size) {
  T r_value_update_gate;
  T r_value_frame_state;
  T r_prev_out = 0;
@@ -61,30 +92,73 @@ void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
  T *update_gate = gate_value;
  T *frame_state = gate_value + frame_size * 2;

-  for (int i = 0; i < frame_size; i++) {
+  int remain = frame_size;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  int loop = remain >> 3;
+  remain = remain & 0x7;
+  float32x4_t prev0 = vdupq_n_f32(0.f);
+  float32x4_t prev1 = vdupq_n_f32(0.f);
+  for (int i = 0; i < loop; ++i) {
+    float32x4_t update0 = vld1q_f32(update_gate);
+    float32x4_t update1 = vld1q_f32(update_gate + 4);
+    float32x4_t state0 = vld1q_f32(frame_state);
+    float32x4_t state1 = vld1q_f32(frame_state + 4);
+    if (prev_output_value) {
+      prev0 = vld1q_f32(prev_output_value);
+      prev1 = vld1q_f32(prev_output_value + 4);
+      prev_output_value += 8;
+    }
+    state0 = vActiveq_f32<Act>(state0);
+    state1 = vActiveq_f32<Act>(state1);
+    float32x4_t output0 = vmlsq_f32(prev0, update0, prev0);
+    float32x4_t output1 = vmlsq_f32(prev1, update1, prev1);
+    output0 = vmlaq_f32(output0, update0, state0);
+    output1 = vmlaq_f32(output1, update1, state1);
+    vst1q_f32(frame_state, state0);
+    vst1q_f32(frame_state + 4, state1);
+    vst1q_f32(output_value, output0);
+    vst1q_f32(output_value + 4, output1);
+    update_gate += 8;
+    frame_state += 8;
+    output_value += 8;
+  }
+#endif  // __ARM_NEON__
+  for (int i = 0; i < remain; i++) {
    r_value_update_gate = update_gate[i];
    r_value_frame_state = frame_state[i];
    if (prev_output_value) {
      r_prev_out = prev_output_value[i];
    }
-
-    op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
-                    &r_output, active_node);
-
+    r_value_frame_state = Active<Act>(r_value_frame_state);
+    r_output = r_prev_out - r_value_update_gate * r_prev_out +
+               r_value_update_gate * r_value_frame_state;
    frame_state[i] = r_value_frame_state;
    output_value[i] = r_output;
  }
 }

-template <class OpResetOutput, typename T>
-inline void forward_reset_output(OpResetOutput op_reset_output,
-                                 GRUMetaValue<T> value, int frame_size,
-                                 int batch_size, ActivationType active_gate) {
-  for (int b = 0; b < batch_size; b++) {
-    hl_naive_gru_forward_reset_output(
-        op_reset_output, value.gate_value, value.reset_output_value,
-        value.prev_out_value, frame_size, active_gate);
+#define FORWARD_RESET_OUTPUT(active_type, value, frame_size)            \
+  hl_naive_gru_forward_reset_output<float, active_type>(                \
+      value.gate_value, value.reset_output_value, value.prev_out_value, \
+      frame_size);

+template <typename T>
+inline void forward_reset_output(GRUMetaValue<T> value, int frame_size,
+                                 int batch_size, ActivationType active_node) {
+  for (int b = 0; b < batch_size; ++b) {
+    switch (active_node) {
+      case RELU:
+        FORWARD_RESET_OUTPUT(RELU, value, frame_size);
+        break;
+      case SIGMOID:
+        FORWARD_RESET_OUTPUT(SIGMOID, value, frame_size);
+        break;
+      case TANH:
+        FORWARD_RESET_OUTPUT(TANH, value, frame_size);
+        break;
+      default:
+        FORWARD_RESET_OUTPUT(IDENTITY, value, frame_size);
+    }
    value.gate_value += frame_size * 3;
    value.reset_output_value += frame_size;
    if (value.prev_out_value) {
@@ -93,15 +167,27 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
  }
 }

-template <class OpFinalOutput, typename T>
-inline void forward_final_output(OpFinalOutput op_final_output,
-                                 GRUMetaValue<T> value, int frame_size,
-                                 int batch_size, ActivationType active_node) {
-  for (int b = 0; b < batch_size; b++) {
-    hl_naive_gru_forward_final_output(op_final_output, value.gate_value,
-                                      value.prev_out_value, value.output_value,
-                                      frame_size, active_node);
+#define FORWARD_FINAL_OUTPUT(active_type, value, frame_size) \
+  hl_naive_gru_forward_final_output<float, active_type>(     \
+      value.gate_value, value.prev_out_value, value.output_value, frame_size)

+template <typename T>
+inline void forward_final_output(GRUMetaValue<T> value, int frame_size,
+                                 int batch_size, ActivationType active_node) {
+  for (int b = 0; b < batch_size; ++b) {
+    switch (active_node) {
+      case RELU:
+        FORWARD_FINAL_OUTPUT(RELU, value, frame_size);
+        break;
+      case SIGMOID:
+        FORWARD_FINAL_OUTPUT(SIGMOID, value, frame_size);
+        break;
+      case TANH:
+        FORWARD_FINAL_OUTPUT(TANH, value, frame_size);
+        break;
+      default:
+        FORWARD_FINAL_OUTPUT(IDENTITY, value, frame_size);
+    }
    value.gate_value += frame_size * 3;
    value.output_value += frame_size;
    if (value.prev_out_value) {
@@ -113,4 +199,5 @@ inline void forward_final_output(OpFinalOutput op_final_output,
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
+
 #endif
--- a/src/operators/math/gru_kernel.h
+++ b/src/operators/math/gru_kernel.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef GRU_OP
-#pragma once
-#include <type_traits>
-#include "operators/math/activation_functions.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-namespace forward {
-
-template <typename T>
-class gru_resetOutput {
- public:
-  void operator()(T *value_update_gate, T *value_reset_gate, T *prev_out,
-                  T *value_reset_output, ActivationType act_gate) {
-    *value_update_gate = activation(*value_update_gate, act_gate);
-    *value_reset_gate = activation(*value_reset_gate, act_gate);
-    *value_reset_output = (*prev_out) * (*value_reset_gate);
-  }
-};
-
-template <typename T>
-class gru_finalOutput {
- public:
-  void operator()(T *value_update_gate, T *value_frame_state, T *prev_out,
-                  T *value_output, ActivationType act_input) {
-    *value_frame_state = activation(*value_frame_state, act_input);
-    *value_output = *prev_out - ((*value_update_gate) * (*prev_out)) +
-                    ((*value_update_gate) * (*value_frame_state));
-  }
-};
-}  // namespace forward
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/math/math_func_neon.h
+++ b/src/operators/math/math_func_neon.h
@@ -38,7 +38,11 @@ limitations under the License. */
 *
 *  (this is the zlib license)
 */
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
 #pragma once
+
 #include <arm_neon.h>

 #define c_inv_mant_mask ~0x7f800000u
@@ -316,11 +320,11 @@ static inline float32x4_t cos_ps(float32x4_t x) {
 static inline float32x4_t div_ps(float32x4_t a, float32x4_t b) {
  float32x4_t reciprocal = vrecpeq_f32(b);
  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
-  //     reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
  return vmulq_f32(a, reciprocal);
 }

 static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) {
-  // pow(x, m) = exp(m * log(x))
  return exp_ps(vmulq_f32(b, log_ps(a)));
 }
+
+#endif  // __ARM_NEON__
--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "operators/math/math_function.h"
-#include <cstring>
 #include <string>
+#include "common/enforce.h"
 #include "framework/data_type.h"
 #include "framework/tensor.h"
 #include "operators/math/gemm.h"
@@ -35,35 +35,34 @@ struct TensorSetConstant {
  float value_;
 };

-void set_constant(framework::Tensor *tensor, float value) {
+void SetConstant(framework::Tensor *tensor, float value) {
  framework::VisitDataType(framework::ToDataType(tensor->type()),
                           TensorSetConstant(tensor, value));
 }

 template <>
-void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
-                   const framework::Tensor &matrix_b, bool trans_b, float alpha,
-                   framework::Tensor *matrix_out, float beta, bool relu,
-                   float *bias) {
+void MatMul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
+                          const framework::Tensor &matrix_b, bool trans_b,
+                          float alpha, framework::Tensor *matrix_out,
+                          float beta, bool relu, float *bias) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
  PADDLE_MOBILE_ENFORCE(
      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-      "The input and output of matmul be matrix");
+      "The input and output of MatMul be matrix");

  int M = dim_out[0];
  int N = dim_out[1];
  int K = (!trans_a) ? dim_a[1] : dim_a[0];
  Gemm gemm;
-
  if (trans_a) {
+    framework::Tensor matrix_trans;
    int numel = matrix_a.numel();
    int m = matrix_a.dims()[0];
    int n = matrix_a.dims()[1];
    float *tmp = (float *)(matrix_a.data<float>());  // NOLINT
-    float *a = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * numel));
+    float *a = matrix_trans.mutable_data<float>(matrix_a.dims());
    int index = 0;
    for (int j = 0; j < n; j++) {
      for (int i = 0; i < m; i++) {
@@ -72,7 +71,6 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
    }

 #ifdef _OPENMP
-
    gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<float>(), N, beta,
                   matrix_out->data<float>(), N, relu, bias);
 #else
@@ -92,19 +90,18 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
  }
 }

-template <>
-void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
-                         const framework::Tensor &matrix_b, bool trans_b,
-                         float alpha, framework::Tensor *matrix_out, float beta,
-                         bool relu, framework::Tensor *new_scale,
-                         framework::Tensor *new_bias, int group, float *bias) {
+void MatMulWithBn(const framework::Tensor &matrix_a, bool trans_a,
+                  const framework::Tensor &matrix_b, bool trans_b, float alpha,
+                  framework::Tensor *matrix_out, float beta, bool relu,
+                  framework::Tensor *new_scale, framework::Tensor *new_bias,
+                  int group, float *bias) {
  Gemm gemm;
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
  PADDLE_MOBILE_ENFORCE(
      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-      "The input and output of matmul be matrix");
+      "The input and output of MatMul be matrix");

  int M = dim_out[0];
  int N = dim_out[1];
@@ -122,7 +119,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
                   new_bias->data<float>() + group, bias);
 #endif
 }
-void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
+void MatMulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
                     const framework::Tensor &matrix_b, bool trans_b,
                     framework::Tensor *matrix_out, float *p, std::string mode,
                     float *bias, float *bias1) {
@@ -132,7 +129,7 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
  auto dim_out = matrix_out->dims();
  PADDLE_MOBILE_ENFORCE(
      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-      "The input and output of matmul be matrix");
+      "The input and output of MatMul be matrix");

  int M = dim_out[0];
  int N = dim_out[1];
@@ -146,7 +143,6 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
  gemm.SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K,
                      matrix_b.data<float>(), N, matrix_out->data<float>(), N,
                      p, mode, bias, bias1);
-
 #endif
 }


--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -14,7 +14,6 @@ limitations under the License. */

 #pragma once

-#include <cmath>
 #include <string>
 #include "framework/tensor.h"

@@ -22,37 +21,37 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {

-void set_constant(framework::Tensor *tensor, float value);
+void SetConstant(framework::Tensor *tensor, float value);

-template <typename T>
-void matmul(const framework::Tensor &matrix_a, bool trans_a,
-            const framework::Tensor &matrix_b, bool trans_b, T alpha,
-            framework::Tensor *matrix_out, T beta, bool relu = false,
-            float *bias = nullptr);
+template <typename Itype, typename Otype>
+void MatMul(const framework::Tensor &matrix_a, bool trans_a,
+            const framework::Tensor &matrix_b, bool trans_b, float alpha,
+            framework::Tensor *matrix_out, float beta, bool relu = false,
+            Otype *bias = nullptr);

-template <typename T, typename S>
-void matmul(const framework::Tensor &matrix_a, bool trans_a,
-            const framework::Tensor &matrix_b, bool trans_b, T alpha,
-            framework::Tensor *matrix_out, T beta, bool relu = false,
-            S *bias = nullptr, bool addOnRow = false);
+template <typename Itype, typename Otype>
+void MatMul(const framework::Tensor &matrix_a, bool trans_a,
+            const framework::Tensor &matrix_b, bool trans_b, float alpha,
+            framework::Tensor *matrix_out, float beta, bool relu, Otype *bias,
+            bool addOnRow);

-template <typename T>
-void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
-                  const framework::Tensor &matrix_b, bool trans_b, T alpha,
-                  framework::Tensor *matrix_out, T beta, bool relu,
+void MatMulWithBn(const framework::Tensor &matrix_a, bool trans_a,
+                  const framework::Tensor &matrix_b, bool trans_b, float alpha,
+                  framework::Tensor *matrix_out, float beta, bool relu,
                  framework::Tensor *new_scale, framework::Tensor *new_bias,
                  int group, float *bias = nullptr);

-void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
+void MatMulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
                     const framework::Tensor &matrix_b, bool trans_b,
                     framework::Tensor *matrix_out, float *p, std::string mode,
                     float *bias, float *bias1);
-template <typename DeviceType, typename T>
+
+template <typename Device, typename T>
 struct ClearTensor {
  void operator()(framework::Tensor *tensor);
 };

-template <typename DeviceType, typename T>
+template <typename Device, typename T>
 struct RowwiseAdd {
  void operator()(const framework::Tensor &input, const framework::Tensor &vec,
                  framework::Tensor *output);

--- a/src/operators/math/math_function_int8.cpp
+++ b/src/operators/math/math_function_int8.cpp
@@ -22,16 +22,17 @@ namespace operators {
 namespace math {

 template <>
-void matmul(const framework::Tensor &matrix_a, bool trans_a,
-            const framework::Tensor &matrix_b, bool trans_b, float alpha,
-            framework::Tensor *matrix_out, float beta, bool relu, int32_t *bias,
+void MatMul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
+                             const framework::Tensor &matrix_b, bool trans_b,
+                             float alpha, framework::Tensor *matrix_out,
+                             float beta, bool relu, int32_t *bias,
                             bool addOnRow) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
  PADDLE_MOBILE_ENFORCE(
      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-      "The input and output of matmul be matrix");
+      "The input and output of MatMul be matrix");

  int32_t M = dim_out[0];
  int32_t N = dim_out[1];
@@ -93,6 +94,16 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a,
 #endif
  }
 }
+
+template <>
+void MatMul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
+                             const framework::Tensor &matrix_b, bool trans_b,
+                             float alpha, framework::Tensor *matrix_out,
+                             float beta, bool relu, int32_t *bias) {
+  MatMul<int8_t, int32_t>(matrix_a, trans_a, matrix_b, trans_b, alpha,
+                          matrix_out, beta, relu, bias, false);
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/pool_2x2.cpp
+++ b/src/operators/math/pool_2x2.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-#include "operators/math/pool_2x2.h"
-#include <algorithm>
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-#define FLT_MAX __FLT_MAX__
-
-void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
-                    const Tensor *input, Tensor *output) {
-  const int batch_size = input->dims()[0];
-  const int input_height = input->dims()[2];
-  const int input_width = input->dims()[3];
-
-  const int output_channels = output->dims()[1];
-  int output_height = output->dims()[2];
-  const int output_width = output->dims()[3];
-  const int ksize_height = 2;
-  const int ksize_width = 2;
-  const int stride_height = strides[0];
-  const int stride_width = strides[1];
-  const int padding_height = paddings[0];
-  const int padding_width = paddings[1];
-
-  const int input_channel_stride = input_height * input_width;
-  const int output_channel_stride = output_height * output_width;
-
-  const int input_batch_stride = output_channels * input_channel_stride;
-  const int output_batch_stride = output_channels * output_channel_stride;
-
-  const float *input_data = input->data<float>();
-  float *output_data = output->mutable_data<float>();
-
-  int w1 = input_width / 16;
-  int _w1 = input_width % 16;
-  int w2 = _w1 / 4;
-  int _w2 = _w1 % 4;
-
-  for (int i = 0; i < batch_size; ++i) {
-    for (int c = 0; c < output_channels; ++c) {
-      for (int ph = 0; ph < input_height; ph += 2) {
-        const float *in_ptr1 = input_data + i * input_batch_stride +
-                               c * input_channel_stride + ph * input_width;
-        const float *in_ptr2 = in_ptr1 + input_width;
-        if (ph != input_height && ph + 1 >= input_height) {
-          in_ptr2 = static_cast<float *>(
-              paddle_mobile::memory::Alloc(sizeof(float) * input_width));
-          memset(static_cast<void *>(const_cast<float *>(in_ptr2)), -FLT_MAX,
-                 sizeof(float) * input_width);
-        }
-        float *out_ptr = output_data + i * output_batch_stride +
-                         c * output_channel_stride + ph / 2 * output_width;
-#if __ARM_NEON
-#if __aarch64__
-#else
-        asm volatile(
-            "subs       %[w1], %[w1], #1        \n\t"
-            "blt        end_w1_%=               \n\t"
-            "loop_w1_%=:                        \n\t"
-
-            "pld        [%[in_ptr1], #64]       \n\t"
-            "pld        [%[in_ptr2], #64]       \n\t"
-
-            "vld1.f32   {q0, q1},   [%[in_ptr1]]!   \n\t"
-            "vld1.f32   {q2, q3},   [%[in_ptr2]]!   \n\t"
-            "vld1.f32   {q6, q7},   [%[in_ptr1]]!   \n\t"
-            "vld1.f32   {q8, q9},   [%[in_ptr2]]!   \n\t"
-
-            "vmax.f32   q0,     q0,   q2        \n\t"
-            "vmax.f32   q1,     q1,   q3        \n\t"
-
-            "vmax.f32   q6,     q6,   q8        \n\t"
-            "vmax.f32   q7,     q7,   q9        \n\t"
-
-            "vpmax.f32  d8,     d0,   d1        \n\t"
-            "vpmax.f32  d9,     d2,   d3        \n\t"
-
-            "vpmax.f32  d10,    d12,  d13       \n\t"
-            "vpmax.f32  d11,    d14,  d15       \n\t"
-
-            "vst1.32  {q4, q5},  [%[out_ptr]]!  \n\t"
-
-            "subs       %[w1], %[w1], #1        \n\t"
-            "bge        loop_w1_%=              \n\t"
-            "end_w1_%=:                         \n\t"
-
-            "subs       %[w2], %[w2], #1        \n\t"
-            "blt        end_w2_%=               \n\t"
-            "loop_w2_%=:                        \n\t"
-
-            "vld1.f32   {q0},   [%[in_ptr1]]!   \n\t"
-            "vld1.f32   {q1},   [%[in_ptr2]]!   \n\t"
-            "vmax.f32   q0,     q0,   q1        \n\t"
-            "vpmax.f32  d4,     d0,   d1        \n\t"
-            "vst1.32    {d4},   [%[out_ptr]]!   \n\t"
-
-            "subs       %[w2], %[w2], #1        \n\t"
-            "bge        loop_w2_%=              \n\t"
-            "end_w2_%=:                         \n\t"
-            :
-            : [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1),
-              [in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr)
-            : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-              "q9");
-#endif
-#endif
-
-        if (_w2 != 0) {
-          in_ptr1 = input_data + i * input_batch_stride +
-                    c * input_channel_stride + ph * input_width + 16 * w1 +
-                    4 * w2;
-          in_ptr2 = in_ptr1 + input_width;
-          out_ptr = output_data + i * output_batch_stride +
-                    c * output_channel_stride + ph / 2 * output_width + 8 * w1 +
-                    2 * w2;
-          if (_w2 == 1) {
-            *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
-          } else if (_w2 == 2) {
-            float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
-            in_ptr1++;
-            in_ptr2++;
-            float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
-            *out_ptr = (temp > temp1) ? temp : temp1;
-          } else if (_w2 == 3) {
-            float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
-            in_ptr1++;
-            in_ptr2++;
-            float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
-            in_ptr1++;
-            in_ptr2++;
-            *out_ptr = (temp > temp1) ? temp : temp1;
-            out_ptr++;
-            *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
-          }
-        }
-      }
-    }
-  }
-}
-
-void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
-                    const Tensor *input, Tensor *output) {
-  const int batch_size = input->dims()[0];
-  const int input_height = input->dims()[2];
-  const int input_width = input->dims()[3];
-
-  const int output_channels = output->dims()[1];
-  int output_height = output->dims()[2];
-  const int output_width = output->dims()[3];
-  const int ksize_height = 2;
-  const int ksize_width = 2;
-  const int stride_height = strides[0];
-  const int stride_width = strides[1];
-  const int padding_height = paddings[0];
-  const int padding_width = paddings[1];
-
-  const int input_channel_stride = input_height * input_width;
-  const int output_channel_stride = output_height * output_width;
-
-  const int input_batch_stride = output_channels * input_channel_stride;
-  const int output_batch_stride = output_channels * output_channel_stride;
-
-  const float *input_data = input->data<float>();
-  float *output_data = output->mutable_data<float>();
-
-  int w1 = input_width / 16;
-  int _w1 = input_width % 16;
-  int w2 = _w1 / 4;
-  int _w2 = _w1 % 4;
-
-  float quarter = 0.25;
-  for (int i = 0; i < batch_size; ++i) {
-    for (int c = 0; c < output_channels; ++c) {
-      for (int ph = 0; ph < input_height; ph += 2) {
-        const float *in_ptr1 = input_data + i * input_batch_stride +
-                               c * input_channel_stride + ph * input_width;
-        const float *in_ptr2 = in_ptr1 + input_width;
-        if (ph + 1 >= input_height) {
-          in_ptr2 = static_cast<float *>(
-              paddle_mobile::memory::Alloc(sizeof(float) * input_width));
-          memset(static_cast<void *>(const_cast<float *>(in_ptr2)), 0,
-                 sizeof(float) * input_width);
-        }
-        float *out_ptr = output_data + i * output_batch_stride +
-                         c * output_channel_stride + ph / 2 * output_width;
-#if __ARM_NEON
-#if __aarch64__
-#else
-        asm volatile(
-            "subs       %[w1], %[w1], #1        \n\t"
-            "blt        end_w1_%=               \n\t"
-            "loop_w1_%=:                        \n\t"
-
-            "pld        [%[in_ptr1], #64]       \n\t"
-            "pld        [%[in_ptr2], #64]       \n\t"
-
-            "vmov.f32   d0[0],      %[quarter]      \n\t"
-            "vld1.f32   {q1, q2},   [%[in_ptr1]]!   \n\t"
-            "vld1.f32   {q3, q4},   [%[in_ptr2]]!   \n\t"
-            "vld1.f32   {q7, q8},   [%[in_ptr1]]!   \n\t"
-            "vld1.f32   {q9, q10},  [%[in_ptr2]]!   \n\t"
-
-            "vadd.f32   q1,     q1,   q3        \n\t"
-            "vadd.f32   q2,     q2,   q4        \n\t"
-
-            "vadd.f32   q7,     q7,   q9        \n\t"
-            "vadd.f32   q8,     q8,   q10       \n\t"
-
-            "vpadd.f32  d10,    d2,   d3        \n\t"
-            "vpadd.f32  d11,    d4,   d5        \n\t"
-
-            "vpadd.f32  d12,    d14,  d15       \n\t"
-            "vpadd.f32  d13,    d16,  d17       \n\t"
-
-            "vmul.f32   q5,     q5,   d0[0]     \n\t"
-            "vmul.f32   q6,     q6,   d0[0]     \n\t"
-
-            "vst1.32  {q5, q6},  [%[out_ptr]]!  \n\t"
-
-            "subs       %[w1], %[w1], #1        \n\t"
-            "bge        loop_w1_%=              \n\t"
-            "end_w1_%=:                         \n\t"
-
-            "subs       %[w2], %[w2], #1        \n\t"
-            "blt        end_w2_%=               \n\t"
-            "loop_w2_%=:                        \n\t"
-
-            "vld1.f32   {q1},   [%[in_ptr1]]!   \n\t"
-            "vld1.f32   {q2},   [%[in_ptr2]]!   \n\t"
-            "vadd.f32   q1,     q1,   q2        \n\t"
-            "vpadd.f32  d4,     d2,   d3        \n\t"
-            "vmul.f32   d4,     d4,   d0[0]     \n\t"
-            "vst1.32    {d4},   [%[out_ptr]]!   \n\t"
-
-            "subs       %[w2], %[w2], #1        \n\t"
-            "bge        loop_w2_%=              \n\t"
-            "end_w2_%=:                         \n\t"
-            :
-            : [w1] "r"(w1), [w2] "r"(w2), [in_ptr1] "r"(in_ptr1),
-              [in_ptr2] "r"(in_ptr2), [out_ptr] "r"(out_ptr),
-              [quarter] "r"(quarter)
-            : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-              "q9", "q10");
-#endif
-#endif
-
-        if (_w2 != 0) {
-          in_ptr1 = input_data + i * input_batch_stride +
-                    c * input_channel_stride + ph * input_width + 16 * w1 +
-                    4 * w2;
-          in_ptr2 = in_ptr1 + input_width;
-          out_ptr = output_data + i * output_batch_stride +
-                    c * output_channel_stride + ph / 2 * output_width + 8 * w1 +
-                    2 * w2;
-          if (_w2 == 1) {
-            *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
-          } else if (_w2 == 2) {
-            float temp = 0;
-            temp += *in_ptr1;
-            temp += *in_ptr2;
-            in_ptr1++;
-            in_ptr2++;
-            temp += *in_ptr1;
-            temp += *in_ptr2;
-            *out_ptr = 0.25 * temp;
-          } else if (_w2 == 3) {
-            float temp = 0;
-            temp += *in_ptr1++;
-            temp += *in_ptr2++;
-            temp += *in_ptr1++;
-            temp += *in_ptr2++;
-            *out_ptr = 0.25 * temp;
-            out_ptr++;
-            *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
-          }
-        }
-      }
-    }
-  }
-}
-
-//}
-}  // namespace math
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#include "framework/tensor.h"
-#include "operators/math/pool_3x3.h"
-#if __ARM_NEON
-#include <arm_neon.h>
-#endif  // __ARM_NEON
-#include <climits>
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-using framework::Tensor;
-using std::max;
-using std::min;
-using std::vector;
-void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
-#if __ARM_NEON
-  const int batch_size = static_cast<int>(input->dims()[0]);
-  const int input_channel = static_cast<int>(input->dims()[1]);
-
-  const int input_height = static_cast<int>(input->dims()[2]);
-  const int input_width = static_cast<int>(input->dims()[3]);
-  const int output_height = static_cast<int>(output->dims()[2]);
-  const int output_width = static_cast<int>(output->dims()[3]);
-  output->mutable_data<float>();
-
-  const int hxw = input_height * input_width;
-
-  const int l = input_height;
-
-  const float coef = 1.0 / 9.0;
-  const float coef1 = 1.0 / 6.0;
-  const float coef2 = 1.0 / 4.0;
-
-  float32x4_t v_coef = vdupq_n_f32(coef);
-  float32x4_t v_coef1 = vdupq_n_f32(coef1);
-
-  for (int b = 0; b < batch_size; b++) {
-#pragma omp parallel for
-    for (int c = 0; c < input_channel; c++) {
-      const float *input_data = input->data<float>() + c * hxw;
-      float *output_data = output->data<float>() + c * hxw;
-
-      for (int i = 1; i < output_height - 1; i++) {
-        float *output_ptr;
-        float32x4_t in0, in1, in2, in3, in4, in5, tmp0, tmp1, tmp2, tmp3, tmp4,
-            tmp5, out0;
-        for (int m = 1; m < output_width - 4; m += 4) {
-          output_ptr = output_data + i * output_width + m;
-          in0 = vld1q_f32(input_data + (i - 1) * input_width + m - 1);
-          in1 = vld1q_f32(input_data + (i - 1) * input_width + m + 3);
-          in2 = vld1q_f32(input_data + i * input_width + m - 1);
-          in3 = vld1q_f32(input_data + i * input_width + m + 3);
-          in4 = vld1q_f32(input_data + (i + 1) * input_width + m - 1);
-          in5 = vld1q_f32(input_data + (i + 1) * input_width + m + 3);
-
-          tmp0 = vextq_f32(in0, in1, 1);
-          tmp1 = vextq_f32(in0, in1, 2);
-          tmp2 = vextq_f32(in2, in3, 1);
-          tmp3 = vextq_f32(in2, in3, 2);
-          tmp4 = vextq_f32(in4, in5, 1);
-          tmp5 = vextq_f32(in4, in5, 2);
-
-          out0 = in0;
-          out0 = vaddq_f32(out0, tmp0);
-          out0 = vaddq_f32(out0, tmp1);
-          out0 = vaddq_f32(out0, in2);
-          out0 = vaddq_f32(out0, tmp2);
-          out0 = vaddq_f32(out0, tmp3);
-          out0 = vaddq_f32(out0, in4);
-          out0 = vaddq_f32(out0, tmp4);
-          out0 = vaddq_f32(out0, tmp5);
-
-          vst1q_f32(output_ptr, vmulq_f32(out0, v_coef));
-        }
-        int m;
-        for (m = 1; (m + 3) < output_width - 1; m = m + 4) {
-        }
-
-        for (int j = m; j < output_width - 1; j++) {
-          output_data[i * output_width + j] =
-              input_data[(i - 1) * input_width + j - 1] +
-              input_data[(i - 1) * input_width + j] +
-              input_data[(i - 1) * input_width + j + 1] +
-              input_data[(i)*input_width + j - 1] +
-              input_data[(i)*input_width + j] +
-              input_data[(i)*input_width + j + 1] +
-              input_data[(i + 1) * input_width + j - 1] +
-              input_data[(i + 1) * input_width + j] +
-              input_data[(i + 1) * input_width + j + 1];
-          output_data[i * output_width + j] =
-              output_data[i * output_width + j] * coef;
-        }
-      }
-
-      output_data[0] =
-          input_data[0] + input_data[1] + input_data[l] + input_data[l + 1];
-      output_data[l - 1] = input_data[l - 2] + input_data[l - 1] +
-                           input_data[2 * l - 2] + input_data[2 * l - 1];
-      output_data[(l - 1) * l] =
-          input_data[(l - 2) * l] + input_data[(l - 2) * l + 1] +
-          input_data[(l - 1) * l] + input_data[(l - 1) * l + 1];
-      output_data[l * l - 1] = input_data[(l - 2) * (l + 1)] +
-                               input_data[(l - 2) * (l + 1) + 1] +
-                               input_data[l * l - 2] + input_data[l * l - 1];
-      output_data[0] = output_data[0] * coef2;
-      output_data[l - 1] = output_data[l - 1] * coef2;
-      output_data[(l - 1) * l] = output_data[(l - 1) * l] * coef2;
-      output_data[l * l - 1] = output_data[l * l - 1] * coef2;
-
-      for (int i = 1; i < l - 1; ++i) {
-        output_data[i * l] = input_data[i * l - l] + input_data[i * l - l + 1] +
-                             input_data[i * l] + input_data[i * l + 1] +
-                             input_data[i * l + l] + input_data[i * l + l + 1];
-
-        output_data[i * l + l - 1] =
-            input_data[i * l + l - 1 - l - 1] + input_data[i * l + l - 1 - l] +
-            input_data[i * l + l - 1 - 1] + input_data[i * l + l - 1] +
-            input_data[i * l + l - 1 + l - 1] + input_data[i * l + l - 1 + l];
-        output_data[i * l] = output_data[i * l] * coef1;
-        output_data[i * l + l - 1] = output_data[i * l + l - 1] * coef1;
-      }
-
-      int m;
-      for (m = 1; m < output_width - 4; m += 4) {
-        float *output_ptr = output_data + m;
-        float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0;
-        in0 = vld1q_f32(input_data + m - 1);
-        in1 = vld1q_f32(input_data + m + 3);
-        in2 = vld1q_f32(input_data + input_width + m - 1);
-        in3 = vld1q_f32(input_data + input_width + m + 3);
-        tmp0 = vextq_f32(in0, in1, 1);
-        tmp1 = vextq_f32(in0, in1, 2);
-        tmp2 = vextq_f32(in2, in3, 1);
-        tmp3 = vextq_f32(in2, in3, 2);
-        out0 = in0;
-        out0 = vaddq_f32(out0, tmp0);
-        out0 = vaddq_f32(out0, tmp1);
-        out0 = vaddq_f32(out0, in2);
-        out0 = vaddq_f32(out0, tmp2);
-        out0 = vaddq_f32(out0, tmp3);
-
-        vst1q_f32(output_ptr, vmulq_f32(out0, v_coef1));
-      }
-
-      for (m = 1; (m + 3) < output_width - 1; m += 4) {
-      }
-      for (int j = m; j < output_width - 1; j++) {
-        output_data[j] = input_data[j - 1] + input_data[j] + input_data[j + 1] +
-                         input_data[input_width + j - 1] +
-                         input_data[input_width + j] +
-                         input_data[input_width + j + 1];
-        output_data[j] = output_data[j] * coef1;
-      }
-
-      for (m = 1; m < output_width - 4; m += 4) {
-        float *output_ptr =
-            output_data + (output_height - 1) * output_width + m;
-
-        float32x4_t in0, in1, in2, in3, tmp0, tmp1, tmp2, tmp3, out0;
-        in0 = vld1q_f32(input_data + (output_height - 2) * input_width + m - 1);
-        in1 = vld1q_f32(input_data + (output_height - 2) * input_width + m + 3);
-        in2 = vld1q_f32(input_data + (output_height - 1) * input_width + m - 1);
-        in3 = vld1q_f32(input_data + (output_height - 1) * input_width + m + 3);
-        tmp0 = vextq_f32(in0, in1, 1);
-        tmp1 = vextq_f32(in0, in1, 2);
-        tmp2 = vextq_f32(in2, in3, 1);
-        tmp3 = vextq_f32(in2, in3, 2);
-        out0 = in0;
-        out0 = vaddq_f32(out0, tmp0);
-        out0 = vaddq_f32(out0, tmp1);
-        out0 = vaddq_f32(out0, in2);
-        out0 = vaddq_f32(out0, tmp2);
-        out0 = vaddq_f32(out0, tmp3);
-
-        vst1q_f32(output_ptr, vmulq_f32(out0, v_coef1));
-      }
-      for (m = 1; (m + 3) < output_width - 1; m = m + 4) {
-      }
-      for (int j = m; j < output_width - 1; j++) {
-        output_data[(output_height - 1) * input_width + j] =
-            input_data[(output_height - 2) * input_width + j - 1] +
-            input_data[(output_height - 2) * input_width + j] +
-            input_data[(output_height - 2) * input_width + j + 1] +
-            input_data[(output_height - 1) * input_width + j - 1] +
-            input_data[(output_height - 1) * input_width + j] +
-            input_data[(output_height - 1) * input_width + j + 1];
-        output_data[(output_height - 1) * output_width + j] =
-            output_data[(output_height - 1) * output_width + j] * coef1;
-      }
-    }
-  }
-
-//  const int batch_size = input->dims()[0];
-//
-//  const int h_in = input->dims()[2];
-//
-//  const int w_in = input->dims()[3];
-//
-//  const int output_channels = output->dims()[1];
-//
-//  const int h_out = output->dims()[2];
-//  const int w_out = output->dims()[3];
-//  const int outputdata_channel_stride = h_out * w_out;
-//  const int inputdata_channel_stride = h_in * w_in;
-//  const int input_batch_stride = output_channels * inputdata_channel_stride;
-//  const int output_batch_stride = output_channels *
-//  outputdata_channel_stride; float *out_data = output->data<float>(); const
-//  float *input_data = input->data<float>();
-//
-//  const float coef = 1.0 / 9.0;
-//  for (int k = 0; k < batch_size; ++k) {
-// #pragma omp parallel for
-//    for (int c = 0; c < output_channels; ++c) {
-//      const float *input_seg = input_data + c * inputdata_channel_stride;
-//      float *output_seg = out_data + c * outputdata_channel_stride;
-//      // four corner point
-//      output_seg[0] = (input_seg[0] + input_seg[1] + input_seg[w_in] +
-//                       input_seg[w_in + 1]) *
-//                      coef;
-//      output_seg[w_out - 1] =
-//          (input_seg[w_in - 2] + input_seg[w_in - 1] + input_seg[w_in * 2 -
-//          2] +
-//           input_seg[2 * w_in - 1]) *
-//          coef;
-//      output_seg[(h_out - 1) * w_out] =
-//          (input_seg[(h_in - 2) * w_in] + input_seg[(h_in - 2) * w_in + 1] +
-//           input_seg[(h_in - 1) * w_in] + input_seg[(h_in - 1) * w_in + 1])
-//           *
-//          coef;
-//      output_seg[h_out * w_out - 1] =
-//          (input_seg[h_in * w_in - 1] + input_seg[h_in * w_in - 2] +
-//           input_seg[(h_in - 1) * w_in - 1] +
-//           input_seg[(h_in - 1) * w_in - 2]) *
-//          coef;
-//      // left side & right side
-//      for (int i = 1; i < h_in - 1; ++i) {
-//        output_seg[i * w_out] =
-//            (input_seg[i * w_in - w_in] + input_seg[i * w_in - w_in + 1] +
-//             input_seg[i * w_in] + input_seg[i * w_in + 1] +
-//             input_seg[i * w_in + w_in] + input_seg[i * w_in + w_in + 1]) *
-//            coef;
-//        output_seg[i * w_out + w_out - 1] =
-//            (input_seg[i * w_in - w_in + w_in - 2] +
-//             input_seg[i * w_in - w_in + 1 + w_in - 2] +
-//             input_seg[i * w_in + w_in - 2] +
-//             input_seg[i * w_in + 1 + w_in - 2] +
-//             input_seg[i * w_in + w_in + w_in - 2] +
-//             input_seg[i * w_in + w_in + 1 + w_in - 2]) *
-//            coef;
-//      }
-//      // top 1 row & bottom 1 row
-//      const float *input_tmp = input_seg;
-//
-//      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
-//          tmp3, tmp4, tmp5, sum, out0;
-//      float32x4_t v_coef = vdupq_n_f32(coef);
-//      in0 = vld1q_f32(input_tmp);
-//      in2 = vld1q_f32(input_tmp + w_in);
-//      const float *input_tmp_end = input_tmp + (h_in - 2) * w_in;
-//      in4 = vld1q_f32(input_tmp_end);
-//      in6 = vld1q_f32(input_tmp_end + w_in);
-//      int c_mid = w_out - 2;
-//      auto output_ptr = output_seg + 1;
-//      for (; c_mid > 3; c_mid -= 4) {
-//        in1 = vld1q_f32(input_tmp + 4);
-//        in3 = vld1q_f32(input_tmp + w_in + 4);
-//
-//        tmp0 = vextq_f32(in0, in1, 1);
-//        tmp1 = vextq_f32(in0, in1, 2);
-//
-//        tmp2 = vextq_f32(in2, in3, 1);
-//        tmp3 = vextq_f32(in2, in3, 2);
-//
-//        sum = vaddq_f32(in0, tmp0);
-//        sum = vaddq_f32(sum, tmp1);
-//        sum = vaddq_f32(sum, in2);
-//        sum = vaddq_f32(sum, tmp2);
-//        sum = vaddq_f32(sum, tmp3);
-//
-//        vst1q_f32(output_ptr, vmulq_f32(sum, v_coef));
-//
-//        in5 = vld1q_f32(input_tmp_end + 4);
-//        in7 = vld1q_f32(input_tmp_end + w_in + 4);
-//
-//        tmp0 = vextq_f32(in4, in5, 1);
-//        tmp1 = vextq_f32(in4, in5, 2);
-//        tmp2 = vextq_f32(in6, in7, 1);
-//        tmp3 = vextq_f32(in6, in7, 2);
-//
-//        sum = vaddq_f32(in0, tmp0);
-//        sum = vaddq_f32(sum, tmp1);
-//        sum = vaddq_f32(sum, in2);
-//        sum = vaddq_f32(sum, tmp2);
-//        sum = vaddq_f32(sum, tmp3);
-//
-//        vst1q_f32(output_ptr + (h_out - 1) * w_out, vmulq_f32(sum, v_coef));
-//
-//        // can optimize to each 8 stride.
-//        input_tmp += 4;
-//        input_tmp_end += 4;
-//        output_ptr += 4;
-//        in0 = in1;
-//        in2 = in3;
-//        in4 = in5;
-//        in6 = in7;
-//      }
-//      // top right remain
-//      float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]);
-//      float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]);
-//
-//      tmp0 = vextq_f32(in0, pad0, 1);
-//      tmp1 = vextq_f32(in0, pad0, 2);
-//      tmp2 = vextq_f32(in2, pad1, 2);
-//      tmp3 = vextq_f32(in2, pad1, 2);
-//
-//      sum = vaddq_f32(in0, tmp0);
-//      sum = vaddq_f32(sum, tmp1);
-//      sum = vaddq_f32(sum, in2);
-//      sum = vaddq_f32(sum, tmp2);
-//      sum = vaddq_f32(sum, tmp3);
-//      out0 = vmulq_f32(sum, v_coef);
-//
-//      for (int i = 0; i < c_mid; ++i) {
-//        if (i == 0) {
-//          vst1q_lane_f32(output_ptr + i, out0, 0);
-//        }
-//        if (i == 1) {
-//          vst1q_lane_f32(output_ptr + i, out0, 1);
-//        }
-//        if (i == 2) {
-//          vst1q_lane_f32(output_ptr + i, out0, 2);
-//        }
-//      }
-//
-//      // bottom_right remain
-//      float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]);
-//      float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]);
-//
-//      tmp0 = vextq_f32(in4, pad2, 1);
-//      tmp1 = vextq_f32(in4, pad2, 2);
-//      tmp2 = vextq_f32(in6, pad3, 2);
-//      tmp3 = vextq_f32(in6, pad3, 2);
-//
-//      sum = vaddq_f32(in4, tmp0);
-//      sum = vaddq_f32(sum, tmp1);
-//      sum = vaddq_f32(sum, in6);
-//      sum = vaddq_f32(sum, tmp2);
-//      sum = vaddq_f32(sum, tmp3);
-//      out0 = vmulq_f32(sum, v_coef);
-//
-//      for (int i = 0; i < c_mid; ++i) {
-//        if (i == 0) {
-//          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 0);
-//        }
-//        if (i == 1) {
-//          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 1);
-//        }
-//        if (i == 2) {
-//          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, out0, 2);
-//        }
-//      }
-//      // mid
-//      for (int j = 0; j < h_out - 2; ++j) {
-//        output_ptr = output_seg + w_out * (j + 1) + 1;
-//        input_tmp = input_seg + j * w_in;
-//
-//        in0 = vld1q_f32(input_tmp);
-//        in2 = vld1q_f32(input_tmp + w_in);
-//        in4 = vld1q_f32(input_tmp + 2 * w_in);
-//        c_mid = w_out - 2;
-//        for (; c_mid > 3; c_mid -= 4) {
-//          in1 = vld1q_f32(input_tmp + 4);
-//          in3 = vld1q_f32(input_tmp + w_in + 4);
-//          in5 = vld1q_f32(input_tmp + 2 * w_in + 4);
-//
-//          tmp0 = vextq_f32(in0, in1, 1);
-//          tmp1 = vextq_f32(in0, in1, 2);
-//          tmp2 = vextq_f32(in2, in3, 1);
-//          tmp3 = vextq_f32(in2, in3, 2);
-//          tmp4 = vextq_f32(in4, in5, 1);
-//          tmp5 = vextq_f32(in4, in5, 2);
-//
-//          sum = vaddq_f32(in0, tmp0);
-//          sum = vaddq_f32(sum, tmp1);
-//          sum = vaddq_f32(sum, in2);
-//          sum = vaddq_f32(sum, tmp2);
-//          sum = vaddq_f32(sum, tmp3);
-//          sum = vaddq_f32(sum, in4);
-//          sum = vaddq_f32(sum, tmp4);
-//          sum = vaddq_f32(sum, tmp5);
-//
-//          out0 = vmulq_f32(sum, v_coef);
-//          vst1q_f32(output_ptr, out0);
-//          output_ptr += 4;
-//          input_tmp += 4;
-//          in0 = in1;
-//          in2 = in3;
-//          in4 = in5;
-//        }
-//        // mid remain
-//        float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]);
-//        float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
-//        float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
-//
-//        tmp0 = vextq_f32(in0, pad0, 1);
-//        tmp1 = vextq_f32(in0, pad0, 2);
-//        tmp2 = vextq_f32(in2, pad1, 1);
-//        tmp3 = vextq_f32(in2, pad1, 2);
-//        tmp4 = vextq_f32(in4, pad2, 1);
-//        tmp5 = vextq_f32(in4, pad2, 2);
-//
-//        sum = vaddq_f32(in0, tmp0);
-//        sum = vaddq_f32(sum, tmp1);
-//        sum = vaddq_f32(sum, in2);
-//        sum = vaddq_f32(sum, tmp2);
-//        sum = vaddq_f32(sum, tmp3);
-//        sum = vaddq_f32(sum, in4);
-//        sum = vaddq_f32(sum, tmp4);
-//        sum = vaddq_f32(sum, tmp5);
-//        out0 = vmulq_f32(sum, v_coef);
-//
-//        for (int i = 0; i < c_mid; ++i) {
-//          if (i == 0) {
-//            vst1q_lane_f32(output_ptr + i, out0, 0);
-//          }
-//          if (i == 1) {
-//            vst1q_lane_f32(output_ptr + i, out0, 1);
-//          }
-//          if (i == 2) {
-//            vst1q_lane_f32(output_ptr + i, out0, 2);
-//          }
-//        }
-//      }
-//      //      input_data += inputdata_channel_stride;
-//      //      out_data += outputdata_channel_stride;
-//    }
-//    input_data += input_batch_stride;
-//    out_data += output_batch_stride;
-//  }
-#endif
-}
-
-void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
-#if __ARM_NEON
-  const int batch_size = input->dims()[0];
-
-  const int h_in = input->dims()[2];
-
-  const int w_in = input->dims()[3];
-
-  const int output_channels = output->dims()[1];
-
-  const int h_out = output->dims()[2];
-  const int w_out = output->dims()[3];
-  const int outputdata_channel_stride = h_out * w_out;
-  const int inputdata_channel_stride = h_in * w_in;
-  const int input_batch_stride = output_channels * inputdata_channel_stride;
-  const int output_batch_stride = output_channels * outputdata_channel_stride;
-  float *out_data = output->mutable_data<float>();
-  const float *input_data = input->data<float>();
-  for (int k = 0; k < batch_size; ++k) {
-#pragma omp parallel for
-    for (int c = 0; c < output_channels; ++c) {
-      const float *input_seg = input_data + c * inputdata_channel_stride;
-      float *output_seg = out_data + c * outputdata_channel_stride;
-      // four corner point
-      output_seg[0] = std::max(std::max(input_seg[0], input_seg[1]),
-                               std::max(input_seg[w_in], input_seg[w_in + 1]));
-      output_seg[w_out - 1] =
-          std::max(std::max(input_seg[w_in - 2], input_seg[w_in - 1]),
-                   std::max(input_seg[w_in * 2 - 2], input_seg[2 * w_in - 1]));
-      output_seg[(h_out - 1) * w_out] =
-          std::max(std::max(input_seg[(h_in - 2) * w_in],
-                            input_seg[(h_in - 2) * w_in + 1]),
-                   std::max(input_seg[(h_in - 1) * w_in],
-                            input_seg[(h_in - 1) * w_in + 1]));
-      output_seg[h_out * w_out - 1] = std::max(
-          std::max(input_seg[(h_in - 1) * w_in - 1],
-                   input_seg[(h_in - 1) * w_in - 2]),
-          std::max(input_seg[h_in * w_in - 1], input_seg[h_in * w_in - 2]));
-      // left side & right side
-      for (int i = 1; i < h_in - 1; ++i) {
-        float max1 = std::max(input_seg[i * w_in - w_in],
-                              input_seg[i * w_in - w_in + 1]);
-        float max2 = std::max(input_seg[i * w_in], input_seg[i * w_in + 1]);
-        float max3 = std::max(input_seg[i * w_in + w_in],
-                              input_seg[i * w_in + w_in + 1]);
-        output_seg[i * w_out] = std::max(std::max(max1, max2), max3);
-
-        max1 = std::max(input_seg[i * w_in - w_in + w_in - 2],
-                        input_seg[i * w_in - w_in + 1 + w_in - 2]);
-        max2 = std::max(input_seg[i * w_in + w_in - 2],
-                        input_seg[i * w_in + 1 + w_in - 2]);
-        max3 = std::max(input_seg[i * w_in + w_in + w_in - 2],
-                        input_seg[i * w_in + w_in + 1 + w_in - 2]);
-        output_seg[i * w_out + w_out - 1] =
-            std::max(std::max(max1, max2), max3);
-      }
-      // top 1 row & bottom 1 row
-      const float *input_tmp = input_seg;
-
-      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
-          tmp3, tmp4, tmp5, max;
-      in0 = vld1q_f32(input_tmp);
-      in2 = vld1q_f32(input_tmp + w_in);
-      const float *input_tmp_end = input_tmp + (h_in - 2) * w_in;
-      in4 = vld1q_f32(input_tmp_end);
-      in6 = vld1q_f32(input_tmp_end + w_in);
-      int c_mid = w_out - 2;
-      auto output_ptr = output_seg + 1;
-      for (; c_mid > 3; c_mid -= 4) {
-        in1 = vld1q_f32(input_tmp + 4);
-        in3 = vld1q_f32(input_tmp + w_in + 4);
-
-        tmp0 = vextq_f32(in0, in1, 1);
-        tmp1 = vextq_f32(in0, in1, 2);
-
-        tmp2 = vextq_f32(in2, in3, 1);
-        tmp3 = vextq_f32(in2, in3, 2);
-
-        max = vmaxq_f32(in0, tmp0);
-        max = vmaxq_f32(max, tmp1);
-        max = vmaxq_f32(max, in2);
-        max = vmaxq_f32(max, tmp2);
-        max = vmaxq_f32(max, tmp3);
-
-        vst1q_f32(output_ptr, max);
-
-        in5 = vld1q_f32(input_tmp_end + 4);
-        in7 = vld1q_f32(input_tmp_end + w_in + 4);
-
-        tmp0 = vextq_f32(in4, in5, 1);
-        tmp1 = vextq_f32(in4, in5, 2);
-        tmp2 = vextq_f32(in6, in7, 1);
-        tmp3 = vextq_f32(in6, in7, 2);
-
-        max = vmaxq_f32(in4, tmp0);
-        max = vmaxq_f32(max, tmp1);
-        max = vmaxq_f32(max, in6);
-        max = vmaxq_f32(max, tmp2);
-        max = vmaxq_f32(max, tmp3);
-
-        vst1q_f32(output_ptr + (h_out - 1) * w_out, max);
-
-        input_tmp += 4;
-        input_tmp_end += 4;
-        output_ptr += 4;
-        in0 = in1;
-        in2 = in3;
-        in4 = in5;
-        in6 = in7;
-      }
-      // top right remain
-      float32x4_t pad0 = vdupq_n_f32(input_seg[w_in - 1]);
-      float32x4_t pad1 = vdupq_n_f32(input_seg[2 * w_in - 1]);
-
-      tmp0 = vextq_f32(in0, pad0, 1);
-      tmp1 = vextq_f32(in0, pad0, 2);
-      tmp2 = vextq_f32(in2, pad1, 1);
-      tmp3 = vextq_f32(in2, pad1, 2);
-
-      max = vmaxq_f32(in0, tmp0);
-      max = vmaxq_f32(max, tmp1);
-      max = vmaxq_f32(max, in2);
-      max = vmaxq_f32(max, tmp2);
-      max = vmaxq_f32(max, tmp3);
-
-      for (int i = 0; i < c_mid; ++i) {
-        if (i == 0) {
-          vst1q_lane_f32(output_ptr + i, max, 0);
-        }
-        if (i == 1) {
-          vst1q_lane_f32(output_ptr + i, max, 1);
-        }
-        if (i == 2) {
-          vst1q_lane_f32(output_ptr + i, max, 2);
-        }
-      }
-
-      // bottom_right remain
-      float32x4_t pad2 = vdupq_n_f32(input_seg[(h_in - 1) * w_in - 1]);
-      float32x4_t pad3 = vdupq_n_f32(input_seg[h_in * w_in - 1]);
-
-      tmp0 = vextq_f32(in4, pad2, 1);
-      tmp1 = vextq_f32(in4, pad2, 2);
-      tmp2 = vextq_f32(in6, pad3, 1);
-      tmp3 = vextq_f32(in6, pad3, 2);
-
-      max = vmaxq_f32(in4, tmp0);
-      max = vmaxq_f32(max, tmp1);
-      max = vmaxq_f32(max, in6);
-      max = vmaxq_f32(max, tmp2);
-      max = vmaxq_f32(max, tmp3);
-
-      for (int i = 0; i < c_mid; ++i) {
-        if (i == 0) {
-          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, max, 0);
-        }
-        if (i == 1) {
-          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, max, 1);
-        }
-        if (i == 2) {
-          vst1q_lane_f32(output_ptr + (h_out - 1) * w_out + i, max, 2);
-        }
-      }
-      // mid
-      for (int j = 0; j < h_out - 2; ++j) {
-        output_ptr = output_seg + (j + 1) * w_out + 1;
-        input_tmp = input_seg + j * w_in;
-
-        in0 = vld1q_f32(input_tmp);
-        in2 = vld1q_f32(input_tmp + w_in);
-        in4 = vld1q_f32(input_tmp + 2 * w_in);
-        c_mid = w_out - 2;
-        for (; c_mid > 3; c_mid -= 4) {
-          in1 = vld1q_f32(input_tmp + 4);
-          in3 = vld1q_f32(input_tmp + w_in + 4);
-          in5 = vld1q_f32(input_tmp + 2 * w_in + 4);
-
-          tmp0 = vextq_f32(in0, in1, 1);
-          tmp1 = vextq_f32(in0, in1, 2);
-          tmp2 = vextq_f32(in2, in3, 1);
-          tmp3 = vextq_f32(in2, in3, 2);
-          tmp4 = vextq_f32(in4, in5, 1);
-          tmp5 = vextq_f32(in4, in5, 2);
-
-          max = vmaxq_f32(in0, tmp0);
-          max = vmaxq_f32(max, tmp1);
-          max = vmaxq_f32(max, in2);
-          max = vmaxq_f32(max, tmp2);
-          max = vmaxq_f32(max, tmp3);
-          max = vmaxq_f32(max, in4);
-          max = vmaxq_f32(max, tmp4);
-          max = vmaxq_f32(max, tmp5);
-
-          vst1q_f32(output_ptr, max);
-          output_ptr += 4;
-          input_tmp += 4;
-          in0 = in1;
-          in2 = in3;
-          in4 = in5;
-        }
-        // mid remain
-        float32x4_t pad0 = vdupq_n_f32(input_seg[(j + 1) * w_in - 1]);
-        float32x4_t pad1 = vdupq_n_f32(input_seg[(j + 2) * w_in - 1]);
-        float32x4_t pad2 = vdupq_n_f32(input_seg[(j + 3) * w_in - 1]);
-
-        tmp0 = vextq_f32(in0, pad0, 1);
-        tmp1 = vextq_f32(in0, pad0, 2);
-        tmp2 = vextq_f32(in2, pad1, 1);
-        tmp3 = vextq_f32(in2, pad1, 2);
-        tmp4 = vextq_f32(in4, pad2, 1);
-        tmp5 = vextq_f32(in4, pad2, 2);
-
-        max = vmaxq_f32(in0, tmp0);
-        max = vmaxq_f32(max, tmp1);
-        max = vmaxq_f32(max, in2);
-        max = vmaxq_f32(max, tmp2);
-        max = vmaxq_f32(max, tmp3);
-        max = vmaxq_f32(max, in4);
-        max = vmaxq_f32(max, tmp4);
-        max = vmaxq_f32(max, tmp5);
-
-        for (int i = 0; i < c_mid; ++i) {
-          if (i == 0) {
-            vst1q_lane_f32(output_ptr + i, max, 0);
-          }
-          if (i == 1) {
-            vst1q_lane_f32(output_ptr + i, max, 1);
-          }
-          if (i == 2) {
-            vst1q_lane_f32(output_ptr + i, max, 2);
-          }
-        }
-      }
-      //      input_data += inputdata_channel_stride;
-      //      out_data += outputdata_channel_stride;
-    }
-    input_data += input_batch_stride;
-    out_data += output_batch_stride;
-  }
-#else
-
-#endif
-}
-
-void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
-                Tensor *output) {
-#if __ARM_NEON
-  const int batch_size = input->dims()[0];
-
-  const int input_height = input->dims()[2];
-
-  const int input_width = input->dims()[3];
-
-  const int output_channels = output->dims()[1];
-
-  const int output_height = output->dims()[2];
-  const int output_width = output->dims()[3];
-  //  const int _kernel_size = 3;
-  const int stride = strides[0];
-  //  const int stride_width = strides[1];
-  const int padding = paddings[0];
-  //  const int padding_width = paddings[1];
-  const float negative_max = -INT_MAX;
-  const int input_channel_stride = input_height * input_width;
-  const int output_channel_stride = output_height * output_width;
-
-  const float *input_data = input->data<float>();
-  float *output_data = output->mutable_data<float>();
-
-  const int input_batch_stride = output_channels * input_channel_stride;
-  const int output_batch_stride = output_channels * output_channel_stride;
-  const float *pos1, *output_ptr;
-  int hstart, wstart, hend, wend;
-  for (int i = 0; i < batch_size; ++i) {
-#pragma omp parallel for
-    for (int c = 0; c < output_channels; ++c) {
-      const float *input_seg = input_data + c * input_channel_stride;
-      float *output_seg = output_data + c * output_channel_stride;
-      for (int ph = 0; ph < output_height; ph++) {
-        int hstart = ph * stride - padding;
-        int hend = min(hstart + 3, input_height);
-        hstart = max(hstart, 0);
-        for (int pw = 0; pw < output_width; pw++) {
-          int wstart = pw * stride - padding;
-          int wend = min(wstart + 3, input_width);
-          wstart = max(wstart, 0);
-          const float *pos1 = input_seg + hstart * input_width + wstart;
-          const float *pos2 = input_seg + (hstart + 1) * input_width + wstart;
-          const float *pos3 = input_seg + (hstart + 2) * input_width + wstart;
-          output_ptr = output_seg + ph * output_width + pw;
-
-          if (hend - hstart != 3 || wend - wstart != 3) {
-            float max_value = -INT_MAX;
-            for (int h = hstart; h < hend; h++) {
-              for (int w = wstart; w < wend; w++) {
-                float value = input_seg[h * input_width + w];
-                if (value > max_value) {
-                  max_value = value;
-                }
-              }
-            }
-            output_seg[ph * output_width + pw] = max_value;
-          } else {
-#if __aarch64__
-            const float32x4_t data1 = vld1q_f32(pos1);
-            const float32x4_t data2 = vld1q_f32(pos1 + input_width);
-            const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width);
-            const float32x4_t max_data =
-                vmaxq_f32(vmaxq_f32(data1, data2), data3);
-            float32x2_t res =
-                vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
-                          vget_low_f32(max_data));
-            res = vpmax_f32(res, res);
-            output_seg[ph * output_width + pw] = vget_lane_f32(res, 0);
-#else
-            asm volatile(
-                "vld1.32  {q1}, [%[pos1]]        \n\t"
-                "vld1.32  {q2}, [%[pos2]]        \n\t"
-                "vld1.32  {q3}, [%[pos3]]        \n\t"
-                "vmax.f32 q1, q1, q2            \n\t"
-                "vmax.f32 q2, q1, q3            \n\t"
-                "vmov.f32 d5[1],  %[negative_max]         \n\t"
-                "vpmax.f32  d6, d4, d5            \n\t"
-                "vpmax.f32  d7, d6, d6             \n\t"
-                "vst1.32 {d7[0]},[%[output_ptr]]    \n\t"
-                :
-                : [input_seg] "r"(input_seg), [pos1] "r"(pos1),
-                  [pos2] "r"(pos2), [pos3] "r"(pos3),
-                  [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
-                : "memory", "q1", "q2", "q3", "q4");
-#endif
-          }
-        }
-      }
-    }
-    input_data += input_batch_stride;
-    output_data += output_batch_stride;
-  }
-#endif
-}
-
-void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
-                Tensor *output) {
-#if __ARM_NEON
-  const int batch_size = input->dims()[0];
-
-  const int input_height = input->dims()[2];
-
-  const int input_width = input->dims()[3];
-
-  const int output_channels = output->dims()[1];
-
-  const int output_height = output->dims()[2];
-  const int output_width = output->dims()[3];
-  const int stride = strides[0];
-  const int padding = paddings[0];
-
-  const int input_channel_stride = input_height * input_width;
-  const int output_channel_stride = output_height * output_width;
-
-  const float *input_data = input->data<float>();
-  float *output_data = output->mutable_data<float>();
-  const float zero = 0;
-  const float nine = 1.0 / 9.0;
-  const float nine_ptr[] = {nine, nine};
-
-  const int input_batch_stride = output_channels * input_channel_stride;
-  const int output_batch_stride = output_channels * output_channel_stride;
-  for (int i = 0; i < batch_size; ++i) {
-#pragma omp parallel for
-    for (int c = 0; c < output_channels; ++c) {
-      const float *input_seg = input_data + c * input_channel_stride;
-      float *output_seg = output_data + c * output_channel_stride;
-      for (int ph = 0; ph < output_height; ph++) {
-        for (int pw = 0; pw < output_width; pw++) {
-          int hstart = ph * stride - padding;
-          int wstart = pw * stride - padding;
-          int hend = min(hstart + 3, input_height + padding);
-          int wend = min(wstart + 3, input_width + padding);
-          hstart = max(hstart, 0);
-          wstart = max(wstart, 0);
-          hend = min(hend, input_height);
-          wend = min(wend, input_width);
-
-          const float *pos1 = input_seg + hstart * input_width + wstart;
-          const float *pos2 = input_seg + (hstart + 1) * input_width + wstart;
-          const float *pos3 = input_seg + (hstart + 2) * input_width + wstart;
-          float *output_ptr = output_seg + ph * output_width + pw;
-
-          if (hend - hstart != 3 || wend - wstart != 3) {
-            float sum = 0;
-            for (int h = hstart; h < hend; h++) {
-              for (int w = wstart; w < wend; w++) {
-                sum += input_seg[h * input_width + w];
-              }
-            }
-            output_seg[ph * output_width + pw] =
-                sum / ((hend - hstart) * (wend - wstart) * 1.0);
-          } else {
-#if __aarch64__
-#else
-            asm volatile(
-                "vld1.32  {q1}, [%[pos1]]        \n\t"
-                "vld1.32  {q2}, [%[pos2]]        \n\t"
-                "vld1.32  {q3}, [%[pos3]]        \n\t"
-                "vadd.f32 q1, q1, q2            \n\t"
-                "vadd.f32 q2, q1, q3            \n\t"
-                "vmov.f32 d5[1],  %[zero]         \n\t"
-                "vpadd.f32  d6, d4, d5            \n\t"
-                "vpadd.f32  d6, d6, d6             \n\t"
-                "vld1.f32 d7, [%[nine_ptr]]!        \n\t"
-                "vmul.f32 d6,d7                     \n\t"
-                "vst1.32 {d6[0]},[%[output_ptr]]    \n\t"
-                :
-                : [input_seg] "r"(input_seg), [pos1] "r"(pos1),
-                  [pos2] "r"(pos2), [pos3] "r"(pos3),
-                  [output_ptr] "r"(output_ptr), [zero] "r"(zero),
-                  [nine_ptr] "r"(nine_ptr)
-                : "memory", "r6", "q1", "q2", "q3", "q4");
-#endif
-            const float32x4_t data1 = vld1q_f32(pos1);
-            const float32x4_t data2 = vld1q_f32(pos2);
-            const float32x4_t data3 = vld1q_f32(pos3);
-            const float32x4_t sum_data =
-                vaddq_f32(vaddq_f32(data1, data3), data2);
-            float32x2_t res =
-                vpadd_f32(vget_high_f32(vsetq_lane_f32(0, sum_data, 3)),
-                          vget_low_f32(sum_data));
-            res = vpadd_f32(res, res);
-            output_seg[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0;
-          }
-        }
-      }
-    }
-    input_data += input_batch_stride;
-    output_data += output_batch_stride;
-  }
-#else
-#endif
-}
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
--- a/src/operators/math/pool_3x3_int8.cpp
+++ b/src/operators/math/pool_3x3_int8.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-     http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef POOL_OP
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#include "framework/tensor.h"
-#include "operators/math/pool_3x3.h"
-#if __ARM_NEON
-#include <arm_neon.h>
-#endif  // __ARM_NEON
-#include <climits>
-#include <iostream>
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-using framework::Tensor;
-using std::max;
-using std::min;
-using std::vector;
-template <typename T>
-static void make_paddings(const Tensor *input, Tensor *padded_input,
-                          int32_t top, int32_t bottom, int32_t left,
-                          int32_t right, T value) {
-  const int32_t batch_size = input->dims()[0];
-  const int32_t c_in = input->dims()[1];
-  const int32_t h_in = input->dims()[2];
-  const int32_t w_in = input->dims()[3];
-  const int32_t h_padded = h_in + top + bottom;
-  const int32_t w_padded = w_in + left + right;
-  padded_input->Resize({batch_size, c_in, h_padded, w_padded});
-  T *padded_input_data = padded_input->mutable_data<T>();
-  const T *input_data = input->data<T>();
-  const int32_t input_channel_stride = h_in * w_in;
-  const int32_t input_batch_stride = c_in * input_channel_stride;
-  const int32_t padded_channel_stride = h_padded * w_padded;
-  const int32_t padded_batch_stride = c_in * padded_channel_stride;
-  for (int i = 0; i < batch_size; ++i) {
-#pragma omp parallel for
-    for (int j = 0; j < c_in; ++j) {
-      const T *img_in = input_data + j * input_channel_stride;
-      T *img_padded = padded_input_data + j * padded_channel_stride;
-      int k = 0;
-      for (; k < top; ++k) {
-        for (int l = 0; l < w_padded; ++l) {
-          img_padded[l] = value;
-        }
-        img_padded += w_padded;
-      }
-      for (; k < top + h_in; ++k) {
-        int l = 0;
-        for (; l < left; ++l) {
-          img_padded[l] = value;
-        }
-        memcpy(img_padded + left, img_in, w_in * sizeof(T));
-        l += w_in;
-        img_in += w_in;
-        for (; l < w_padded; ++l) {
-          img_padded[l] = value;
-        }
-        img_padded += w_padded;
-      }
-      for (; k < h_padded; ++k) {
-        for (int l = 0; l < w_padded; ++l) {
-          img_padded[l] = value;
-        }
-        img_padded += w_padded;
-      }
-    }
-    input_data += input_batch_stride;
-    padded_input_data += padded_batch_stride;
-  }
-  //  input_data = input->data<T>();
-  //  std::cout << "+++++++++++++++++++Origin begin++++++++++++++++++++"
-  //            << std::endl;
-  //  for (int i = 0; i < 1; ++i) {
-  //    for (int j = 0; j < 1; ++j) {
-  //      const T *img_in = input_data + j * input_channel_stride;
-  //      for (int k = 0; k < h_in; ++k) {
-  //        for (int l = 0; l < w_in; ++l) {
-  //          std::cout << (int32_t)*img_in << "\t";
-  //          img_in++;
-  //        }
-  //        std::cout << std::endl;
-  //      }
-  //    }
-  //    input_data += input_batch_stride;
-  //  }
-  //  std::cout << "+++++++++++++++++++Origin end++++++++++++++++++++" <<
-  //  std::endl;
-  //
-  //  padded_input_data = padded_input->data<T>();
-  //  std::cout << "******************Padding begin**********************"
-  //            << std::endl;
-  //  for (int i = 0; i < 1; ++i) {
-  //    for (int j = 0; j < 1; ++j) {
-  //      T *img_padded = padded_input_data + j * padded_channel_stride;
-  //      for (int k = 0; k < h_padded; ++k) {
-  //        for (int l = 0; l < w_padded; ++l) {
-  //          std::cout << (int32_t)*img_padded << "\t";
-  //          img_padded++;
-  //        }
-  //        std::cout << std::endl;
-  //      }
-  //    }
-  //    padded_input_data += padded_batch_stride;
-  //  }
-  //  std::cout << "******************Padding end**********************"
-  //            << std::endl;
-}
-void Pool3x3Maxs1_int8(const Tensor *input, Tensor *output, int32_t pad_h,
-                       int32_t pad_w) {
-  Tensor padded_input;
-  if (pad_h != 0 && pad_w != 0) {
-    int8_t value = -SCHAR_MAX;
-    make_paddings(input, &padded_input, pad_h, pad_h, pad_w, pad_w, value);
-    input = &padded_input;
-  }
-  const int32_t batch_size = input->dims()[0];
-  const int32_t h_in = input->dims()[2];
-  const int32_t w_in = input->dims()[3];
-  const int8_t *input_data = input->data<int8_t>();
-  const int32_t output_channels = output->dims()[1];
-  const int32_t h_out = output->dims()[2];
-  const int32_t w_out = output->dims()[3];
-  int8_t *output_data = output->mutable_data<int8_t>();
-  const int32_t outputdata_channel_stride = h_out * w_out;
-  const int32_t inputdata_channel_stride = h_in * w_in;
-  const int32_t input_batch_stride = output_channels * inputdata_channel_stride;
-  const int32_t output_batch_stride =
-      output_channels * outputdata_channel_stride;
-  //    std::cout << "h_out = " << h_out << ", w_out=" << w_out << std::endl;
-  for (int i = 0; i < batch_size; ++i) {
-#pragma omp parallel for
-    for (int j = 0; j < output_channels; ++j) {
-      const int8_t *img_in = input_data + j * inputdata_channel_stride;
-      int8_t *img_out = output_data + j * outputdata_channel_stride;
-      for (int k = 0; k < h_out; ++k) {
-        const int8_t *row0 = img_in + k * w_in;
-        const int8_t *row1 = img_in + (k + 1) * w_in;
-        const int8_t *row2 = img_in + (k + 2) * w_in;
-#if __ARM_NEON
-        int32_t nw = w_out >> 4;
-        int32_t left_w = w_out & 0xf;
-        int32_t nw1 = left_w >> 3;
-        int32_t left_w1 = left_w & 0x7;
-#if __aarch64__
-        // TODO
-#else
-        if (nw > 0) {
-#define LOOP_LABEL "1"
-          // result: q15
-          asm volatile(
-              "vld1.8 {q0}, [%[row0]]! \n\t"  // q0=0-15
-              "vld1.8 {q2}, [%[row1]]! \n\t"
-              "vld1.8 {q4}, [%[row2]]! \n\t"
-
-              LOOP_LABEL
-              ": \n\t"
-              "vld1.8 {q1}, [%[row0]]! \n\t"  // q1=16-31
-              "vext.8 q6, q0, q1, #1 \n\t"
-              "vext.8 q7, q0, q1, #2 \n\t"
-              "vld1.8 {q3}, [%[row1]]! \n\t"
-              "vmax.s8 q15, q0, q6 \n\t"
-              "vmax.s8 q15, q15, q7 \n\t"
-              "vext.8 q6, q2, q3, #1 \n\t"
-              "vext.8 q7, q2, q3, #2 \n\t"
-              "vld1.8 {q5}, [%[row2]]! \n\t"
-              "vmax.s8 q14, q2, q6 \n\t"
-              "vmax.s8 q14, q14, q7 \n\t"
-              "vext.8 q6, q4, q5, #1 \n\t"
-              "vext.8 q7, q4, q5, #2 \n\t"
-              "vmax.s8 q13, q4, q6 \n\t"
-              "vmax.s8 q13, q13, q7 \n\t"
-              "vmax.s8 q15, q15, q14 \n\t"
-              "vmax.s8 q15, q15, q13 \n\t"
-              "vmov.s8 q0, q1 \n\t"
-              "vmov.s8 q2, q3 \n\t"
-              "vmov.s8 q4, q5 \n\t"
-              "vst1.8 {q15}, [%[img_out]]! \n\t"
-              "subs %[nw], #1 \n\t"
-              "bne " LOOP_LABEL
-              "b \n\t"
-              "sub %[row0], #16 \n\t"
-              "sub %[row1], #16 \n\t"
-              "sub %[row2], #16 \n\t"
-              : [nw] "+r"(nw), [row0] "+r"(row0), [row1] "+r"(row1),
-                [row2] "+r"(row2), [img_out] "+r"(img_out)
-              :
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q13", "q14", "q15");
-#undef LOOP_LABEL
-        }
-        if (nw1 > 0 || left_w1 > 0) {
-#define PADDLE_LABEL_LESS8 "1"
-#define PADDLE_LABEL_LESS8_SAVE "2"
-#define PADDLE_LABEL_OVER "3"
-          // result: d15
-          asm volatile(
-              "vld1.8 {d0}, [%[row0]]! \n\t"  // d0=0-8
-              "vld1.8 {d2}, [%[row1]]! \n\t"
-              "vld1.8 {d4}, [%[row2]]! \n\t"
-              "mov r0, #1 \n\t"
-              "cmp %[nw1], #0 \n\t"
-              "beq " PADDLE_LABEL_LESS8
-              "f\n\t"
-              "vld1.8 {d1}, [%[row0]]! \n\t"  // d1=9-15
-              "vext.8 d6, d0, d1, #1 \n\t"
-              "vext.8 d7, d0, d1, #2 \n\t"
-              "vld1.8 {d3}, [%[row1]]! \n\t"
-              "vmax.s8 d15, d0, d6 \n\t"
-              "vmax.s8 d15, d15, d7 \n\t"
-              "vext.8 d6, d2, d3, #1 \n\t"
-              "vext.8 d7, d2, d3, #2 \n\t"
-              "vld1.8 {d5}, [%[row2]]! \n\t"
-              "vmax.s8 d14, d2, d6 \n\t"
-              "vmax.s8 d14, d14, d7 \n\t"
-              "vext.8 d6, d4, d5, #1 \n\t"
-              "vext.8 d7, d4, d5, #2 \n\t"
-              "vmax.s8 d13, d4, d6 \n\t"
-              "vmax.s8 d13, d13, d7 \n\t"
-              "vmax.s8 d15, d15, d14 \n\t"
-              "vmax.s8 d15, d15, d13 \n\t"
-              "vmov.s8 d0, d1 \n\t"
-              "vmov.s8 d2, d3 \n\t"
-              "vmov.s8 d4, d5 \n\t"
-              "vst1.8 {d15}, [%[img_out]]! \n\t"
-
-              PADDLE_LABEL_LESS8
-              ": \n\t"
-              "cmp %[left_w1], #0 \n\t"
-              "beq " PADDLE_LABEL_OVER
-              "f\n\t"
-              "vld1.8 {d1}, [%[row0]] \n\t"  // d1=9-15
-              "vext.8 d6, d0, d1, #1 \n\t"
-              "vext.8 d7, d0, d1, #2 \n\t"
-              "vld1.8 {d3}, [%[row1]] \n\t"
-              "vmax.s8 d15, d0, d6 \n\t"
-              "vmax.s8 d15, d15, d7 \n\t"
-              "vext.8 d6, d2, d3, #1 \n\t"
-              "vext.8 d7, d2, d3, #2 \n\t"
-              "vld1.8 {d5}, [%[row2]] \n\t"
-              "vmax.s8 d14, d2, d6 \n\t"
-              "vmax.s8 d14, d14, d7 \n\t"
-              "vext.8 d6, d4, d5, #1 \n\t"
-              "vext.8 d7, d4, d5, #2 \n\t"
-              "vmax.s8 d13, d4, d6 \n\t"
-              "vmax.s8 d13, d13, d7 \n\t"
-              "vmax.s8 d15, d15, d14 \n\t"
-              "vmax.s8 d15, d15, d13 \n\t"
-
-              PADDLE_LABEL_LESS8_SAVE
-              ": \n\t"
-              "vst1.8 {d15[0]}, [%[img_out]], r0\n\t"
-              "add %[row0], %[row0], #1 \n\t"
-              "add %[row1], %[row1], #1 \n\t"
-              "add %[row2], %[row2], #1 \n\t"
-              "vext.8 d15, d15, d15, #1 \n\t"
-              "subs %[left_w1], #1 \n\t"
-              "bgt " PADDLE_LABEL_LESS8_SAVE "b \n\t"
-
-              PADDLE_LABEL_OVER ": \n\t"
-              : [nw1] "+r"(nw1), [left_w1] "+r"(left_w1), [row0] "+r"(row0),
-                [row1] "+r"(row1), [row2] "+r"(row2), [img_out] "+r"(img_out)
-              :
-              : "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
-                "d7", "d13", "d14", "d15");
-#undef PADDLE_LABEL_OVER
-#undef PADDLE_LABEL_LESS8_SAVE
-#undef PADDLE_LABEL_LESS8
-        }
-#endif  // __aarch64__
-#else
-        int32_t left = w_out;
-        while (left > 0) {
-          const int8_t max0 = std::max(std::max(row0[0], row0[1]), row0[2]);
-          const int8_t max1 = std::max(std::max(row1[0], row1[1]), row1[2]);
-          const int8_t max2 = std::max(std::max(row2[0], row2[1]), row2[2]);
-          *img_out = std::max(std::max(max0, max1), max2);
-          row0 += 1;
-          row1 += 1;
-          row2 += 1;
-          img_out++;
-          left--;
-        }
-#endif  // __ARM_NEON
-      }
-    }
-    input_data += input_batch_stride;
-    output_data += output_batch_stride;
-  }
-}
-void Pool3x3Maxs2_int8(const Tensor *input, Tensor *output, int32_t pad_h,
-                       int32_t pad_w) {
-  Tensor padded_input;
-  if (pad_h != 0 && pad_w != 0) {
-    int8_t value = -SCHAR_MAX;
-    make_paddings(input, &padded_input, pad_h, pad_h, pad_w, pad_w, value);
-    input = &padded_input;
-  }
-  const int32_t batch_size = input->dims()[0];
-  const int32_t h_in = input->dims()[2];
-  const int32_t w_in = input->dims()[3];
-  const int32_t output_channels = output->dims()[1];
-  const int32_t h_out = output->dims()[2];
-  const int32_t w_out = output->dims()[3];
-  const int32_t outputdata_channel_stride = h_out * w_out;
-  const int32_t inputdata_channel_stride = h_in * w_in;
-  const int32_t output_batch_stride =
-      output_channels * outputdata_channel_stride;
-  const int32_t input_batch_stride = output_channels * inputdata_channel_stride;
-  const int8_t *input_data = input->data<int8_t>();
-  int8_t *output_data = output->mutable_data<int8_t>();
-  for (int i = 0; i < batch_size; ++i) {
-#pragma omp parallel for
-    for (int j = 0; j < output_channels; ++j) {
-      const int8_t *img_in = input_data + j * inputdata_channel_stride;
-      int8_t *img_out = output_data + j * outputdata_channel_stride;
-      for (int k = 0; k < h_out; ++k) {
-        const int8_t *row0 = img_in + 2 * k * w_in;
-        const int8_t *row1 = img_in + (2 * k + 1) * w_in;
-        const int8_t *row2 = img_in + (2 * k + 2) * w_in;
-#if __ARM_NEON
-        int32_t nw = w_out >> 4;
-        int32_t left_w = w_out & 0xf;
-        int32_t nw1 = left_w >> 3;
-        int32_t left_w1 = left_w & 0x7;
-#if __aarch64__
-        // TODO
-#else
-        if (nw > 0) {
-#define LOOP_LABEL "1"
-          // result: q15
-          asm volatile(
-              "vld2.8 {q0, q1}, [%[row0]]! \n\t"  // q0=0-30, q1=1-31
-              "vld2.8 {q2, q3}, [%[row1]]! \n\t"
-              "vld2.8 {q4, q5}, [%[row2]]! \n\t"
-
-              LOOP_LABEL
-              ": \n\t"
-              "vmax.s8 q15, q0, q1 \n\t"
-              "vld2.8 {q6, q7}, [%[row0]]! \n\t"  // q0=32-62, q1=33-63
-              "vmax.s8 q14, q2, q3 \n\t"
-              "vmax.s8 q13, q4, q5 \n\t"
-              "vld2.8 {q8, q9}, [%[row1]]! \n\t"
-              "vext.8 q0, q0, q6, #1 \n\t"
-              "vmax.s8 q15, q15, q0 \n\t"
-              "vld2.8 {q10, q11}, [%[row2]]! \n\t"
-              "vext.8 q2, q2, q8, #1 \n\t"
-              "vmax.s8 q14, q14, q2 \n\t"
-              "vext.8 q4, q4, q10, #1 \n\t"
-              "vmax.s8 q13, q13, q4 \n\t"
-              "vmax.s8 q15, q15, q14 \n\t"
-              "vmax.s8 q15, q15, q13 \n\t"
-              "vmov.s8 q0, q6 \n\t"
-              "vmov.s8 q1, q7 \n\t"
-              "vmov.s8 q2, q8 \n\t"
-              "vmov.s8 q3, q9 \n\t"
-              "vmov.s8 q4, q10 \n\t"
-              "vmov.s8 q5, q11 \n\t"
-              "vst1.8 {q15}, [%[img_out]]! \n\t"
-              "subs %[nw], #1 \n\t"
-              "bne " LOOP_LABEL
-              "b \n\t"
-              "sub %[row0], #32 \n\t"
-              "sub %[row1], #32 \n\t"
-              "sub %[row2], #32 \n\t"
-              : [nw] "+r"(nw), [row0] "+r"(row0), [row1] "+r"(row1),
-                [row2] "+r"(row2), [img_out] "+r"(img_out)
-              :
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q8", "q9", "q10", "q11", "q13", "q14", "q15");
-#undef LOOP_LABEL
-        }
-        if (nw1 > 0 || left_w1 > 0) {
-#define PADDLE_LABEL_LESS8 "1"
-#define PADDLE_LABEL_LESS8_SAVE "2"
-#define PADDLE_LABEL_OVER "3"
-          // result: d15
-          asm volatile(
-              "vld2.8 {d0, d1}, [%[row0]]! \n\t"  // d0=0-14, d1=1-15
-              "vld2.8 {d2, d3}, [%[row1]]! \n\t"
-              "vld2.8 {d4, d5}, [%[row2]]! \n\t"
-              "mov r0, #1 \n\t"
-              "cmp %[nw1], #0 \n\t"
-              "beq " PADDLE_LABEL_LESS8
-              "f\n\t"
-              "vmax.s8 d15, d0, d1 \n\t"
-              "vld2.8 {d6, d7}, [%[row0]]! \n\t"  // d0=32-62, d1=33-63
-              "vmax.s8 d14, d2, d3 \n\t"
-              "vmax.s8 d13, d4, d5 \n\t"
-              "vld2.8 {d8, d9}, [%[row1]]! \n\t"
-              "vext.8 d0, d0, d6, #1 \n\t"
-              "vmax.s8 d15, d15, d0 \n\t"
-              "vld2.8 {d10, d11}, [%[row2]]! \n\t"
-              "vext.8 d2, d2, d8, #1 \n\t"
-              "vmax.s8 d14, d14, d2 \n\t"
-              "vext.8 d4, d4, d10, #1 \n\t"
-              "vmax.s8 d13, d13, d4 \n\t"
-              "vmax.s8 d15, d15, d14 \n\t"
-              "vmax.s8 d15, d15, d13 \n\t"
-              "vmov.s8 d0, d6 \n\t"
-              "vmov.s8 d1, d7 \n\t"
-              "vmov.s8 d2, d8 \n\t"
-              "vmov.s8 d3, d9 \n\t"
-              "vmov.s8 d4, d10 \n\t"
-              "vmov.s8 d5, d11 \n\t"
-              "vst1.8 {d15}, [%[img_out]]! \n\t"
-
-              PADDLE_LABEL_LESS8
-              ": \n\t"
-              "cmp %[left_w1], #0 \n\t"
-              "beq " PADDLE_LABEL_OVER
-              "f\n\t"
-              "vmax.s8 d15, d0, d1 \n\t"
-              "vld2.8 {d6, d7}, [%[row0]] \n\t"  // d0=32-62, d1=33-63
-              "vmax.s8 d14, d2, d3 \n\t"
-              "vmax.s8 d13, d4, d5 \n\t"
-              "vld2.8 {d8, d9}, [%[row1]] \n\t"
-              "vext.8 d0, d0, d6, #1 \n\t"
-              "vmax.s8 d15, d15, d0 \n\t"
-              "vld2.8 {d10, d11}, [%[row2]] \n\t"
-              "vext.8 d2, d2, d8, #1 \n\t"
-              "vmax.s8 d14, d14, d2 \n\t"
-              "vext.8 d4, d4, d10, #1 \n\t"
-              "vmax.s8 d13, d13, d4 \n\t"
-              "vmax.s8 d15, d15, d14 \n\t"
-              "vmax.s8 d15, d15, d13 \n\t"
-
-              PADDLE_LABEL_LESS8_SAVE
-              ": \n\t"
-              "vst1.8 {d15[0]}, [%[img_out]], r0\n\t"
-              "add %[row0], %[row0], #2 \n\t"
-              "add %[row1], %[row1], #2 \n\t"
-              "add %[row2], %[row2], #2 \n\t"
-              "vext.8 d15, d15, d15, #1 \n\t"
-              "subs %[left_w1], #1 \n\t"
-              "bgt " PADDLE_LABEL_LESS8_SAVE "b \n\t"
-
-              PADDLE_LABEL_OVER ": \n\t"
-              : [nw1] "+r"(nw1), [left_w1] "+r"(left_w1), [row0] "+r"(row0),
-                [row1] "+r"(row1), [row2] "+r"(row2), [img_out] "+r"(img_out)
-              :
-              : "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
-                "d7", "d8", "d9", "d10", "d11", "d13", "d14", "d15");
-#undef PADDLE_LABEL_OVER
-#undef PADDLE_LABEL_LESS8_SAVE
-#undef PADDLE_LABEL_LESS8
-        }
-#endif  // __aarch64__
-#else
-        int32_t left = w_out;
-        while (left > 0) {
-          const int8_t max0 = std::max(std::max(row0[0], row0[1]), row0[2]);
-          const int8_t max1 = std::max(std::max(row1[0], row1[1]), row1[2]);
-          const int8_t max2 = std::max(std::max(row2[0], row2[1]), row2[2]);
-          *img_out = std::max(std::max(max0, max1), max2);
-          row0 += 2;
-          row1 += 2;
-          row2 += 2;
-          img_out++;
-          left--;
-        }
-#endif  // __ARM_NEON
-      }
-    }
-    input_data += input_batch_stride;
-    output_data += output_batch_stride;
-  }
-}
-void Pool3x3Max_int8(const vector<int> &strides, const vector<int> &paddings,
-                     const Tensor *input, Tensor *output) {
-  const int batch_size = input->dims()[0];
-  const int input_height = input->dims()[2];
-  const int input_width = input->dims()[3];
-  const int output_channels = output->dims()[1];
-  const int output_height = output->dims()[2];
-  const int output_width = output->dims()[3];
-  //  const int _kernel_size = 3;
-  const int stride = strides[0];
-  //  const int stride_width = strides[1];
-  const int padding = paddings[0];
-  //  const int padding_width = paddings[1];
-  const int8_t negative_max = -SCHAR_MAX;
-  const int input_channel_stride = input_height * input_width;
-  const int output_channel_stride = output_height * output_width;
-  const int8_t *input_data = input->data<int8_t>();
-  int8_t *output_data = output->mutable_data<int8_t>();
-  const int input_batch_stride = output_channels * input_channel_stride;
-  const int output_batch_stride = output_channels * output_channel_stride;
-  for (int i = 0; i < batch_size; ++i) {
-#pragma omp parallel for
-    for (int c = 0; c < output_channels; ++c) {
-      const int8_t *input_seg = input_data + c * input_channel_stride;
-      int8_t *output_seg = output_data + c * output_channel_stride;
-      for (int ph = 0; ph < output_height; ph++) {
-        int hstart = ph * stride - padding;
-        int hend = min(hstart + 3, input_height);
-        hstart = max(hstart, 0);
-        for (int pw = 0; pw < output_width; pw++) {
-          int wstart = pw * stride - padding;
-          int wend = min(wstart + 3, input_width);
-          wstart = max(wstart, 0);
-          const int8_t *pos1 = input_seg + hstart * input_width + wstart;
-          const int8_t *pos2 = input_seg + (hstart + 1) * input_width + wstart;
-          const int8_t *pos3 = input_seg + (hstart + 2) * input_width + wstart;
-          int8_t *output_ptr = output_seg + ph * output_width + pw;
-          if (hend - hstart != 3 || wend - wstart != 3) {
-            int8_t max_value = -SCHAR_MAX;
-            for (int h = hstart; h < hend; h++) {
-              for (int w = wstart; w < wend; w++) {
-                int8_t value = input_seg[h * input_width + w];
-                if (value > max_value) {
-                  max_value = value;
-                }
-              }
-            }
-            output_seg[ph * output_width + pw] = max_value;
-          } else {
-#if __ARM_NEON
-#if __aarch64__
-          // TODO
-#else
-            asm volatile(
-                "vld1.8  {d0}, [%[pos1]]        \n\t"
-                "vld1.8  {d1}, [%[pos2]]        \n\t"
-                "vld1.8  {d2}, [%[pos3]]        \n\t"
-                "vmax.s8 d3, d0, d1            \n\t"
-                "vmax.s8 d4, d2, d3            \n\t"
-                "vmov.s8 d4[3],  %[negative_max] \n\t"
-                "vpmax.s8  d5, d4, d4            \n\t"
-                "vpmax.s8  d6, d5, d5             \n\t"
-                "vst1.8 {d6[0]},[%[output_ptr]]    \n\t"
-                :
-                : [pos1] "r"(pos1), [pos2] "r"(pos2), [pos3] "r"(pos3),
-                  [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
-                : "memory", "q0", "q1", "q2", "q3");
-#endif
-#else
-            const int8_t max0 = std::max(std::max(pos1[0], pos1[1]), pos1[2]);
-            const int8_t max1 = std::max(std::max(pos2[0], pos2[1]), pos2[2]);
-            const int8_t max2 = std::max(std::max(pos3[0], pos3[1]), pos3[2]);
-            *output_ptr = std::max(std::max(max0, max1), max2);
-#endif  // __ARM_NEON
-          }
-        }
-      }
-    }
-    input_data += input_batch_stride;
-    output_data += output_batch_stride;
-  }
-}
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
--- a/src/operators/math/pooling.cpp
+++ b/src/operators/math/pooling.cpp
@@ -14,53 +14,42 @@ limitations under the License. */

 #ifdef POOL_OP

-#include "pooling.h"
-#include "common/types.h"
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
+#include "operators/math/pooling.h"
 namespace paddle_mobile {
 namespace operators {
 namespace math {

-/*
- * All tensors are in NCHW format.
- * Ksize, strides, paddings are two elements. These two elements represent
- * height and width, respectively.
- */
-template <typename PoolProcess, typename T>
-class PoolFunctor<CPU, PoolProcess, T> {
- public:
-  void operator()(const framework::Tensor &input, const std::vector<int> &ksize,
+template <PoolingType P>
+void Pooling<P>::operator()(const framework::Tensor &input,
+                            const std::vector<int> &kernel_size,
                            const std::vector<int> &strides,
-                  const std::vector<int> &paddings, PoolProcess pool_process,
+                            const std::vector<int> &paddings,
                            framework::Tensor *output) {
  const int batch_size = input.dims()[0];
-
  const int input_height = input.dims()[2];
-
  const int input_width = input.dims()[3];
-
  const int output_channels = output->dims()[1];
-
  const int output_height = output->dims()[2];
  const int output_width = output->dims()[3];
-    const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
+  const int ksize_height = kernel_size[0];
+  const int ksize_width = kernel_size[1];
  const int stride_height = strides[0];
  const int stride_width = strides[1];
  const int padding_height = paddings[0];
  const int padding_width = paddings[1];

-    const int input_stride = input_height * input_width;
-    const int output_stride = output_height * output_width;
+  const float *input_data = input.data<float>();
+  float *output_data = output->mutable_data<float>();
+  const size_t input_spatial_size = input_height * input_width;
+  const size_t output_spatial_size = output_height * output_width;

-    const T *input_data = input.data<T>();
-    T *output_data = output->mutable_data<T>();
+  #pragma omp parallel for collapse(2)
  for (int i = 0; i < batch_size; i++) {
    for (int c = 0; c < output_channels; ++c) {
-#pragma omp parallel for
+      int channel = i * output_channels + c;
+      const float *input_ptr = input_data + channel * input_spatial_size;
+      float *output_ptr = output_data + channel * output_spatial_size;
+
      for (int ph = 0; ph < output_height; ++ph) {
        int hstart = ph * stride_height - padding_height;
        int hend = std::min(hstart + ksize_height, input_height);
@@ -70,30 +59,24 @@ class PoolFunctor<CPU, PoolProcess, T> {
          int wend = std::min(wstart + ksize_width, input_width);
          wstart = std::max(wstart, 0);

-            auto ele = pool_process.initial();
+          PoolingVal<P> val;
          for (int h = hstart; h < hend; ++h) {
            for (int w = wstart; w < wend; ++w) {
-                pool_process.compute(input_data[h * input_width + w], &ele);
+              val += input_ptr[h * input_width + w];
            }
          }
-            int pool_size = (hend - hstart) * (wend - wstart);
-            pool_process.finalize(static_cast<float>(pool_size), &ele);
-            output_data[ph * output_width + pw] = static_cast<T>(ele);
+          output_ptr[ph * output_width + pw] = val.Value();
        }
      }
-        input_data += input_stride;
-        output_data += output_stride;
    }
  }
-  }
-};
+}
+
+template struct Pooling<MAX>;
+template struct Pooling<AVG>;

-template class PoolFunctor<CPU, math::AvgPool<float, float>, float>;
-template class PoolFunctor<CPU, math::MaxPool<float>, float>;
-template class PoolFunctor<CPU, math::AvgPool<int8_t, int32_t>, int8_t>;
-template class PoolFunctor<CPU, math::MaxPool<int8_t>, int8_t>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile

-#endif
+#endif  // POOL_OP
--- a/src/operators/math/pooling.h
+++ b/src/operators/math/pooling.h
@@ -16,75 +16,138 @@ limitations under the License. */

 #pragma once

-#include <climits>
+#include <algorithm>
 #include <cmath>
-#include "common/log.h"
+#include <limits>
+#include <vector>
+#include "common/types.h"
 #include "framework/tensor.h"
-#include "pool_2x2.h"
-#include "pool_3x3.h"
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#endif

 namespace paddle_mobile {
 namespace operators {
 namespace math {

-#define FLT_MAX __FLT_MAX__
-
-/*
- * \brief Extracting simple operations from pooling.
- *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
- * operation.
- *        MaxPool initializes temp variable to the negative maximum to find the
- * maximum value in the pooling field.
- *        AvgPool initializes temp variable to the zero to accumulate all values
- * in pool pooling, and finally takes the average.
- *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
- */
-template <typename T>
-class MaxPool {
- public:
-  inline T initial() {
-    if (typeid(T) == typeid(int8_t)) {
-      return static_cast<T>(-SCHAR_MAX);
+template <PoolingType P = MAX>
+struct PoolingVal {
+  float val;
+  int count;
+  PoolingVal() : count(0) { val = -std::numeric_limits<float>::max(); }
+  inline PoolingVal<P> &operator+=(const float &x) {
+    val = std::max(val, x);
+    ++count;
+    return *this;
  }
-    return static_cast<T>(-FLT_MAX);
+  inline float Value() { return (count > 0) ? val : 0.f; }
+};
+
+template <>
+struct PoolingVal<AVG> {
+  float val;
+  int count;
+  PoolingVal() : val(0.f), count(0) {}
+  inline PoolingVal<AVG> &operator+=(const float &x) {
+    val += x;
+    ++count;
+    return *this;
  }
+  inline float Value() { return (count > 0) ? val / count : 0.f; }
+};

-  inline void compute(const T &x, T *y) { *y = *y > x ? *y : x; }
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+template <PoolingType P = MAX>
+inline float32x4_t vPoolInitq_f32() {
+  return vdupq_n_f32(-std::numeric_limits<float>::max());
+}
+
+template <>
+inline float32x4_t vPoolInitq_f32<AVG>() {
+  return vdupq_n_f32(0.f);
+}
+
+template <PoolingType P = MAX>
+inline float32x4_t vPoolPreq_f32(const float32x4_t &x1, const float32x4_t &x2) {
+  return vmaxq_f32(x1, x2);
+}
+
+template <>
+inline float32x4_t vPoolPreq_f32<AVG>(const float32x4_t &x1,
+                                      const float32x4_t &x2) {
+  return vaddq_f32(x1, x2);
+}
+
+template <PoolingType P = MAX>
+inline float32x4_t vPoolPostq_f32(const float32x4_t &x,
+                                  const float32x4_t &post) {
+  return x;
+}
+
+template <>
+inline float32x4_t vPoolPostq_f32<AVG>(const float32x4_t &x,
+                                       const float32x4_t &post) {
+  return vmulq_f32(x, post);
+}
+#endif  // __ARM_NEON__
+
+template <PoolingType P = MAX>
+inline float PoolPre(const float &x1, const float &x2) {
+  return std::max(x1, x2);
+}
+
+template <>
+inline float PoolPre<AVG>(const float &x1, const float &x2) {
+  return x1 + x2;
+}
+
+template <PoolingType P = MAX>
+inline float PoolPost(const float &x, const float &post) {
+  return x;
+}
+
+template <>
+inline float PoolPost<AVG>(const float &x, const float &post) {
+  return x * post;
+}
+
+template <PoolingType P>
+struct Pooling {
+  inline void operator()(const framework::Tensor &input,
+                         const std::vector<int> &kernel_size,
+                         const std::vector<int> &strides,
+                         const std::vector<int> &paddings,
+                         framework::Tensor *output);
+};

-  inline void finalize(const T &pool_field, T *y) {}
+template <PoolingType P, int Stride>
+struct Pooling2x2 {
+  inline void operator()(const framework::Tensor &input,
+                         const std::vector<int> &paddings,
+                         framework::Tensor *output);
 };

-template <typename Itype, typename Otype>
-class AvgPool {
- public:
-  inline Otype initial() { return static_cast<Otype>(0); }
-
-  inline void compute(const Itype &x, Otype *y) { *y += x; }
-
-  inline void finalize(const float &pool_field, Otype *y) {
-    if (typeid(Itype) == typeid(int8_t)) {
-      float tmp = *y / pool_field;
-      if (tmp > SCHAR_MAX) {
-        *y = SCHAR_MAX;
-      } else if (tmp < -SCHAR_MAX) {
-        *y = -SCHAR_MAX;
-      } else {
-        *y = static_cast<Otype>(std::round(tmp));
-      }
-    } else {
-      *y /= pool_field;
-    }
-  }
+template <PoolingType P, int Stride>
+struct Pooling3x3 {
+  inline void operator()(const framework::Tensor &input,
+                         const std::vector<int> &paddings,
+                         framework::Tensor *output);
 };

-template <typename DeviceType, typename PoolProcess, typename T>
-class PoolFunctor {
- public:
-  void operator()(const framework::Tensor &input, const std::vector<int> &ksize,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings, PoolProcess pool_compute,
+template <PoolingType P, int Stride>
+struct Pooling5x5 {
+  inline void operator()(const framework::Tensor &input,
+                         const std::vector<int> &paddings,
                         framework::Tensor *output);
 };
+
+template <PoolingType P, int Stride>
+struct Pooling7x7 {
+  inline void operator()(const framework::Tensor &input,
+                         const std::vector<int> &paddings,
+                         framework::Tensor *output);
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/math/pooling3x3.cpp
+++ b/src/operators/math/pooling3x3.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef POOL_OP
+
+#include "operators/math/pooling.h"
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#endif  // __ARM_NEON
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+#define POOLING3X3_NORMAL_BORDER(start, end)                   \
+  for (int w = start; w < end; ++w) {                          \
+    const int w_in_start = -padding_w + w * Stride;            \
+    const int w_in_end = w_in_start + 3;                       \
+    const int w_start = w_in_start > 0 ? w_in_start : 0;       \
+    const int w_end = w_in_end < input_w ? w_in_end : input_w; \
+    PoolingVal<P> val;                                         \
+    for (int h_in = h_start; h_in < h_end; ++h_in) {           \
+      for (int w_in = w_start; w_in < w_end; ++w_in) {         \
+        val += input[h_in * input_w + w_in];                   \
+      }                                                        \
+    }                                                          \
+    output_ptr[w] = val.Value();                               \
+  }
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+template <PoolingType P, int Stride = 1>
+struct Pooling3x3ValidColLoadInput {
+  inline void operator()(const float *input, const int input_w,
+                         const int valid_cols, float32x4x2_t &x0,  // NOLINT
+                         float32x4x2_t &x1, float32x4x2_t &x2,     // NOLINT
+                         float32x4x2_t &y0) {                      // NOLINT
+    float fake_input[3][8];
+    if (valid_cols == 1) {
+      for (int i = 0; i < 8; ++i, input += input_w) {
+        fake_input[0][i] = input[0];
+      }
+    } else if (valid_cols == 2) {
+      for (int i = 0; i < 8; ++i, input += input_w) {
+        fake_input[0][i] = input[0];
+        fake_input[1][i] = input[1];
+      }
+    } else {
+      for (int i = 0; i < 8; ++i, input += input_w) {
+        fake_input[0][i] = input[0];
+        fake_input[1][i] = input[1];
+        fake_input[2][i] = input[2];
+      }
+    }
+    y0.val[0] = vPoolInitq_f32<P>();
+    y0.val[1] = vPoolInitq_f32<P>();
+    for (int i = 0; i < valid_cols; ++i) {
+      x0.val[0] = vld1q_f32(fake_input[i]);
+      x0.val[1] = vld1q_f32(fake_input[i] + 4);
+      x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+      x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
+      x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+      x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
+      y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+      y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
+      y0.val[0] = vPoolPreq_f32<P>(x1.val[0], y0.val[0]);
+      y0.val[1] = vPoolPreq_f32<P>(x1.val[1], y0.val[1]);
+      y0.val[0] = vPoolPreq_f32<P>(x2.val[0], y0.val[0]);
+      y0.val[1] = vPoolPreq_f32<P>(x2.val[1], y0.val[1]);
+    }
+  }
+};
+
+template <PoolingType P>
+struct Pooling3x3ValidColLoadInput<P, 2> {
+  inline void operator()(const float *input, const int input_w,
+                         const int valid_cols, float32x4x2_t &x0,  // NOLINT
+                         float32x4x2_t &x1, float32x4x2_t &x2,     // NOLINT
+                         float32x4x2_t &y0) {                      // NOLINT
+    float fake_input[3][13];
+    if (valid_cols == 1) {
+      for (int i = 0; i < 13; ++i, input += input_w) {
+        fake_input[0][i] = input[0];
+      }
+    } else if (valid_cols == 2) {
+      for (int i = 0; i < 13; ++i, input += input_w) {
+        fake_input[0][i] = input[0];
+        fake_input[1][i] = input[1];
+      }
+    } else {
+      for (int i = 0; i < 13; ++i, input += input_w) {
+        fake_input[0][i] = input[0];
+        fake_input[1][i] = input[1];
+        fake_input[2][i] = input[2];
+      }
+    }
+    for (int i = 0; i < valid_cols; ++i) {
+      x0 = vld2q_f32(fake_input[i]);
+      x1 = vld2q_f32(fake_input[i] + 8);
+      x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+      x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
+      x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+      x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
+      x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+      x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+      y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+      y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
+    }
+  }
+};
+
+template <PoolingType P, int Stride = 1>
+struct Pooling3x3NormalRowLoadInput {
+  inline void operator()(const float *input, float32x4x2_t &x0,  // NOLINT
+                         float32x4x2_t &x1, float32x4x2_t &x2,   // NOLINT
+                         float32x4x2_t &y0) {                    // NOLINT
+    x0.val[0] = vld1q_f32(input);
+    x0.val[1] = vld1q_f32(input + 4);
+    x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+    x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
+    x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+    x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
+    y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+    y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
+    y0.val[0] = vPoolPreq_f32<P>(x1.val[0], y0.val[0]);
+    y0.val[1] = vPoolPreq_f32<P>(x1.val[1], y0.val[1]);
+    y0.val[0] = vPoolPreq_f32<P>(x2.val[0], y0.val[0]);
+    y0.val[1] = vPoolPreq_f32<P>(x2.val[1], y0.val[1]);
+  }
+};
+
+template <PoolingType P>
+struct Pooling3x3NormalRowLoadInput<P, 2> {
+  inline void operator()(const float *input, float32x4x2_t &x0,  // NOLINT
+                         float32x4x2_t &x1, float32x4x2_t &x2,   // NOLINT
+                         float32x4x2_t &y0) {                    // NOLINT
+    x0 = vld2q_f32(input);
+    x1 = vld2q_f32(input + 8);
+    x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+    x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
+    x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+    x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
+    x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+    x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+    y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+    y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
+  }
+};
+#endif  // __ARM_NEON__
+
+template <PoolingType P, int Stride>
+inline void Pooling3x3ValidCol(const float *input, const int h_output,
+                               const int h_output_end, const int w_output,
+                               const int input_h, const int input_w,
+                               const int padding_h, const int padding_w,
+                               const int output_w, float *output) {
+  const int w_in_start = -padding_w + w_output * Stride;
+  const int w_in_end = w_in_start + 3;
+  const int w_start = w_in_start > 0 ? w_in_start : 0;
+  const int w_end = w_in_end < input_w ? w_in_end : input_w;
+  int remain_start = h_output;
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+  int output_tiles = (h_output_end - h_output) / 6;
+  remain_start = h_output + output_tiles * 6;
+  int input_h_start = h_output * Stride - padding_h;
+  size_t input_offset = input_h_start * input_w + w_start;
+  size_t output_offset = h_output * output_w + w_output;
+  int valid_cols = w_end - w_start;
+  Pooling3x3ValidColLoadInput<P, Stride> PoolingCompute;
+  float32x4x2_t x0, x1, x2, y0;
+  float32x4_t avg = vdupq_n_f32(1.f / (3 * valid_cols));
+  for (int h = 0; h < output_tiles * 6; h += 6) {
+    float *output0 = output + output_offset;
+    float *output1 = output0 + output_w;
+    float *output2 = output1 + output_w;
+    float *output3 = output2 + output_w;
+    float *output4 = output3 + output_w;
+    float *output5 = output4 + output_w;
+    y0.val[0] = vPoolInitq_f32<P>();
+    y0.val[1] = vPoolInitq_f32<P>();
+    PoolingCompute(input + input_offset, input_w, valid_cols, x0, x1, x2, y0);
+    y0.val[0] = vPoolPostq_f32<P>(y0.val[0], avg);
+    y0.val[1] = vPoolPostq_f32<P>(y0.val[1], avg);
+    vst1q_lane_f32(output0, y0.val[0], 0);
+    vst1q_lane_f32(output1, y0.val[0], 1);
+    vst1q_lane_f32(output2, y0.val[0], 2);
+    vst1q_lane_f32(output3, y0.val[0], 3);
+    vst1q_lane_f32(output4, y0.val[1], 0);
+    vst1q_lane_f32(output5, y0.val[1], 1);
+    input_offset += 6 * Stride * input_w;
+    output_offset += 6 * output_w;
+  }
+#endif
+  for (int h = remain_start; h < h_output_end; ++h) {
+    PoolingVal<P> val;
+    const int h_in_start = -padding_h + h * Stride;
+    for (int i = 0; i < 3; ++i) {
+      for (int w_in = w_start; w_in < w_end; ++w_in) {
+        val += input[(h_in_start + i) * input_w + w_in];
+      }
+    }
+    output[h * output_w + w_output] = val.Value();
+  }
+}
+
+template <PoolingType P, int Stride>
+inline void Pooling3x3NormalRow(const float *input, const int h_output,
+                                const int input_h, const int input_w,
+                                const int padding_h, const int padding_w,
+                                const int output_w, float *output) {
+  const int h_in_start = -padding_h + h_output * Stride;
+  const int h_in_end = h_in_start + 3;
+  const int h_start = h_in_start > 0 ? h_in_start : 0;
+  const int h_end = h_in_end < input_h ? h_in_end : input_h;
+
+  int valid_w_start = (padding_w + Stride - 1) / Stride;
+  int valid_w_end = (input_w - 3) / Stride + 1 + valid_w_start;
+
+  float *output_ptr = output + h_output * output_w;
+  // border left
+  POOLING3X3_NORMAL_BORDER(0, valid_w_start)
+  // middle
+  int remain_start = valid_w_start;
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+  int output_tiles = (valid_w_end - valid_w_start) / 6;
+  remain_start = valid_w_start + output_tiles * 6;
+  Pooling3x3NormalRowLoadInput<P, Stride> PoolingCompute;
+  float32x4x2_t x0, x1, x2, y0;
+  float32x4_t post = vdupq_n_f32(1.f / (3 * (h_end - h_start)));
+  for (int w = 0; w < output_tiles * 6; w += 6) {
+    int output_offset = valid_w_start + w;
+    int input_w_offset = output_offset * Stride - padding_w;
+    y0.val[0] = vPoolInitq_f32<P>();
+    y0.val[1] = vPoolInitq_f32<P>();
+    for (int h_in = h_start; h_in < h_end; ++h_in) {
+      PoolingCompute(input + h_in * input_w + input_w_offset, x0, x1, x2, y0);
+    }
+    y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
+    y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
+    vst1q_f32(output_ptr + output_offset, y0.val[0]);
+    vst1_f32(output_ptr + output_offset + 4, vget_low_f32(y0.val[1]));
+  }
+#endif  // __ARM_NEON__
+  for (int w = remain_start; w < valid_w_end; ++w) {
+    PoolingVal<P> val;
+    int input_start = -padding_w + w * Stride;
+    for (int h_in = h_start; h_in < h_end; ++h_in) {
+      for (int j = 0; j < 3; ++j) {
+        val += input[h_in * input_w + j + input_start];
+      }
+    }
+    output_ptr[w] = val.Value();
+  }
+  // border right
+  POOLING3X3_NORMAL_BORDER(valid_w_end, output_w)
+}
+
+template <PoolingType P>
+struct Pooling3x3<P, 1> {
+  inline void operator()(const framework::Tensor &input,
+                         const std::vector<int> &paddings,
+                         framework::Tensor *output) {
+    const float *input_data = input.data<float>();
+    float *output_data = output->mutable_data<float>();
+    int input_h = input.dims()[2];
+    int input_w = input.dims()[3];
+    int output_h = output->dims()[2];
+    int output_w = output->dims()[3];
+    int padding_h = paddings[0];
+    int padding_w = paddings[1];
+    int image_size = input_h * input_w;
+    int out_image_size = output_h * output_w;
+    int valid_h_start = padding_h;
+    int valid_h = input_h - 2;
+    int valid_h_end = valid_h_start + valid_h;
+    int valid_w_start = padding_w;
+    int valid_w = input_w - 2;
+    int valid_w_end = valid_w_start + valid_w;
+    float avg = 1.f / 9;
+
+    #pragma omp parallel for collapse(2)
+    for (int batch = 0; batch < output->dims()[0]; ++batch) {
+      for (int c = 0; c < output->dims()[1]; ++c) {
+        int channel = batch * output->dims()[1] + c;
+        const float *input_ptr = input_data + channel * image_size;
+        float *output_ptr = output_data + channel * out_image_size;
+        // top
+        for (int h = 0; h < valid_h_start; ++h) {
+          Pooling3x3NormalRow<P, 1>(input_ptr, h, input_h, input_w, padding_h,
+                                    padding_w, output_w, output_ptr);
+        }
+        // left
+        for (int w = 0; w < valid_w_start; ++w) {
+          Pooling3x3ValidCol<P, 1>(input_ptr, valid_h_start, valid_h_end, w,
+                                   input_h, input_w, padding_h, padding_w,
+                                   output_w, output_ptr);
+        }
+        // right
+        for (int w = valid_w_end; w < output_w; ++w) {
+          Pooling3x3ValidCol<P, 1>(input_ptr, valid_h_start, valid_h_end, w,
+                                   input_h, input_w, padding_h, padding_w,
+                                   output_w, output_ptr);
+        }
+        // bottom
+        for (int h = valid_h_end; h < output_h; ++h) {
+          Pooling3x3NormalRow<P, 1>(input_ptr, h, input_h, input_w, padding_h,
+                                    padding_w, output_w, output_ptr);
+        }
+        // valid
+        int output_w_tiles = valid_w / 6;
+        int output_w_remain = valid_w - output_w_tiles * 6;
+        for (int h = valid_h_start; h < valid_h_end - 3; h += 4) {
+          const float *input_ptr0 = input_ptr + (h - padding_h) * input_w;
+          const float *input_ptr1 = input_ptr0 + input_w;
+          const float *input_ptr2 = input_ptr1 + input_w;
+          const float *input_ptr3 = input_ptr2 + input_w;
+          const float *input_ptr4 = input_ptr3 + input_w;
+          const float *input_ptr5 = input_ptr4 + input_w;
+          float *output_ptr0 = output_ptr + h * output_w + valid_w_start;
+          float *output_ptr1 = output_ptr0 + output_w;
+          float *output_ptr2 = output_ptr1 + output_w;
+          float *output_ptr3 = output_ptr2 + output_w;
+          int remain = output_w_remain;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+          float32x4x2_t x0, x1, x2;
+          float32x4x2_t y0, y1, y2;
+          float32x4_t post = vdupq_n_f32(1.f / 9);
+          for (int loop = 0; loop < output_w_tiles; ++loop) {
+            x0.val[0] = vld1q_f32(input_ptr0);
+            x0.val[1] = vld1q_f32(input_ptr0 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+
+            x0.val[0] = vld1q_f32(input_ptr1);
+            x0.val[1] = vld1q_f32(input_ptr1 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
+            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y1.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(y1.val[0], y0.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(y1.val[1], y0.val[1]);
+
+            x0.val[0] = vld1q_f32(input_ptr2);
+            x0.val[1] = vld1q_f32(input_ptr2 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
+            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+            y1.val[0] = vPoolPreq_f32<P>(y2.val[0], y1.val[0]);
+            y1.val[1] = vPoolPreq_f32<P>(y2.val[1], y1.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(y2.val[0], y0.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(y2.val[1], y0.val[1]);
+            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
+            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
+            vst1q_f32(output_ptr0, y0.val[0]);
+            vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1]));
+
+            x0.val[0] = vld1q_f32(input_ptr3);
+            x0.val[1] = vld1q_f32(input_ptr3 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+            y1.val[0] = vPoolPreq_f32<P>(y0.val[0], y1.val[0]);
+            y1.val[1] = vPoolPreq_f32<P>(y0.val[1], y1.val[1]);
+            y2.val[0] = vPoolPreq_f32<P>(y0.val[0], y2.val[0]);
+            y2.val[1] = vPoolPreq_f32<P>(y0.val[1], y2.val[1]);
+            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
+            y1.val[1] = vPoolPostq_f32<P>(y1.val[1], post);
+            vst1q_f32(output_ptr1, y1.val[0]);
+            vst1_f32(output_ptr1 + 4, vget_low_f32(y1.val[1]));
+
+            x0.val[0] = vld1q_f32(input_ptr4);
+            x0.val[1] = vld1q_f32(input_ptr4 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
+            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], y2.val[0]);
+            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], y2.val[1]);
+            y2.val[0] = vPoolPostq_f32<P>(y2.val[0], post);
+            y2.val[1] = vPoolPostq_f32<P>(y2.val[1], post);
+            vst1q_f32(output_ptr2, y2.val[0]);
+            vst1_f32(output_ptr2 + 4, vget_low_f32(y2.val[1]));
+
+            x0.val[0] = vld1q_f32(input_ptr5);
+            x0.val[1] = vld1q_f32(input_ptr5 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
+            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
+            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
+            vst1q_f32(output_ptr3, y0.val[0]);
+            vst1_f32(output_ptr3 + 4, vget_low_f32(y0.val[1]));
+
+            input_ptr0 += 6;
+            input_ptr1 += 6;
+            input_ptr2 += 6;
+            input_ptr3 += 6;
+            input_ptr4 += 6;
+            input_ptr5 += 6;
+            output_ptr0 += 6;
+            output_ptr1 += 6;
+            output_ptr2 += 6;
+            output_ptr3 += 6;
+          }
+          // remain width
+          if (remain >= 4) {
+            x0.val[0] = vld1q_f32(input_ptr0);
+            x0.val[1] = vld1q_f32(input_ptr0 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+
+            x0.val[0] = vld1q_f32(input_ptr1);
+            x0.val[1] = vld1q_f32(input_ptr1 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y0.val[0] = vPoolPreq_f32<P>(y1.val[0], y0.val[0]);
+
+            x0.val[0] = vld1q_f32(input_ptr2);
+            x0.val[1] = vld1q_f32(input_ptr2 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y1.val[0] = vPoolPreq_f32<P>(y2.val[0], y1.val[0]);
+            y0.val[0] = vPoolPreq_f32<P>(y2.val[0], y0.val[0]);
+            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
+            vst1q_f32(output_ptr0, y0.val[0]);
+
+            x0.val[0] = vld1q_f32(input_ptr3);
+            x0.val[1] = vld1q_f32(input_ptr3 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y1.val[0] = vPoolPreq_f32<P>(y0.val[0], y1.val[0]);
+            y2.val[0] = vPoolPreq_f32<P>(y0.val[0], y2.val[0]);
+            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
+            vst1q_f32(output_ptr1, y1.val[0]);
+
+            x0.val[0] = vld1q_f32(input_ptr4);
+            x0.val[1] = vld1q_f32(input_ptr4 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], y2.val[0]);
+            y2.val[0] = vPoolPostq_f32<P>(y2.val[0], post);
+            vst1q_f32(output_ptr2, y2.val[0]);
+
+            x0.val[0] = vld1q_f32(input_ptr5);
+            x0.val[1] = vld1q_f32(input_ptr5 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
+            vst1q_f32(output_ptr3, y0.val[0]);
+
+            input_ptr0 += 4;
+            input_ptr1 += 4;
+            input_ptr2 += 4;
+            input_ptr3 += 4;
+            input_ptr4 += 4;
+            input_ptr5 += 4;
+            output_ptr0 += 4;
+            output_ptr1 += 4;
+            output_ptr2 += 4;
+            output_ptr3 += 4;
+            remain -= 4;
+          }
+#endif  // __ARM_NEON__
+          for (int r = 0; r < remain; ++r) {
+            float m0 = PoolPre<P>(input_ptr0[r], input_ptr0[r + 1]);
+            m0 = PoolPre<P>(m0, input_ptr0[r + 2]);
+            float m1 = PoolPre<P>(input_ptr1[r], input_ptr1[r + 1]);
+            m1 = PoolPre<P>(m1, input_ptr1[r + 2]);
+            float m2 = PoolPre<P>(input_ptr2[r], input_ptr2[r + 1]);
+            m2 = PoolPre<P>(m2, input_ptr2[r + 2]);
+            float m3 = PoolPre<P>(input_ptr3[r], input_ptr3[r + 1]);
+            m3 = PoolPre<P>(m3, input_ptr3[r + 2]);
+            float m4 = PoolPre<P>(input_ptr4[r], input_ptr4[r + 1]);
+            m4 = PoolPre<P>(m4, input_ptr4[r + 2]);
+            float m5 = PoolPre<P>(input_ptr5[r], input_ptr5[r + 1]);
+            m5 = PoolPre<P>(m5, input_ptr5[r + 2]);
+
+            m0 = PoolPre<P>(PoolPre<P>(m0, m1), m2);
+            m1 = PoolPre<P>(PoolPre<P>(m1, m2), m3);
+            m2 = PoolPre<P>(PoolPre<P>(m2, m3), m4);
+            m3 = PoolPre<P>(PoolPre<P>(m3, m4), m5);
+            output_ptr0[r] = PoolPost<P>(m0, avg);
+            output_ptr1[r] = PoolPost<P>(m1, avg);
+            output_ptr2[r] = PoolPost<P>(m2, avg);
+            output_ptr3[r] = PoolPost<P>(m3, avg);
+          }
+        }
+        // remain height
+        int start_h = valid_h_start + (valid_h & 0xFFFC);
+        for (int h = start_h; h < valid_h_end; ++h) {
+          const float *input_ptr0 = input_ptr + (h - padding_h) * input_w;
+          const float *input_ptr1 = input_ptr0 + input_w;
+          const float *input_ptr2 = input_ptr1 + input_w;
+          float *output_ptr0 = output_ptr + h * output_w + valid_w_start;
+          int remain = output_w_remain;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+          float32x4x2_t x0, x1, x2, y0;
+          float32x4_t post = vdupq_n_f32(1.f / 9);
+          for (int loop = 0; loop < output_w_tiles; ++loop) {
+            x0.val[0] = vld1q_f32(input_ptr0);
+            x0.val[1] = vld1q_f32(input_ptr0 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+
+            x0.val[0] = vld1q_f32(input_ptr1);
+            x0.val[1] = vld1q_f32(input_ptr1 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
+
+            x0.val[0] = vld1q_f32(input_ptr2);
+            x0.val[1] = vld1q_f32(input_ptr2 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
+            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
+            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
+            vst1q_f32(output_ptr0, y0.val[0]);
+            vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1]));
+
+            input_ptr0 += 6;
+            input_ptr1 += 6;
+            input_ptr2 += 6;
+            output_ptr0 += 6;
+          }
+          // remain width
+          if (remain >= 4) {
+            x0.val[0] = vld1q_f32(input_ptr0);
+            x0.val[1] = vld1q_f32(input_ptr0 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+
+            x0.val[0] = vld1q_f32(input_ptr1);
+            x0.val[1] = vld1q_f32(input_ptr1 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+
+            x0.val[0] = vld1q_f32(input_ptr2);
+            x0.val[1] = vld1q_f32(input_ptr2 + 4);
+            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
+            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
+            vst1q_f32(output_ptr0, y0.val[0]);
+
+            input_ptr0 += 4;
+            input_ptr1 += 4;
+            input_ptr2 += 4;
+            output_ptr0 += 4;
+            remain -= 4;
+          }
+#endif  // __ARM_NEON__
+          for (int r = 0; r < remain; ++r) {
+            float m0 = PoolPre<P>(input_ptr0[r], input_ptr0[r + 1]);
+            m0 = PoolPre<P>(m0, input_ptr0[r + 2]);
+            float m1 = PoolPre<P>(input_ptr1[r], input_ptr1[r + 1]);
+            m1 = PoolPre<P>(m1, input_ptr1[r + 2]);
+            float m2 = PoolPre<P>(input_ptr2[r], input_ptr2[r + 1]);
+            m2 = PoolPre<P>(m2, input_ptr2[r + 2]);
+
+            m0 = PoolPre<P>(PoolPre<P>(m0, m1), m2);
+            output_ptr0[r] = PoolPost<P>(m0, avg);
+          }
+        }
+      }
+    }
+  }
+};
+
+template <PoolingType P>
+struct Pooling3x3<P, 2> {
+  inline void operator()(const framework::Tensor &input,
+                         const std::vector<int> &paddings,
+                         framework::Tensor *output) {
+    const float *input_data = input.data<float>();
+    float *output_data = output->mutable_data<float>();
+    int input_h = input.dims()[2];
+    int input_w = input.dims()[3];
+    int output_h = output->dims()[2];
+    int output_w = output->dims()[3];
+    int padding_h = paddings[0];
+    int padding_w = paddings[1];
+    int image_size = input_h * input_w;
+    int out_image_size = output_h * output_w;
+    int valid_h_start = (padding_h + 1) / 2;
+    int valid_h = (input_h - 3) / 2 + 1;
+    int valid_h_end = valid_h_start + valid_h;
+    int valid_w_start = (padding_w + 1) / 2;
+    int valid_w = (input_w - 3) / 2 + 1;
+    int valid_w_end = valid_w_start + valid_w;
+    float avg = 1.f / 9;
+
+    #pragma omp parallel for collapse(2)
+    for (int batch = 0; batch < output->dims()[0]; ++batch) {
+      for (int c = 0; c < output->dims()[1]; ++c) {
+        int channel = batch * output->dims()[1] + c;
+        const float *input_ptr = input_data + channel * image_size;
+        float *output_ptr = output_data + channel * out_image_size;
+        // top
+        for (int h = 0; h < valid_h_start; ++h) {
+          Pooling3x3NormalRow<P, 2>(input_ptr, h, input_h, input_w, padding_h,
+                                    padding_w, output_w, output_ptr);
+        }
+        // left
+        for (int w = 0; w < valid_w_start; ++w) {
+          Pooling3x3ValidCol<P, 2>(input_ptr, valid_h_start, valid_h_end, w,
+                                   input_h, input_w, padding_h, padding_w,
+                                   output_w, output_ptr);
+        }
+        // right
+        for (int w = valid_w_end; w < output_w; ++w) {
+          Pooling3x3ValidCol<P, 2>(input_ptr, valid_h_start, valid_h_end, w,
+                                   input_h, input_w, padding_h, padding_w,
+                                   output_w, output_ptr);
+        }
+        // bottom
+        for (int h = valid_h_end; h < output_h; ++h) {
+          Pooling3x3NormalRow<P, 2>(input_ptr, h, input_h, input_w, padding_h,
+                                    padding_w, output_w, output_ptr);
+        }
+        // valid
+        int input_w_start = 2 * valid_w_start - padding_w;
+        int output_w_tiles = valid_w / 6;
+        int output_w_remain = valid_w - output_w_tiles * 6;
+        for (int h = valid_h_start; h < valid_h_end - 2; h += 3) {
+          size_t offset = (2 * h - padding_h) * input_w + input_w_start;
+          const float *input_ptr0 = input_ptr + offset;
+          const float *input_ptr1 = input_ptr0 + input_w;
+          const float *input_ptr2 = input_ptr1 + input_w;
+          const float *input_ptr3 = input_ptr2 + input_w;
+          const float *input_ptr4 = input_ptr3 + input_w;
+          const float *input_ptr5 = input_ptr4 + input_w;
+          const float *input_ptr6 = input_ptr5 + input_w;
+          float *output_ptr0 = output_ptr + h * output_w + valid_w_start;
+          float *output_ptr1 = output_ptr0 + output_w;
+          float *output_ptr2 = output_ptr1 + output_w;
+          int remain = output_w_remain;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+          float32x4x2_t x0, x1, x2;
+          float32x4x2_t y0, y1, y2;
+          float32x4_t post = vdupq_n_f32(1.f / 9);
+          for (int loop = 0; loop < output_w_tiles; ++loop) {
+            x0 = vld2q_f32(input_ptr0);
+            x1 = vld2q_f32(input_ptr0 + 8);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+
+            x0 = vld2q_f32(input_ptr1);
+            x1 = vld2q_f32(input_ptr1 + 8);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
+
+            x0 = vld2q_f32(input_ptr2);
+            x1 = vld2q_f32(input_ptr2 + 8);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
+            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y1.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(y1.val[0], y0.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(y1.val[1], y0.val[1]);
+            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
+            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
+            vst1q_f32(output_ptr0, y0.val[0]);
+            vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1]));
+
+            x0 = vld2q_f32(input_ptr3);
+            x1 = vld2q_f32(input_ptr3 + 8);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], y1.val[0]);
+            y1.val[1] = vPoolPreq_f32<P>(x0.val[1], y1.val[1]);
+
+            x0 = vld2q_f32(input_ptr4);
+            x1 = vld2q_f32(input_ptr4 + 8);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+            y1.val[0] = vPoolPreq_f32<P>(y0.val[0], y1.val[0]);
+            y1.val[1] = vPoolPreq_f32<P>(y0.val[1], y1.val[1]);
+            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
+            y1.val[1] = vPoolPostq_f32<P>(y1.val[1], post);
+            vst1q_f32(output_ptr1, y1.val[0]);
+            vst1_f32(output_ptr1 + 4, vget_low_f32(y1.val[1]));
+
+            x0 = vld2q_f32(input_ptr5);
+            x1 = vld2q_f32(input_ptr5 + 8);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
+
+            x0 = vld2q_f32(input_ptr6);
+            x1 = vld2q_f32(input_ptr6 + 8);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
+            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
+            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
+            vst1q_f32(output_ptr2, y0.val[0]);
+            vst1_f32(output_ptr2 + 4, vget_low_f32(y0.val[1]));
+
+            input_ptr0 += 12;
+            input_ptr1 += 12;
+            input_ptr2 += 12;
+            input_ptr3 += 12;
+            input_ptr4 += 12;
+            input_ptr5 += 12;
+            input_ptr6 += 12;
+            output_ptr0 += 6;
+            output_ptr1 += 6;
+            output_ptr2 += 6;
+          }
+          // remain width
+          if (remain >= 4) {
+            x0 = vld2q_f32(input_ptr0);
+            x1.val[0] = vdupq_n_f32(input_ptr0[8]);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+
+            x0 = vld2q_f32(input_ptr1);
+            x1.val[0] = vdupq_n_f32(input_ptr1[8]);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+
+            x0 = vld2q_f32(input_ptr2);
+            x1.val[0] = vdupq_n_f32(input_ptr2[8]);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y0.val[0] = vPoolPreq_f32<P>(y1.val[0], y0.val[0]);
+            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
+            vst1q_f32(output_ptr0, y0.val[0]);
+
+            x0 = vld2q_f32(input_ptr3);
+            x1.val[0] = vdupq_n_f32(input_ptr3[8]);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], y1.val[0]);
+
+            x0 = vld2q_f32(input_ptr4);
+            x1.val[0] = vdupq_n_f32(input_ptr4[8]);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y1.val[0] = vPoolPreq_f32<P>(y0.val[0], y1.val[0]);
+            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
+            vst1q_f32(output_ptr1, y1.val[0]);
+
+            x0 = vld2q_f32(input_ptr5);
+            x1.val[0] = vdupq_n_f32(input_ptr5[8]);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+
+            x0 = vld2q_f32(input_ptr6);
+            x1.val[0] = vdupq_n_f32(input_ptr6[8]);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
+            vst1q_f32(output_ptr2, y0.val[0]);
+
+            input_ptr0 += 8;
+            input_ptr1 += 8;
+            input_ptr2 += 8;
+            input_ptr3 += 8;
+            input_ptr4 += 8;
+            input_ptr5 += 8;
+            input_ptr6 += 8;
+            output_ptr0 += 4;
+            output_ptr1 += 4;
+            output_ptr2 += 4;
+            remain -= 4;
+          }
+#endif  // __ARM_NEON__
+          for (int r = 0; r < remain; ++r) {
+            float m0 = PoolPre<P>(input_ptr0[2 * r], input_ptr0[2 * r + 1]);
+            m0 = PoolPre<P>(m0, input_ptr0[2 * r + 2]);
+            float m1 = PoolPre<P>(input_ptr1[2 * r], input_ptr1[2 * r + 1]);
+            m1 = PoolPre<P>(m1, input_ptr1[2 * r + 2]);
+            float m2 = PoolPre<P>(input_ptr2[2 * r], input_ptr2[2 * r + 1]);
+            m2 = PoolPre<P>(m2, input_ptr2[2 * r + 2]);
+            float m3 = PoolPre<P>(input_ptr3[2 * r], input_ptr3[2 * r + 1]);
+            m3 = PoolPre<P>(m3, input_ptr3[2 * r + 2]);
+            float m4 = PoolPre<P>(input_ptr4[2 * r], input_ptr4[2 * r + 1]);
+            m4 = PoolPre<P>(m4, input_ptr4[2 * r + 2]);
+            float m5 = PoolPre<P>(input_ptr5[2 * r], input_ptr5[2 * r + 1]);
+            m5 = PoolPre<P>(m5, input_ptr5[2 * r + 2]);
+            float m6 = PoolPre<P>(input_ptr6[2 * r], input_ptr6[2 * r + 1]);
+            m6 = PoolPre<P>(m6, input_ptr6[2 * r + 2]);
+
+            m0 = PoolPre<P>(PoolPre<P>(m0, m1), m2);
+            m1 = PoolPre<P>(PoolPre<P>(m2, m3), m4);
+            m2 = PoolPre<P>(PoolPre<P>(m4, m5), m6);
+            output_ptr0[r] = PoolPost<P>(m0, avg);
+            output_ptr1[r] = PoolPost<P>(m1, avg);
+            output_ptr2[r] = PoolPost<P>(m2, avg);
+          }
+        }
+        // remain height
+        int start_h = valid_h_start + valid_h / 3 * 3;
+        for (int h = start_h; h < valid_h_end; ++h) {
+          size_t offset = (2 * h - padding_h) * input_w + input_w_start;
+          const float *input_ptr0 = input_ptr + offset;
+          const float *input_ptr1 = input_ptr0 + input_w;
+          const float *input_ptr2 = input_ptr1 + input_w;
+          float *output_ptr0 = output_ptr + h * output_w + valid_w_start;
+          int remain = output_w_remain;
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+          float32x4x2_t x0, x1, x2, y0;
+          float32x4_t post = vdupq_n_f32(1.f / 9);
+          for (int loop = 0; loop < output_w_tiles; ++loop) {
+            x0 = vld2q_f32(input_ptr0);
+            x1 = vld2q_f32(input_ptr0 + 8);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+
+            x0 = vld2q_f32(input_ptr1);
+            x1 = vld2q_f32(input_ptr1 + 8);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
+
+            x0 = vld2q_f32(input_ptr2);
+            x1 = vld2q_f32(input_ptr2 + 8);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
+            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
+            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
+            vst1q_f32(output_ptr0, y0.val[0]);
+            vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1]));
+
+            input_ptr0 += 12;
+            input_ptr1 += 12;
+            input_ptr2 += 12;
+            output_ptr0 += 6;
+          }
+          // remain width
+          if (remain >= 4) {
+            x0 = vld2q_f32(input_ptr0);
+            x1.val[0] = vdupq_n_f32(input_ptr0[8]);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+
+            x0 = vld2q_f32(input_ptr1);
+            x1.val[0] = vdupq_n_f32(input_ptr1[8]);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+
+            x0 = vld2q_f32(input_ptr2);
+            x1.val[0] = vdupq_n_f32(input_ptr2[8]);
+            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
+            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
+            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
+            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
+            vst1q_f32(output_ptr0, y0.val[0]);
+
+            input_ptr0 += 8;
+            input_ptr1 += 8;
+            input_ptr2 += 8;
+            output_ptr0 += 4;
+            remain -= 4;
+          }
+#endif  // __ARM_NEON__
+          for (int r = 0; r < remain; ++r) {
+            float m0 = PoolPre<P>(input_ptr0[2 * r], input_ptr0[2 * r + 1]);
+            m0 = PoolPre<P>(m0, input_ptr0[2 * r + 2]);
+            float m1 = PoolPre<P>(input_ptr1[2 * r], input_ptr1[2 * r + 1]);
+            m1 = PoolPre<P>(m1, input_ptr1[2 * r + 2]);
+            float m2 = PoolPre<P>(input_ptr2[2 * r], input_ptr2[2 * r + 1]);
+            m2 = PoolPre<P>(m2, input_ptr2[2 * r + 2]);
+
+            m0 = PoolPre<P>(PoolPre<P>(m0, m1), m2);
+            output_ptr0[r] = PoolPost<P>(m0, avg);
+          }
+        }
+      }
+    }
+  }
+};
+
+template struct Pooling3x3<MAX, 1>;
+template struct Pooling3x3<AVG, 1>;
+template struct Pooling3x3<MAX, 2>;
+template struct Pooling3x3<AVG, 2>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // POOL_OP
--- a/src/operators/math/quantize.h
+++ b/src/operators/math/quantize.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef QUANT_OP
+
+#pragma once
+
+#include <cmath>
+#include "common/types.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+template <RoundType R = ROUND_NEAREST_TOWARDS_ZERO>
+inline int8_t Round(const float &x) {
+  return static_cast<int8_t>(x);
+}
+
+template <>
+inline int8_t Round<ROUND_NEAREST_AWAY_ZERO>(const float &x) {
+  return std::round(x);
+}
+
+template <>
+inline int8_t Round<ROUND_NEAREST_TO_EVEN>(const float &x) {
+  float v = std::round(x);
+  int32_t q = static_cast<int32_t>(v);
+  if (std::abs(std::abs(q - v) - 0.5) <= 0) {
+    if (std::abs(q) % 2 != 0) {
+      q = q + ((q > 0) ? -1 : 1);
+    }
+  }
+  return static_cast<int8_t>(q);
+}
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+template <RoundType R = ROUND_NEAREST_TOWARDS_ZERO>
+inline int32x4_t vRoundq_f32(const float32x4_t &x) {
+  return vcvtq_s32_f32(x);
+}
+
+template <>
+inline int32x4_t vRoundq_f32<ROUND_NEAREST_AWAY_ZERO>(const float32x4_t &x) {
+  float32x4_t plus = vdupq_n_f32(0.5);
+  float32x4_t minus = vdupq_n_f32(-0.5);
+  float32x4_t zero = vdupq_n_f32(0);
+  uint32x4_t more_than_zero = vcgtq_f32(x, zero);
+  float32x4_t temp = vbslq_f32(more_than_zero, plus, minus);
+  temp = vaddq_f32(x, temp);
+  int32x4_t ret = vcvtq_s32_f32(temp);
+  return ret;
+}
+
+template <>
+inline int32x4_t vRoundq_f32<ROUND_NEAREST_TO_EVEN>(const float32x4_t &x) {
+  float32x4_t point5 = vdupq_n_f32(0.5);
+  int32x4_t one = vdupq_n_s32(1);
+  int32x4_t zero = vdupq_n_s32(0);
+
+  int32x4_t rnd = math::vRoundq_f32<ROUND_NEAREST_AWAY_ZERO>(x);
+  float32x4_t frnd = vcvtq_f32_s32(rnd);
+  frnd = vsubq_f32(frnd, x);
+  frnd = vabsq_f32(frnd);
+  uint32x4_t equal_point5 = vceqq_f32(frnd, point5);
+  int32x4_t abs_rnd = vabsq_s32(rnd);
+  abs_rnd = vandq_s32(abs_rnd, one);
+  uint32x4_t not_mod2 = vreinterpretq_u32_s32(abs_rnd);
+  uint32x4_t mask = vandq_u32(equal_point5, not_mod2);
+  uint32x4_t more_than_zero = vcgtq_s32(rnd, zero);
+  more_than_zero = vandq_u32(more_than_zero, vreinterpretq_u32_s32(one));
+  mask = veorq_u32(more_than_zero, mask);
+  more_than_zero = veorq_u32(more_than_zero, vreinterpretq_u32_s32(one));
+  mask = vaddq_u32(more_than_zero, mask);
+  int32x4_t smask = vreinterpretq_s32_u32(mask);
+  smask = vsubq_s32(smask, one);
+  rnd = vaddq_s32(rnd, smask);
+  return rnd;
+}
+#endif  // __ARM_NEON__
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // QUANT_OP
--- a/src/operators/math/sequence2batch.h
+++ b/src/operators/math/sequence2batch.h
@@ -69,10 +69,10 @@ class LoDTensor2BatchFunctor {

    auto lods = lod_tensor.lod();
    PADDLE_MOBILE_ENFORCE((lods.size() == 1UL),
-                          "Only support one level sequence now.");
+                          "Only support 1 level sequence, but %d is given",
+                          lods.size());

    const auto& lod = lods[0];
-
    std::vector<SeqInfo> seq_info;
    for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
      int length = lod[seq_id + 1] - lod[seq_id];

--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
@@ -15,154 +15,131 @@ limitations under the License. */
 #ifdef SOFTMAX_OP

 #include "operators/math/softmax.h"
-#include "common/types.h"
-#ifdef __ARM_NEON
 #include <math.h>
 #include <algorithm>
+#include <limits>
+#include "common/types.h"
 #include "operators/math/math_func_neon.h"
-#endif

 namespace paddle_mobile {
 namespace operators {
 namespace math {
-using framework::DDim;
-using framework::Tensor;
-template <typename T>
-class SoftmaxFuntor<CPU, T> {
-#ifdef __ARM_NEON
-  void sum(float *input, float *sumptr, int inner_size, int outter_size) {
-    float32x4_t acc = vdupq_n_f32(0);
-    float sum_ = 0;
-    for (int i = 0; i < outter_size; ++i) {
-      float *input_outer_ptr = input + i * inner_size;
-      int nn = inner_size >> 2;
-      int left = inner_size - (nn << 2);
-      for (; nn > 0; nn--) {
-        float32x4_t vec_input = vld1q_f32(input_outer_ptr);
-        acc = vaddq_f32(acc, vec_input);
-        input_outer_ptr += 4;
-      }
-      float32x2_t vsum_ = vadd_f32(vget_high_f32(acc), vget_low_f32(acc));
-      sum_ = vget_lane_f32(vsum_, 0) + vget_lane_f32(vsum_, 1);
-      for (; left > 0; left--) {
-        sum_ += *input_outer_ptr;
-        input_outer_ptr++;
-      }
-    }
-    for (int j = 0; j < inner_size * outter_size; ++j) {
-      sumptr[j] = sum_;
-    }
-  }

-  void SoftmaxCacl(const Tensor *X, Tensor *Y) {
-    const float *input = X->data<float>();
-    const DDim &dDim = X->dims();
-    int axis_index = 1;
-    if (dDim.size() < 4) {
-      axis_index = 0;
-    }
-    DDim outer_ddim =
-        paddle_mobile::framework::slice_ddim(dDim, 0, axis_index + 1);
-    DDim inner_ddim =
-        paddle_mobile::framework::slice_ddim(dDim, axis_index + 1, dDim.size());
-    int out_size = paddle_mobile::framework::product(outer_ddim);
-    int inner_size = paddle_mobile::framework::product(inner_ddim);
-    auto *max_ptr = new float[inner_size * out_size];
-    // max
-    for (int j = 0; j < out_size; ++j) {
-      const float *input_outer_ptr = input + j * inner_size;
-      float *max_outer_ptr = max_ptr + j * inner_size;
-      float max_ = 0;
-      for (int i = 0; i < inner_size; ++i) {
-        const float *input_inner_ptr = input_outer_ptr + i;
-        max_ = std::max(max_, input_inner_ptr[0]);
-      }
-      for (int k = 0; k < inner_size; ++k) {
-        max_outer_ptr[k] = max_;
-      }
-    }
-    // exp(value - max)
-    float *exp_sub_max = new float[inner_size * out_size];
-    float *exp_sub_max_ptr = &exp_sub_max[0];
-    for (int l = 0; l < out_size; ++l) {
-      const float *input_outer_ptr = input + l * inner_size;
-      float *max_outer_ptr = max_ptr + l * inner_size;
-      int nn = inner_size >> 2;
-      int left = inner_size - (nn << 2);
-      for (; nn > 0; nn--) {
-        float32x4_t vec_input = vld1q_f32(input_outer_ptr);
-        float32x4_t vec_max = vld1q_f32(max_outer_ptr);
-        float32x4_t vec_sub = vsubq_f32(vec_input, vec_max);
-        float32x4_t vec_exp = exp_ps(vec_sub);
-        vst1q_f32(exp_sub_max_ptr, vec_exp);
-        input_outer_ptr += 4;
-        max_outer_ptr += 4;
-        exp_sub_max_ptr += 4;
-      }
-      for (; left > 0; left--) {
-        *exp_sub_max_ptr = expf(*input_outer_ptr - *max_outer_ptr);
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#ifndef __aarch64__
+inline float32_t vmaxvq_f32(const float32x4_t &r) {
+  float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
+  return vget_lane_f32(vpmax_f32(v, v), 0);
+}

-        input_outer_ptr++;
-        max_outer_ptr++;
-        exp_sub_max_ptr++;
-      }
-    }
-    float *sumptr = new float[inner_size * out_size];
-    // sum exp
-    sum(exp_sub_max, sumptr, inner_size, out_size);
-    // div
-    auto *out_ptr = Y->mutable_data<float>();
-    for (int l = 0; l < out_size; ++l) {
-      const float *input_outer_ptr = exp_sub_max + l * inner_size;
-      float *output_outer_ptr = out_ptr + l * inner_size;
-      float *sum_outer_ptr = sumptr + l * inner_size;
-      int nn = inner_size >> 2;
-      int left = inner_size - (nn << 2);
-      for (; nn > 0; nn--) {
-        float32x4_t vec_input = vld1q_f32(input_outer_ptr);
-        float32x4_t vec_sum = vld1q_f32(sum_outer_ptr);
-        float32x4_t vec_div = div_ps(vec_input, vec_sum);
-        vst1q_f32(output_outer_ptr, vec_div);
-        input_outer_ptr += 4;
-        output_outer_ptr += 4;
-        sum_outer_ptr += 4;
+inline float32_t vaddvq_f32(const float32x4_t &r) {
+  float32x2_t v = vadd_f32(vget_high_f32(r), vget_low_f32(r));
+  return vget_lane_f32(vpadd_f32(v, v), 0);
+}
+#endif  // __aarch64__
+#endif  // __ARM_NEON__
+
+float find_max(const float *input, const int num_classes) {
+  int remain = num_classes;
+  float max = -std::numeric_limits<float>::max();
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+  int loop = num_classes >> 3;
+  remain = num_classes & 0x7;
+  float32x4_t __max = vdupq_n_f32(max);
+  for (int i = 0; i < loop; ++i, input += 8) {
+    float32x4_t x0 = vld1q_f32(input);
+    float32x4_t x1 = vld1q_f32(input + 4);
+    __max = vmaxq_f32(x0, __max);
+    __max = vmaxq_f32(x1, __max);
+  }
+  max = vmaxvq_f32(__max);
+#endif
+  for (int i = 0; i < remain; ++i) {
+    max = std::max(max, input[i]);
  }
-      for (; left > 0; left--) {
-        *output_outer_ptr = (*input_outer_ptr) / (*sum_outer_ptr);
-        input_outer_ptr++;
-        output_outer_ptr++;
-        sum_outer_ptr++;
+  return max;
+}
+
+template <>
+void SoftmaxFuntor<CPU, float>::operator()(const framework::Tensor *X,
+                                           framework::Tensor *Y) {
+  const framework::DDim &dims = X->dims();
+  int batch_size = dims[0];
+  int num_classes = dims[dims.size() - 1];
+  int channels = X->numel() / batch_size / num_classes;
+  const float *x = X->data<float>();
+  float *y = Y->mutable_data<float>();
+
+  #pragma omp parallel for collapse(2)
+  for (int batch = 0; batch < X->dims()[0]; ++batch) {
+    for (int channel = 0; channel < channels; ++channel) {
+      size_t offset = (batch * channels + channel) * num_classes;
+      const float *input = x + offset;
+      float *output = y + offset;
+      // find max
+      float max = find_max(input, num_classes);
+
+      // exp(x - max)
+      int remain = num_classes;
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+      int loop = num_classes >> 3;
+      remain = num_classes & 0x7;
+      float32x4_t __max = vdupq_n_f32(max);
+      for (int i = 0; i < loop; ++i, input += 8, output += 8) {
+        float32x4_t x0 = vld1q_f32(input);
+        float32x4_t x1 = vld1q_f32(input + 4);
+        x0 = vsubq_f32(x0, __max);
+        x1 = vsubq_f32(x1, __max);
+        x0 = exp_ps(x0);
+        x1 = exp_ps(x1);
+        vst1q_f32(output, x0);
+        vst1q_f32(output + 4, x1);
+      }
+#endif  // __ARM_NEON__
+      for (int i = 0; i < remain; ++i) {
+        output[i] = expf(input[i] - max);
      }
+
+      // sum(exp(x - max))
+      float sum = 0.f;
+      output = y + offset;
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+      float32x4_t __sum = vdupq_n_f32(0.f);
+      for (int i = 0; i < loop; ++i, output += 8) {
+        float32x4_t x0 = vld1q_f32(output);
+        float32x4_t x1 = vld1q_f32(output + 4);
+        __sum = vaddq_f32(x0, __sum);
+        __sum = vaddq_f32(x1, __sum);
+      }
+      sum += vaddvq_f32(__sum);
+#endif  // __ARM_NEON__
+      for (int i = 0; i < remain; ++i) {
+        sum += output[i];
      }
+
+      // exp(x - max) / sum
+      float inv_sum = 1.f / sum;
+      output = y + offset;
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+      float32x4_t __inv_sum = vdupq_n_f32(inv_sum);
+      for (int i = 0; i < loop; ++i, output += 8) {
+        float32x4_t x0 = vld1q_f32(output);
+        float32x4_t x1 = vld1q_f32(output + 4);
+        x0 = vmulq_f32(x0, __inv_sum);
+        x1 = vmulq_f32(x1, __inv_sum);
+        vst1q_f32(output, x0);
+        vst1q_f32(output + 4, x1);
      }
-#else
-#endif  // ARM_NEON
-
- public:
-  void operator()(const framework::Tensor *X, framework::Tensor *Y) {
-    const DDim dDim = X->dims();
-    int dim1 = dDim[dDim.size() - 1];
-    int dim0 = X->numel() / dim1 / dDim[0];
-    framework::DDim matrix_shape = {dim0, dim1};
-    for (int i = 0; i < dDim[0]; ++i) {
-      framework::Tensor sub_X = X->Slice(i, i + 1);
-      framework::Tensor sub_Y = Y->Slice(i, i + 1);
-      sub_X.Resize(matrix_shape);
-      sub_Y.Resize(matrix_shape);
-      for (int j = 0; j < dim0; j++) {
-        framework::Tensor sub_x = sub_X.Slice(j, j + 1);
-        framework::Tensor sub_y = sub_Y.Slice(j, j + 1);
-#ifdef __ARM_NEON
-        SoftmaxCacl(&sub_x, &sub_y);
 #endif
+      for (int i = 0; i < remain; ++i) {
+        output[i] *= inv_sum;
      }
    }
  }
-};
-
-template class SoftmaxFuntor<CPU, float>;
+}

 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
-#endif
+
+#endif  // SOFTMAX_OP
--- a/src/operators/math/softmax.h
+++ b/src/operators/math/softmax.h
@@ -13,17 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #ifdef SOFTMAX_OP
+
 #pragma once
+
 #include "framework/tensor.h"
+
 namespace paddle_mobile {
 namespace operators {
 namespace math {

-template <typename DeviceType, typename T>
+template <typename Device, typename T>
 class SoftmaxFuntor {
 public:
  void operator()(const framework::Tensor *X, framework::Tensor *Y);
 };
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/math/winograd/winograd_transform_f6k3.cpp
+++ b/src/operators/math/winograd/winograd_transform_f6k3.cpp
@@ -327,8 +327,8 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
  int channel = input.dims()[1];
  int height = input.dims()[2];
  int width = input.dims()[3];
-  int h_tiles = (height + 3) / 6;  // (height - 8 + 5 + 6) / 6
-  int w_tiles = (width + 3) / 6;   // (width - 8 + 5 + 6) / 6
+  int h_tiles = (height + 3) / 6;  // (height - 2 + 5) / 6
+  int w_tiles = (width + 3) / 6;   // (width - 2 + 5) / 6
  int tiles = (h_tiles * w_tiles + 7) / 8;
  framework::DDim transformed_shape =
      framework::make_ddim(std::vector<int>{tiles, 64, channel, 8});
@@ -336,16 +336,10 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
  memset(outptr, 0, output->numel() * sizeof(float));

  const float *inptr = input.data<float>();
-  int inter_h = (height - 2) / 6;
-  int inter_w = (width - 2) / 6;
-  int remain_h = height - (inter_h * 6);
-  int remain_w = width - (inter_w * 6);
+  height = h_tiles * 6 + 2;
+  width = w_tiles * 6 + 2;
  framework::Tensor input_pad;
-  if (remain_h > 2 || remain_w > 2) {
-    inter_h += (remain_h > 2);
-    inter_w += (remain_w > 2);
-    height = (inter_h - 1) * 6 + 8;
-    width = (inter_w - 1) * 6 + 8;
+  if (height > input.dims()[2] || width > input.dims()[3]) {
    framework::DDim input_shape =
        framework::make_ddim(std::vector<int>{1, channel, height, width});
    PadFunctor<CPU, float> pad;
@@ -878,8 +872,8 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
                                     framework::Tensor *output) {
  // weight shape is [out_channel/4, 64, in_channel, 4],
  // input shape is [hw/8, 64, in_channel, 8]
-  int in_channel = input.dims()[2];
  int tiles = input.dims()[0];
+  int in_channel = input.dims()[2];
  int out_channel = weight.dims()[0];

  // compute U*V first
@@ -887,7 +881,6 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
  framework::DDim shape =
      framework::make_ddim(std::vector<int>{out_channel, tiles, 64, 32});
  float *uv_trans_ptr = uv_trans.mutable_data<float>(shape);
-  memset(uv_trans_ptr, 0, uv_trans.numel() * sizeof(float));
  const float *input_ptr = input.data<float>();
  const float *weight_ptr = weight.data<float>();

@@ -910,7 +903,8 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
            "veor       q14, q14, q14                  \n"
            "veor       q15, q15, q15                  \n"

-            "b          store_res_%=                   \n"
+            "cmp        %[inter_channel], #0           \n"
+            "ble        loop_1c_%=                     \n"
            // loop 2 channels
            "loop_2c_%=:                               \n"
            "vld1.32    {d0-d3}, [%[w_ptr]]!           \n"
@@ -936,13 +930,14 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,

            "subs       %[inter_channel], #1           \n"
            "bne        loop_2c_%=                     \n"
-            "mov        pc, lr                         \n"

            // loop 1 channel
-            "loop_c_%=:                                \n"
+            "loop_1c_%=:                               \n"
+            "cmp        %[remain_channel], #0          \n"
+            "ble        store_res_%=                   \n"
+
            "vld1.32    {d0-d1}, [%[w_ptr]]!           \n"
            "vld1.32    {d4-d7}, [%[in_ptr]]!          \n"
-
            "vmla.f32   q8, q2, d0[0]                  \n"
            "vmla.f32   q9, q3, d0[0]                  \n"
            "vmla.f32   q10, q2, d0[1]                 \n"
@@ -952,28 +947,16 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
            "vmla.f32   q14, q2, d1[1]                 \n"
            "vmla.f32   q15, q3, d1[1]                 \n"

-            "subs       %[remain_channel], #1          \n"
-            "bne        loop_c_%=                      \n"
-            "mov        pc, lr                         \n"
-
            "store_res_%=:                             \n"
-            "cmp        %[inter_channel], #0           \n"
-            "it         gt                             \n"
-            "blgt       loop_2c_%=                     \n"
-            "cmp        %[remain_channel], #0          \n"
-            "it         gt                             \n"
-            "blgt       loop_c_%=                      \n"
-
            "vst1.32    {d16-d19}, [%[uv_ptr]]!        \n"
            "vst1.32    {d20-d23}, [%[uv_ptr]]!        \n"
            "vst1.32    {d24-d27}, [%[uv_ptr]]!        \n"
            "vst1.32    {d28-d31}, [%[uv_ptr]]!        \n"
            : [w_ptr] "+r"(w_ptr), [in_ptr] "+r"(in_ptr), [uv_ptr] "+r"(uv_ptr),
-              [remain_channel] "+r"(remain_channel),
              [inter_channel] "+r"(inter_channel)
-            :
+            : [remain_channel] "r"(remain_channel)
            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "pc", "lr");
+              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
      }
    }
  }
@@ -1223,8 +1206,10 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
          size_t offset = (oc * out_h + 6 * tile_h) * out_w + 6 * tile_w;
          float *out_ptr = output_ptr + offset;
-          int remain_row = (tile_h < h_tiles - 1) ? 6 : remain_h;
-          int remain_col = (tile_w < w_tiles - 1) ? 6 : remain_w;
+          int remain_row = out_h - 6 * tile_h;
+          int remain_col = out_w - 6 * tile_w;
+          remain_row = (remain_row > 6) ? 6 : remain_row;
+          remain_col = (remain_col > 6) ? 6 : remain_col;
          for (int i = 0; i < remain_row; ++i, out_ptr += out_w) {
            memcpy(out_ptr, output_tmp + i * 6, remain_col * sizeof(float));
          }

--- a/src/operators/math/winograd/winograd_transform_f6k3_arm64.cpp
+++ b/src/operators/math/winograd/winograd_transform_f6k3_arm64.cpp
@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-// Inspired by https://arxiv.org/abs/1509.09308 and refered from nnpack and ncnn
-// project.
+// We refer https://github.com/andravin/wincnn to access the winograd transform
+// matrixs

 #ifdef CONV_OP
-
 #ifdef __aarch64__

-#include "operators/math/pad.h"
 #include "operators/math/winograd/winograd_transform.h"

 namespace paddle_mobile {
@@ -29,46 +27,382 @@ namespace math {
 template <>
 void winograd_transform_weight<8, 3>(const framework::Tensor &weight,
                                     framework::Tensor *output) {
-  /*
-   * w0 = g0
-   * w1 = ((g0 + g2) + g1) * (-2.0 / 9)
-   * w2 = ((g0 + g2) - g1) * (-2.0 / 9)
-   * w3 = ((g0 + 4 * g2) + 2 * g1) * (1.0 / 90)
-   * w4 = ((g0 + 4 * g2) - 2 * g1) * (1.0 / 90)
-   * w5 = ((g2 + 4 * g0) + 2 * g1) * (1.0 / 180)
-   * w6 = ((g2 + 4 * g0) - 2 * g1) * (1.0 / 180)
-   * w7 = g2
-   */
-  // TODO(hjchen2)
-  PADDLE_MOBILE_THROW_EXCEPTION(
-      "Winograd for arm v8 has not been implemented.");
+  // weight shape is [out_channel, in_channel, kernel_h, kernel_w]
+  int out_channel = weight.dims()[0];
+  int in_channel = weight.dims()[1];
+  // reshape and alloc transformed weight
+  framework::DDim transformed_shape =
+      framework::make_ddim(std::vector<int>{out_channel, in_channel, 64});
+  float *outptr = output->mutable_data<float>(transformed_shape);
+  const float *inptr = weight.data<float>();
+  for (int oc = 0; oc < out_channel; ++oc) {
+    for (int ic = 0; ic < in_channel; ++ic) {
+      size_t offset = oc * in_channel + ic;
+      float *kout = outptr + offset * 64;
+      const float *k = inptr + offset * 9;
+
+      float gw[3][8];
+      for (int i = 0; i < 3; ++i, k += 3) {
+        float g0 = k[0];
+        float g1 = k[1];
+        float g2 = k[2];
+        float d0 = g0 + g2;
+        float d1 = g0 + 4 * g2;
+        float d2 = g2 + 4 * g0;
+        float d3 = 2 * g1;
+        gw[i][0] = g0;
+        gw[i][1] = -2.f / 9 * (d0 + g1);   // -2.f/9 * (g0 + g1 + g2)
+        gw[i][2] = -2.f / 9 * (d0 - g1);   // -2.f/9 * (g0 - g1 + g2)
+        gw[i][3] = 1.f / 90 * (d1 + d3);   // 1.f/90 * (g0 + 2 * g1 + 4 * g2)
+        gw[i][4] = 1.f / 90 * (d1 - d3);   // 1.f/90 * (g0 - 2 * g1 + 4 * g2)
+        gw[i][5] = 1.f / 180 * (d2 + d3);  // 1.f/180 * (4 * g0 + 2 * g1 + g2)
+        gw[i][6] = 1.f / 180 * (d2 - d3);  // 1.f/180 * (4 * g0 - 2 * g1 + g2)
+        gw[i][7] = g2;
+      }
+      for (int i = 0; i < 8; ++i, kout += 8) {
+        float g0 = gw[0][i];
+        float g1 = gw[1][i];
+        float g2 = gw[2][i];
+        float d0 = g0 + g2;
+        float d1 = g0 + 4 * g2;
+        float d2 = g2 + 4 * g0;
+        float d3 = 2 * g1;
+        kout[0] = g0;
+        kout[1] = -2.f / 9 * (d0 + g1);   // -2.f/9 * (k0 + k1 + k2)
+        kout[2] = -2.f / 9 * (d0 - g1);   // -2.f/9 * (k0 - k1 + k2)
+        kout[3] = 1.f / 90 * (d1 + d3);   // 1.f/90 * (k0 + 2 * k1 + 4 * k2)
+        kout[4] = 1.f / 90 * (d1 - d3);   // 1.f/90 * (k0 - 2 * k1 + 4 * k2)
+        kout[5] = 1.f / 180 * (d2 + d3);  // 8.f/45 * (4 * k0 + 2 * k1 + k2)
+        kout[6] = 1.f / 180 * (d2 - d3);  // 8.f/45 * (4 * k0 - 2 * k1 + k2)
+        kout[7] = g2;
+      }
+    }
+  }
 }

 template <>
 void winograd_transform_input<8, 3>(const framework::Tensor &input,
                                    framework::Tensor *output) {
-  /*
-   * x0 = (d0 - d6) + (d4 - d2) * 5.25
-   * x1 = (d2 + d6) - 4.25 * (d4 + d3) + (d1 + d5)
-   * x2 = (d2 + d6) - 4.25 * (d4 - d3) - (d1 + d5)
-   * x3 = (0.25 * d2 - 1.25 * d4 + d6) + (0.5 * d1 - 2.5 * d3 + 2 * d5)
-   * x4 = (0.25 * d2 - 1.25 * d4 + d6) - (0.5 * d1 - 2.5 * d3 + 2 * d5)
-   * x5 = (4 * d2 - 5 * d4 + d6) + (2 * d1 - 2.5 * d3 + 0.5 * d5)
-   * x6 = (4 * d2 - 5 * d4 + d6) - (2 * d1 - 2.5 * d3 + 0.5 * d5)
-   * x7 = (d7 - d1) + (d3 - d5) * 5.25
-   */
-  // TODO(hjchen2)
-  PADDLE_MOBILE_THROW_EXCEPTION(
-      "Winograd for arm v8 has not been implemented.");
+  // tile input to [c, roundup(h/6), roundup(w/6), 64] and do transformation
+  int channel = input.dims()[1];
+  int height = input.dims()[2];
+  int width = input.dims()[3];
+  int h_tiles = (height + 3) / 6;  // (height + 5 - 2) / 6
+  int w_tiles = (width + 3) / 6;   // (width + 5 - 2) / 6
+  framework::DDim transformed_shape =
+      framework::make_ddim(std::vector<int>{channel, h_tiles, w_tiles, 64});
+  float *outptr = output->mutable_data<float>(transformed_shape);
+  memset(outptr, 0, channel * h_tiles * w_tiles * 64 * sizeof(float));
+  const float *inptr = input.data<float>();
+  // pack input to tiles
+  for (int c = 0; c < channel; ++c) {
+    int inter_h = (height - 2) / 6;
+    int inter_w = (width - 2) / 6;
+    int remain_h = height - (inter_h * 6);
+    int remain_w = width - (inter_w * 6);
+    const float *in0 = inptr + c * height * width;
+    const float *in1 = in0 + width;
+    const float *in2 = in1 + width;
+    const float *in3 = in2 + width;
+    const float *in4 = in3 + width;
+    const float *in5 = in4 + width;
+    const float *in6 = in5 + width;
+    const float *in7 = in6 + width;
+    float *out = outptr + c * h_tiles * w_tiles * 64;
+
+    for (int h = 0; h < inter_h; ++h) {
+      for (int w = 0; w < inter_w; ++w) {
+        memcpy(out, in0, 8 * sizeof(float));
+        memcpy(out + 8, in1, 8 * sizeof(float));
+        memcpy(out + 16, in2, 8 * sizeof(float));
+        memcpy(out + 24, in3, 8 * sizeof(float));
+        memcpy(out + 32, in4, 8 * sizeof(float));
+        memcpy(out + 40, in5, 8 * sizeof(float));
+        memcpy(out + 48, in6, 8 * sizeof(float));
+        memcpy(out + 56, in7, 8 * sizeof(float));
+        in0 += 6;
+        in1 += 6;
+        in2 += 6;
+        in3 += 6;
+        in4 += 6;
+        in5 += 6;
+        in6 += 6;
+        in7 += 6;
+        out += 64;
+      }
+      // remain width
+      if (remain_w > 2) {
+        memcpy(out, in0, remain_w * sizeof(float));
+        memcpy(out + 8, in1, remain_w * sizeof(float));
+        memcpy(out + 16, in2, remain_w * sizeof(float));
+        memcpy(out + 24, in3, remain_w * sizeof(float));
+        memcpy(out + 32, in4, remain_w * sizeof(float));
+        memcpy(out + 40, in5, remain_w * sizeof(float));
+        memcpy(out + 48, in6, remain_w * sizeof(float));
+        memcpy(out + 56, in7, remain_w * sizeof(float));
+        out += 64;
+      }
+      in0 += 5 * width + remain_w;
+      in1 += 5 * width + remain_w;
+      in2 += 5 * width + remain_w;
+      in3 += 5 * width + remain_w;
+      in4 += 5 * width + remain_w;
+      in5 += 5 * width + remain_w;
+      in6 += 5 * width + remain_w;
+      in7 += 5 * width + remain_w;
+    }
+    // remain height
+    if (remain_h > 2) {
+      for (int w = 0; w < inter_w; ++w) {
+        for (int rh = 0; rh < remain_h; ++rh) {
+          memcpy(out + rh * 8, in0 + rh * width, 8 * sizeof(float));
+        }
+        out += 64;
+        in0 += 6;
+      }
+      // remain width
+      if (remain_w > 2) {
+        for (int rh = 0; rh < remain_h; ++rh) {
+          memcpy(out + rh * 8, in0 + rh * width, remain_w * sizeof(float));
+        }
+      }
+    }
+  }
+  // transform tiles, compute B_T * d(c, b) * B
+  for (int c = 0; c < channel; ++c) {
+    for (int tile = 0; tile < h_tiles * w_tiles; ++tile) {
+      float *out = outptr + (c * h_tiles * w_tiles + tile) * 64;
+      // compute B_T * d(c, b)
+      float bd[8][8];
+      for (int i = 0; i < 8; ++i) {
+        float d0 = out[8 * i + 0];
+        float d1 = out[8 * i + 1];
+        float d2 = out[8 * i + 2];
+        float d3 = out[8 * i + 3];
+        float d4 = out[8 * i + 4];
+        float d5 = out[8 * i + 5];
+        float d6 = out[8 * i + 6];
+        float d7 = out[8 * i + 7];
+
+        bd[i][0] = d0 - d6 + (d4 - d2) * 5.25;
+        float v1 = d2 - 4.25 * d4 + d6;
+        float v2 = d1 - 4.25 * d3 + d5;
+        // d1 + d2 - 4.25 * d3 - 4.25 * d4 + d5 + d6
+        bd[i][1] = v1 + v2;
+        // -d1 + d2 + 4.25 * d3 - 4.25 * d4 - d5 + d6
+        bd[i][2] = v1 - v2;
+        v1 = 0.25 * d2 - 1.25 * d4 + d6;
+        v2 = 0.5 * d1 - 2.5 * d3 + 2 * d5;
+        // 0.5 * d1 + 0.25 * d2 - 2.5 * d3 - 1.25 * d4 + 2 * d5 + d6
+        bd[i][3] = v1 + v2;
+        // -0.5 * d1 + 0.25 * d2 + 2.5 * d3 - 1.25 * d4 - 2 * d5 + d6
+        bd[i][4] = v1 - v2;
+        v1 = 4 * d2 - 5 * d4 + d6;
+        v2 = 2 * d1 - 2.5 * d3 + 0.5 * d5;
+        // 2 * d1 + 4 * d2 - 2.5 * d3 - 5 * d4 + 0.5 * d5 + d6
+        bd[i][5] = v1 + v2;
+        // -2 * d1 + 4 * d2 + 2.5 * d3 - 5 * d4 - 0.5 * d5 + d6
+        bd[i][6] = v1 - v2;
+        bd[i][7] = d7 - d1 + (d3 - d5) * 5.25;
+      }
+      // compute B_T * d(c, b) * B
+      for (int i = 0; i < 8; ++i, out += 8) {
+        float d0 = bd[0][i];
+        float d1 = bd[1][i];
+        float d2 = bd[2][i];
+        float d3 = bd[3][i];
+        float d4 = bd[4][i];
+        float d5 = bd[5][i];
+        float d6 = bd[6][i];
+        float d7 = bd[7][i];
+
+        out[0] = d0 - d6 + (d4 - d2) * 5.25;
+        float v1 = d2 - 4.25 * d4 + d6;
+        float v2 = d1 - 4.25 * d3 + d5;
+        // d1 + d2 - 4.25 * d3 - 4.25 * d4 + d5 + d6
+        out[1] = v1 + v2;
+        // -d1 + d2 + 4.25 * d3 - 4.25 * d4 - d5 + d6
+        out[2] = v1 - v2;
+        v1 = 0.25 * d2 - 1.25 * d4 + d6;
+        v2 = 0.5 * d1 - 2.5 * d3 + 2 * d5;
+        // 0.5 * d1 + 0.25 * d2 - 2.5 * d3 - 1.25 * d4 + 2 * d5 + d6
+        out[3] = v1 + v2;
+        // -0.5 * d1 + 0.25 * d2 + 2.5 * d3 - 1.25 * d4 - 2 * d5 + d6
+        out[4] = v1 - v2;
+        v1 = 4 * d2 - 5 * d4 + d6;
+        v2 = 2 * d1 - 2.5 * d3 + 0.5 * d5;
+        // 2 * d1 + 4 * d2 - 2.5 * d3 - 5 * d4 + 0.5 * d5 + d6
+        out[5] = v1 + v2;
+        // -2 * d1 + 4 * d2 + 2.5 * d3 - 5 * d4 - 0.5 * d5 + d6
+        out[6] = v1 - v2;
+        out[7] = d7 - d1 + (d3 - d5) * 5.25;
+      }
+    }
+  }
 }

 template <>
 void winograd_transform_output<8, 3>(const framework::Tensor &input,
                                     const framework::Tensor &weight,
                                     framework::Tensor *output) {
-  // TODO(hjchen2)
-  PADDLE_MOBILE_THROW_EXCEPTION(
-      "Winograd for arm v8 has not been implemented.");
+  // input shape is [in_channel, h_tiles, w_tiles, 64]
+  // weight shape is [out_channel, in_channel, 64]
+  int in_channel = input.dims()[0];
+  int h_tiles = input.dims()[1];
+  int w_tiles = input.dims()[2];
+  int tiles = h_tiles * w_tiles;
+  int out_channel = weight.dims()[0];
+  // compute U*V first
+  framework::Tensor output_m;
+  framework::DDim shape =
+      framework::make_ddim(std::vector<int>{out_channel, tiles, 64});
+  float *output_m_ptr = output_m.mutable_data<float>(shape);
+  memset(output_m_ptr, 0, output_m.numel() * sizeof(float));
+  const float *input_ptr = input.data<float>();
+  const float *weight_ptr = weight.data<float>();
+  for (int i = 0; i < out_channel; ++i) {
+    for (int j = 0; j < tiles; ++j) {
+      const float *w_ptr = weight_ptr + i * in_channel * 64;
+      const float *in_ptr = input_ptr + j * 64;
+      float *m_ptr = output_m_ptr + (i * tiles + j) * 64;
+      for (int c = 0; c < in_channel; ++c) {
+        for (int k = 0; k < 64; ++k) {
+          m_ptr[k] += w_ptr[k] * in_ptr[k];
+        }
+        w_ptr += 64;
+        in_ptr += tiles * 64;
+      }
+    }
+  }
+
+  for (int oc = 0; oc < out_channel; ++oc) {
+    for (int tile = 0; tile < tiles; ++tile) {
+      float *m = output_m_ptr + (oc * tiles + tile) * 64;
+      // compute A_T * m
+      float am[6][8];
+      for (int i = 0; i < 8; ++i) {
+        float d0 = m[i * 8 + 0];
+        float d1 = m[i * 8 + 1];
+        float d2 = m[i * 8 + 2];
+        float d3 = m[i * 8 + 3];
+        float d4 = m[i * 8 + 4];
+        float d5 = m[i * 8 + 5];
+        float d6 = m[i * 8 + 6];
+        float d7 = m[i * 8 + 7];
+        float v0 = d1 + d2;
+        float v1 = d1 - d2;
+        float v2 = d3 + d4;
+        float v3 = d3 - d4;
+        float v4 = d5 + d6;
+        float v5 = d5 - d6;
+
+        am[0][i] = d0 + v0 + v2 + 32 * v4;
+        am[1][i] = v1 + 2 * v3 + 16 * v5;
+        am[2][i] = v0 + 4 * v2 + 8 * v4;
+        am[3][i] = v1 + 8 * v3 + 4 * v5;
+        am[4][i] = v0 + 16 * v2 + 2 * v4;
+        am[5][i] = v1 + 32 * v3 + v5 + d7;
+      }
+      // compute A_T * m * A
+      for (int i = 0; i < 6; ++i, m += 8) {
+        float d0 = am[i][0];
+        float d1 = am[i][1];
+        float d2 = am[i][2];
+        float d3 = am[i][3];
+        float d4 = am[i][4];
+        float d5 = am[i][5];
+        float d6 = am[i][6];
+        float d7 = am[i][7];
+        float v0 = d1 + d2;
+        float v1 = d1 - d2;
+        float v2 = d3 + d4;
+        float v3 = d3 - d4;
+        float v4 = d5 + d6;
+        float v5 = d5 - d6;
+
+        m[0] = d0 + v0 + v2 + 32 * v4;
+        m[1] = v1 + 2 * v3 + 16 * v5;
+        m[2] = v0 + 4 * v2 + 8 * v4;
+        m[3] = v1 + 8 * v3 + 4 * v5;
+        m[4] = v0 + 16 * v2 + 2 * v4;
+        m[5] = v1 + 32 * v3 + v5 + d7;
+      }
+    }
+  }
+
+  int out_h = output->dims()[2];
+  int out_w = output->dims()[3];
+  float *output_ptr = output->mutable_data<float>();
+  // copy valid region to final output
+  for (int oc = 0; oc < out_channel; ++oc) {
+    int inter_h = out_h / 6;
+    int inter_w = out_w / 6;
+    int remain_h = out_h - inter_h * 6;
+    int remain_w = out_w - inter_w * 6;
+
+    float *out_ptr0 = output_ptr + oc * out_h * out_w;
+    float *out_ptr1 = out_ptr0 + out_w;
+    float *out_ptr2 = out_ptr1 + out_w;
+    float *out_ptr3 = out_ptr2 + out_w;
+    float *out_ptr4 = out_ptr3 + out_w;
+    float *out_ptr5 = out_ptr4 + out_w;
+    const float *m_ptr = output_m_ptr + oc * tiles * 64;
+    for (int tile_h = 0; tile_h < inter_h; ++tile_h) {
+      for (int tile_w = 0; tile_w < inter_w; ++tile_w) {
+        const float *m = m_ptr + (tile_h * w_tiles + tile_w) * 64;
+        memcpy(out_ptr0, m, 6 * sizeof(float));
+        memcpy(out_ptr1, m + 8, 6 * sizeof(float));
+        memcpy(out_ptr2, m + 16, 6 * sizeof(float));
+        memcpy(out_ptr3, m + 24, 6 * sizeof(float));
+        memcpy(out_ptr4, m + 32, 6 * sizeof(float));
+        memcpy(out_ptr5, m + 40, 6 * sizeof(float));
+        out_ptr0 += 6;
+        out_ptr1 += 6;
+        out_ptr2 += 6;
+        out_ptr3 += 6;
+        out_ptr4 += 6;
+        out_ptr5 += 6;
+      }
+      // remain w
+      if (remain_w > 0) {
+        const float *m = m_ptr + (tile_h * w_tiles + inter_w) * 64;
+        memcpy(out_ptr0, m, remain_w * sizeof(float));
+        memcpy(out_ptr1, m + 8, remain_w * sizeof(float));
+        memcpy(out_ptr2, m + 16, remain_w * sizeof(float));
+        memcpy(out_ptr3, m + 24, remain_w * sizeof(float));
+        memcpy(out_ptr4, m + 32, remain_w * sizeof(float));
+        memcpy(out_ptr5, m + 40, remain_w * sizeof(float));
+        out_ptr0 += remain_w;
+        out_ptr1 += remain_w;
+        out_ptr2 += remain_w;
+        out_ptr3 += remain_w;
+        out_ptr4 += remain_w;
+        out_ptr5 += remain_w;
+      }
+      out_ptr0 += 5 * out_w;
+      out_ptr1 += 5 * out_w;
+      out_ptr2 += 5 * out_w;
+      out_ptr3 += 5 * out_w;
+      out_ptr4 += 5 * out_w;
+      out_ptr5 += 5 * out_w;
+    }
+    // remain h
+    if (remain_h > 0) {
+      for (int tile_w = 0; tile_w < inter_w; ++tile_w) {
+        const float *m = m_ptr + (inter_h * w_tiles + tile_w) * 64;
+        for (int rh = 0; rh < remain_h; ++rh) {
+          memcpy(out_ptr0 + rh * out_w, m + rh * 8, 6 * sizeof(float));
+        }
+        out_ptr0 += 6;
+      }
+      if (remain_w > 0) {
+        const float *m = m_ptr + (inter_h * w_tiles + inter_w) * 64;
+        for (int rh = 0; rh < remain_h; ++rh) {
+          memcpy(out_ptr0 + rh * out_w, m + rh * 8, remain_w * sizeof(float));
+        }
+      }
+    }
+  }
 }

 }  // namespace math

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -439,10 +439,11 @@ class ConvParam : public OpParam {

 #endif

- protected:
+ public:
  RType *input_;
  RType *output_;
  RType *filter_;
+  RType *transformed_filter_;
  vector<int> strides_;
  vector<int> paddings_;
  vector<int> dilations_;
@@ -455,7 +456,7 @@ class ConvParam : public OpParam {

 #ifdef PADDLE_MOBILE_FPGA

- private:
+ public:
  fpga::SplitConvArgs fpga_conv_args;

 public:
@@ -1632,10 +1633,6 @@ class FusionFcParam : public OpParam {
    x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
    y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
    axis_ = GetAttr<int>("axis", attrs);
-
-#ifdef FUSION_FC_INT8_OP
-    scale_ = InputScaleFrom<GType>(inputs, scope);
-#endif
  }
  GType *InputX() const { return input_x_; }

@@ -1660,16 +1657,8 @@ class FusionFcParam : public OpParam {
  int y_num_col_dims_;
  int axis_;

-#ifdef FUSION_FC_INT8_OP
- public:
-  const RType *InputScale() const { return scale_; }
-
- private:
-  RType *scale_;
-#endif
-
 #ifdef PADDLE_MOBILE_FPGA
- private:
+ private:  // NOLINT
  fpga::SplitConvArgs fpga_conv_args;

 public:
@@ -1719,19 +1708,7 @@ class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
  FusionConvAddReluParam(const VariableNameMap &inputs,
                         const VariableNameMap &outputs,
                         const AttributeMap &attrs, const Scope &scope)
-      : FusionConvAddParam<DeviceType>(inputs, outputs, attrs, scope) {
-#ifdef FUSION_CONVADDRELU_INT8_OP
-    scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
-#endif
-  }
-#ifdef FUSION_CONVADDRELU_INT8_OP
-  typedef typename DtypeTensorTrait<DeviceType>::gtype GType;
-  typedef typename DtypeTensorTrait<DeviceType>::rtype RType;
-  const RType *InputScale() const { return scale_; }
-
- private:
-  RType *scale_;
-#endif
+      : FusionConvAddParam<DeviceType>(inputs, outputs, attrs, scope) {}
 };
 #endif

@@ -2539,6 +2516,52 @@ class ShapeParam : public OpParam {
 };
 #endif

+#ifdef TOP_K_OP
+template <typename Dtype>
+class TopKParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  TopKParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+            const AttributeMap &attrs, const Scope &scope) {
+    input_ = OpParam::GetVarValue<GType>("X", inputs, scope);
+    output_ = OpParam::GetVarValue<GType>("Out", outputs, scope);
+    indices_ = OpParam::GetVarValue<GType>("Indices", outputs, scope);
+    k_ = OpParam::GetAttr<int>("k", attrs);
+  }
+
+ public:
+  RType *input_;
+  RType *output_;
+  RType *indices_;
+  int k_;
+};
+#endif  // TOP_K_OP
+
+#ifdef CAST_OP
+template <typename Dtype>
+class CastParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  CastParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+            const AttributeMap &attrs, const Scope &scope) {
+    input_ = OpParam::GetVarValue<GType>("X", inputs, scope);
+    output_ = OpParam::GetVarValue<GType>("Out", outputs, scope);
+    input_type_ = OpParam::GetAttr<int>("in_dtype", attrs);
+    output_type_ = OpParam::GetAttr<int>("out_dtype", attrs);
+  }
+
+ public:
+  RType *input_;
+  RType *output_;
+  int input_type_;
+  int output_type_;
+};
+#endif  // CAST_OP
+
 #ifdef QUANT_OP
 template <typename Dtype>
 class QuantizeParam : public OpParam {
@@ -2554,38 +2577,29 @@ class QuantizeParam : public OpParam {
    // scale = max(abs(x))
    online_scale_ = OpParam::GetVarValue<GType>("OutScale", outputs, scope);
    // offline
-    if (HasAttr("static_scale", attrs)) {
-      is_static_ = true;
-      static_scale_ = GetAttr<float>("static_scale", attrs);
+    if (inputs.count("InScale")) {
+      offline_ = true;
+      offline_scale_ = OpParam::GetVarValue<GType>("InScale", inputs, scope);
    }
    // x = round(scale * x)
-    if (HasAttr("round_type", attrs)) {
-      round_type_ = GetAttr<RoundType>("round_type", attrs);
-    }
-    // get paddings
-    paddings_ = std::vector<int>({0, 0});
-    if (HasAttr("paddings", attrs)) {
-      paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    if (OpParam::HasAttr("round_type", attrs)) {
+      round_type_ = OpParam::GetAttr<RoundType>("round_type", attrs);
    }
  }

 public:
  // op input
-  RType *input_;
+  GType *input_;
  // op output
-  RType *output_;
+  GType *output_;
  RType *online_scale_;
-  // if static scale or not
-  bool is_static_ = false;
-  // quantize scale
-  float static_scale_ = 1.0f;
+  // quantize offline scale
+  RType *offline_scale_;
+  // if offine scale or not
+  bool offline_ = false;
  // round method type
-  // nearest_zero and nearest_even is valid currently
  // RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
  RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
-  // optional paddings
-  std::vector<int> paddings_;
-  int8_t padding_val_;
 };
 #endif

@@ -2599,31 +2613,31 @@ class DequantizeParam : public OpParam {
  DequantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
                  const AttributeMap &attrs, const Scope &scope) {
    input_ = InputXFrom<GType>(inputs, scope);
-    if (outputs.count("Out")) {
    output_ = OutFrom<GType>(outputs, scope);
-    }
    activation_scale_ = OpParam::GetVarValue<GType>("Scale", inputs, scope);
    // dequantization is performed as x = x / static_scale / online_scale
-    if (HasAttr("weight_scale", attrs)) {
-      weight_scale_ = GetAttr<float>("weight_scale", attrs);
+    if (OpParam::HasAttr("weight_scale", attrs)) {
+      weight_scale_ = OpParam::GetAttr<float>("weight_scale", attrs);
    } else {
-      weight_scale_ = GetAttr<float>("max_range", attrs);
+      weight_scale_ = OpParam::GetAttr<float>("max_range", attrs);
    }
  }

 public:
  // op input
-  RType *input_;
+  GType *input_;
  // op output
-  RType *output_;
+  GType *output_;
  RType *activation_scale_;
  float weight_scale_;
 };
 #endif

-#if defined(FUSION_DEQUANT_ADD_BN_OP) ||      \
+#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
    defined(FUSION_DEQUANT_ADD_BN_RELU_OP) ||                             \
-    defined(FUSION_DEQUANT_BN_RELU_OP) || defined(FUSION_DEQUANT_BN_OP)
+    defined(FUSION_DEQUANT_BN_RELU_OP) ||                                 \
+    defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) ||                            \
+    defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
 template <typename Dtype>
 class FusionDequantBNParam : public DequantizeParam<Dtype> {
  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
@@ -2640,10 +2654,6 @@ class FusionDequantBNParam : public DequantizeParam<Dtype> {
    bn_scale_ = OpParam::GetVarValue<GType>("BNScale", inputs, scope);
    bn_bias_ = OpParam::GetVarValue<GType>("BNBias", inputs, scope);
    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    // output
-    if (outputs.count("Y")) {
-      this->output_ = OpParam::OutputYFrom<GType>(outputs, scope);
-    }
  }

 public:
@@ -2656,7 +2666,10 @@ class FusionDequantBNParam : public DequantizeParam<Dtype> {
 };
 #endif

-#if defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || defined(FUSION_DEQUANT_ADD_BN_OP)
+#if defined(FUSION_DEQUANT_ADD_BN_RELU_OP) ||  \
+    defined(FUSION_DEQUANT_ADD_BN_OP) ||       \
+    defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
+    defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
 template <typename Dtype>
 class FusionDequantAddBNParam : public FusionDequantBNParam<Dtype> {
  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
@@ -2670,10 +2683,6 @@ class FusionDequantAddBNParam : public FusionDequantBNParam<Dtype> {
    // element wise add params
    axis_ = OpParam::GetAttr<int>("axis", attrs);
    bias_ = OpParam::InputYFrom<GType>(inputs, scope);
-    // output
-    if (outputs.count("Y")) {
-      this->output_ = OpParam::OutputYFrom<GType>(outputs, scope);
-    }
  }

 public:
@@ -2683,41 +2692,39 @@ class FusionDequantAddBNParam : public FusionDequantBNParam<Dtype> {
 };
 #endif

-#ifdef FUSION_DEQUANT_BN_RELU_OP
+#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
 template <typename Dtype>
-class FusionDequantBNReluParam : public FusionDequantBNParam<Dtype> {
+class FusionDequantAddBNQuantParam : public FusionDequantAddBNParam<Dtype> {
  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
  typedef typename DtypeTensorTrait<Dtype>::rtype RType;

 public:
-  FusionDequantBNReluParam(const VariableNameMap &inputs,
+  FusionDequantAddBNQuantParam(const VariableNameMap &inputs,
                               const VariableNameMap &outputs,
                               const AttributeMap &attrs, const Scope &scope)
-      : FusionDequantBNParam<Dtype>(inputs, outputs, attrs, scope) {
-    // output
-    if (outputs.count("Out")) {
-      this->output_ = OpParam::OutFrom<GType>(outputs, scope);
+      : FusionDequantAddBNParam<Dtype>(inputs, outputs, attrs, scope) {
+    // scale output
+    online_scale_ = OpParam::GetVarValue<GType>("OutScale", outputs, scope);
+    // offline
+    if (inputs.count("InScale")) {
+      offline_ = true;
+      offline_scale_ = OpParam::GetVarValue<GType>("InScale", inputs, scope);
+    }
+    // x = round(scale * x)
+    if (OpParam::HasAttr("round_type", attrs)) {
+      round_type_ = OpParam::GetAttr<RoundType>("round_type", attrs);
    }
  }
-};
-#endif
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
-template <typename Dtype>
-class FusionDequantAddBNReluParam : public FusionDequantAddBNParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;

 public:
-  FusionDequantAddBNReluParam(const VariableNameMap &inputs,
-                              const VariableNameMap &outputs,
-                              const AttributeMap &attrs, const Scope &scope)
-      : FusionDequantAddBNParam<Dtype>(inputs, outputs, attrs, scope) {
-    // output
-    if (outputs.count("Out")) {
-      this->output_ = OpParam::OutFrom<GType>(outputs, scope);
-    }
-  }
+  RType *online_scale_;
+  // quantize offline scale
+  RType *offline_scale_;
+  // if offine scale or not
+  bool offline_ = false;
+  // round method type
+  // RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
+  RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
 };
 #endif


--- a/src/operators/quantize_op.cpp
+++ b/src/operators/quantize_op.cpp
@@ -22,10 +22,7 @@ namespace operators {

 template <typename DeviceType, typename T>
 void QuantizeOp<DeviceType, T>::InferShape() const {
-  auto input_dims = this->param_.input_->dims();
-  const std::vector<int> &paddings = this->param_.paddings_;
-  input_dims[2] += 2 * paddings[0];
-  input_dims[3] += 2 * paddings[1];
+  const auto &input_dims = this->param_.input_->dims();
  this->param_.output_->Resize(input_dims);
  auto scale_dims = framework::make_ddim(std::vector<int>{1});
  this->param_.online_scale_->Resize(scale_dims);
@@ -39,4 +36,4 @@ namespace ops = paddle_mobile::operators;
 REGISTER_OPERATOR_CPU(quantize, ops::QuantizeOp);
 #endif

-#endif
+#endif  // QUANT_OP
--- a/src/operators/quantize_op.h
+++ b/src/operators/quantize_op.h
@@ -43,4 +43,4 @@ class QuantizeOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile

-#endif
+#endif  // QUANT_OP
--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -24,17 +24,19 @@ void ReluOp<Dtype, T>::InferShape() const {
  this->param_.Out()->Resize(input_dims);
 }

+template <typename Dtype, typename T>
+void Relu6Op<Dtype, T>::InferShape() const {
+  auto input_dims = this->param_.InputX()->dims();
+  this->param_.Out()->Resize(input_dims);
+}
+
 }  // namespace operators
 }  // namespace paddle_mobile

-/*
- * @b 每一个 op 都需要注册一下的,
- *    USE_OP的参数 和 REGISTER_OPERATOR的第一个参数
- * 都是需要和model中类型对应起来的
- * */
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
+REGISTER_OPERATOR_CPU(relu6, ops::Relu6Op);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);

--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
@@ -25,25 +25,34 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {

-using paddle_mobile::framework::Tensor;
-
 template <typename DeviceType, typename T>
 class ReluOp : public framework::OperatorWithKernel<
                   DeviceType, ReluParam<DeviceType>,
                   operators::ReluKernel<DeviceType, T>> {
 public:
-  /*
-   * @b op 的实例化方法, 需要调用父类的实例化方法, 以及实例化自己的参数结构体
-   * */
  ReluOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
         std::shared_ptr<framework::Scope> scope)
      : framework::OperatorWithKernel<DeviceType, ReluParam<DeviceType>,
                                      operators::ReluKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}
+
  void InferShape() const override;
+};
+
+template <typename DeviceType, typename T>
+class Relu6Op : public framework::OperatorWithKernel<
+                    DeviceType, ReluParam<DeviceType>,
+                    operators::Relu6Kernel<DeviceType, T>> {
+ public:
+  Relu6Op(const std::string &type, const VariableNameMap &inputs,
+          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+          std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, ReluParam<DeviceType>,
+                                      operators::Relu6Kernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}

- protected:
+  void InferShape() const override;
 };

 }  // namespace operators

--- a/src/operators/math/pool_2x2.h
+++ b/src/operators/math/pool_2x2.h
@@ -12,26 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef POOL_OP
+#ifdef TOP_K_OP

-#pragma once
+#include "operators/top_k_op.h"

-#include "framework/tensor.h"
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#endif  // __ARM_NEON
 namespace paddle_mobile {
 namespace operators {
-namespace math {
-using framework::Tensor;
-using std::vector;

-void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
-                    const Tensor *input, Tensor *output);
+template <typename DeviceType, typename T>
+void TopKOp<DeviceType, T>::InferShape() const {
+  const int k = this->param_.k_;
+  auto dims = this->param_.input_->dims();
+  // should check k <= dims[-1] && k >= 1
+  dims[dims.size() - 1] = k;
+  this->param_.output_->Resize(dims);
+  this->param_.indices_->Resize(dims);
+}

-void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
-                    const Tensor *in_x, Tensor *out);
-}  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(top_k, ops::TopKOp);
 #endif
+
+#endif  // TOP_K_OP
--- a/src/operators/kernel/dequant_bn_relu_kernel.h
+++ b/src/operators/kernel/dequant_bn_relu_kernel.h
@@ -12,35 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#ifdef TOP_K_OP
+
 #pragma once

+#include <string>
 #include "framework/operator.h"
+#include "operators/kernel/kernels.h"
 #include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {

-#ifdef FUSION_DEQUANT_BN_RELU_OP
-template <typename DeviceType, typename T>
-class FusionDequantBNReluKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     FusionDequantBNReluParam<DeviceType>> {
- public:
-  void Compute(const FusionDequantBNReluParam<DeviceType> &param);
-  bool Init(FusionDequantBNReluParam<DeviceType> *param);
-};
-#endif
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
 template <typename DeviceType, typename T>
-class FusionDequantAddBNReluKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     FusionDequantAddBNReluParam<DeviceType>> {
+class TopKOp : public framework::OperatorWithKernel<
+                   DeviceType, TopKParam<DeviceType>,
+                   operators::TopKKernel<DeviceType, T>> {
 public:
-  void Compute(const FusionDequantAddBNReluParam<DeviceType> &param);
-  bool Init(FusionDequantAddBNReluParam<DeviceType> *param);
+  TopKOp(const std::string &type, const VariableNameMap &inputs,
+         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
+         std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, TopKParam<DeviceType>,
+                                      operators::TopKKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  // inference output shape
+  void InferShape() const override;
 };
-#endif

 }  // namespace operators
 }  // namespace paddle_mobile
+
+#endif  // TOP_K_OP
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -261,20 +261,17 @@ if (NOT FOUND_MATCH)
    ADD_EXECUTABLE(test-inference-api framework/test_inference_api.cpp)
    target_link_libraries(test-inference-api paddle-mobile)

-
-    # gen test log
    # gen test
    ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp)
    target_link_libraries(test-optimize paddle-mobile)

-
    #gen test
    ADD_EXECUTABLE(test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-pool-op paddle-mobile)

    #gen test
-    ADD_EXECUTABLE(test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-softmax paddle-mobile)
+    ADD_EXECUTABLE(test-softmax-op operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-softmax-op paddle-mobile)

    # gen test
    ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
@@ -324,10 +321,6 @@ if (NOT FOUND_MATCH)
    ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-conv-add-relu-op paddle-mobile)

-    # gen test
-    ADD_EXECUTABLE(test-conv-add-relu-int8-op operators/test_fusion_conv_add_relu_int8_op.cpp test_helper.h test_include.h)
-    target_link_libraries(test-conv-add-relu-int8-op paddle-mobile)
-
    # gen test
    ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
@@ -379,5 +372,8 @@ if (NOT FOUND_MATCH)
    # gen test
    ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h)
    target_link_libraries(test-super paddle-mobile)
-    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
+   
+    # gen test
+    ADD_EXECUTABLE(test-ocr net/test_ocr.cpp test_helper.h test_include.h)
+    target_link_libraries(test-ocr paddle-mobile)
 endif ()
--- a/test/common/test_gemm_perf.cpp
+++ b/test/common/test_gemm_perf.cpp
@@ -73,14 +73,14 @@ int main() {
  // float
  // warm-up 10 times
  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::matmul<float>(
+    paddle_mobile::operators::math::MatMul<float, float>(
        aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
        false, nullptr);
  }

  auto time_start0 = time();
  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::matmul<float>(
+    paddle_mobile::operators::math::MatMul<float, float>(
        aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
        false, nullptr);
  }
@@ -91,14 +91,14 @@ int main() {
  // int8_t without bias
  // warm-up 10 times
  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::matmul<float, int32_t>(
+    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
        aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
        static_cast<float>(0));
  }

  auto time_start1 = time();
  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::matmul<float, int32_t>(
+    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
        aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
        static_cast<float>(0));
  }
@@ -109,13 +109,13 @@ int main() {
  // int8_t with bias, column element wise add
  // warm-up 10 times
  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::matmul(
+    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
        static_cast<float>(0), false, bias_data_col, false);
  }
  auto time_start2 = time();
  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::matmul(
+    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
        static_cast<float>(0), false, bias_data_col, false);
  }
@@ -126,13 +126,13 @@ int main() {
  // int8_t with bias, row element wise add
  // warm-up 10 times
  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::matmul(
+    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
        static_cast<float>(0), false, bias_data_row, true);
  }
  auto time_start3 = time();
  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::matmul(
+    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
        static_cast<float>(0), false, bias_data_row, true);
  }
@@ -143,13 +143,13 @@ int main() {
  // int8_t with bias&relu
  // warm-up 10 times
  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::matmul(
+    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
        static_cast<float>(0), true, bias_data_col, false);
  }
  auto time_start4 = time();
  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::matmul(
+    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
        static_cast<float>(0), true, bias_data_col, false);
  }

--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
@@ -39,6 +39,7 @@ using paddle_mobile::framework::Tensor;
 using paddle_mobile::framework::Variable;
 using std::string;
 using std::vector;
+
 template <typename DeviceType, typename OpType>
 class Executor4Test : public Executor<DeviceType> {
 public:
@@ -48,20 +49,19 @@ class Executor4Test : public Executor<DeviceType> {
    this->use_optimize_ = use_optimize;
    this->program_ = p;
    if (this->use_optimize_) {
-      this->to_predict_program_ = this->program_.optimizeProgram;
+      this->program_desc_ = this->program_.optimizeProgram;
    } else {
-      this->to_predict_program_ = this->program_.originProgram;
+      this->program_desc_ = this->program_.originProgram;
    }

    if (this->program_.originProgram == nullptr) {
-      LOG(paddle_mobile::LogLevel::kLOG_ERROR)
-          << "to_predict_program_ == nullptr";
+      LOG(paddle_mobile::LogLevel::kLOG_ERROR) << "program_desc_ == nullptr";
    }

    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        this->to_predict_program_->Blocks();
-    for (std::shared_ptr<BlockDesc> block_desc : blocks) {
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+        this->program_desc_->Blocks();
+    for (int block_id = 0; block_id < blocks.size(); ++block_id) {
+      std::vector<std::shared_ptr<OpDesc>> ops = blocks[block_id]->Ops();
      for (int i = 0; i < ops.size(); ++i) {
        auto op = ops[i];
        if (op->Type() == op_type) {
@@ -73,20 +73,18 @@ class Executor4Test : public Executor<DeviceType> {
                  paddle_mobile::framework::OpRegistry<DeviceType>::CreateOp(
                      op->Type(), op->GetInputs(), op->GetOutputs(),
                      op->GetAttrMap(), this->program_.scope);
-          this->ops_of_block_[*block_desc.get()].push_back(op_ptr);
+          this->ops_of_block_[block_id].push_back(op_ptr);
          break;
        }
      }
    }
    this->InitMemory();
-
-    std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
-        this->to_predict_program_->Block(0);
-    auto &ops = this->ops_of_block_[*to_predict_block.get()];
+    for (const auto &ops : this->ops_of_block_) {
      for (const auto &op : ops) {
        op->Init();
      }
    }
+  }

  template <typename T = LoDTensor>
  vector<std::shared_ptr<Tensor>> Predict(const vector<Tensor> &ts,
@@ -117,13 +115,11 @@ class Executor4Test : public Executor<DeviceType> {
      output_tensor_sptrs[i].reset(output_tensors[i]);
    }

-    std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
-        this->to_predict_program_->Block(0);
-    for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size();
-         ++j) {
-      auto op = this->ops_of_block_[*to_predict_block.get()][j];
+    for (auto &ops : this->ops_of_block_) {
+      for (auto &op : ops) {
        op->Run();
      }
+    }

    return output_tensor_sptrs;
  }
@@ -139,14 +135,11 @@ class Executor4Test : public Executor<DeviceType> {
    auto *output_tensor = con_output->GetMutable<LoDTensor>();
    output_tensor->mutable_data<float>(dDim);

-    std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
-        this->to_predict_program_->Block(0);
-    for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size();
-         ++j) {
-      auto op = this->ops_of_block_[*to_predict_block.get()][j];
+    for (auto &ops : this->ops_of_block_) {
+      for (auto &op : ops) {
        op->Run();
      }
-
+    }
    return std::make_shared<paddle_mobile::framework::Tensor>(
        paddle_mobile::framework::Tensor(*output_tensor));
  }

--- a/test/net/test_benchmark.cpp
+++ b/test/net/test_benchmark.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <iostream>
+#include <sstream>
 #include "../test_helper.h"
 #include "../test_include.h"

@@ -51,14 +52,22 @@ int main(int argc, char* argv[]) {
    SetupTensor<float>(&input, in_shape, 0.f, 255.f);
    // warmup
    for (int i = 0; i < 10; ++i) {
-      output = paddle_mobile.Predict(input);
+      paddle_mobile.Predict(input);
    }
    auto time3 = time();
    for (int i = 0; i < 10; ++i) {
-      output = paddle_mobile.Predict(input);
+      paddle_mobile.Predict(input);
    }
    auto time4 = time();
    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms\n";
+    std::ostringstream os("output tensor size: ");
+    output = paddle_mobile.Fetch();
+    os << output->numel() << "\n" << output->data<float>()[0];
+    for (int i = 1; i < output->numel(); ++i) {
+      os << ", " << output->data<float>()[i];
+    }
+    std::string output_str = os.str();
+    std::cout << output_str << std::endl;
  }
  return 0;
 }
--- a/test/net/test_eng.cpp
+++ b/test/net/test_eng.cpp
@@ -36,11 +36,11 @@ int main() {
                             input_tensor.data<float>() + input_tensor.numel());
    //   预热十次
    for (int i = 0; i < 1; ++i) {
-      paddle_mobile.PredictLod(input_tensor);
+      paddle_mobile.Predict(input_tensor);
    }
    auto time3 = time();
    for (int i = 0; i < 1; ++i) {
-      paddle_mobile.PredictLod(input_tensor);
+      paddle_mobile.Predict(input_tensor);
    }
    auto time4 = time();
    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"

--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -16,24 +16,43 @@ limitations under the License. */
 #include "../test_helper.h"
 #include "../test_include.h"

-int main() {
+int main(int argc, char* argv[]) {
+  if (argc < 2) {
+    std::cout << "Usage: ./test_benchmark feed_shape [thread_num] [use_fuse]\n"
+              << "feed_shape: input tensor shape, such as 3,224,224.\n"
+              << "thread_num: optional int, threads count, default is 1.\n"
+              << "use_fuse: optional bool, default is 0.\n";
+    return 1;
+  }
+  int thread_num = 1;
+  bool optimize = false;
+  char* feed_shape = argv[1];
+  if (argc >= 3) {
+    thread_num = atoi(argv[2]);
+  }
+  if (argc >= 4) {
+    optimize = atoi(argv[3]);
+  }
 #ifdef PADDLE_MOBILE_FPGA
  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
 #endif
 #ifdef PADDLE_MOBILE_CPU
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
 #endif
-
-  paddle_mobile.SetThreadNum(1);
-  bool optimize = true;
+  paddle_mobile.SetThreadNum(thread_num);
  auto time1 = time();
-  if (paddle_mobile.Load(g_googlenet, optimize)) {
+  std::vector<float> output;
+  if (paddle_mobile.Load(g_googlenet, optimize, false, 1, true)) {
    auto time2 = paddle_mobile::time();
    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
              << std::endl;
    std::vector<float> input;
-    std::vector<float> output;
    std::vector<int64_t> dims{1, 3, 224, 224};
+    if (feed_shape) {
+      sscanf(feed_shape, "%d,%d,%d", &dims[1], &dims[2], &dims[3]);
+    }
+    std::cout << "feed shape: [" << dims[0] << ", " << dims[1] << ", "
+              << dims[2] << ", " << dims[3] << "]" << std::endl;
    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
    // warmup
    for (int i = 0; i < 10; ++i) {
@@ -44,7 +63,6 @@ int main() {
      output = paddle_mobile.Predict(input, dims);
    }
    auto time4 = time();
-
    std::cout << "predict cost: " << time_diff(time3, time4) / 10 << "ms\n";
  }
  return 0;

--- a/test/net/test_nlp.cpp
+++ b/test/net/test_nlp.cpp
@@ -48,8 +48,8 @@ int main() {
    DLOG << "words lod 22: " << words.lod();
    auto time3 = time();
    for (int i = 0; i < 1; ++i) {
-      auto vec_result = paddle_mobile.PredictLod(words);
-      DLOG << *vec_result;
+      paddle_mobile.Predict(words);
+      DLOG << *paddle_mobile.Fetch();
    }
    auto time4 = time();
    std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"
@@ -84,8 +84,8 @@ int main() {
  DLOG << "words lod 22: " << words.lod();
  auto time3 = time();
  for (int i = 0; i < 1; ++i) {
-    auto vec_result = paddle_mobile.PredictLod(words);
-    DLOG << *vec_result;
+    paddle_mobile.Predict(words);
+    DLOG << *paddle_mobile.Fetch();
  }
  auto time4 = time();
  std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"

--- a/test/net/test_ocr.cpp
+++ b/test/net/test_ocr.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+void load_images(const char *image_dir, const char *images_list,
+                 std::vector<std::string> *image_names,
+                 std::vector<std::pair<int, int>> *image_shapes) {
+  int height, width;
+  std::string filename;
+  std::ifstream if_list(images_list, std::ios::in);
+  while (!if_list.eof()) {
+    if_list >> height >> width >> filename;
+    image_shapes->push_back(std::make_pair(height, width));
+    image_names->push_back(filename);
+  }
+  if_list.close();
+}
+
+int main(int argc, char **argv) {
+  if (argc < 4) {
+    std::cerr << "Usage: ./test_ocr model_dir image_dir images_list."
+              << std::endl;
+    return 1;
+  }
+  char *model_dir = argv[1];
+  char *image_dir = argv[2];
+  char *images_list = argv[3];
+
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(8);
+  auto isok = paddle_mobile.Load(std::string(model_dir) + "/model",
+                                 std::string(model_dir) + "/params", true,
+                                 false, 1, true);
+  DLOG << "pass init model";
+  std::vector<std::string> image_names;
+  std::vector<std::pair<int, int>> image_shapes;
+  load_images(image_dir, images_list, &image_names, &image_shapes);
+  DLOG << "pass load images";
+
+  for (int i = 0; i < image_names.size(); i++) {
+    std::string file_name = image_names[i];
+    std::vector<float> input_vec;
+    std::vector<int64_t> dims{1, 1, 48, 512};
+    dims[2] = image_shapes[i].first;
+    dims[3] = image_shapes[i].second;
+    // load input image
+    std::string img_path = std::string(image_dir) + "/" + file_name;
+    std::cerr << "img_path: " << img_path << std::endl;
+    std::cerr << "shape = [" << dims[0] << ", " << dims[1] << ", " << dims[2]
+              << ", " << dims[3] << "]" << std::endl;
+    GetInput<float>(img_path, &input_vec, dims);
+    framework::Tensor input(input_vec, framework::make_ddim(dims));
+    // predict
+    paddle_mobile.Predict(input);
+    auto output_topk = paddle_mobile.Fetch("top_k_1.tmp_0");
+    auto output_indices = paddle_mobile.Fetch("cast_68.tmp_0");
+    // print result
+    std::cerr << file_name << std::endl;
+    std::cerr << output_topk->data<float>()[0];
+    for (int j = 1; j < output_topk->numel(); ++j) {
+      std::cerr << " " << output_topk->data<float>()[j];
+    }
+    std::cerr << std::endl;
+    std::cerr << output_indices->data<float>()[0];
+    for (int j = 1; j < output_indices->numel(); ++j) {
+      std::cerr << " " << output_indices->data<float>()[j];
+    }
+    std::cerr << std::endl;
+  }
+  return 0;
+}
--- a/test/operators/test_conv_op.cpp
+++ b/test/operators/test_conv_op.cpp
@@ -129,7 +129,8 @@ void conv2d(const framework::Tensor *input, const framework::Tensor *filter,
 }

 template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
-int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
+int TestConvOp(int in_channels, int in_height, int in_width, int out_channels,
+               int groups) {
  int kernel_h = Kernel;
  int kernel_w = Kernel;
  int pad_h = Pad;
@@ -147,7 +148,7 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
  framework::DDim input_shape =
      framework::make_ddim({batch_size, input_c, input_h, input_w});
  framework::DDim filter_shape =
-      framework::make_ddim({output_c, input_c, kernel_h, kernel_w});
+      framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w});

  VariableNameMap inputs;
  VariableNameMap outputs;
@@ -164,13 +165,22 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
  auto filter = filter_var->template GetMutable<framework::LoDTensor>();
  SetupTensor<Itype>(filter, filter_shape, -20, 20);

+  for (int i = 0; i < input->numel(); ++i) {
+    DLOG << "input[" << i
+         << "] = " << static_cast<int>(input->data<int8_t>()[i]);
+  }
+  for (int i = 0; i < filter->numel(); ++i) {
+    DLOG << "filter[" << i
+         << "] = " << static_cast<int>(filter->data<int8_t>()[i]);
+  }
+
  auto output_var = scope.get()->Var("output");
  framework::AttributeMap attrs;
  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
  attrs["dilations"].Set<vector<int>>(
      std::vector<int>({dilation_h, dilation_w}));
-  attrs["groups"].Set<int>(1);
+  attrs["groups"].Set<int>(groups);

  auto *op = new operators::ConvOp<CPU, float>("conv2d", inputs, outputs, attrs,
                                               scope);
@@ -204,15 +214,15 @@ int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
  Otype *output_cmp_data = output_cmp.data<Otype>();
  for (int i = 0; i < output->numel(); ++i) {
    float gap = output_data[i] - output_cmp_data[i];
-    PADDLE_MOBILE_ENFORCE(std::abs(gap / (output_data[i] + 1e-5)) < 1e-3,
-                          "output[%d] = %d, output_cmp[%d] = %d", i,
-                          output_data[i], i, output_cmp_data[i]);
-    // if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-    //   LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-    //                  << ", output_cmp_data[" << i << "] = " <<
-    //                  output_cmp_data[i];
-    //   return 1;
-    // }
+    //    PADDLE_MOBILE_ENFORCE(std::abs(gap / (output_data[i] + 1e-5)) < 1e-3,
+    //                          "output[%d] = %d, output_cmp[%d] = %d", i,
+    //                          output_data[i], i, output_cmp_data[i]);
+    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
+      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
+                     << ", output_cmp_data[" << i
+                     << "] = " << output_cmp_data[i];
+      exit(1);
+    }
  }
  delete op;
  return 0;
@@ -224,7 +234,8 @@ int main(int argc, char *argv[]) {
  if (argc < 5) {
    LOG(paddle_mobile::kLOG_INFO)
        << "Usage:\n"
-        << "  ./test-int8-conv-op in_channels in_height in_width out_channels\n"
+        << "  ./test-int8-conv-op in_channels in_height in_width out_channels "
+           "[groups]\n"
        << "  params:\n"
        << "   -in_channels: int, input image's channels\n"
        << "   -in_height: int, input image's height\n"
@@ -236,72 +247,134 @@ int main(int argc, char *argv[]) {
  int in_height = atoi(argv[2]);
  int in_width = atoi(argv[3]);
  int out_channels = atoi(argv[4]);
-  // kernel = 3, pad = 1, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1";
-  paddle_mobile::TestConvOp<float, float, 3, 1, 1>(in_channels, in_height,
-                                                   in_width, out_channels);
-  // kernel = 7, pad = 0, stride = 2
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 2>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 7, pad = 1, stride = 2
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 2>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 7, pad = 3, stride = 2
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 2>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 7, pad = 0, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 1>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 7, pad = 1, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 1>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 7, pad = 3, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 1>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 7, pad = 5, stride = 3
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 5, 3>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 7, pad = 3, stride = 4
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 4>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 3, pad = 0, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 1>(in_channels, in_height,
-                                                      in_width, out_channels);
+  int groups = 1;
+  if (argc == 6) {
+    groups = atoi(argv[5]);
+  }
  // kernel = 3, pad = 0, stride = 1
  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=0, stride=1";
-  paddle_mobile::TestConvOp<float, float, 3, 0, 1>(in_channels, in_height,
-                                                   in_width, out_channels);
-  // kernel = 3, pad = 1, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=1, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 1>(in_channels, in_height,
-                                                      in_width, out_channels);
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 1>(
+      in_channels, in_height, in_width, out_channels, groups);
  // kernel = 3, pad = 1, stride = 1
  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1";
-  paddle_mobile::TestConvOp<float, float, 3, 1, 1>(in_channels, in_height,
-                                                   in_width, out_channels);
-  // kernel = 5, pad = 0, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=0, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 0, 1>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 5, pad = 0, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=0, stride=1";
-  paddle_mobile::TestConvOp<float, float, 5, 0, 1>(in_channels, in_height,
-                                                   in_width, out_channels);
-  // kernel = 5, pad = 2, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=2, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 2, 1>(in_channels, in_height,
-                                                      in_width, out_channels);
-  // kernel = 5, pad = 2, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=2, stride=1";
-  paddle_mobile::TestConvOp<float, float, 5, 2, 1>(in_channels, in_height,
-                                                   in_width, out_channels);
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 1>(
+      in_channels, in_height, in_width, out_channels, groups);
+  // kernel = 3, pad = 2, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=2, stride=1";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 2, 1>(
+      in_channels, in_height, in_width, out_channels, groups);
+  // kernel = 3, pad = 5, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=5, stride=1";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 5, 1>(
+      in_channels, in_height, in_width, out_channels, groups);
+
+  // kernel = 3, pad = 0, stride = 2
+  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=0, stride=2";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 2>(
+      in_channels, in_height, in_width, out_channels, groups);
+  // kernel = 3, pad = 1, stride = 2
+  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=2";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 2>(
+      in_channels, in_height, in_width, out_channels, groups);
+  // kernel = 3, pad = 2, stride = 2
+  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=2, stride=2";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 2, 2>(
+      in_channels, in_height, in_width, out_channels, groups);
+  // kernel = 3, pad = 5, stride = 2
+  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=5, stride=2";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 5, 2>(
+      in_channels, in_height, in_width, out_channels, groups);
+
+  //  // kernel = 7, pad = 0, stride = 2
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 2>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 7, pad = 1, stride = 2
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 2>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 7, pad = 3, stride = 2
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 2>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 7, pad = 0, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 1>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 7, pad = 1, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 1>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 7, pad = 3, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 1>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 7, pad = 5, stride = 3
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 5, 3>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 7, pad = 3, stride = 4
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 4>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 3, pad = 0, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 1>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 3, pad = 0, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=0, stride=1";
+  //  paddle_mobile::TestConvOp<float, float, 3, 0, 1>(in_channels, in_height,
+  //                                                   in_width, out_channels,
+  //                                                   groups);
+  //  // kernel = 3, pad = 1, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=1, stride=1";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 1>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 3, pad = 1, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1";
+  //  paddle_mobile::TestConvOp<float, float, 3, 1, 1>(in_channels, in_height,
+  //                                                   in_width, out_channels,
+  //                                                   groups);
+  //  // kernel = 5, pad = 0, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=0, stride=1";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 0, 1>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 5, pad = 0, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=0, stride=1";
+  //  paddle_mobile::TestConvOp<float, float, 5, 0, 1>(in_channels, in_height,
+  //                                                   in_width, out_channels,
+  //                                                   groups);
+  //  // kernel = 5, pad = 2, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=2, stride=1";
+  //  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 2, 1>(in_channels,
+  //  in_height,
+  //                                                      in_width,
+  //                                                      out_channels, groups);
+  //  // kernel = 5, pad = 2, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=2, stride=1";
+  //  paddle_mobile::TestConvOp<float, float, 5, 2, 1>(in_channels, in_height,
+  //                                                   in_width, out_channels,
+  //                                                   groups);
 }
--- a/test/operators/test_fusion_conv_add_relu_int8_op.cpp
+++ b/test/operators/test_fusion_conv_add_relu_int8_op.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-
-#ifdef FUSION_CONVADDRELU_INT8_OP
-#include <limits>
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/fusion_conv_add_relu_int8_op.h"
-
-namespace paddle_mobile {
-int32_t qadd_int32(int32_t l, int32_t r) {
-  int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
-  if (res > std::numeric_limits<int32_t>::max())
-    return std::numeric_limits<int32_t>::max();
-  else if (res < std::numeric_limits<int32_t>::min())
-    return std::numeric_limits<int32_t>::min();
-  else
-    return static_cast<int32_t>(res);
-}
-
-// round to zero
-float round2zero(float v) {
-  float res;
-  if (v > 0)
-    res = std::floor(v);
-  else if (v < 0)
-    res = std::ceil(v);
-  return res;
-}
-
-int8_t qscale_int32(int32_t v, float scale) {
-  float res = static_cast<float>(v) * scale;
-  res = round2zero(res);
-  if (res > 127)
-    return static_cast<int8_t>(127);
-  else if (res < -127)
-    return static_cast<int8_t>(-127);
-  else
-    return static_cast<int8_t>(res);
-}
-
-// Reference convolution from Caffe for checking results.
-// accumulate through explicit loops over input, output, and filters.
-template <typename T>
-void conv2d(const framework::Tensor *input, const framework::Tensor *filter,
-            const framework::Tensor *bias, const framework::AttributeMap &attrs,
-            framework::Tensor *output, float scale) {
-  framework::AttrReader attr_reader(attrs);
-  std::vector<int> paddings = attr_reader.Get<std::vector<int>>("paddings");
-  std::vector<int> strides = attr_reader.Get<std::vector<int>>("strides");
-  std::vector<int> dilations = attr_reader.Get<std::vector<int>>("dilations");
-  int groups = attr_reader.Get<int>("groups");
-  int kernel_h = filter->dims()[2];
-  int kernel_w = filter->dims()[3];
-  int pad_h = paddings[0];
-  int pad_w = paddings[1];
-  int stride_h = strides[0];
-  int stride_w = strides[1];
-  int dilation_h = dilations[0];
-  int dilation_w = dilations[1];
-  auto in_shape = input->dims();
-  auto out_shape = output->dims();
-
-  const bool has_depth = 0;
-  int kernel_d, pad_d, stride_d, dilation_d;
-  if (has_depth) {
-    kernel_d = kernel_h;
-    stride_d = stride_h;
-    pad_d = pad_h;
-    dilation_d = dilation_h;
-  } else {
-    kernel_d = stride_d = dilation_d = 1;
-    pad_d = 0;
-  }
-  // Groups
-  int o_g = out_shape[1] / groups;
-  int k_g = in_shape[1] / groups;
-  int o_head, k_head;
-  // Convolution
-  vector<int> weight_offset(4 + has_depth);
-  vector<int> in_offset(4 + has_depth);
-  vector<int> out_offset(4 + has_depth);
-  auto offset = [](const framework::Tensor *input, const vector<int> &indics) {
-    framework::DDim shape = input->dims();
-    size_t count = 0;
-    for (int i = 0; i < indics.size(); ++i) {
-      count *= shape[i];
-      count += indics[i];
-    }
-    return count;
-  };
-
-  const T *in_data = input->data<T>();
-  const T *w_data = filter->data<T>();
-  framework::Tensor output_32;
-  int32_t *out_data_32 = output_32.mutable_data<int32_t>(out_shape);
-  memset(out_data_32, 0, output_32.numel() * sizeof(int32_t));
-  for (int n = 0; n < out_shape[0]; n++) {
-    for (int g = 0; g < groups; g++) {
-      o_head = o_g * g;
-      k_head = k_g * g;
-      for (int o = 0; o < o_g; o++) {
-        for (int k = 0; k < k_g; k++) {
-          for (int z = 0; z < (has_depth ? out_shape[2] : 1); z++) {
-            for (int y = 0; y < out_shape[2 + has_depth]; y++) {
-              for (int x = 0; x < out_shape[3 + has_depth]; x++) {
-                for (int r = 0; r < kernel_d; r++) {
-                  for (int p = 0; p < kernel_h; p++) {
-                    for (int q = 0; q < kernel_w; q++) {
-                      int in_z = z * stride_d - pad_d + r * dilation_d;
-                      int in_y = y * stride_h - pad_h + p * dilation_h;
-                      int in_x = x * stride_w - pad_w + q * dilation_w;
-                      if (in_z >= 0 && in_z < (has_depth ? in_shape[2] : 1) &&
-                          in_y >= 0 && in_y < in_shape[2 + has_depth] &&
-                          in_x >= 0 && in_x < in_shape[3 + has_depth]) {
-                        weight_offset[0] = o + o_head;
-                        weight_offset[1] = k;
-                        if (has_depth) {
-                          weight_offset[2] = r;
-                        }
-                        weight_offset[2 + has_depth] = p;
-                        weight_offset[3 + has_depth] = q;
-                        in_offset[0] = n;
-                        in_offset[1] = k + k_head;
-                        if (has_depth) {
-                          in_offset[2] = in_z;
-                        }
-                        in_offset[2 + has_depth] = in_y;
-                        in_offset[3 + has_depth] = in_x;
-                        out_offset[0] = n;
-                        out_offset[1] = o + o_head;
-                        if (has_depth) {
-                          out_offset[2] = z;
-                        }
-                        out_offset[2 + has_depth] = y;
-                        out_offset[3 + has_depth] = x;
-
-                        out_data_32[offset(output, out_offset)] +=
-                            in_data[offset(input, in_offset)] *
-                            w_data[offset(filter, weight_offset)];
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  T *out_data = output->mutable_data<T>();
-  int32_t n = out_shape[0];
-  int32_t c = out_shape[1];
-  int32_t h = out_shape[2];
-  int32_t w = out_shape[3];
-  const int32_t *bias_data = bias->data<int32_t>();
-  for (int i = 0; i < n; ++i) {
-    for (int j = 0; j < c; ++j) {
-      int32_t bias_v = bias_data[j];
-      for (int k = 0; k < h; ++k) {
-        for (int l = 0; l < w; ++l) {
-          int32_t tmp = out_data_32[i * c * h * w + j * h * w + k * w + l];
-          tmp = qadd_int32(tmp, bias_v);
-          tmp = std::max(0, tmp);
-          out_data[i * c * h * w + j * h * w + k * w + l] =
-              qscale_int32(tmp, scale);
-        }
-      }
-    }
-  }
-}
-
-template <typename T, int Kernel, int Pad, int Stride>
-int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
-  int kernel_h = Kernel;
-  int kernel_w = Kernel;
-  int pad_h = Pad;
-  int pad_w = Pad;
-  int stride_h = Stride;
-  int stride_w = Stride;
-  int dilation_h = 1;
-  int dilation_w = 1;
-
-  int batch_size = 1;
-  int input_c = in_channels;
-  int input_h = in_height;
-  int input_w = in_width;
-  int output_c = out_channels;
-  framework::DDim input_shape =
-      framework::make_ddim({batch_size, input_c, input_h, input_w});
-  framework::DDim filter_shape =
-      framework::make_ddim({output_c, input_c, kernel_h, kernel_w});
-
-  int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
-  int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
-  int output_h = (input_h + 2 * pad_h - kernel_extent_h) / stride_h + 1;
-  int output_w = (input_w + 2 * pad_w - kernel_extent_w) / stride_w + 1;
-  framework::DDim output_shape = framework::make_ddim(
-      std::vector<int>({batch_size, output_c, output_h, output_w}));
-
-  framework::DDim bias_shape = framework::make_ddim({output_c});
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["Input"] = std::vector<std::string>({"input"});
-  inputs["Filter"] = std::vector<std::string>({"filter"});
-  inputs["Scale"] = std::vector<std::string>({"scale"});
-  inputs["Y"] = std::vector<std::string>({"bias"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(input, input_shape, -127, 127);
-
-  auto filter_var = scope.get()->Var("filter");
-  auto filter = filter_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(filter, filter_shape, -127, 127);
-
-  auto scale_var = scope.get()->Var("scale");
-  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
-  scale->Resize(framework::make_ddim({1}));
-  float scale_v = 0.000828f;
-  scale->mutable_data<float>()[0] = scale_v;
-
-  auto bias_var = scope.get()->Var("bias");
-  auto bias = bias_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<int32_t>(bias, bias_shape, -127, 127);
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
-  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
-  attrs["dilations"].Set<vector<int>>(
-      std::vector<int>({dilation_h, dilation_w}));
-  attrs["groups"].Set<int>(1);
-  attrs["axis"].Set<int>(0);
-
-  auto *op = new operators::FusionConvAddReluInt8Op<CPU, T>(
-      "fusion_conv_add_relu_int8", inputs, outputs, attrs, scope);
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  framework::Tensor output_cmp;
-  output_cmp.mutable_data<T>(output_shape);
-  conv2d<T>(input, filter, bias, attrs, &output_cmp, scale_v);
-
-  // compare results
-  int eq = 0;
-  int neq = 0;
-  auto output = output_var->template Get<framework::LoDTensor>();
-  const T *output_data = output->data<T>();
-  T *output_cmp_data = output_cmp.data<T>();
-  for (int i = 0; i < output->numel(); ++i) {
-    PADDLE_MOBILE_ENFORCE(
-        output_data[i] == output_cmp_data[i],
-        "The execution of test_fusion_conv_add_relu_int8_op is failed!");
-    if (output_data[i] == output_cmp_data[i]) {
-      ++eq;
-    } else {
-      ++neq;
-    }
-  }
-  std::cout << "eq = " << eq << ", neq = " << neq << std::endl;
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main(int argc, char *argv[]) {
-  if (argc < 5) {
-    LOG(paddle_mobile::kLOG_INFO)
-        << "Usage:\n"
-        << "  ./test-conv-add-relu-int8-op in_channels in_height in_width "
-           "out_channels\n"
-        << "  params:\n"
-        << "   -in_channels: int, input image's channels\n"
-        << "   -in_height: int, input image's height\n"
-        << "   -in_width: int, input image's width\n"
-        << "   -out_channels: int, conv output channels\n";
-    return 1;
-  }
-  int in_channels = atoi(argv[1]);
-  int in_height = atoi(argv[2]);
-  int in_width = atoi(argv[3]);
-  int out_channels = atoi(argv[4]);
-  // kernel = 3, pad = 1, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8_t, kernel=3, pad=1, stride=1";
-  paddle_mobile::TestConvOp<int8_t, 3, 1, 1>(in_channels, in_height, in_width,
-                                             out_channels);
-  // kernel = 7, pad = 0, stride = 2
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2";
-  paddle_mobile::TestConvOp<int8_t, 7, 0, 2>(in_channels, in_height, in_width,
-                                             out_channels);
-  // kernel = 7, pad = 1, stride = 2
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2";
-  paddle_mobile::TestConvOp<int8_t, 7, 1, 2>(in_channels, in_height, in_width,
-                                             out_channels);
-  // kernel = 7, pad = 3, stride = 2
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2";
-  paddle_mobile::TestConvOp<int8_t, 7, 3, 2>(in_channels, in_height, in_width,
-                                             out_channels);
-  // kernel = 7, pad = 0, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1";
-  paddle_mobile::TestConvOp<int8_t, 7, 0, 1>(in_channels, in_height, in_width,
-                                             out_channels);
-  // kernel = 7, pad = 1, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1";
-  paddle_mobile::TestConvOp<int8_t, 7, 1, 1>(in_channels, in_height, in_width,
-                                             out_channels);
-  // kernel = 7, pad = 3, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1";
-  paddle_mobile::TestConvOp<int8_t, 7, 3, 1>(in_channels, in_height, in_width,
-                                             out_channels);
-  // kernel = 7, pad = 5, stride = 3
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3";
-  paddle_mobile::TestConvOp<int8_t, 7, 5, 3>(in_channels, in_height, in_width,
-                                             out_channels);
-  // kernel = 7, pad = 3, stride = 4
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4";
-  paddle_mobile::TestConvOp<int8_t, 7, 3, 4>(in_channels, in_height, in_width,
-                                             out_channels);
-  // kernel = 3, pad = 0, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1";
-  paddle_mobile::TestConvOp<int8_t, 3, 0, 1>(in_channels, in_height, in_width,
-                                             out_channels);
-  // kernel = 3, pad = 1, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=1, stride=1";
-  paddle_mobile::TestConvOp<int8_t, 3, 1, 1>(in_channels, in_height, in_width,
-                                             out_channels);
-
-  // kernel = 5, pad = 0, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=0, stride=1";
-  paddle_mobile::TestConvOp<int8_t, 5, 0, 1>(in_channels, in_height, in_width,
-                                             out_channels);
-
-  // kernel = 5, pad = 2, stride = 1
-  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=2, stride=1";
-  paddle_mobile::TestConvOp<int8_t, 5, 2, 1>(in_channels, in_height, in_width,
-                                             out_channels);
-}
-#else
-int main() {
-  std::cout << "FUSION_CONVADDRELU_INT8_OP is not defined!" << std::endl;
-  return 0;
-}
-#endif
--- a/test/operators/test_fusion_fc_op.cpp
+++ b/test/operators/test_fusion_fc_op.cpp
@@ -18,9 +18,6 @@ limitations under the License. */
 #include "../test_include.h"
 #include "framework/operator.h"
 #include "operators/fusion_fc_op.h"
-#ifdef FUSION_FC_INT8_OP
-#include "operators/fusion_fc_int8_op.h"
-#endif

 #define a(i, j) a[(i)*lda + (j)]
 #define b(i, j) b[(i)*ldb + (j)]
@@ -105,18 +102,8 @@ int TestFcOP() {
  attrs["y_num_col_dims"].Set<int>(1);
  attrs["axis"].Set<int>(1);
  operators::OperatorBase<CPU> *op = nullptr;
-#ifdef FUSION_FC_INT8_OP
-  if (std::is_same<T, int8_t>::value) {
-    op = new operators::FusionFcInt8Op<CPU, T>("fusion_fc_int8", inputs,
-                                               outputs, attrs, scope);
-  } else {
-    op = new operators::FusionFcOp<CPU, T>("fusion_fc", inputs, outputs, attrs,
-                                           scope);
-  }
-#else
  op = new operators::FusionFcOp<CPU, T>("fusion_fc", inputs, outputs, attrs,
                                         scope);
-#endif
  op->InferShape();
  op->Run();
  auto output = output_var->template Get<framework::LoDTensor>();
@@ -168,9 +155,6 @@ int TestFcOP() {
 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
  paddle_mobile.SetThreadNum(4);
-#ifdef FUSION_FC_INT8_OP
-  paddle_mobile::TestFcOP<int8_t, int32_t>();
-#endif
  paddle_mobile::TestFcOP<float, float>();
  return 0;
 }
--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
@@ -14,88 +14,14 @@ limitations under the License. */

 #include <iostream>
 #include "../test_include.h"
-#include "operators/kernel/central-arm-func/pool_arm_func.h"
+#include "operators/math/pooling.h"
 #include "operators/pool_op.h"

 namespace paddle_mobile {
-static int PoolOutputSize(int input_size, int filter_size, int padding,
-                          int stride, bool ceil_mode) {
-  int output_size;
-  if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
-  } else {
-    output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
-  }
-  return output_size;
-}

-template <typename T>
-static void PoolAvgPad0(std::vector<int> ksize, std::vector<int> strides,
-                        const framework::Tensor *input,
-                        framework::Tensor *out) {
-  const int32_t batch_size = input->dims()[0];
-  const int32_t input_c = input->dims()[1];
-  const int32_t input_h = input->dims()[2];
-  const int32_t input_w = input->dims()[3];
-  const int32_t out_c = out->dims()[1];
-  const int32_t out_h = out->dims()[2];
-  const int32_t out_w = out->dims()[3];
-  const int32_t kernel_h = ksize[0];
-  const int32_t kernel_w = ksize[1];
-  const int32_t stride_h = strides[0];
-  const int32_t stride_w = strides[1];
-  const int32_t inputdata_channel_stride = input_h * input_w;
-  const int32_t input_batch_stride = input_c * inputdata_channel_stride;
-  const int32_t outputdata_channel_stride = out_h * out_w;
-  const int32_t output_batch_stride = out_c * outputdata_channel_stride;
-  T *out_data = out->mutable_data<T>();
-  const T *input_data = input->data<T>();
-  const T **rows = new const T *[kernel_h];
-  for (int i = 0; i < batch_size; ++i) {
-    for (int j = 0; j < out_c; ++j) {
-      const T *img_in = input_data + j * inputdata_channel_stride;
-      T *img_out = out_data + j * outputdata_channel_stride;
-      for (int k = 0; k < out_h; ++k) {
-        for (int m = 0; m < kernel_h; ++m) {
-          rows[m] = img_in + (stride_h * k + m) * input_w;
-        }
-        int32_t left = out_w;
-        while (left > 0) {
-          float tmp = 0;
-          for (int m = 0; m < kernel_h; ++m) {
-            for (int l = 0; l < kernel_w; ++l) {
-              tmp += rows[m][l];
-            }
-          }
-          if (typeid(T) == typeid(int8_t)) {
-            tmp = tmp / (kernel_h * kernel_w);
-            if (tmp < -127) {
-              *img_out = -127;
-            } else if (tmp > 127) {
-              *img_out = 127;
-            } else {
-              *img_out = static_cast<T>(std::round(tmp));
-            }
-          } else {
-            *img_out = static_cast<T>(tmp / (kernel_h * kernel_w));
-          }
-          for (int m = 0; m < kernel_h; ++m) {
-            rows[m] += stride_w;
-          }
-          img_out++;
-          left--;
-        }
-      }
-    }
-    input_data += input_batch_stride;
-    out_data += output_batch_stride;
-  }
-  delete[] rows;
-}
+namespace math = operators::math;

-template <typename T, int CeilMode, int PoolType, int Kernel, int Pad,
-          int Stride>
+template <int PoolType, int Kernel, int Pad, int Stride>
 int TestPoolOp(int in_channels, int in_height, int in_width) {
  int kernel_h = Kernel;
  int kernel_w = Kernel;
@@ -103,7 +29,6 @@ int TestPoolOp(int in_channels, int in_height, int in_width) {
  int pad_w = Pad;
  int stride_h = Stride;
  int stride_w = Stride;
-  bool ceil_mode = CeilMode != 0;
  std::string pooling_type = (PoolType == 0 ? "max" : "avg");

  int batch_size = 1;
@@ -114,14 +39,6 @@ int TestPoolOp(int in_channels, int in_height, int in_width) {
  framework::DDim input_shape =
      framework::make_ddim({batch_size, input_c, input_h, input_w});

-  std::vector<int64_t> output_shape_v({batch_size, input_c});
-  output_shape_v.push_back(
-      PoolOutputSize(input_h, kernel_h, pad_h, stride_h, ceil_mode));
-  output_shape_v.push_back(
-      PoolOutputSize(input_w, kernel_w, pad_w, stride_w, ceil_mode));
-
-  framework::DDim output_shape = framework::make_ddim(output_shape_v);
-
  VariableNameMap inputs;
  VariableNameMap outputs;
  auto scope = std::make_shared<framework::Scope>();
@@ -130,7 +47,11 @@ int TestPoolOp(int in_channels, int in_height, int in_width) {

  auto input_var = scope.get()->Var("input");
  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(input, input_shape, -127, 127);
+  SetupTensor<float>(input, input_shape, -127, 127);
+
+  //  for (int i = 0; i < input->numel(); ++i) {
+  //    DLOG << "input[" << i << "] = " << input->data<float>()[i];
+  //  }

  auto output_var = scope.get()->Var("output");
  framework::AttributeMap attrs;
@@ -138,7 +59,8 @@ int TestPoolOp(int in_channels, int in_height, int in_width) {
  attrs["ksize"].Set<vector<int>>(std::vector<int>({kernel_h, kernel_w}));
  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
-  attrs["ceil_mode"].Set<bool>(false);
+  attrs["ceil_mode"].Set<bool>(true);
+  //  attrs["ceil_mode"].Set<bool>(false);
  attrs["global_pooling"].Set<bool>(false);

  auto *op = new operators::PoolOp<CPU, float>("pool2d", inputs, outputs, attrs,
@@ -147,43 +69,36 @@ int TestPoolOp(int in_channels, int in_height, int in_width) {
  op->Init();
  op->Run();

+  auto output = output_var->template Get<framework::LoDTensor>();
  framework::Tensor output_cmp;
-  output_cmp.mutable_data<T>(output_shape);
-  if (pooling_type == "avg" && pad_h == 0 && pad_h == pad_w) {
-    PoolAvgPad0<T>(std::vector<int>{kernel_h, kernel_w},
-                   std::vector<int>{stride_h, stride_w}, input, &output_cmp);
-  } else {
-    if (typeid(T) == typeid(int8_t)) {
-      operators::PoolBasic<int8_t, int32_t>(
-          pooling_type, std::vector<int>{kernel_h, kernel_w},
-          std::vector<int>{stride_h, stride_w}, std::vector<int>{pad_h, pad_w},
-          input, &output_cmp);
+  output_cmp.mutable_data<float>(output->dims());
+
+  if (pooling_type == "avg") {
+    math::Pooling<AVG>()(*input, std::vector<int>{kernel_h, kernel_w},
+                         std::vector<int>{stride_h, stride_w},
+                         std::vector<int>{pad_h, pad_w}, &output_cmp);
  } else {
-      operators::PoolBasic<float, float>(
-          pooling_type, std::vector<int>{kernel_h, kernel_w},
-          std::vector<int>{stride_h, stride_w}, std::vector<int>{pad_h, pad_w},
-          input, &output_cmp);
-    }
+    math::Pooling<MAX>()(*input, std::vector<int>{kernel_h, kernel_w},
+                         std::vector<int>{stride_h, stride_w},
+                         std::vector<int>{pad_h, pad_w}, &output_cmp);
  }

  // compare results
-  int eq = 0;
-  int neq = 0;
-  auto output = output_var->template Get<framework::LoDTensor>();
-  const T *output_data = output->data<T>();
-  T *output_cmp_data = output_cmp.data<T>();
+  const float *output_data = output->data<float>();
+  float *output_cmp_data = output_cmp.data<float>();
  for (int i = 0; i < output->numel(); ++i) {
-    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
-                          "The execution of test_pool_op is failed!");
-    if (output_data[i] == output_cmp_data[i]) {
-      ++eq;
-    } else {
-      ++neq;
+    float gap = output_data[i] - output_cmp_data[i];
+    //    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
+    //                          "output[%d] = %d, output_cmp[%d] = %d", i,
+    //                          output_data[i], i, output_cmp_data[i]);
+    if (gap > 1e-5 && std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
+      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
+                     << ", output_cmp_data[" << i
+                     << "] = " << output_cmp_data[i];
+      exit(1);
    }
  }
-  std::cout << "eq = " << eq << ", neq = " << neq << std::endl;
  delete op;
-
  return 0;
 }
 }  // namespace paddle_mobile
@@ -202,91 +117,80 @@ int main(int argc, char *argv[]) {
  int in_channels = atoi(argv[1]);
  int in_height = atoi(argv[2]);
  int in_width = atoi(argv[3]);
-#if __ARM_NEON
-  // kernel = 3, pad = 1, stride = 1
  LOG(paddle_mobile::kLOG_INFO)
-      << "float, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1";
-  paddle_mobile::TestPoolOp<float, 0, 0, 3, 1, 1>(in_channels, in_height,
-                                                  in_width);
-  // kernel = 3, pad = 0, stride = 2
+      << "float, pooling_type=max, kernel=3, pad=0, stride=1";
+  paddle_mobile::TestPoolOp<0, 3, 0, 1>(in_channels, in_height, in_width);
  LOG(paddle_mobile::kLOG_INFO)
-      << "float, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2";
-  paddle_mobile::TestPoolOp<float, 0, 0, 3, 0, 2>(in_channels, in_height,
-                                                  in_width);
-#endif
-  // kernel = 3, pad = 0, stride = 1
+      << "float, pooling_type=max, kernel=3, pad=1, stride=1";
+  paddle_mobile::TestPoolOp<0, 3, 1, 1>(in_channels, in_height, in_width);
  LOG(paddle_mobile::kLOG_INFO)
-      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=1";
-  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 0, 1>(in_channels, in_height,
-                                                   in_width);
-  // kernel = 3, pad = 1, stride = 1
+      << "float, pooling_type=max, kernel=3, pad=2, stride=1";
+  paddle_mobile::TestPoolOp<0, 3, 2, 1>(in_channels, in_height, in_width);
  LOG(paddle_mobile::kLOG_INFO)
-      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1";
-  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 1, 1>(in_channels, in_height,
-                                                   in_width);
-  // kernel = 3, pad = 2, stride = 1
-  LOG(paddle_mobile::kLOG_INFO)
-      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=1";
-  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 2, 1>(in_channels, in_height,
-                                                   in_width);
-  // kernel = 3, pad = 0, stride = 2
+      << "float, pooling_type=max, kernel=3, pad=5, stride=1";
+  paddle_mobile::TestPoolOp<0, 3, 5, 1>(in_channels, in_height, in_width);
+
  LOG(paddle_mobile::kLOG_INFO)
-      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2";
-  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 0, 2>(in_channels, in_height,
-                                                   in_width);
-  // kernel = 3, pad = 1, stride = 2
+      << "float, pooling_type=avg, kernel=3, pad=0, stride=1";
+  paddle_mobile::TestPoolOp<1, 3, 0, 1>(in_channels, in_height, in_width);
  LOG(paddle_mobile::kLOG_INFO)
-      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=2";
-  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 1, 2>(in_channels, in_height,
-                                                   in_width);
-  // kernel = 3, pad = 0, stride = 2
+      << "float, pooling_type=avg, kernel=3, pad=1, stride=1";
+  paddle_mobile::TestPoolOp<1, 3, 1, 1>(in_channels, in_height, in_width);
  LOG(paddle_mobile::kLOG_INFO)
-      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=2";
-  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 2, 2>(in_channels, in_height,
-                                                   in_width);
-  // kernel = 3, pad = 3, stride = 3
+      << "float, pooling_type=avg, kernel=3, pad=2, stride=1";
+  paddle_mobile::TestPoolOp<1, 3, 2, 1>(in_channels, in_height, in_width);
  LOG(paddle_mobile::kLOG_INFO)
-      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=3, stride=3";
-  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 3, 3>(in_channels, in_height,
-                                                   in_width);
-  // kernel = 7, pad = 0, stride = 1
+      << "float, pooling_type=avg, kernel=3, pad=5, stride=1";
+  paddle_mobile::TestPoolOp<1, 3, 5, 1>(in_channels, in_height, in_width);
+
  LOG(paddle_mobile::kLOG_INFO)
-      << "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=1";
-  paddle_mobile::TestPoolOp<int8_t, 0, 1, 7, 0, 1>(in_channels, in_height,
-                                                   in_width);
-  // kernel = 7, pad = 0, stride = 2
+      << "float, pooling_type=max, kernel=3, pad=0, stride=2";
+  paddle_mobile::TestPoolOp<0, 3, 0, 2>(in_channels, in_height, in_width);
  LOG(paddle_mobile::kLOG_INFO)
-      << "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=2";
-  paddle_mobile::TestPoolOp<int8_t, 0, 1, 7, 0, 2>(in_channels, in_height,
-                                                   in_width);
-  // kernel = 7, pad = 0, stride = 3
+      << "float, pooling_type=max, kernel=3, pad=1, stride=2";
+  paddle_mobile::TestPoolOp<0, 3, 1, 2>(in_channels, in_height, in_width);
  LOG(paddle_mobile::kLOG_INFO)
-      << "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=3";
-  paddle_mobile::TestPoolOp<int8_t, 0, 1, 7, 0, 3>(in_channels, in_height,
-                                                   in_width);
-  // kernel = 3, pad = 0, stride = 1
+      << "float, pooling_type=max, kernel=3, pad=2, stride=2";
+  paddle_mobile::TestPoolOp<0, 3, 2, 2>(in_channels, in_height, in_width);
  LOG(paddle_mobile::kLOG_INFO)
-      << "int8_t, ceil_mode=false, pooling_type=avg, kernel=3, pad=0, stride=1";
-  paddle_mobile::TestPoolOp<int8_t, 0, 1, 3, 0, 1>(in_channels, in_height,
-                                                   in_width);
-  // kernel = 3, pad = 0, stride = 3
+      << "float, pooling_type=max, kernel=3, pad=5, stride=2";
+  paddle_mobile::TestPoolOp<0, 3, 5, 2>(in_channels, in_height, in_width);
+
  LOG(paddle_mobile::kLOG_INFO)
-      << "int8_t, ceil_mode=false, pooling_type=avg, kernel=3, pad=0, stride=3";
-  paddle_mobile::TestPoolOp<int8_t, 0, 1, 3, 0, 3>(in_channels, in_height,
-                                                   in_width);
-  // kernel = 7, pad = 0, stride = 1
+      << "float, pooling_type=avg, kernel=3, pad=0, stride=2";
+  paddle_mobile::TestPoolOp<1, 3, 0, 2>(in_channels, in_height, in_width);
  LOG(paddle_mobile::kLOG_INFO)
-      << "float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=1";
-  paddle_mobile::TestPoolOp<float, 0, 1, 7, 0, 1>(in_channels, in_height,
-                                                  in_width);
-  // kernel = 7, pad = 0, stride = 4
+      << "float, pooling_type=avg, kernel=3, pad=1, stride=2";
+  paddle_mobile::TestPoolOp<1, 3, 1, 2>(in_channels, in_height, in_width);
  LOG(paddle_mobile::kLOG_INFO)
-      << "float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=4";
-  paddle_mobile::TestPoolOp<float, 0, 1, 7, 0, 4>(in_channels, in_height,
-                                                  in_width);
-  // kernel = 5, pad = 0, stride = 1
+      << "float, pooling_type=avg, kernel=3, pad=2, stride=2";
+  paddle_mobile::TestPoolOp<1, 3, 2, 2>(in_channels, in_height, in_width);
  LOG(paddle_mobile::kLOG_INFO)
-      << "float, ceil_mode=false, pooling_type=avg, kernel=5, pad=0, stride=1";
-  paddle_mobile::TestPoolOp<float, 0, 1, 5, 0, 1>(in_channels, in_height,
-                                                  in_width);
+      << "float, pooling_type=avg, kernel=3, pad=5, stride=2";
+  paddle_mobile::TestPoolOp<1, 3, 5, 2>(in_channels, in_height, in_width);
+
+  //  // kernel = 5, pad = 0, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO)
+  //      << "float, ceil_mode=false, pooling_type=avg, kernel=5, pad=0,
+  //      stride=1";
+  //  paddle_mobile::TestPoolOp<float, 0, 1, 5, 0, 1>(in_channels, in_height,
+  //                                                  in_width);
+  //  // kernel = 5, pad = 0, stride = 2
+  //  LOG(paddle_mobile::kLOG_INFO)
+  //      << "float, ceil_mode=false, pooling_type=avg, kernel=5, pad=0,
+  //      stride=1";
+  //  paddle_mobile::TestPoolOp<float, 0, 1, 5, 0, 2>(in_channels, in_height,
+  //                                                  in_width);
+  //  // kernel = 7, pad = 0, stride = 1
+  //  LOG(paddle_mobile::kLOG_INFO)
+  //      << "float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0,
+  //      stride=1";
+  //  paddle_mobile::TestPoolOp<float, 0, 1, 7, 0, 1>(in_channels, in_height,
+  //                                                  in_width);
+  //  // kernel = 7, pad = 0, stride = 4
+  //  LOG(paddle_mobile::kLOG_INFO)
+  //      << "float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0,
+  //      stride=4";
+  //  paddle_mobile::TestPoolOp<float, 0, 1, 7, 0, 4>(in_channels, in_height,
+  //                                                  in_width);
 }
--- a/test/operators/test_softmax_op.cpp
+++ b/test/operators/test_softmax_op.cpp
@@ -12,29 +12,88 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <math.h>
+#include <limits>
 #include "../test_include.h"
-
 #include "operators/softmax_op.h"

-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_mobilenet));
-  if (program.originProgram == nullptr) {
-    DLOG << "program read file";
+namespace paddle_mobile {
+
+void Softmax(const framework::Tensor *X, framework::Tensor *Y) {
+  const framework::DDim &dims = X->dims();
+  int batch_size = dims[0];
+  int num_classes = dims[dims.size() - 1];
+  int channels = X->numel() / batch_size / num_classes;
+  const float *x = X->data<float>();
+  float *y = Y->mutable_data<float>();
+
+  for (int batch = 0; batch < batch_size; ++batch) {
+    for (int c = 0; c < channels; ++c) {
+      size_t offset = (batch * channels + c) * num_classes;
+      const float *input = x + offset;
+      float *output = y + offset;
+      float max = -std::numeric_limits<float>::max();
+      for (int j = 0; j < num_classes; ++j) {
+        max = (input[j] > max) ? input[j] : max;
+      }
+      float sum = 0.f;
+      for (int j = 0; j < num_classes; ++j) {
+        float tmp = std::expf(input[j] - max);
+        sum += tmp;
+        output[j] = tmp;
+      }
+      for (int j = 0; j < num_classes; ++j) {
+        output[j] /= sum;
+      }
+    }
+  }
+}
+
+int TestSoftmaxOp(const std::vector<int> input_shape) {
+  framework::DDim dims = framework::make_ddim(input_shape);
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["X"] = std::vector<std::string>({"input"});
+  outputs["Out"] = std::vector<std::string>({"output"});
+
+  auto input_var = scope.get()->Var("input");
+  auto input = input_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<float>(input, dims, -100.0, 100.0);
+
+  auto output_var = scope.get()->Var("output");
+  auto output = output_var->template Get<framework::LoDTensor>();
+
+  framework::AttributeMap attrs;
+  auto *op = new operators::SoftmaxOp<CPU, float>("softmax", inputs, outputs,
+                                                  attrs, scope);
+  op->InferShape();
+  op->Init();
+  op->Run();
+
+  framework::Tensor output_cmp;
+  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
+  Softmax(input, &output_cmp);
+
+  const float *output_data = output->data<float>();
+  for (int i = 0; i < output->numel(); ++i) {
+    float gap = output_data[i] - output_cmp_data[i];
+    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
+      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
+                     << ", output_cmp_data[" << i
+                     << "] = " << output_cmp_data[i];
+      delete op;
+      exit(1);
    }
-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::SoftmaxOp<paddle_mobile::CPU, float>>
-      executor(program, "softmax");
-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {1, 1000}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto out_ddim = paddle_mobile::framework::make_ddim({1, 1000});
-  auto output =
-      executor.Predict(input, "reshape_0.tmp_0", "softmax_0.tmp_0", out_ddim);
-  auto *output_ptr = output->data<float>();
-  for (int j = 0; j < output->numel(); ++j) {
-    DLOG << " value of output: " << output_ptr[j];
  }
+  delete op;
+  return 0;
+}
+
+}  // namespace paddle_mobile

+int main(int argc, char *argv[]) {
+  TestSoftmaxOp({128, 1000});
+  TestSoftmaxOp({128, 10, 1000});
  return 0;
 }
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -213,8 +213,6 @@ if(NOT FOUND_MATCH)
  set(FUSION_CONVADD_OP ON)
  set(FUSION_CONVADDPRELU_OP ON)
  set(FUSION_CONVADDRELU_OP ON)
-  set(FUSION_CONVADDRELU_INT8_OP ON)
-  set(FUSION_FC_INT8_OP ON)
  set(FUSION_FC_OP ON)
  set(LRN_OP ON)
  set(MUL_OP ON)
@@ -249,11 +247,16 @@ if(NOT FOUND_MATCH)
  set(SHAPE_OP ON)
  set(ELEMENTWISEMUL_OP ON)
  set(SUM_OP ON)
+  set(TOP_K_OP ON)
+  set(CAST_OP ON)
  set(QUANT_OP ON)
  set(DEQUANT_OP ON)
+  set(FUSION_DEQUANT_BN_OP ON)
  set(FUSION_DEQUANT_ADD_BN_OP ON)
  set(FUSION_DEQUANT_BN_RELU_OP ON)
  set(FUSION_DEQUANT_ADD_BN_RELU_OP ON)
+  set(FUSION_DEQUANT_ADD_BN_QUANT_OP ON)
+  set(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP ON)
 endif()

  # option(BATCHNORM_OP "" ON)
@@ -311,9 +314,6 @@ endif()
 if (FUSION_CONVADDRELU_OP)
  add_definitions(-DFUSION_CONVADDRELU_OP)
 endif()
-if (FUSION_CONVADDRELU_INT8_OP)
-  add_definitions(-DFUSION_CONVADDRELU_INT8_OP)
-endif()
 if (FUSION_CONVADDPRELU_OP)
  add_definitions(-DFUSION_CONVADDPRELU_OP)
 endif()
@@ -323,9 +323,6 @@ endif()
 if (FUSION_FC_OP)
  add_definitions(-DFUSION_FC_OP)
 endif()
-if(FUSION_FC_INT8_OP)
-  add_definitions(-DFUSION_FC_INT8_OP)
-endif()
 if (LRN_OP)
  add_definitions(-DLRN_OP)
 endif()
@@ -454,13 +451,21 @@ endif()
 if (SUM_OP)
  add_definitions(-DSUM_OP)
 endif()
-
+if (TOP_K_OP)
+  add_definitions(-DTOP_K_OP)
+endif()
+if (CAST_OP)
+  add_definitions(-DCAST_OP)
+endif()
 if (QUANT_OP)
  add_definitions(-DQUANT_OP)
 endif()
 if (DEQUANT_OP)
  add_definitions(-DDEQUANT_OP)
 endif()
+if (FUSION_DEQUANT_BN_OP)
+  add_definitions(-DFUSION_DEQUANT_BN_OP)
+endif()
 if (FUSION_DEQUANT_ADD_BN_OP)
  add_definitions(-DFUSION_DEQUANT_ADD_BN_OP)
 endif()
@@ -470,7 +475,12 @@ endif()
 if (FUSION_DEQUANT_ADD_BN_RELU_OP)
  add_definitions(-DFUSION_DEQUANT_ADD_BN_RELU_OP)
 endif()
-
+if (FUSION_DEQUANT_ADD_BN_QUANT_OP)
+#  add_definitions(-DFUSION_DEQUANT_ADD_BN_QUANT_OP)
+endif()
+if (FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
+#  add_definitions(-DFUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
+endif()

 if (TANH_OP)
  add_definitions(-DTANH_OP)
@@ -484,4 +494,3 @@ endif()
 if (FUSION_DECONVADDRELU_OP)
  add_definitions(-DFUSION_DECONVADDRELU_OP)
 endif()
-