diff --git a/README.md b/README.md
index 1a478db3770e1f5e518594fd2fefabb686cf3c38..ee4e20513186979fe76c1259e7fc3ca962426843 100644
--- a/README.md
+++ b/README.md
@@ -26,61 +26,10 @@ Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 
 - **ARM CPU**
 
-|mobilenet arm v7|1线程|2线程|4线程|
-|------------|----|-----|-----|
-|麒麟970(ms)|108.180|63.935|37.545|
-|麒麟960(ms)|108.588|63.073|36.822|
-|高通845(ms)|85.952|48.890|28.641|
-|高通835(ms)|105.434|62.752|37.131|
-|||||
-|mobilenetssd arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|212.686|127.205|77.485|
-|麒麟960(ms)|212.641|125.338|75.250|
-|高通845(ms)|182.863|95.671|56.857|
-|高通835(ms)|213.849|127.717|77.006|
-|||||
-|googlenet(v1) arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|335.288|234.559|161.295|
-|麒麟960(ms)|354.443|232.642|157.815|
-|高通845(ms)|282.007|173.146|122.148|
-|高通835(ms)|341.250|233.354|158.554|
-|||||
-|squeezenet arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|83.726|57.944|36.923|
-|麒麟960(ms)|85.835|55.762|36.496|
-|高通845(ms)|71.301|41.618|28.785|
-|高通835(ms)|82.407|56.176|36.455|
-|||||
-|yolo arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|129.658|79.993|49.969|
-|麒麟960(ms)|130.208|78.791|48.390|
-|高通845(ms)|109.244|61.736|40.600|
-|高通835(ms)|130.402|80.863|50.359|
-
-    测试机型信息：
-    麒麟970:荣耀v10     (2.36GHz * 4 + 1.8GHz * 4)
-    麒麟960:华为mate9   (2.36GHz * 4 + 1.8GHz * 4)
-    骁龙835:小米6       (2.45GHz * 4 + 1.9GHz * 4)
-    骁龙845:OPPO FindX  (2.80GHz * 4 + 1.8GHz * 4)
-    
 - **Mali GPU**
 
-    Mali GPU是百度和ARM合作开发的，双方团队近期都在致力于将paddle的op能无缝运行在ACL(arm compute library)。目前已经支持squeezenet，googlenet，resnet等几个网络模型，后续会继续加大力度。使全部移动端paddle op能高效运行在mali gpu上。 
-
 - **苹果设备的GPU Metal实现**
 
-|mobilenetfssd|速度|
-|------------|-----|
-|A9(ms)|33.78|
-|A10(ms)|24.05|
-|A11(ms)|17.15|
-|||
-|genet|速度|
-|A9(ms) |3.49|
-|A10(ms)|2.54|
-|A11(ms)|1.43|
-
-
 - **FPGA**
 
     目前已经支持 ZCU102 开发板。
diff --git a/doc/development_fpga.md b/doc/development_fpga.md
index 14cc57c6b4055e8c4e45d8b673eb1e3be22ae256..3389ddde676a5d1c7b452dc734880eb50170bd3e 100644
--- a/doc/development_fpga.md
+++ b/doc/development_fpga.md
@@ -27,8 +27,9 @@ ___
 ## 准备模型和数据
 ___
 1. 模型文件放在./test/models/resnet50中。将[\_\_model\_\_](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)文件复制到此文件夹下。
-2. 另外下载模型[权重文件](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar),解压后也放在./test/models/resnet50 中。
-3. 将数据文件[image_src_float](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)复制到/test/images下。此数据文件对应着标准数据集中的ILSVRC2012_val_00000885.JPEG，分类标签为80， 对应着"black grouse".
+2. 如果不存在，则创建文件夹./test/models/resnet50 和 ./test/images。
+3. 另外下载模型[权重文件](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar),解压后也放在./test/models/resnet50 中。
+4. 将数据文件[image_src_float](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)复制到./test/images下。此数据文件对应着标准数据集中的ILSVRC2012_val_00000885.JPEG，分类标签为80， 对应着"black grouse"。
 
 ## 运行程序
 ___
diff --git a/doc/development_ios.md b/doc/development_ios.md
index 1d4f28bd5bcde1c3068ddeae87627ae6686d886a..1dbc7555e8ed6db94071c571673212d0ce2b7a71 100644
--- a/doc/development_ios.md
+++ b/doc/development_ios.md
@@ -34,7 +34,7 @@ cd ../build/release/ios/build
 libpaddle-mobile.a
 
 /src/ios_io/ 下的
-PaddleMobile.h
+PaddleMobileCPU.h
 ```
 拖入工程
 
diff --git a/src/common/types.cpp b/src/common/types.cpp
index 8c8de7765161e61dc75036a87a34fc6abd2df43e..b90fb70f2a81b365f049632cc7281a69ec58e18d 100644
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -40,9 +40,11 @@ const char *G_OP_TYPE_POOL2D = "pool2d";
 const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
 const char *G_OP_TYPE_RELU = "relu";
 const char *G_OP_TYPE_RESHAPE = "reshape";
+const char *G_OP_TYPE_RESHAPE2 = "reshape2";
 const char *G_OP_TYPE_SIGMOID = "sigmoid";
 const char *G_OP_TYPE_SOFTMAX = "softmax";
 const char *G_OP_TYPE_TRANSPOSE = "transpose";
+const char *G_OP_TYPE_TRANSPOSE2 = "transpose2";
 const char *G_OP_TYPE_SPLIT = "split";
 const char *G_OP_TYPE_FEED = "feed";
 const char *G_OP_TYPE_FETCH = "fetch";
@@ -90,6 +92,7 @@ std::unordered_map<
         {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
         {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}},
         {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_TRANSPOSE2, {{"X"}, {"Out", "XShape"}}},
         {G_OP_TYPE_BOX_CODER,
          {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
         {G_OP_TYPE_FUSION_CONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
@@ -99,6 +102,7 @@ std::unordered_map<
         {G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}},
         {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
         {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_RESHAPE2, {{"X"}, {"Out", "XShape"}}},
         {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
         {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
         {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
diff --git a/src/framework/load_ops.h b/src/framework/load_ops.h
index 2b76b0158fe06e8678208f6f98fcdb71f8d91e51..982f1c0f3525afde8475866c0121343fafc9d5a0 100644
--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -109,9 +109,15 @@ LOAD_FUSION_MATCHER(fusion_conv_add_bn_relu);
 #ifdef RESHAPE_OP
 LOAD_OP2(reshape, CPU, MALI_GPU);
 #endif
+#ifdef RESHAPE2_OP
+LOAD_OP2(reshape2, CPU, MALI_GPU);
+#endif
 #ifdef TRANSPOSE_OP
 LOAD_OP1(transpose, CPU);
 #endif
+#ifdef TRANSPOSE2_OP
+LOAD_OP1(transpose2, CPU);
+#endif
 #ifdef PRIORBOX_OP
 LOAD_OP1(prior_box, CPU);
 #endif
@@ -221,5 +227,9 @@ LOAD_FUSION_MATCHER(fusion_conv_bn);
 #ifdef ELEMENTWISESUB_OP
 LOAD_OP1(elementwise_sub, CPU)
 #endif
+#ifdef QUANT_OP
 LOAD_OP1(quantize, CPU);
+#endif
+#ifdef DEQUANT_OP
 LOAD_OP1(dequantize, CPU);
+#endif
diff --git a/src/operators/kernel/arm/quantize_kernel.cpp b/src/operators/kernel/arm/quantize_kernel.cpp
index e7552d2602b31f9a5c10e3d81122babae8fcf1a8..11a1f0a53d4886e1a07d258b76b3827671471dca 100644
--- a/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
@@ -135,11 +135,15 @@ static void quantize_round_to_even(const Tensor *input, const float scale,
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
   size_t loop = size >> 4;
   size_t remain = size & 0xF;
+
+  #pragma omp parallel for
   for (size_t i = 0; i < loop; ++i) {
-    float32x4_t r0 = vld1q_f32(x);
-    float32x4_t r1 = vld1q_f32(x + 4);
-    float32x4_t r2 = vld1q_f32(x + 8);
-    float32x4_t r3 = vld1q_f32(x + 12);
+    const float *local_x = x + (i << 4);
+    int8_t *local_y = y + (i << 4);
+    float32x4_t r0 = vld1q_f32(local_x);
+    float32x4_t r1 = vld1q_f32(local_x + 4);
+    float32x4_t r2 = vld1q_f32(local_x + 8);
+    float32x4_t r3 = vld1q_f32(local_x + 12);
     r0 = vmulq_n_f32(r0, scale);
     r1 = vmulq_n_f32(r1, scale);
     r2 = vmulq_n_f32(r2, scale);
@@ -156,12 +160,12 @@ static void quantize_round_to_even(const Tensor *input, const float scale,
     int16x8_t q6 = vcombine_s16(d2, d3);
     int8x8_t d5 = vmovn_s16(q5);
     int8x8_t d6 = vmovn_s16(q6);
-    vst1_s8(y, d5);
-    vst1_s8(y + 8, d6);
-    x += 16;
-    y += 16;
+    vst1_s8(local_y, d5);
+    vst1_s8(local_y + 8, d6);
   }
   size = remain;
+  x += (loop << 4);
+  y += (loop << 4);
 #endif
   for (size_t i = 0; i < size; ++i) {
     float value = x[i] * scale;
@@ -187,11 +191,15 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
 #ifdef defined(__ARM_NEON__) || defined(__ARM_NEON)
   size_t loop = size >> 4;
   size_t remain = size & 0xF;
+
+  #pragma omp parallel for
   for (size_t i = 0; i < loop; ++i) {
-    float32x4_t r0 = vld1q_f32(x);
-    float32x4_t r1 = vld1q_f32(x + 4);
-    float32x4_t r2 = vld1q_f32(x + 8);
-    float32x4_t r3 = vld1q_f32(x + 12);
+    const float *local_x = x + (i << 4);
+    int8_t *local_y = y + (i << 4);
+    float32x4_t r0 = vld1q_f32(local_x);
+    float32x4_t r1 = vld1q_f32(local_x + 4);
+    float32x4_t r2 = vld1q_f32(local_x + 8);
+    float32x4_t r3 = vld1q_f32(local_x + 12);
     r0 = vmulq_n_f32(r0, scale);
     r1 = vmulq_n_f32(r1, scale);
     r2 = vmulq_n_f32(r2, scale);
@@ -208,12 +216,12 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
     int16x8_t q6 = vcombine_s16(d2, d3);
     int8x8_t d5 = vmovn_s16(q5);
     int8x8_t d6 = vmovn_s16(q6);
-    vst1_s8(y, d5);
-    vst1_s8(y + 8, d6);
-    x += 16;
-    y += 16;
+    vst1_s8(local_y, d5);
+    vst1_s8(local_y + 8, d6);
   }
   size = remain;
+  x += (loop << 4);
+  y += (loop << 4);
 #endif
   for (size_t i = 0; i < size; ++i) {
     y[i] = trunc(x[i] * scale);
@@ -228,11 +236,15 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale,
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
   size_t loop = size >> 4;
   size_t remain = size & 0xF;
+
+  #pragma omp parallel for
   for (size_t i = 0; i < loop; ++i) {
-    float32x4_t r0 = vld1q_f32(x);
-    float32x4_t r1 = vld1q_f32(x + 4);
-    float32x4_t r2 = vld1q_f32(x + 8);
-    float32x4_t r3 = vld1q_f32(x + 12);
+    const float *local_x = x + (i << 4);
+    int8_t *local_y = y + (i << 4);
+    float32x4_t r0 = vld1q_f32(local_x);
+    float32x4_t r1 = vld1q_f32(local_x + 4);
+    float32x4_t r2 = vld1q_f32(local_x + 8);
+    float32x4_t r3 = vld1q_f32(local_x + 12);
     r0 = vmulq_n_f32(r0, scale);
     r1 = vmulq_n_f32(r1, scale);
     r2 = vmulq_n_f32(r2, scale);
@@ -249,12 +261,12 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale,
     int16x8_t q6 = vcombine_s16(d2, d3);
     int8x8_t d5 = vmovn_s16(q5);
     int8x8_t d6 = vmovn_s16(q6);
-    vst1_s8(y, d5);
-    vst1_s8(y + 8, d6);
-    x += 16;
-    y += 16;
+    vst1_s8(local_y, d5);
+    vst1_s8(local_y + 8, d6);
   }
   size = remain;
+  x += (loop << 4);
+  y += (loop << 4);
 #endif
   for (size_t i = 0; i < size; ++i) {
     y[i] = round(x[i] * scale);
diff --git a/src/operators/kernel/arm/reshape2_kernel.cpp b/src/operators/kernel/arm/reshape2_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..83bbf112abb8b5e290126d6909a0fe77291f8fac
--- /dev/null
+++ b/src/operators/kernel/arm/reshape2_kernel.cpp
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE2_OP
+
+#include "operators/kernel/reshape2_kernel.h"
+#include "operators/kernel/central-arm-func/reshape2_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool Reshape2Kernel<CPU, float>::Init(Reshape2Param<CPU> *param) {
+  return true;
+}
+
+template <>
+void Reshape2Kernel<CPU, float>::Compute(
+    const Reshape2Param<CPU> &param) const {
+  Reshape2Compute<float>(param);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/arm/transpose2_kernel.cpp b/src/operators/kernel/arm/transpose2_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..656d2768840a52f50c42d3797018aa9aec037783
--- /dev/null
+++ b/src/operators/kernel/arm/transpose2_kernel.cpp
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef TRANSPOSE2_OP
+
+#include "operators/kernel/transpose2_kernel.h"
+#include "operators/kernel/central-arm-func/transpose2_arm_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool Transpose2Kernel<CPU, float>::Init(Transpose2Param<CPU> *param) {
+  return true;
+}
+
+template <>
+void Transpose2Kernel<CPU, float>::Compute(
+    const Transpose2Param<CPU> &param) const {
+  Transpose2Compute<float>(param);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h
index f80a8f944139566483c47daf10f9decac49650dc..e7a8c7f52db327f3ff5871566c3557c484ba4d13 100644
--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
-#include "operators/math/conv_arm_int8.h"
 #include "operators/math/conv_func.h"
 #include "operators/math/depthwise_conv_3x3.h"
 #include "operators/math/im2col.h"
@@ -28,11 +27,12 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 
-template <typename Dtype>
+template <typename Itype, typename Otype>
 inline void ConvBasic(const ConvParam<CPU> &param) {
   const Tensor *input = param.Input();
   Tensor filter = *param.Filter();
   Tensor *output = param.Output();
+  output->mutable_data<Otype>();
   int groups = param.Groups();
   const std::vector<int> strides = param.Strides();
   const std::vector<int> paddings = param.Paddings();
@@ -60,7 +60,7 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
   Tensor col;
   Tensor col_matrix;
   if (is_expand) {
-    col.mutable_data<Dtype>(col_shape);
+    col.mutable_data<Itype>(col_shape);
     col_matrix.ShareDataWith(col);
     col_matrix.Resize(col_matrix_shape);
   }
@@ -79,8 +79,8 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
   int in_step = static_cast<int>(input->dims()[1]) / groups;
   int out_step = static_cast<int>(output->dims()[1]) / groups;
 
-  math::Vol2ColFunctor<CPU, Dtype> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Dtype> im2col;
+  math::Vol2ColFunctor<CPU, Itype> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Itype> im2col;
 
   for (int i = 0; i < batch_size; i++) {
     Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
@@ -109,69 +109,18 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
       Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
       Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
 
-      math::matmul<Dtype>(filter_slice, false, col_matrix, false,
+      math::matmul<Itype>(filter_slice, false, col_matrix, false,
                           static_cast<float>(1), &out_slice,
                           static_cast<float>(0));
     }
   }
 }
 
-inline void ConvCompute_int8(const ConvParam<CPU> &param) {
-  typedef void (*ConvFunc)(const Tensor &input, const Tensor &kernel,
-                           Tensor *output);
-  static ConvFunc conv_funcs_table[7][5] = {
-      {0, 0, 0, 0, 0},                                // k = 1
-      {0, 0, 0, 0, 0}, {conv3x3s1_int8, 0, 0, 0, 0},  // k = 3
-      {0, 0, 0, 0, 0}, {conv5x5s1_int8, 0, 0, 0, 0},  // k = 5
-      {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0},               // k = 7
-  };
-  const Tensor *input = param.Input();
-  Tensor *filter = param.Filter();
-  Tensor *output = param.Output();
-  int groups = param.Groups();
-  const std::vector<int> &strides = param.Strides();
-  const std::vector<int> &paddings = param.Paddings();
-  const std::vector<int> &dilations = param.Dilations();
-  int kernel_h = filter->dims()[2];
-  int kernel_w = filter->dims()[3];
-  output->mutable_data<int32_t>();
-
-  ConvFunc conv_func = 0;
-  if (strides[1] == strides[0] && strides[1] < 6 && kernel_h == kernel_w &&
-      kernel_h < 8 && groups == 1 && dilations[0] == dilations[1] &&
-      dilations[1] == 1) {
-    conv_func = conv_funcs_table[kernel_h - 1][strides[0] - 1];
-  }
-  if (conv_func) {
-    int batch_size = input->dims()[0];
-    math::PadFunctor<CPU, int8_t> pad;
-
-    Tensor input_pad;
-    for (int i = 0; i < batch_size; ++i) {
-      Tensor in_batch = input->Slice(i, i + 1);
-      Tensor out_batch = output->Slice(i, i + 1);
-      if (paddings[0] == 0 && paddings[1] == 0) {
-        input_pad = in_batch;
-      } else {
-        framework::DDim pad_shape = in_batch.dims();
-        pad_shape[2] += 2 * paddings[0];
-        pad_shape[3] += 2 * paddings[1];
-        input_pad.mutable_data<int8_t>(pad_shape);
-        pad(in_batch, paddings[0], paddings[1], &input_pad);
-      }
-      conv_func(input_pad, *filter, &out_batch);
-    }
-  } else {
-    ConvBasic<int8_t>(param);
-  }
-}
-
 template <typename P>
 void ConvCompute(const ConvParam<CPU> &param) {
   if (param.Input()->type() == typeid(int8_t)) {
-    ConvCompute_int8(param);
+    ConvBasic<int8_t, int32_t>(param);
   } else {
-    param.Output()->mutable_data<float>();
     if (param.Groups() == param.Input()->dims()[1] &&
         param.Input()->dims()[1] == param.Output()->dims()[1] &&
         param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
@@ -185,7 +134,7 @@ void ConvCompute(const ConvParam<CPU> &param) {
       math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
                              param.Filter(), nullptr, param.Output(), false);
     } else {
-      ConvBasic<float>(param);
+      ConvBasic<float, float>(param);
     }
   }
 }
diff --git a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
index ff5d5d4b2a351d075fcecce209063aa66e026754..73170bdab922a46831334307aebc8af210ddfb73 100644
--- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
@@ -44,7 +44,7 @@ void DepthwiseConvCompute(const ConvParam<CPU> &param) {
                                  Bias, false);
 
   } else {
-    ConvBasic<float>(param);
+    ConvBasic<float, float>(param);
   }
 }
 
diff --git a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
index 0c01ef0072444479d2d2e2f7676b842d89e432ec..b6288380a04c71b3d6467f7f6648db046ae9acc9 100644
--- a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
+++ b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
@@ -58,6 +58,7 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
   const float *input_data = input_x->data<float>();
   float *output_data = Out->mutable_data<float>();
   for (int i = 0; i < batch; ++i) {
+    #pragma omp parallel for
     for (int j = 0; j < channels; ++j) {
       size_t offset = (i * channels + j) * elementwise_num;
       const float *input = input_data + offset;
diff --git a/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h b/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
index 6db4297046fba8cbb8028f1c70d8214b703158b6..9cbac1035faf4cdc5109a08ea78dfafa8e1df7f2 100644
--- a/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
+++ b/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
@@ -26,7 +26,7 @@ void PolygonBoxTransformCompute(const PolygonBoxTransformParam<CPU>& param) {
   const auto& input_dims = input->dims();
   const auto* input_data = input->data<float>();
   auto* output = param.Output();
-  auto* output_data = output->mutable_data<float>();
+  auto* output_data = output->mutable_data<float>(input_dims);
 
   int64_t batch_size = input_dims[0];
   int64_t geo_channel = input_dims[1];
diff --git a/src/operators/kernel/central-arm-func/reshape2_arm_func.h b/src/operators/kernel/central-arm-func/reshape2_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..c22cf120313b039944932fb4e6cc52aa59a68fd4
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/reshape2_arm_func.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE2_OP
+#pragma once
+
+#include <vector>
+#include "operators/kernel/reshape_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void Reshape2Compute(const Reshape2Param<CPU> &param) {
+  const auto *input_x = param.InputX();
+  const auto &input_x_dims = input_x->dims();
+  auto *out = param.Out();
+  framework::DDim out_dims = out->dims();
+  const auto *input_shape = param.InputShape();
+
+  if (input_shape) {
+    auto *shape_data = input_shape->data<int>();
+    framework::Tensor cpu_shape_tensor;
+    auto shape =
+        std::vector<int>(shape_data, shape_data + input_shape->numel());
+    out_dims = ValidateShape(shape, input_x->dims());
+  } else {
+    auto &shape = param.Shape();
+    out_dims = ValidateShape(shape, input_x_dims);
+  }
+
+  bool inplace = param.Inplace();
+  out->Resize(out_dims);
+  if (!inplace) {
+    out->mutable_data<float>();
+    framework::TensorCopy(*input_x, out);
+    out->Resize(out_dims);
+  } else {
+    out->ShareDataWith(*input_x);
+    out->Resize(out_dims);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/central-arm-func/transpose2_arm_func.h b/src/operators/kernel/central-arm-func/transpose2_arm_func.h
new file mode 100644
index 0000000000000000000000000000000000000000..dea90e863b20f19820d60d9cce67b6849d3c467b
--- /dev/null
+++ b/src/operators/kernel/central-arm-func/transpose2_arm_func.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TRANSPOSE2_OP
+#pragma once
+
+#include <vector>
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename P>
+void Transpose2Compute(const Transpose2Param<CPU>& param) {
+  const auto* input_x = param.InputX();
+  const auto input_x_dims = input_x->dims();
+  auto* out = param.Out();
+  const auto axis = param.Axis();
+  const auto* input_x_data = input_x->data<float>();
+  auto* out_data = out->mutable_data<float>();
+
+  size_t ndim = axis.size();
+  std::vector<int> xdim(ndim);
+  std::vector<int> xstride(ndim);
+  std::vector<int> xout(ndim);
+  for (int i = 0; i < ndim; i++) {
+    int j = ndim - 1 - i;
+    xdim[j] = input_x_dims[axis[i]];
+    xstride[j] = 1;
+    for (int k = axis[i] + 1; k < ndim; k++) {
+      xstride[j] *= input_x_dims[k];
+    }
+    xout[j] = xstride[j] * xdim[j];
+  }
+
+  auto numel = input_x->numel();
+  size_t pind = 0;
+  std::vector<int> ind(ndim);
+  for (int i = 0; i < numel; i++) {
+    out_data[i] = input_x_data[pind];
+    ind[0]++;
+    pind += xstride[0];
+    for (int j = 0; j < ndim - 1; j++) {
+      if (ind[j] == xdim[j]) {
+        ind[j + 1]++;
+        ind[j] = 0;
+        pind += xstride[j + 1];
+        pind -= xout[j];
+      } else {
+        break;
+      }
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/kernel/reshape2_kernel.h b/src/operators/kernel/reshape2_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d15a619d314e3f5d3085a34cff503e286b5ee37
--- /dev/null
+++ b/src/operators/kernel/reshape2_kernel.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE2_OP
+
+#pragma once
+
+#include <vector>
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class Reshape2Kernel
+    : public framework::OpKernelBase<DeviceType, Reshape2Param<DeviceType>> {
+ public:
+  void Compute(const Reshape2Param<DeviceType>& param) const;
+  bool Init(Reshape2Param<DeviceType>* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/math/conv_arm_int8.h b/src/operators/kernel/transpose2_kernel.h
similarity index 60%
rename from src/operators/math/conv_arm_int8.h
rename to src/operators/kernel/transpose2_kernel.h
index 98843e6158bb0f9816bf49a1cbced5a2ea731446..8ae75ea483ddb887d9c53b32228ff72b41c76097 100644
--- a/src/operators/math/conv_arm_int8.h
+++ b/src/operators/kernel/transpose2_kernel.h
@@ -12,25 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef CONV_OP
+#ifdef TRANSPOSE2_OP
 
 #pragma once
 
-#include "framework/tensor.h"
+#include <vector>
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
 
 namespace paddle_mobile {
 namespace operators {
 
-void conv3x3s1_int8(const framework::Tensor& input,
-                    const framework::Tensor& weight, framework::Tensor* output);
-
-void conv3x3s1_int8_4c(const framework::Tensor& input,
-                       const framework::Tensor& weight,
-                       framework::Tensor* output);
-
-void conv5x5s1_int8(const framework::Tensor& input,
-                    const framework::Tensor& weight, framework::Tensor* output);
-
+template <typename DeviceType, typename T>
+class Transpose2Kernel
+    : public framework::OpKernelBase<DeviceType, Transpose2Param<DeviceType>> {
+ public:
+  void Compute(const Transpose2Param<DeviceType>& param) const;
+  bool Init(Transpose2Param<DeviceType>* param);
+};
 }  // namespace operators
 }  // namespace paddle_mobile
 
diff --git a/src/operators/math/conv3x3_arm_int8.cpp b/src/operators/math/conv3x3_arm_int8.cpp
deleted file mode 100644
index 283dcb2255b43052dcaf2d622ad629e923810a82..0000000000000000000000000000000000000000
--- a/src/operators/math/conv3x3_arm_int8.cpp
+++ /dev/null
@@ -1,761 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#include "operators/math/conv_arm_int8.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-void conv3x3s1_int8(const framework::Tensor& input,
-                    const framework::Tensor& weight,
-                    framework::Tensor* output) {
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  const int8_t* in_data = input.data<int8_t>();
-  const int8_t* w_data = weight.data<int8_t>();
-  int32_t* out_data = output->mutable_data<int32_t>();
-  // make sure that batch size is 1
-  int input_c = input.dims()[1];
-  int input_h = input.dims()[2];
-  int input_w = input.dims()[3];
-  int output_c = output->dims()[1];
-  int output_h = output->dims()[2];
-  int output_w = output->dims()[3];
-  int image_size = input_h * input_w;
-  int out_image_size = output_h * output_w;
-  memset(out_data, 0, output_c * out_image_size * sizeof(int32_t));
-#if __aarch64__
-  // TODO(hjchen2)
-#else
-  int oc = 0;
-  #pragma omp parallel for
-  for (; oc < output_c - 1; oc += 2) {
-    for (int ic = 0; ic < input_c; ++ic) {
-      const int8_t* kernel0 = w_data + (oc * input_c + ic) * 9;
-      const int8_t* kernel1 = w_data + ((oc + 1) * input_c + ic) * 9;
-      int32_t* output0 = out_data + oc * out_image_size;
-      int32_t* output0n = output0 + output_w;
-      int32_t* output1 = out_data + (oc + 1) * out_image_size;
-      int32_t* output1n = output1 + output_w;
-
-      int oh = 0;
-      for (; oh < output_h - 1; oh += 2) {
-        const int8_t* r0 = in_data + ic * image_size + oh * input_w;
-        const int8_t* r1 = r0 + input_w;
-        const int8_t* r2 = r1 + input_w;
-        const int8_t* r3 = r2 + input_w;
-
-        int ow = output_w >> 3;
-        int remain = output_w & 0x7;
-        if (ow > 0) {
-          asm volatile(
-              "vld1.8     {d0}, [%[kernel0]]        \n"
-              "ldr        r5,   [%[kernel0], #8]    \n"
-              "vld1.8     {d1}, [%[kernel1]]        \n"
-              "ldr        r6,   [%[kernel1], #8]    \n"
-
-              "0:                                   \n"
-              "vld1.8     {d2-d3}, [%[r0]]          \n"  // r0
-              "add        %[r0], #8                 \n"
-              "vext.8     d4, d2, d3, #1            \n"
-              "vext.8     d5, d2, d3, #2            \n"
-              "vdup.s8    d6, d0[0]                 \n"
-              "vdup.s8    d7, d0[1]                 \n"
-              "vdup.s8    d8, d0[2]                 \n"
-              "vdup.s8    d9, d1[0]                 \n"
-              "vdup.s8    d10, d1[1]                \n"
-              "vdup.s8    d11, d1[2]                \n"
-              "vmull.s8   q6, d2, d6                \n"
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddl.s16  q12, d12, d14             \n"
-              "vaddl.s16  q13, d13, d15             \n"
-              "vmull.s8   q6, d2, d9                \n"
-              "vmull.s8   q7, d4, d10               \n"
-              "vmlal.s8   q6, d5, d11               \n"
-              "vaddl.s16  q14, d12, d14             \n"
-              "vaddl.s16  q15, d13, d15             \n"
-
-              "vld1.8     {d2-d3}, [%[r1]]          \n"  // r1
-              "add        %[r1], #8                 \n"
-              "vext.8     d4, d2, d3, #1            \n"
-              "vext.8     d5, d2, d3, #2            \n"
-
-              "vmull.s8   q6, d2, d6                \n"  // next row
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddl.s16  q8, d12, d14              \n"
-              "vaddl.s16  q9, d13, d15              \n"
-              "vmull.s8   q6, d2, d9                \n"
-              "vmull.s8   q7, d4, d10               \n"
-              "vmlal.s8   q6, d5, d11               \n"
-              "vaddl.s16  q10, d12, d14             \n"
-              "vaddl.s16  q11, d13, d15             \n"
-
-              "vdup.s8    d6, d0[3]                 \n"
-              "vdup.s8    d7, d0[4]                 \n"
-              "vdup.s8    d8, d0[5]                 \n"
-              "vdup.s8    d9, d1[3]                 \n"
-              "vdup.s8    d10, d1[4]                \n"
-              "vdup.s8    d11, d1[5]                \n"
-              "vmull.s8   q6, d2, d6                \n"
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddw.s16  q12, q12, d12             \n"
-              "vaddw.s16  q13, q13, d13             \n"
-              "vaddw.s16  q12, q12, d14             \n"
-              "vaddw.s16  q13, q13, d15             \n"
-              "vmull.s8   q6, d2, d9                \n"
-              "vmull.s8   q7, d4, d10               \n"
-              "vmlal.s8   q6, d5, d11               \n"
-              "vaddw.s16  q14, q14, d12             \n"
-              "vaddw.s16  q15, q15, d13             \n"
-              "vaddw.s16  q14, q14, d14             \n"
-              "vaddw.s16  q15, q15, d15             \n"
-
-              "vld1.8     {d2-d3}, [%[r2]]          \n"  // r2
-              "add        %[r2], #8                 \n"
-              "vext.8     d4, d2, d3, #1            \n"
-              "vext.8     d5, d2, d3, #2            \n"
-
-              "vmull.s8   q6, d2, d6                \n"  // next row
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddw.s16  q8, q8, d12               \n"
-              "vaddw.s16  q8, q8, d14               \n"
-              "vaddw.s16  q9, q9, d13               \n"
-              "vaddw.s16  q9, q9, d15               \n"
-              "vmull.s8   q6, d2, d9                \n"
-              "vmull.s8   q7, d4, d10               \n"
-              "vmlal.s8   q6, d5, d11               \n"
-              "vaddw.s16  q10, q10, d12             \n"
-              "vaddw.s16  q11, q11, d13             \n"
-              "vaddw.s16  q10, q10, d14             \n"
-              "vaddw.s16  q11, q11, d15             \n"
-
-              "vdup.s8    d6, d0[6]                 \n"
-              "vdup.s8    d7, d0[7]                 \n"
-              "vdup.s8    d8, r5                    \n"
-              "vdup.s8    d9, d1[6]                 \n"
-              "vdup.s8    d10, d1[7]                \n"
-              "vdup.s8    d11, r6                   \n"
-              "vmull.s8   q6, d2, d6                \n"
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddw.s16  q12, q12, d12             \n"
-              "vaddw.s16  q13, q13, d13             \n"
-              "vaddw.s16  q12, q12, d14             \n"
-              "vaddw.s16  q13, q13, d15             \n"
-
-              "vld1.32    {d12-d15}, [%[output0]]   \n"
-              "vadd.s32   q6, q6, q12               \n"
-              "vadd.s32   q7, q7, q13               \n"
-              "vst1.32    {d12-d15}, [%[output0]]!  \n"
-
-              "vmull.s8   q6, d2, d9                \n"
-              "vmull.s8   q7, d4, d10               \n"
-              "vmlal.s8   q6, d5, d11               \n"
-              "vaddw.s16  q14, q14, d12             \n"
-              "vaddw.s16  q15, q15, d13             \n"
-              "vaddw.s16  q14, q14, d14             \n"
-              "vaddw.s16  q15, q15, d15             \n"
-
-              "vld1.32    {d12-d15}, [%[output1]]   \n"
-              "vadd.s32   q6, q6, q14               \n"
-              "vadd.s32   q7, q7, q15               \n"
-              "vst1.32    {d12-d15}, [%[output1]]!  \n"
-
-              "vld1.8     {d2-d3}, [%[r3]]          \n"  // r3
-              "add        %[r3], #8                 \n"
-              "vext.8     d4, d2, d3, #1            \n"
-              "vext.8     d5, d2, d3, #2            \n"
-
-              "vmull.s8   q6, d2, d6                \n"  // next row
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddw.s16  q8, q8, d12               \n"
-              "vaddw.s16  q9, q9, d15               \n"
-              "vaddw.s16  q8, q8, d14               \n"
-              "vaddw.s16  q9, q9, d13               \n"
-
-              "vld1.32    {d12-d15}, [%[output0n]]  \n"
-              "vadd.s32   q6, q6, q8                \n"
-              "vadd.s32   q7, q7, q9                \n"
-              "vst1.32    {d12-d15}, [%[output0n]]! \n"
-
-              "vmull.s8   q6, d2, d9                \n"
-              "vmull.s8   q7, d4, d10               \n"
-              "vmlal.s8   q6, d5, d11               \n"
-              "vaddw.s16  q10, q10, d12             \n"
-              "vaddw.s16  q11, q11, d15             \n"
-              "vaddw.s16  q10, q10, d14             \n"
-              "vaddw.s16  q11, q11, d13             \n"
-
-              "vld1.32    {d12-d15}, [%[output1n]]  \n"
-              "vadd.s32   q6, q6, q10               \n"
-              "vadd.s32   q7, q7, q11               \n"
-              "vst1.32    {d12-d15}, [%[output1n]]! \n"
-
-              "subs       %[ow], #1                 \n"
-              "bne        0b                        \n"
-              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
-                [ow] "+r"(ow), [output0] "+r"(output0), [output1] "+r"(output1),
-                [output0n] "+r"(output0n), [output1n] "+r"(output1n)
-              : [kernel0] "r"(kernel0), [kernel1] "r"(kernel1)
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r5",
-                "r6");
-        }
-        if (remain > 0) {
-          asm volatile(
-              "vld1.8     {d0}, [%[kernel0]]       \n"
-              "ldr        r5,   [%[kernel0], #8]   \n"
-              "vld1.8     {d1}, [%[kernel1]]       \n"
-              "ldr        r6,   [%[kernel1], #8]   \n"
-
-              "0:                                  \n"
-              "vld1.8     d4, [%[r0]]              \n"
-              "vld1.8     d5, [%[r1]]              \n"
-              "vld1.8     d6, [%[r2]]              \n"
-              "vld1.8     d7, [%[r3]]              \n"
-              "add        %[r0], #1                \n"
-              "add        %[r1], #1                \n"
-              "add        %[r2], #1                \n"
-              "add        %[r3], #1                \n"
-              "vdup.s8     d2, r5                  \n"
-              "vdup.s8     d3, r6                  \n"
-              "vext.8     d8, d0, d2, #3           \n"
-              "vext.8     d9, d0, d2, #6           \n"
-              "vext.8     d10, d1, d3, #3          \n"
-              "vext.8     d11, d1, d3, #6          \n"
-
-              "vmull.s8   q6, d4, d0               \n"
-              "vmull.s8   q7, d5, d8               \n"
-              "vmlal.s8   q6, d6, d9               \n"
-              "vaddl.s16  q12, d12, d14            \n"
-              "vdup.s32   d2, d24[1]               \n"
-              "vadd.s32   d24, d24, d2             \n"
-              "vadd.s32   d24, d24, d25            \n"
-              "vmull.s8   q6, d4, d1               \n"
-              "vmull.s8   q7, d5, d10              \n"
-              "vmlal.s8   q6, d6, d11              \n"
-              "vaddl.s16  q13, d12, d14            \n"
-              "vdup.s32   d2, d26[1]               \n"
-              "vadd.s32   d26, d26, d2             \n"
-              "vadd.s32   d26, d26, d27            \n"
-
-              "ldr        r7, [%[output0]]         \n"
-              "vdup.s32   d14, r7                  \n"
-              "vadd.s32   d14, d14, d24            \n"
-              "vst1.32    d14[0], [%[output0]]!    \n"
-              "ldr        r7, [%[output1]]         \n"
-              "vdup.s32   d14, r7                  \n"
-              "vadd.s32   d14, d14, d26            \n"
-              "vst1.32    d14[0], [%[output1]]!    \n"
-
-              "vmull.s8   q6, d5, d0               \n"
-              "vmull.s8   q7, d6, d8               \n"
-              "vmlal.s8   q6, d7, d9               \n"
-              "vaddl.s16  q12, d12, d14            \n"
-              "vdup.s32   d2, d24[1]               \n"
-              "vadd.s32   d24, d24, d2             \n"
-              "vadd.s32   d24, d24, d25            \n"
-              "vmull.s8   q6, d5, d1               \n"
-              "vmull.s8   q7, d6, d10              \n"
-              "vmlal.s8   q6, d7, d11              \n"
-              "vaddl.s16  q13, d12, d14            \n"
-              "vdup.s32   d2, d26[1]               \n"
-              "vadd.s32   d26, d26, d2             \n"
-              "vadd.s32   d26, d26, d27            \n"
-
-              "ldr        r7, [%[output0n]]        \n"
-              "vdup.s32   d14, r7                  \n"
-              "vadd.s32   d14, d14, d24            \n"
-              "vst1.32    d14[0], [%[output0n]]!   \n"
-              "ldr        r7, [%[output1n]]        \n"
-              "vdup.s32   d14, r7                  \n"
-              "vadd.s32   d14, d14, d26            \n"
-              "vst1.32    d14[0], [%[output1n]]!   \n"
-
-              "subs       %[remain], #1            \n"
-              "bne        0b                       \n"
-              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
-                [remain] "+r"(remain), [output0] "+r"(output0),
-                [output1] "+r"(output1), [output0n] "+r"(output0n),
-                [output1n] "+r"(output1n)
-              : [kernel0] "r"(kernel0), [kernel1] "r"(kernel1)
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q8", "q9", "q10", "r5", "r6", "r7");
-        }
-        output0 += output_w;
-        output1 += output_w;
-        output0n += output_w;
-        output1n += output_w;
-      }
-      // remain output height
-      for (; oh < output_h; ++oh) {
-        const int8_t* r0 = in_data + ic * image_size + oh * input_w;
-        const int8_t* r1 = r0 + input_w;
-        const int8_t* r2 = r1 + input_w;
-        const int8_t* r3 = r2 + input_w;
-        const int8_t* r4 = r3 + input_w;
-
-        int ow = output_w >> 3;
-        int remain = output_w & 0x7;
-        if (ow > 0) {
-          asm volatile(
-              "vld1.8     {d0}, [%[kernel0]]        \n"
-              "ldr        r5,   [%[kernel0], #8]    \n"
-              "vld1.8     {d1}, [%[kernel1]]        \n"
-              "ldr        r6,   [%[kernel1], #8]    \n"
-
-              "0:                                   \n"
-              "vld1.8     {d2-d3}, [%[r0]]          \n"  // r0
-              "add        %[r0], #8                 \n"
-              "vext.8     d4, d2, d3, #1            \n"
-              "vext.8     d5, d2, d3, #2            \n"
-              "vdup.s8    d6, d0[0]                 \n"
-              "vdup.s8    d7, d0[1]                 \n"
-              "vdup.s8    d8, d0[2]                 \n"
-              "vdup.s8    d9, d1[0]                 \n"
-              "vdup.s8    d10, d1[1]                \n"
-              "vdup.s8    d11, d1[2]                \n"
-              "vmull.s8   q6, d2, d6                \n"
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddl.s16  q12, d12, d14             \n"
-              "vaddl.s16  q13, d13, d15             \n"
-              "vmull.s8   q6, d2, d9                \n"
-              "vmull.s8   q7, d4, d10               \n"
-              "vmlal.s8   q6, d5, d11               \n"
-              "vaddl.s16  q14, d12, d14             \n"
-              "vaddl.s16  q15, d13, d15             \n"
-
-              "vld1.8     {d2-d3}, [%[r1]]          \n"  // r1
-              "add        %[r1], #8                 \n"
-              "vext.8     d4, d2, d3, #1            \n"
-              "vext.8     d5, d2, d3, #2            \n"
-              "vdup.s8    d6, d0[3]                 \n"
-              "vdup.s8    d7, d0[4]                 \n"
-              "vdup.s8    d8, d0[5]                 \n"
-              "vdup.s8    d9, d1[3]                 \n"
-              "vdup.s8    d10, d1[4]                \n"
-              "vdup.s8    d11, d1[5]                \n"
-              "vmull.s8   q6, d2, d6                \n"
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddw.s16  q12, q12, d12             \n"
-              "vaddw.s16  q12, q12, d14             \n"
-              "vaddw.s16  q13, q13, d13             \n"
-              "vaddw.s16  q13, q13, d15             \n"
-              "vmull.s8   q6, d2, d9                \n"
-              "vmull.s8   q7, d4, d10               \n"
-              "vmlal.s8   q6, d5, d11               \n"
-              "vaddw.s16  q14, q14, d12             \n"
-              "vaddw.s16  q14, q14, d14             \n"
-              "vaddw.s16  q15, q15, d13             \n"
-              "vaddw.s16  q15, q15, d15             \n"
-
-              "vld1.8     {d2-d3}, [%[r2]]          \n"  // r2
-              "add        %[r2], #8                 \n"
-              "vext.8     d4, d2, d3, #1            \n"
-              "vext.8     d5, d2, d3, #2            \n"
-              "vdup.s8    d6, d0[6]                 \n"
-              "vdup.s8    d7, d0[7]                 \n"
-              "vdup.s8    d8, r5                    \n"
-              "vdup.s8    d9, d1[6]                 \n"
-              "vdup.s8    d10, d1[7]                \n"
-              "vdup.s8    d11, r6                   \n"
-              "vmull.s8   q6, d2, d6                \n"
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddw.s16  q12, q12, d12             \n"
-              "vaddw.s16  q12, q12, d14             \n"
-              "vaddw.s16  q13, q13, d13             \n"
-              "vaddw.s16  q13, q13, d15             \n"
-              "vmull.s8   q6, d2, d9                \n"
-              "vmull.s8   q7, d4, d10               \n"
-              "vmlal.s8   q6, d5, d11               \n"
-              "vaddw.s16  q14, q14, d12             \n"
-              "vaddw.s16  q14, q14, d14             \n"
-              "vaddw.s16  q15, q15, d13             \n"
-              "vaddw.s16  q15, q15, d15             \n"
-
-              "vld1.32    {d12-d15}, [%[output0]]   \n"
-              "vadd.s32   q6, q6, q12               \n"
-              "vadd.s32   q7, q7, q13               \n"
-              "vst1.32    {d12-d15}, [%[output0]]!  \n"
-              "vld1.32    {d12-d15}, [%[output1]]  \n"
-              "vadd.s32   q6, q6, q14               \n"
-              "vadd.s32   q7, q7, q15               \n"
-              "vst1.32    {d12-d15}, [%[output1]]! \n"
-
-              "subs       %[ow], #1                 \n"
-              "bne        0b                        \n"
-              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [ow] "+r"(ow),
-                [output0] "+r"(output0), [output1] "+r"(output1)
-              : [kernel0] "r"(kernel0), [kernel1] "r"(kernel1)
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r5",
-                "r6");
-        }
-
-        if (remain > 0) {
-          asm volatile(
-              "vld1.8     {d0}, [%[kernel0]]       \n"
-              "ldr        r5,   [%[kernel0], #8]   \n"
-              "vld1.8     {d1}, [%[kernel1]]       \n"
-              "ldr        r6,   [%[kernel1], #8]   \n"
-
-              "0:                                  \n"
-              "vld1.8     d4, [%[r0]]              \n"
-              "vld1.8     d5, [%[r1]]              \n"
-              "vld1.8     d6, [%[r2]]              \n"
-              "add        %[r0], #1                \n"
-              "add        %[r1], #1                \n"
-              "add        %[r2], #1                \n"
-              "vdup.s8     d2, r5                  \n"
-              "vdup.s8     d3, r6                  \n"
-              "vext.8     d8, d0, d2, #3           \n"
-              "vext.8     d9, d0, d2, #6           \n"
-              "vext.8     d10, d1, d3, #3          \n"
-              "vext.8     d11, d1, d3, #6          \n"
-
-              "vmull.s8   q6, d4, d0               \n"
-              "vmull.s8   q7, d5, d8               \n"
-              "vmlal.s8   q6, d6, d9               \n"
-              "vaddl.s16  q12, d12, d14            \n"
-              "vdup.s32   d2, d24[1]               \n"
-              "vadd.s32   d24, d24, d2             \n"
-              "vadd.s32   d24, d24, d25            \n"
-              "vmull.s8   q6, d4, d1               \n"
-              "vmull.s8   q7, d5, d10              \n"
-              "vmlal.s8   q6, d6, d11              \n"
-              "vaddl.s16  q13, d12, d14            \n"
-              "vdup.s32   d2, d26[1]               \n"
-              "vadd.s32   d26, d26, d2             \n"
-              "vadd.s32   d26, d26, d27            \n"
-
-              "ldr        r7, [%[output0]]         \n"
-              "vdup.s32   d14, r7                  \n"
-              "vadd.s32   d14, d14, d24            \n"
-              "vst1.32    d14[0], [%[output0]]!    \n"
-              "ldr        r7, [%[output1]]         \n"
-              "vdup.s32   d14, r7                  \n"
-              "vadd.s32   d14, d14, d26            \n"
-              "vst1.32    d14[0], [%[output1]]!    \n"
-
-              "subs       %[remain], #1            \n"
-              "bne        0b                       \n"
-              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2),
-                [remain] "+r"(remain), [output0] "+r"(output0),
-                [output1] "+r"(output1)
-              : [kernel0] "r"(kernel0), [kernel1] "r"(kernel1)
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q8", "q9", "q10", "r5", "r6", "r7");
-        }
-      }
-    }
-  }
-
-  for (; oc < output_c; ++oc) {
-    for (int ic = 0; ic < input_c; ++ic) {
-      const int8_t* kernel0 = w_data + (oc * input_c + ic) * 9;
-      int32_t* output0 = out_data + oc * out_image_size;
-      int32_t* output0n = output0 + output_w;
-
-      int oh = 0;
-      for (; oh < output_h - 1; oh += 2) {
-        const int8_t* r0 = in_data + ic * image_size + oh * input_w;
-        const int8_t* r1 = r0 + input_w;
-        const int8_t* r2 = r1 + input_w;
-        const int8_t* r3 = r2 + input_w;
-
-        int ow = output_w >> 3;
-        int remain = output_w & 0x7;
-        if (ow > 0) {
-          asm volatile(
-              "vld1.8     {d0}, [%[kernel0]]        \n"
-              "ldr        r5,   [%[kernel0], #8]    \n"
-
-              "0:                                   \n"
-              "vld1.8     {d2-d3}, [%[r0]]          \n"  // r0
-              "add        %[r0], #8                 \n"
-              "vext.8     d4, d2, d3, #1            \n"
-              "vext.8     d5, d2, d3, #2            \n"
-              "vdup.s8    d6, d0[0]                 \n"
-              "vdup.s8    d7, d0[1]                 \n"
-              "vdup.s8    d8, d0[2]                 \n"
-              "vmull.s8   q6, d2, d6                \n"
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddl.s16  q12, d12, d14             \n"
-              "vaddl.s16  q13, d13, d15             \n"
-
-              "vld1.8     {d2-d3}, [%[r1]]          \n"  // r1
-              "add        %[r1], #8                 \n"
-              "vext.8     d4, d2, d3, #1            \n"
-              "vext.8     d5, d2, d3, #2            \n"
-
-              "vmull.s8   q6, d2, d6                \n"  // next row
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddl.s16  q8, d12, d14              \n"
-              "vaddl.s16  q9, d13, d15              \n"
-
-              "vdup.s8    d6, d0[3]                 \n"
-              "vdup.s8    d7, d0[4]                 \n"
-              "vdup.s8    d8, d0[5]                 \n"
-              "vmull.s8   q6, d2, d6                \n"
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddw.s16  q12, q12, d12             \n"
-              "vaddw.s16  q12, q12, d14             \n"
-              "vaddw.s16  q13, q13, d13             \n"
-              "vaddw.s16  q13, q13, d15             \n"
-
-              "vld1.8     {d2-d3}, [%[r2]]          \n"  // r2
-              "add        %[r2], #8                 \n"
-              "vext.8     d4, d2, d3, #1            \n"
-              "vext.8     d5, d2, d3, #2            \n"
-
-              "vmull.s8   q6, d2, d6                \n"  // next row
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddw.s16  q8, q8, d12               \n"
-              "vaddw.s16  q8, q8, d14               \n"
-              "vaddw.s16  q9, q9, d13               \n"
-              "vaddw.s16  q9, q9, d15               \n"
-
-              "vdup.s8    d6, d0[6]                 \n"
-              "vdup.s8    d7, d0[7]                 \n"
-              "vdup.s8    d8, r5                    \n"
-              "vmull.s8   q6, d2, d6                \n"
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddw.s16  q12, q12, d12             \n"
-              "vaddw.s16  q12, q12, d14             \n"
-              "vaddw.s16  q13, q13, d13             \n"
-              "vaddw.s16  q13, q13, d15             \n"
-
-              "vld1.32    {d12-d15}, [%[output0]]   \n"
-              "vadd.s32   q6, q6, q12               \n"
-              "vadd.s32   q7, q7, q13               \n"
-              "vst1.32    {d12-d15}, [%[output0]]!  \n"
-
-              "vld1.8     {d2-d3}, [%[r3]]          \n"  // r3
-              "add        %[r3], #8                 \n"
-              "vext.8     d4, d2, d3, #1            \n"
-              "vext.8     d5, d2, d3, #2            \n"
-
-              "vmull.s8   q6, d2, d6                \n"  // next row
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddw.s16  q8, q8, d12               \n"
-              "vaddw.s16  q8, q8, d14               \n"
-              "vaddw.s16  q9, q9, d13               \n"
-              "vaddw.s16  q9, q9, d15               \n"
-
-              "vld1.32    {d12-d15}, [%[output0n]]  \n"
-              "vadd.s32   q6, q6, q8                \n"
-              "vadd.s32   q7, q7, q9                \n"
-              "vst1.32    {d12-d15}, [%[output0n]]! \n"
-
-              "subs       %[ow], #1                 \n"
-              "bne        0b                        \n"
-              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
-                [ow] "+r"(ow), [output0] "+r"(output0),
-                [output0n] "+r"(output0n)
-              : [kernel0] "r"(kernel0)
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r5");
-        }
-        if (remain > 0) {
-          asm volatile(
-              "vld1.8     {d0}, [%[kernel0]]       \n"
-              "ldr        r5,   [%[kernel0], #8]   \n"
-
-              "0:                                  \n"
-              "vld1.8     d4, [%[r0]]              \n"
-              "vld1.8     d5, [%[r1]]              \n"
-              "vld1.8     d6, [%[r2]]              \n"
-              "vld1.8     d7, [%[r3]]              \n"
-              "add        %[r0], #1                \n"
-              "add        %[r1], #1                \n"
-              "add        %[r2], #1                \n"
-              "add        %[r3], #1                \n"
-              "vdup.s8    d2, r5                   \n"
-              "vext.8     d8, d0, d2, #3           \n"
-              "vext.8     d9, d0, d2, #6           \n"
-
-              "vmull.s8   q6, d4, d0               \n"
-              "vmull.s8   q7, d5, d8               \n"
-              "vmlal.s8   q6, d6, d9               \n"
-              "vaddl.s16  q12, d12, d14            \n"
-              "vdup.s32   d2, d24[1]               \n"
-              "vadd.s32   d24, d24, d2             \n"
-              "vadd.s32   d24, d24, d25            \n"
-
-              "ldr        r7, [%[output0]]         \n"
-              "vdup.s32   d14, r7                  \n"
-              "vadd.s32   d14, d14, d24            \n"
-              "vst1.32    d14[0], [%[output0]]!    \n"
-
-              "vmull.s8   q6, d5, d0               \n"
-              "vmull.s8   q7, d6, d8               \n"
-              "vmlal.s8   q6, d7, d9               \n"
-              "vaddl.s16  q12, d12, d14            \n"
-              "vdup.s32   d2, d24[1]               \n"
-              "vadd.s32   d24, d24, d2             \n"
-              "vadd.s32   d24, d24, d25            \n"
-
-              "ldr        r7, [%[output0n]]        \n"
-              "vdup.s32   d14, r7                  \n"
-              "vadd.s32   d14, d14, d24            \n"
-              "vst1.32    d14[0], [%[output0n]]!   \n"
-
-              "subs       %[remain], #1            \n"
-              "bne        0b                       \n"
-              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
-                [remain] "+r"(remain), [output0] "+r"(output0),
-                [output0n] "+r"(output0n)
-              : [kernel0] "r"(kernel0)
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q8", "q9", "q10", "r5", "r7");
-        }
-        output0 += output_w;
-        output0n += output_w;
-      }
-      // remain output height
-      for (; oh < output_h; ++oh) {
-        const int8_t* r0 = in_data + ic * image_size + oh * input_w;
-        const int8_t* r1 = r0 + input_w;
-        const int8_t* r2 = r1 + input_w;
-
-        int ow = output_w >> 3;
-        int remain = output_w & 0x7;
-        if (ow > 0) {
-          asm volatile(
-              "vld1.8     {d0}, [%[kernel0]]        \n"
-              "ldr        r5,   [%[kernel0], #8]    \n"
-
-              "0:                                   \n"
-              "vld1.8     {d2-d3}, [%[r0]]          \n"  // r0
-              "add        %[r0], #8                 \n"
-              "vext.8     d4, d2, d3, #1            \n"
-              "vext.8     d5, d2, d3, #2            \n"
-              "vdup.s8    d6, d0[0]                 \n"
-              "vdup.s8    d7, d0[1]                 \n"
-              "vdup.s8    d8, d0[2]                 \n"
-              "vmull.s8   q6, d2, d6                \n"
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddl.s16  q12, d12, d14             \n"
-              "vaddl.s16  q13, d13, d15             \n"
-
-              "vld1.8     {d2-d3}, [%[r1]]          \n"  // r1
-              "add        %[r1], #8                 \n"
-              "vext.8     d4, d2, d3, #1            \n"
-              "vext.8     d5, d2, d3, #2            \n"
-              "vdup.s8    d6, d0[3]                 \n"
-              "vdup.s8    d7, d0[4]                 \n"
-              "vdup.s8    d8, d0[5]                 \n"
-              "vmull.s8   q6, d2, d6                \n"
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddw.s16  q12, q12, d12             \n"
-              "vaddw.s16  q12, q12, d14             \n"
-              "vaddw.s16  q13, q13, d13             \n"
-              "vaddw.s16  q13, q13, d15             \n"
-
-              "vld1.8     {d2-d3}, [%[r2]]          \n"  // r2
-              "add        %[r2], #8                 \n"
-              "vext.8     d4, d2, d3, #1            \n"
-              "vext.8     d5, d2, d3, #2            \n"
-              "vdup.s8    d6, d0[6]                 \n"
-              "vdup.s8    d7, d0[7]                 \n"
-              "vdup.s8    d8, r5                    \n"
-              "vmull.s8   q6, d2, d6                \n"
-              "vmull.s8   q7, d4, d7                \n"
-              "vmlal.s8   q6, d5, d8                \n"
-              "vaddw.s16  q12, q12, d12             \n"
-              "vaddw.s16  q12, q12, d14             \n"
-              "vaddw.s16  q13, q13, d13             \n"
-              "vaddw.s16  q13, q13, d15             \n"
-
-              "vld1.32    {d12-d15}, [%[output0]]   \n"
-              "vadd.s32   q6, q6, q12               \n"
-              "vadd.s32   q7, q7, q13               \n"
-              "vst1.32    {d12-d15}, [%[output0]]!  \n"
-
-              "subs       %[ow], #1                 \n"
-              "bne        0b                        \n"
-              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [ow] "+r"(ow),
-                [output0] "+r"(output0)
-              : [kernel0] "r"(kernel0)
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r5");
-        }
-
-        if (remain > 0) {
-          asm volatile(
-              "vld1.8     {d0}, [%[kernel0]]       \n"
-              "ldr        r5,   [%[kernel0], #8]   \n"
-
-              "0:                                  \n"
-              "vld1.8     d4, [%[r0]]              \n"
-              "vld1.8     d5, [%[r1]]              \n"
-              "vld1.8     d6, [%[r2]]              \n"
-              "add        %[r0], #1                \n"
-              "add        %[r1], #1                \n"
-              "add        %[r2], #1                \n"
-              "vdup.s8    d2, r5                   \n"
-              "vext.8     d8, d0, d2, #3           \n"
-              "vext.8     d9, d0, d2, #6           \n"
-
-              "vmull.s8   q6, d4, d0               \n"
-              "vmull.s8   q7, d5, d8               \n"
-              "vmlal.s8   q6, d6, d9               \n"
-              "vaddl.s16  q12, d12, d14            \n"
-              "vdup.s32   d2, d24[1]               \n"
-              "vadd.s32   d24, d24, d2             \n"
-              "vadd.s32   d24, d24, d25            \n"
-
-              "ldr        r7, [%[output0]]         \n"
-              "vdup.s32   d14, r7                  \n"
-              "vadd.s32   d14, d14, d24            \n"
-              "vst1.32    d14[0], [%[output0]]!    \n"
-
-              "subs       %[remain], #1            \n"
-              "bne        0b                       \n"
-              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2),
-                [remain] "+r"(remain), [output0] "+r"(output0)
-              : [kernel0] "r"(kernel0)
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q8", "q9", "q10", "r5", "r7");
-        }
-      }
-    }
-  }
-#endif
-#else
-// TODO(hjchen2)
-#endif
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/src/operators/math/conv5x5_arm_int8.cpp b/src/operators/math/conv5x5_arm_int8.cpp
deleted file mode 100644
index c861c22d184d5428f3ab9c8f3a69b9aca5b697bd..0000000000000000000000000000000000000000
--- a/src/operators/math/conv5x5_arm_int8.cpp
+++ /dev/null
@@ -1,551 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#include "operators/math/conv_arm_int8.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-void conv5x5s1_int8(const framework::Tensor& input,
-                    const framework::Tensor& weight,
-                    framework::Tensor* output) {
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  const int8_t* in_data = input.data<int8_t>();
-  const int8_t* w_data = weight.data<int8_t>();
-  int32_t* out_data = output->mutable_data<int32_t>();
-  // make sure that batch size is 1
-  int input_c = input.dims()[1];
-  int input_h = input.dims()[2];
-  int input_w = input.dims()[3];
-  int output_c = output->dims()[1];
-  int output_h = output->dims()[2];
-  int output_w = output->dims()[3];
-  int image_size = input_h * input_w;
-  int out_image_size = output_h * output_w;
-  memset(out_data, 0, output_c * out_image_size * sizeof(int32_t));
-#if __aarch64__
-  // TODO(hjchen2)
-#else
-  #pragma omp parallel for
-  for (int oc = 0; oc < output_c; ++oc) {
-    for (int ic = 0; ic < input_c; ++ic) {
-      const int8_t* kernel = w_data + (oc * input_c + ic) * 25;
-      int32_t* output0 = out_data + oc * out_image_size;
-      int32_t* output1 = output0 + output_w;
-      int oh = 0;
-      for (; oh < output_h - 1; oh += 2) {
-        const int8_t* r0 = in_data + ic * image_size + oh * input_w;
-        const int8_t* r1 = r0 + input_w;
-        const int8_t* r2 = r1 + input_w;
-        const int8_t* r3 = r2 + input_w;
-        const int8_t* r4 = r3 + input_w;
-        const int8_t* r5 = r4 + input_w;
-
-        int ow = output_w >> 3;
-        int remain = output_w & 0x7;
-        if (ow > 0) {
-          asm volatile("vld1.8  {d0-d3}, [%[kernel]]  \n"
-                       : [kernel] "+r"(kernel)
-                       :
-                       : "cc", "memory", "q0", "q1");
-          asm volatile(
-              "0:                                  \n"
-              "vld1.8     {d4-d5}, [%[r0]]         \n"  // r0
-              "add        %[r0], #8                \n"
-              "vext.8     d6, d4, d5, #1           \n"
-              "vext.8     d7, d4, d5, #2           \n"
-              "vext.8     d8, d4, d5, #3           \n"
-              "vext.8     d9, d4, d5, #4           \n"
-              "vdup.s8    d10, d0[0]               \n"
-              "vdup.s8    d11, d0[1]               \n"
-              "vdup.s8    d12, d0[2]               \n"
-              "vdup.s8    d13, d0[3]               \n"
-              "vdup.s8    d14, d0[4]               \n"
-              "vmull.s8   q8, d4, d10              \n"
-              "vmull.s8   q9, d6, d11              \n"
-              "vmlal.s8   q8, d7, d12              \n"
-              "vmlal.s8   q9, d8, d13              \n"
-              "vaddl.s16  q14, d16, d18            \n"
-              "vaddl.s16  q15, d17, d19            \n"
-              "vmull.s8   q8, d9, d14              \n"
-              "vaddw.s16  q14, q14, d16            \n"
-              "vaddw.s16  q15, q15, d17            \n"
-
-              "vld1.8     {d4-d5}, [%[r1]]         \n"  // r1
-              "add        %[r1], #8                \n"
-              "vext.8     d6, d4, d5, #1           \n"
-              "vext.8     d7, d4, d5, #2           \n"
-              "vext.8     d8, d4, d5, #3           \n"
-              "vext.8     d9, d4, d5, #4           \n"
-
-              "vmull.s8   q8, d4, d10              \n"  // next row
-              "vmull.s8   q9, d6, d11              \n"
-              "vmlal.s8   q8, d7, d12              \n"
-              "vmlal.s8   q9, d8, d13              \n"
-              "vaddl.s16  q10, d16, d18            \n"
-              "vaddl.s16  q11, d17, d19            \n"
-              "vmull.s8   q8, d9, d14              \n"
-              "vaddw.s16  q10, q10, d16            \n"
-              "vaddw.s16  q11, q11, d17            \n"
-
-              "vdup.s8    d10, d0[5]               \n"
-              "vdup.s8    d11, d0[6]               \n"
-              "vdup.s8    d12, d0[7]               \n"
-              "vdup.s8    d13, d1[0]               \n"
-              "vdup.s8    d14, d1[1]               \n"
-              "vmull.s8   q8, d4, d10              \n"
-              "vmull.s8   q9, d6, d11              \n"
-              "vmlal.s8   q8, d7, d12              \n"
-              "vmlal.s8   q9, d8, d13              \n"
-              "vaddl.s16  q12, d16, d18            \n"
-              "vaddl.s16  q13, d17, d19            \n"
-              "vmull.s8   q8, d9, d14              \n"
-              "vaddw.s16  q12, q12, d16            \n"
-              "vaddw.s16  q13, q13, d17            \n"
-              "vadd.s32   q14, q14, q12            \n"
-              "vadd.s32   q15, q15, q13            \n"
-
-              "vld1.8     {d4-d5}, [%[r2]]         \n"  // r2
-              "add        %[r2], #8                \n"
-              "vext.8     d6, d4, d5, #1           \n"
-              "vext.8     d7, d4, d5, #2           \n"
-              "vext.8     d8, d4, d5, #3           \n"
-              "vext.8     d9, d4, d5, #4           \n"
-
-              "vmull.s8   q8, d4, d10              \n"  // next row
-              "vmull.s8   q9, d6, d11              \n"
-              "vmlal.s8   q8, d7, d12              \n"
-              "vmlal.s8   q9, d8, d13              \n"
-              "vaddl.s16  q12, d16, d18            \n"
-              "vaddl.s16  q13, d17, d19            \n"
-              "vmull.s8   q8, d9, d14              \n"
-              "vaddw.s16  q12, q12, d16            \n"
-              "vaddw.s16  q13, q13, d17            \n"
-              "vadd.s32   q10, q10, q12            \n"
-              "vadd.s32   q11, q11, q13            \n"
-
-              "vdup.s8    d10, d1[2]               \n"
-              "vdup.s8    d11, d1[3]               \n"
-              "vdup.s8    d12, d1[4]               \n"
-              "vdup.s8    d13, d1[5]               \n"
-              "vdup.s8    d14, d1[6]               \n"
-              "vmull.s8   q8, d4, d10              \n"
-              "vmull.s8   q9, d6, d11              \n"
-              "vmlal.s8   q8, d7, d12              \n"
-              "vmlal.s8   q9, d8, d13              \n"
-              "vaddl.s16  q12, d16, d18            \n"
-              "vaddl.s16  q13, d17, d19            \n"
-              "vmull.s8   q8, d9, d14              \n"
-              "vaddw.s16  q12, q12, d16            \n"
-              "vaddw.s16  q13, q13, d17            \n"
-              "vadd.s32   q14, q14, q12            \n"
-              "vadd.s32   q15, q15, q13            \n"
-
-              "vld1.8     {d4-d5}, [%[r3]]         \n"  // r3
-              "add        %[r3], #8                \n"
-              "vext.8     d6, d4, d5, #1           \n"
-              "vext.8     d7, d4, d5, #2           \n"
-              "vext.8     d8, d4, d5, #3           \n"
-              "vext.8     d9, d4, d5, #4           \n"
-
-              "vmull.s8   q8, d4, d10              \n"  // next row
-              "vmull.s8   q9, d6, d11              \n"
-              "vmlal.s8   q8, d7, d12              \n"
-              "vmlal.s8   q9, d8, d13              \n"
-              "vaddl.s16  q12, d16, d18            \n"
-              "vaddl.s16  q13, d17, d19            \n"
-              "vmull.s8   q8, d9, d14              \n"
-              "vaddw.s16  q12, q12, d16            \n"
-              "vaddw.s16  q13, q13, d17            \n"
-              "vadd.s32   q10, q10, q12            \n"
-              "vadd.s32   q11, q11, q13            \n"
-
-              "vdup.s8    d10, d1[7]               \n"
-              "vdup.s8    d11, d2[0]               \n"
-              "vdup.s8    d12, d2[1]               \n"
-              "vdup.s8    d13, d2[2]               \n"
-              "vdup.s8    d14, d2[3]               \n"
-              "vmull.s8   q8, d4, d10              \n"
-              "vmull.s8   q9, d6, d11              \n"
-              "vmlal.s8   q8, d7, d12              \n"
-              "vmlal.s8   q9, d8, d13              \n"
-              "vaddl.s16  q12, d16, d18            \n"
-              "vaddl.s16  q13, d17, d19            \n"
-              "vmull.s8   q8, d9, d14              \n"
-              "vaddw.s16  q12, q12, d16            \n"
-              "vaddw.s16  q13, q13, d17            \n"
-              "vadd.s32   q14, q14, q12            \n"
-              "vadd.s32   q15, q15, q13            \n"
-
-              "vld1.8     {d4-d5}, [%[r4]]         \n"  // r4
-              "add        %[r4], #8                \n"
-              "vext.8     d6, d4, d5, #1           \n"
-              "vext.8     d7, d4, d5, #2           \n"
-              "vext.8     d8, d4, d5, #3           \n"
-              "vext.8     d9, d4, d5, #4           \n"
-
-              "vmull.s8   q8, d4, d10              \n"  // next row
-              "vmull.s8   q9, d6, d11              \n"
-              "vmlal.s8   q8, d7, d12              \n"
-              "vmlal.s8   q9, d8, d13              \n"
-              "vaddl.s16  q12, d16, d18            \n"
-              "vaddl.s16  q13, d17, d19            \n"
-              "vmull.s8   q8, d9, d14              \n"
-              "vaddw.s16  q12, q12, d16            \n"
-              "vaddw.s16  q13, q13, d17            \n"
-              "vadd.s32   q10, q10, q12            \n"
-              "vadd.s32   q11, q11, q13            \n"
-
-              "vdup.s8    d10, d2[4]               \n"
-              "vdup.s8    d11, d2[5]               \n"
-              "vdup.s8    d12, d2[6]               \n"
-              "vdup.s8    d13, d2[7]               \n"
-              "vdup.s8    d14, d3[0]               \n"
-              "vmull.s8   q8, d4, d10              \n"
-              "vmull.s8   q9, d6, d11              \n"
-              "vmlal.s8   q8, d7, d12              \n"
-              "vmlal.s8   q9, d8, d13              \n"
-              "vaddl.s16  q12, d16, d18            \n"
-              "vaddl.s16  q13, d17, d19            \n"
-              "vmull.s8   q8, d9, d14              \n"
-              "vaddw.s16  q12, q12, d16            \n"
-              "vaddw.s16  q13, q13, d17            \n"
-              "vadd.s32   q14, q14, q12            \n"
-              "vadd.s32   q15, q15, q13            \n"
-
-              "vld1.32    {d24-d27}, [%[output0]]  \n"
-              "vadd.s32   q12, q12, q14            \n"
-              "vadd.s32   q13, q13, q15            \n"
-              "vst1.32    {d24-d27}, [%[output0]]! \n"
-
-              "vld1.8     {d4-d5}, [%[r5]]         \n"  // row 5
-              "add        %[r5], #8                \n"
-              "vext.8     d6, d4, d5, #1           \n"
-              "vext.8     d7, d4, d5, #2           \n"
-              "vext.8     d8, d4, d5, #3           \n"
-              "vext.8     d9, d4, d5, #4           \n"
-              "vmull.s8   q8, d4, d10              \n"
-              "vmull.s8   q9, d6, d11              \n"
-              "vmlal.s8   q8, d7, d12              \n"
-              "vmlal.s8   q9, d8, d13              \n"
-              "vaddl.s16  q12, d16, d18            \n"
-              "vaddl.s16  q13, d17, d19            \n"
-              "vmull.s8   q8, d9, d14              \n"
-              "vaddw.s16  q12, q12, d16            \n"
-              "vaddw.s16  q13, q13, d17            \n"
-              "vadd.s32   q10, q10, q12            \n"
-              "vadd.s32   q11, q11, q13            \n"
-
-              "vld1.32    {d24-d27}, [%[output1]]  \n"
-              "vadd.s32   q12, q12, q10            \n"
-              "vadd.s32   q13, q13, q11            \n"
-              "vst1.32    {d24-d27}, [%[output1]]! \n"
-
-              "subs       %[ow], #1                \n"
-              "bne        0b                       \n"
-              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
-                [r4] "+r"(r4), [r5] "+r"(r5), [ow] "+r"(ow),
-                [output0] "+r"(output0), [output1] "+r"(output1)
-              :
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-        }
-        if (remain > 0) {
-          asm volatile("vld1.8  {d0-d3}, [%[kernel]]  \n"
-                       : [kernel] "+r"(kernel)
-                       :
-                       : "cc", "memory", "q0", "q1");
-          asm volatile(
-              "0:                                  \n"
-              "vld1.8     d4, [%[r0]]              \n"
-              "vld1.8     d5, [%[r1]]              \n"
-              "vld1.8     d6, [%[r2]]              \n"
-              "vld1.8     d7, [%[r3]]              \n"
-              "vld1.8     d8, [%[r4]]              \n"
-              "vld1.8     d9, [%[r5]]              \n"
-              "add        %[r0], #1                \n"
-              "add        %[r1], #1                \n"
-              "add        %[r2], #1                \n"
-              "add        %[r3], #1                \n"
-              "add        %[r4], #1                \n"
-              "add        %[r5], #1                \n"
-              "vext.8     d10, d0, d1, #5          \n"
-              "vext.8     d11, d1, d2, #2          \n"
-              "vext.8     d12, d1, d2, #7          \n"
-              "vext.8     d13, d2, d3, #4          \n"
-
-              "vmull.s8   q7, d4, d0               \n"
-              "vmull.s8   q8, d5, d10              \n"
-              "vmull.s8   q9, d6, d11              \n"
-              "vmlal.s8   q8, d7, d12              \n"
-              "vmlal.s8   q9, d8, d13              \n"
-              "vaddl.s16  q10, d14, d16            \n"
-              "vaddw.s16  q10, q10, d18            \n"
-              "vadd.s32   d4, d20, d21             \n"
-              "vaddl.s16  q10, d15, d17            \n"
-              "vaddw.s16  q10, q10, d19            \n"
-              "vdup.s32   d14, d4[0]               \n"
-              "vdup.s32   d15, d4[1]               \n"
-              "vadd.s32   d15, d15, d14            \n"
-              "vdup.s32   d14, d20[0]              \n"
-              "vadd.s32   d15, d15, d14            \n"
-
-              "ldr        r6, [%[output0]]         \n"
-              "vdup.s32   d14, r6                  \n"
-              "vadd.s32   d15, d15, d14            \n"
-              "vst1.32    d15[0], [%[output0]]!    \n"
-
-              "vmull.s8   q7, d5, d0               \n"
-              "vmull.s8   q8, d6, d10              \n"
-              "vmull.s8   q9, d7, d11              \n"
-              "vmlal.s8   q8, d8, d12              \n"
-              "vmlal.s8   q9, d9, d13              \n"
-              "vaddl.s16  q10, d14, d16            \n"
-              "vaddw.s16  q10, q10, d18            \n"
-              "vadd.s32   d4, d20, d21             \n"
-              "vaddl.s16  q10, d15, d17            \n"
-              "vaddw.s16  q10, q10, d19            \n"
-              "vdup.s32   d14, d4[0]               \n"
-              "vdup.s32   d15, d4[1]               \n"
-              "vadd.s32   d15, d15, d14            \n"
-              "vdup.s32   d14, d20[0]              \n"
-              "vadd.s32   d15, d15, d14            \n"
-
-              "ldr        r6, [%[output1]]         \n"
-              "vdup.s32   d14, r6                  \n"
-              "vadd.s32   d15, d15, d14            \n"
-              "vst1.32    d15[0], [%[output1]]!    \n"
-
-              "subs       %[remain], #1            \n"
-              "bne        0b                       \n"
-              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
-                [r4] "+r"(r4), [r5] "+r"(r5), [remain] "+r"(remain),
-                [output0] "+r"(output0), [output1] "+r"(output1)
-              :
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q8", "q9", "q10", "r6");
-        }
-        output0 += output_w;
-        output1 += output_w;
-      }
-      // remain output height
-      for (; oh < output_h; ++oh) {
-        const int8_t* r0 = in_data + ic * image_size + oh * input_w;
-        const int8_t* r1 = r0 + input_w;
-        const int8_t* r2 = r1 + input_w;
-        const int8_t* r3 = r2 + input_w;
-        const int8_t* r4 = r3 + input_w;
-
-        int ow = output_w >> 3;
-        int remain = output_w & 0x7;
-        if (ow > 0) {
-          asm volatile("vld1.8  {d0-d3}, [%[kernel]]  \n"
-                       : [kernel] "+r"(kernel)
-                       :
-                       : "cc", "memory", "q0", "q1");
-          asm volatile(
-              "0:                                  \n"
-              "vld1.8     {d4-d5}, [%[r0]]         \n"  // r0
-              "add        %[r0], #8                \n"
-              "vext.8     d6, d4, d5, #1           \n"
-              "vext.8     d7, d4, d5, #2           \n"
-              "vext.8     d8, d4, d5, #3           \n"
-              "vext.8     d9, d4, d5, #4           \n"
-              "vdup.s8    d10, d0[0]               \n"
-              "vdup.s8    d11, d0[1]               \n"
-              "vdup.s8    d12, d0[2]               \n"
-              "vdup.s8    d13, d0[3]               \n"
-              "vdup.s8    d14, d0[4]               \n"
-              "vmull.s8   q8, d4, d10              \n"
-              "vmull.s8   q9, d6, d11              \n"
-              "vmlal.s8   q8, d7, d12              \n"
-              "vmlal.s8   q9, d8, d13              \n"
-              "vaddl.s16  q14, d16, d18            \n"
-              "vaddl.s16  q15, d17, d19            \n"
-              "vmull.s8   q8, d9, d14              \n"
-              "vaddw.s16  q14, q14, d16            \n"
-              "vaddw.s16  q15, q15, d17            \n"
-
-              "vld1.8     {d4-d5}, [%[r1]]         \n"  // r1
-              "add        %[r1], #8                \n"
-              "vext.8     d6, d4, d5, #1           \n"
-              "vext.8     d7, d4, d5, #2           \n"
-              "vext.8     d8, d4, d5, #3           \n"
-              "vext.8     d9, d4, d5, #4           \n"
-              "vdup.s8    d10, d0[5]               \n"
-              "vdup.s8    d11, d0[6]               \n"
-              "vdup.s8    d12, d0[7]               \n"
-              "vdup.s8    d13, d1[0]               \n"
-              "vdup.s8    d14, d1[1]               \n"
-              "vmull.s8   q8, d4, d10              \n"
-              "vmull.s8   q9, d6, d11              \n"
-              "vmlal.s8   q8, d7, d12              \n"
-              "vmlal.s8   q9, d8, d13              \n"
-              "vaddl.s16  q12, d16, d18            \n"
-              "vaddl.s16  q13, d17, d19            \n"
-              "vmull.s8   q8, d9, d14              \n"
-              "vaddw.s16  q12, q12, d16            \n"
-              "vaddw.s16  q13, q13, d17            \n"
-              "vadd.s32   q14, q14, q12            \n"
-              "vadd.s32   q15, q15, q13            \n"
-
-              "vld1.8     {d4-d5}, [%[r2]]         \n"  // r2
-              "add        %[r2], #8                \n"
-              "vext.8     d6, d4, d5, #1           \n"
-              "vext.8     d7, d4, d5, #2           \n"
-              "vext.8     d8, d4, d5, #3           \n"
-              "vext.8     d9, d4, d5, #4           \n"
-              "vdup.s8    d10, d1[2]               \n"
-              "vdup.s8    d11, d1[3]               \n"
-              "vdup.s8    d12, d1[4]               \n"
-              "vdup.s8    d13, d1[5]               \n"
-              "vdup.s8    d14, d1[6]               \n"
-              "vmull.s8   q8, d4, d10              \n"
-              "vmull.s8   q9, d6, d11              \n"
-              "vmlal.s8   q8, d7, d12              \n"
-              "vmlal.s8   q9, d8, d13              \n"
-              "vaddl.s16  q12, d16, d18            \n"
-              "vaddl.s16  q13, d17, d19            \n"
-              "vmull.s8   q8, d9, d14              \n"
-              "vaddw.s16  q12, q12, d16            \n"
-              "vaddw.s16  q13, q13, d17            \n"
-              "vadd.s32   q14, q14, q12            \n"
-              "vadd.s32   q15, q15, q13            \n"
-
-              "vld1.8     {d4-d5}, [%[r3]]         \n"  // r3
-              "add        %[r3], #8                \n"
-              "vext.8     d6, d4, d5, #1           \n"
-              "vext.8     d7, d4, d5, #2           \n"
-              "vext.8     d8, d4, d5, #3           \n"
-              "vext.8     d9, d4, d5, #4           \n"
-              "vdup.s8    d10, d1[7]               \n"
-              "vdup.s8    d11, d2[0]               \n"
-              "vdup.s8    d12, d2[1]               \n"
-              "vdup.s8    d13, d2[2]               \n"
-              "vdup.s8    d14, d2[3]               \n"
-              "vmull.s8   q8, d4, d10              \n"
-              "vmull.s8   q9, d6, d11              \n"
-              "vmlal.s8   q8, d7, d12              \n"
-              "vmlal.s8   q9, d8, d13              \n"
-              "vaddl.s16  q12, d16, d18            \n"
-              "vaddl.s16  q13, d17, d19            \n"
-              "vmull.s8   q8, d9, d14              \n"
-              "vaddw.s16  q12, q12, d16            \n"
-              "vaddw.s16  q13, q13, d17            \n"
-              "vadd.s32   q14, q14, q12            \n"
-              "vadd.s32   q15, q15, q13            \n"
-
-              "vld1.8     {d4-d5}, [%[r4]]         \n"  // r4
-              "add        %[r4], #8                \n"
-              "vext.8     d6, d4, d5, #1           \n"
-              "vext.8     d7, d4, d5, #2           \n"
-              "vext.8     d8, d4, d5, #3           \n"
-              "vext.8     d9, d4, d5, #4           \n"
-              "vdup.s8    d10, d2[4]               \n"
-              "vdup.s8    d11, d2[5]               \n"
-              "vdup.s8    d12, d2[6]               \n"
-              "vdup.s8    d13, d2[7]               \n"
-              "vdup.s8    d14, d3[0]               \n"
-              "vmull.s8   q8, d4, d10              \n"
-              "vmull.s8   q9, d6, d11              \n"
-              "vmlal.s8   q8, d7, d12              \n"
-              "vmlal.s8   q9, d8, d13              \n"
-              "vaddl.s16  q12, d16, d18            \n"
-              "vaddl.s16  q13, d17, d19            \n"
-              "vmull.s8   q8, d9, d14              \n"
-              "vaddw.s16  q12, q12, d16            \n"
-              "vaddw.s16  q13, q13, d17            \n"
-              "vadd.s32   q14, q14, q12            \n"
-              "vadd.s32   q15, q15, q13            \n"
-
-              "vld1.32    {d24-d27}, [%[output0]]  \n"
-              "vadd.s32   q12, q12, q14            \n"
-              "vadd.s32   q13, q13, q15            \n"
-              "vst1.32    {d24-d27}, [%[output0]]! \n"
-
-              "subs       %[ow], #1                \n"
-              "bne        0b                       \n"
-              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
-                [r4] "+r"(r4), [ow] "+r"(ow), [output0] "+r"(output0)
-              :
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-        }
-
-        if (remain > 0) {
-          asm volatile("vld1.8  {d0-d3}, [%[kernel]]  \n"
-                       : [kernel] "+r"(kernel)
-                       :
-                       : "cc", "memory", "q0", "q1");
-          asm volatile(
-              "0:                                  \n"
-              "vld1.8     d4, [%[r0]]              \n"
-              "vld1.8     d5, [%[r1]]              \n"
-              "vld1.8     d6, [%[r2]]              \n"
-              "vld1.8     d7, [%[r3]]              \n"
-              "vld1.8     d8, [%[r4]]              \n"
-              "add        %[r0], #1                \n"
-              "add        %[r1], #1                \n"
-              "add        %[r2], #1                \n"
-              "add        %[r3], #1                \n"
-              "add        %[r4], #1                \n"
-              "vext.8     d10, d0, d1, #5          \n"
-              "vext.8     d11, d1, d2, #2          \n"
-              "vext.8     d12, d1, d2, #7          \n"
-              "vext.8     d13, d2, d3, #4          \n"
-
-              "vmull.s8   q7, d4, d0               \n"
-              "vmull.s8   q8, d5, d10              \n"
-              "vmull.s8   q9, d6, d11              \n"
-              "vmlal.s8   q8, d7, d12              \n"
-              "vmlal.s8   q9, d8, d13              \n"
-              "vaddl.s16  q10, d14, d16            \n"
-              "vaddw.s16  q10, q10, d18            \n"
-              "vadd.s32   d4, d20, d21             \n"
-              "vaddl.s16  q10, d15, d17            \n"
-              "vaddw.s16  q10, q10, d19            \n"
-              "vdup.s32   d14, d4[0]               \n"
-              "vdup.s32   d15, d4[1]               \n"
-              "vadd.s32   d15, d15, d14            \n"
-              "vdup.s32   d14, d20[0]              \n"
-              "vadd.s32   d15, d15, d14            \n"
-
-              "ldr        r6, [%[output0]]         \n"
-              "vdup.s32   d14, r6                  \n"
-              "vadd.s32   d15, d15, d14            \n"
-              "vst1.32    d15[0], [%[output0]]!    \n"
-
-              "subs       %[remain], #1            \n"
-              "bne        0b                       \n"
-              : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3),
-                [r4] "+r"(r4), [remain] "+r"(remain), [output0] "+r"(output0)
-              :
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q8", "q9", "q10", "r6");
-        }
-      }
-    }
-  }
-#endif
-#else
-// TODO(hjchen2)
-#endif
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h
index ea023bc134033aee6577ebf06c95f2a762d08bca..8498992fcecbcb2c9a773fba874e108c013a04fc 100644
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -209,12 +209,18 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
                       int32_t lda, int8_t *buffer);
   void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
                       int32_t ldb, int8_t *buffer);
+  void PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
+                          int32_t lda, int8_t *buffer);
+  void PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
+                          int32_t ldb, int8_t *buffer);
 
   // 8 bits int matrix product
   void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
              int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C,
              int32_t ldc, bool relu, int8_t *bias);
-
+  void Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
+                 int32_t lda, const int8_t *B, int32_t ldb, int8_t beta,
+                 int32_t *C, int32_t ldc, bool relu, int8_t *bias);
   // 8 bits int write back
   // C = alpha * A * B + beta * C
   void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
diff --git a/src/operators/math/gemm_int8.cpp b/src/operators/math/gemm_int8.cpp
index 5dd8a7c3131543f426f32e258efb3181be9b2f61..b16db7fe6acf0c3c7fb2902c9fb3f6e3dc81a65f 100644
--- a/src/operators/math/gemm_int8.cpp
+++ b/src/operators/math/gemm_int8.cpp
@@ -30,7 +30,7 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
                      int32_t ldc) {
 #if __ARM_NEON
 #if __aarch64__
-// TODO
+// TODO(wzzju)
 #else
   const int8_t *a_ptr, *b_ptr;
   a_ptr = a;
@@ -246,7 +246,7 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
                      int32_t ldc) {
 #if __ARM_NEON
 #if __aarch64__
-// TODO
+// TODO(wzzju)
 #else
   const int8_t *a_ptr, *b_ptr;
   a_ptr = a;
@@ -546,8 +546,12 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
 #pragma omp parallel for
   for (int32_t j = 0; j < nc; j += NR) {
     for (int32_t i = 0; i < mc; i += MR_INT8) {
+#if __aarch64__
+    // TODO(wzzju)
+#else
       //      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
       AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#endif  // __aarch64__
     }
   }
   if (alpha != 1) {
@@ -682,7 +686,7 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
       const int8_t *b0 = &B(i, j);
 #if __ARM_NEON
 #if __aarch64__
-      // TODO
+      // TODO(wzzju)
 #else
       asm volatile(
           //          "pld        [%[b0]]                     \n\t"
@@ -791,7 +795,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
                       int32_t ldc) {
 #if __ARM_NEON
 #if __aarch64__
-// TODO
+// TODO(wzzju)
 #else
   int32_t nc1 = nc >> 4;
   int32_t _nc1 = nc & 15;
diff --git a/src/operators/math/gemm_omp_int8.cpp b/src/operators/math/gemm_omp_int8.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..21256cccfcc6dcc647f34a2129616b70804d398f
--- /dev/null
+++ b/src/operators/math/gemm_omp_int8.cpp
@@ -0,0 +1,235 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string.h>
+#include "common/log.h"
+#include "memory/t_malloc.h"
+#include "operators/math/gemm.h"
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+// 8 bits int matrix product (m*k x k*n)
+void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha,
+                     const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb,
+                     int8_t beta, int32_t *C, int32_t ldc, bool relu,
+                     int8_t *bias) {
+#ifdef _OPENMP
+  int32_t max_threads = omp_get_max_threads();
+#else
+  int32_t max_threads = 1;
+#endif
+
+  int32_t L1 = 64 / max_threads * 1024;
+  KC = k;
+  zero_int8 =
+      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * KC));
+  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * KC);
+  if (m > n) {
+    // 对 A 分块
+    MC = L1 / (KC * sizeof(int8_t));
+    if (MC == 0) {
+      MC = MR_INT8;
+    } else {
+      int32_t mblock_num = (m + MC - 1) / MC;
+      MC = (m + mblock_num - 1) / mblock_num;
+      MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
+    }
+    // 补齐 B
+    NC = (n + NR - 1) / NR * NR;
+
+    packedB_int8 = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
+#if __aarch64__
+    // TODO(wzzju)
+#else
+    PackMatrixB_omp_8c(KC, n, n % NR, B, ldb, packedB_int8);
+#endif
+    packedA_int8 = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads));
+  } else {
+    // 对 B 分块
+    NC = L1 / (KC * sizeof(int8_t));
+    if (NC == 0) {
+      NC = NR;
+    } else {
+      int32_t nblock_num = (n + NC - 1) / NC;
+      NC = (n + nblock_num - 1) / nblock_num;
+      NC = (NC + NR - 1) / NR * NR;
+    }
+    // 补齐 A
+    MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8;
+
+    packedA_int8 = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
+#if __aarch64__
+    // TODO(wzzju)
+#else
+    PackMatrixA_omp_4r(m, KC, m % MR_INT8, A, lda, packedA_int8);
+#endif
+    packedB_int8 = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads));
+  }
+  packedC_int8 = static_cast<int32_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads));
+
+  if (m > n) {
+#pragma omp parallel for
+    for (int32_t i = 0; i < m; i += MC) {
+#ifdef _OPENMP
+      int32_t local_threads = omp_get_thread_num();
+#else
+      int32_t local_threads = 0;
+#endif
+
+      int32_t mc;
+      mc = s_min(m - i, MC);
+      int8_t *local_A = packedA_int8 + MC * KC * local_threads;
+      int32_t *local_C = packedC_int8 + MC * NC * local_threads;
+#if __aarch64__
+      // TODO(wzzju)
+#else
+      PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, local_A);
+#endif
+      InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, local_C,
+                          &C(i, 0), ldc, relu, bias + i);
+    }
+  } else {
+#pragma omp parallel for
+    for (int32_t j = 0; j < n; j += NC) {
+#ifdef _OPENMP
+      int32_t local_threads = omp_get_thread_num();
+#else
+      int32_t local_threads = 0;
+#endif
+      int32_t nc;
+      nc = s_min(n - j, NC);
+      int8_t *local_B = packedB_int8 + KC * NC * local_threads;
+      int32_t *local_C = packedC_int8 + MC * NC * local_threads;
+#if __aarch64__
+      // TODO(wzzju)
+#else
+      PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, local_B);
+#endif
+      InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, local_C,
+                          &C(0, j), ldc, relu, bias);
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA_int8);
+  paddle_mobile::memory::Free(packedB_int8);
+  paddle_mobile::memory::Free(packedC_int8);
+  paddle_mobile::memory::Free(zero_int8);
+}
+
+void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,
+                              const int8_t *B, int32_t ldb, int8_t *buffer) {
+  const int32_t j_length = n - n_tail;
+#pragma omp parallel for
+  for (int32_t j = 0; j < j_length; j += NR) {
+    int8_t *local_buffer = buffer + j * k;
+    for (int32_t i = 0; i < k; ++i) {
+      const int8_t *b0 = &B(i, j);
+#if __ARM_NEON
+#if __aarch64__
+      // TODO(wzzju)
+#else
+      asm volatile(
+          //          "pld        [%[b0]]                     \n\t"
+          "vld1.s8    {d0},   [%[b0]]         \n\t"
+          "vst1.s8    {d0},   [%[local_buffer]]!    \n\t"
+          : [local_buffer] "+r"(local_buffer)
+          : [b0] "r"(b0)
+          : "memory", "q0");
+#endif  // __aarch64__
+#else
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+      *local_buffer++ = *b0++;
+#endif  // __ARM_NEON
+    }
+  }
+  if (n_tail != 0) {
+    int8_t *local_buffer = buffer + j_length * k;
+    for (int32_t i = 0; i < k; ++i) {
+      const int8_t *b0 = &B(i, j_length);
+      for (int32_t j = j_length; j < n; ++j) {
+        *local_buffer++ = *b0++;
+      }
+      for (int32_t j = n; j < j_length + NR; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+}
+
+void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail,
+                              const int8_t *A, int32_t lda, int8_t *buffer) {
+  const int i_length = m - m_tail;
+#pragma omp parallel for
+  for (int32_t i = 0; i < i_length; i += MR_INT8) {
+    const int8_t *a0 = A + i * lda;
+    const int8_t *a1 = A + (i + 1) * lda;
+    const int8_t *a2 = A + (i + 2) * lda;
+    const int8_t *a3 = A + (i + 3) * lda;
+    int8_t *local_buffer = buffer + i * k;
+    for (int32_t j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+    }
+  }
+
+  if (m_tail != 0) {
+    const int8_t *a0 = &A(i_length, 0);
+    const int8_t *a1 = a0 + lda;
+    const int8_t *a2 = a0 + 2 * lda;
+    const int8_t *a3 = a0 + 3 * lda;
+    int8_t *local_buffer = buffer + i_length * k;
+    switch (m_tail) {
+      case 1:
+        a1 = zero_int8;
+      case 2:
+        a2 = zero_int8;
+      case 3:
+        a3 = zero_int8;
+        break;
+      default:
+        break;
+    }
+    for (int j = 0; j < k; ++j) {
+      *local_buffer++ = *a0++;
+      *local_buffer++ = *a1++;
+      *local_buffer++ = *a2++;
+      *local_buffer++ = *a3++;
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/src/operators/math/math_function_int8.cpp b/src/operators/math/math_function_int8.cpp
index 70677223d12ded2da07ab53bc371f1e8da9fe293..e02824b290ebc0080613e2ae2365626d79576c9e 100644
--- a/src/operators/math/math_function_int8.cpp
+++ b/src/operators/math/math_function_int8.cpp
@@ -51,12 +51,23 @@ void matmul<int8_t>(const framework::Tensor &matrix_a, bool trans_a,
       }
     }
 
+#ifdef _OPENMP
+    gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
+                   matrix_out->data<int32_t>(), N, relu, bias);
+#else
     gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
                matrix_out->data<int32_t>(), N, relu, bias);
+#endif
   } else {
+#ifdef _OPENMP
+    gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
+                   matrix_b.data<int8_t>(), N, beta,
+                   matrix_out->data<int32_t>(), N, relu, bias);
+#else
     gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
                matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(), N,
                relu, bias);
+#endif
   }
 }
 }  // namespace math
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index 106f5c43c1762afa7f24a8c3e3e86beac8517834..2c0075271a92cb66ef95603965dd18d0dd3c5faf 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -243,6 +243,12 @@ class OpParam {
     return GetVarValue<T>("Y", outputs, scope);
   }
 
+  template <typename T>
+  static T *OutputXShapeFrom(const VariableNameMap &outputs,
+                             const Scope &scope) {
+    return GetVarValue<T>("XShape", outputs, scope);
+  }
+
   template <typename T>
   static T *OutputBoxesFrom(const VariableNameMap &outputs,
                             const Scope &scope) {
@@ -1126,6 +1132,37 @@ class TransposeParam : public OpParam {
 };
 #endif
 
+#ifdef TRANSPOSE2_OP
+template <typename Dtype>
+class Transpose2Param : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  Transpose2Param(const VariableNameMap &inputs, const VariableNameMap &outputs,
+                  const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    output_xshape_ = OutputXShapeFrom<GType>(outputs, scope);
+    axis_ = GetAttr<vector<int>>("axis", attrs);
+  }
+
+  const RType *InputX() const { return input_x_; }
+
+  RType *Out() const { return out_; }
+
+  RType *OutputXShape() const { return output_xshape_; }
+
+  const vector<int> &Axis() const { return axis_; }
+
+ private:
+  RType *input_x_;
+  RType *out_;
+  RType *output_xshape_;
+  vector<int> axis_;
+};
+#endif
+
 #ifdef LOOKUP_OP
 template <typename Dtype>
 class LookupParam : public OpParam {
@@ -1233,6 +1270,49 @@ class ReshapeParam : public OpParam {
 };
 #endif
 
+#ifdef RESHAPE2_OP
+template <typename Dtype>
+class Reshape2Param : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  Reshape2Param(const VariableNameMap &inputs, const VariableNameMap &outputs,
+                const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    input_shape_ = InputShapeFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    output_xshape_ = OutputXShapeFrom<GType>(outputs, scope);
+    shape_ = GetAttr<vector<int>>("shape", attrs);
+    if (HasAttr("inplace", attrs)) {
+      inplace_ = GetAttr<bool>("inplace", attrs);
+    } else {
+      inplace_ = false;
+    }
+  }
+
+  const RType *InputX() const { return input_x_; }
+
+  const RType *InputShape() const { return input_shape_; }
+
+  RType *Out() const { return out_; }
+
+  RType *OutputXShape() const { return output_xshape_; }
+
+  const vector<int> &Shape() const { return shape_; }
+
+  const bool &Inplace() const { return inplace_; }
+
+ private:
+  RType *input_x_;
+  RType *input_shape_;
+  RType *out_;
+  RType *output_xshape_;
+  vector<int> shape_;
+  bool inplace_;
+};
+#endif
+
 #ifdef SCALE_OP
 template <typename Dtype>
 class ScaleParam : public OpParam {
diff --git a/src/operators/reshape2_op.cpp b/src/operators/reshape2_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d1623076570d466fc53f885374060c5e744365ed
--- /dev/null
+++ b/src/operators/reshape2_op.cpp
@@ -0,0 +1,47 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE2_OP
+
+#include "operators/reshape2_op.h"
+#include <vector>
+#include "operators/kernel/reshape_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void Reshape2Op<Dtype, T>::InferShape() const {
+  auto &shape = this->param_.Shape();
+  auto input_x_dims = this->param_.InputX()->dims();
+  auto out_dims = ValidateShape(shape, input_x_dims);
+  this->param_.Out()->Resize(out_dims);
+  std::vector<int64_t> xshape_dims(input_x_dims.size() + 1, 0);
+  for (int i = 0; i < input_x_dims.size(); ++i) {
+    xshape_dims[i + 1] = input_x_dims[i];
+  }
+  this->param_.OutputXShape()->Resize(framework::make_ddim(xshape_dims));
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(reshape2, ops::Reshape2Op);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+REGISTER_OPERATOR_MALI_GPU(reshape2, ops::Reshape2Op);
+#endif
+
+#endif
diff --git a/src/operators/reshape2_op.h b/src/operators/reshape2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a06c2b9b90233b6ad0bacb6176f4cc274ff1cc0
--- /dev/null
+++ b/src/operators/reshape2_op.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef RESHAPE2_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/reshape2_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class Reshape2Op : public framework::OperatorWithKernel<
+                       DeviceType, Reshape2Param<DeviceType>,
+                       operators::Reshape2Kernel<DeviceType, T>> {
+ public:
+  Reshape2Op(const std::string &type, const VariableNameMap &inputs,
+             const VariableNameMap &outputs,
+             const framework::AttributeMap &attrs,
+             std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, Reshape2Param<DeviceType>,
+                                      operators::Reshape2Kernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, Reshape2Param<DeviceType>,
+      operators::Reshape2Kernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+
+ protected:
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/src/operators/transpose2_op.cpp b/src/operators/transpose2_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..64d07991f60b4057e3d2841afa1bfe6483f31a88
--- /dev/null
+++ b/src/operators/transpose2_op.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TRANSPOSE2_OP
+
+#include <vector>
+
+#include "common/enforce.h"
+#include "operators/transpose2_op.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void Transpose2Op<Dtype, T>::InferShape() const {
+  auto input_x_dims = this->param_.InputX()->dims();
+  auto axis = this->param_.Axis();
+
+  size_t x_dims_size = input_x_dims.size();
+  size_t axis_size = axis.size();
+
+  PADDLE_MOBILE_ENFORCE((x_dims_size == axis_size),
+                        "input_dims must "
+                        "be equal to the axis_size. ")
+
+  std::vector<int> count(axis_size, 0);
+  for (size_t i = 0; i < axis_size; i++) {
+    PADDLE_MOBILE_ENFORCE(
+        axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
+        "Each element of Attribute axis should be a unique value "
+        "range from 0 to (dims - 1), "
+        "where the dims is the axis's size");
+  }
+  framework::DDim out_dims(input_x_dims);
+  for (size_t i = 0; i < axis_size; i++) {
+    out_dims[i] = input_x_dims[axis[i]];
+  }
+  this->param_.Out()->Resize(out_dims);
+  std::vector<int64_t> xshape_dims(input_x_dims.size() + 1, 0);
+  for (int i = 0; i < input_x_dims.size(); ++i) {
+    xshape_dims[i + 1] = input_x_dims[i];
+  }
+  this->param_.OutputXShape()->Resize(framework::make_ddim(xshape_dims));
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(transpose2, ops::Transpose2Op);
+#endif
+
+#endif  // TRANSPOSE_OP
diff --git a/src/operators/transpose2_op.h b/src/operators/transpose2_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1339cc59e0c71a232eddd5dcef47f62994b80da
--- /dev/null
+++ b/src/operators/transpose2_op.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef TRANSPOSE2_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/transpose2_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+using paddle_mobile::framework::Tensor;
+
+template <typename DeviceType, typename T>
+class Transpose2Op : public framework::OperatorWithKernel<
+                         DeviceType, Transpose2Param<DeviceType>,
+                         operators::Transpose2Kernel<DeviceType, T>> {
+ public:
+  Transpose2Op(const std::string &type, const VariableNameMap &inputs,
+               const VariableNameMap &outputs,
+               const framework::AttributeMap &attrs,
+               std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, Transpose2Param<DeviceType>,
+            operators::Transpose2Kernel<DeviceType, T>>(type, inputs, outputs,
+                                                        attrs, scope) {}
+
+  using framework::OperatorWithKernel<
+      DeviceType, Transpose2Param<DeviceType>,
+      operators::Transpose2Kernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index a10088f9b417b628418404b8df3d340b851af383..2bd7169533f637add2a752feaceca8df132cb262 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -184,6 +184,10 @@ if (NOT FOUND_MATCH)
     ADD_EXECUTABLE(test-transpose-op operators/test_transpose_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-transpose-op paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-transpose2-op operators/test_transpose2_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-transpose2-op paddle-mobile)
+
     # gen test
     ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-multiclassnms-op paddle-mobile)
@@ -200,6 +204,10 @@ if (NOT FOUND_MATCH)
     ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-reshape-op paddle-mobile)
 
+    # gen test
+    ADD_EXECUTABLE(test-reshape2-op operators/test_reshape2_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-reshape2-op paddle-mobile)
+
     # gen test
     ADD_EXECUTABLE(test-relu-op operators/test_relu_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-relu-op paddle-mobile)
diff --git a/test/common/test_gemm_int8_accuracy.cpp b/test/common/test_gemm_int8_accuracy.cpp
index 80ddd40e121c81032c903955bd7116cf52695569..87f8d945648577ef1414417b57f4013d288dc043 100644
--- a/test/common/test_gemm_int8_accuracy.cpp
+++ b/test/common/test_gemm_int8_accuracy.cpp
@@ -20,6 +20,9 @@ limitations under the License. */
 #include "common/log.h"
 #include "memory/t_malloc.h"
 #include "operators/math/gemm.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif  // _OPENMP
 
 #define a(i, j) a[(i)*lda + (j)]
 #define b(i, j) b[(i)*ldb + (j)]
@@ -84,8 +87,13 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {
   }
 
   paddle_mobile::operators::math::Gemm gemm;
+#ifdef _OPENMP
+  gemm.Sgemm_omp(m, n, k, static_cast<int8_t>(1), a, lda, b, ldb,
+                 static_cast<int8_t>(0), c, ldc, relu, nullptr);
+#else
   gemm.Sgemm(m, n, k, static_cast<int8_t>(1), a, lda, b, ldb,
              static_cast<int8_t>(0), c, ldc, relu, nullptr);
+#endif
   int eq = 0;
   int neq = 0;
   for (int i = 0; i < m * n; ++i) {
@@ -119,12 +127,17 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {
 }
 
 int main() {
-  do_sgemm(9, 9, 9, false, 10);
+#ifdef _OPENMP
+  omp_set_num_threads(8);
+#endif
+  do_sgemm(9, 9, 9, false, 1);
   do_sgemm(10, 6, 12, false, 0);
   do_sgemm(512, 256, 384, false, 0);
   do_sgemm(1366, 768, 256, false, 0);
   do_sgemm(1255, 755, 333, false, 0);
-  do_sgemm(555, 777, 999, false, 0);
+  do_sgemm(599, 1133, 393, false, 0);
+  do_sgemm(777, 555, 999, false, 0);
+  do_sgemm(333, 797, 939, false, 0);
   do_sgemm(1024, 1024, 1024, false, 0);
 
   return 0;
diff --git a/test/common/test_gemm_perf.cpp b/test/common/test_gemm_perf.cpp
index 89f0012ae8effaab383719c1b85748c24eb2bf73..14da4ba284b5ac7b0660bd15de871fdf5ed04cdd 100644
--- a/test/common/test_gemm_perf.cpp
+++ b/test/common/test_gemm_perf.cpp
@@ -28,7 +28,7 @@ limitations under the License. */
 
 int main() {
   paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(1);
+  paddle_mobile.SetThreadNum(8);
   Tensor aa, bb, cc;
   auto aaptr = aa.mutable_data<float>({m, k});
   auto bbptr = bb.mutable_data<float>({k, n});
diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp
index c88a78974c330ec270fbcb3f5c28e368ef16440e..f7d29942224b51734cf62988ba8f271f1fa05bc3 100644
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -25,8 +25,8 @@ int main() {
   paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
 #endif
 
-  paddle_mobile.SetThreadNum(1);
-  bool optimize = false;
+  paddle_mobile.SetThreadNum(4);
+  bool optimize = true;
   auto time1 = time();
   if (paddle_mobile.Load(g_googlenet, optimize)) {
     auto time2 = time();
@@ -35,10 +35,10 @@ int main() {
     std::vector<float> output;
     std::vector<int64_t> dims{1, 3, 224, 224};
     GetInput<float>(g_test_image_1x3x224x224, &input, dims);
-    //    // 预热十次
-    //    for (int i = 0; i < 10; ++i) {
-    //      output = paddle_mobile.Predict(input, dims);
-    //    }
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      output = paddle_mobile.Predict(input, dims);
+    }
     auto time3 = time();
     for (int i = 0; i < 10; ++i) {
       output = paddle_mobile.Predict(input, dims);
@@ -47,9 +47,6 @@ int main() {
 
     std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
               << std::endl;
-    for (int i = 0; i < output.size(); ++i) {
-      DLOG << "result[" << i << "] = " << output[i];
-    }
   }
   return 0;
 }
diff --git a/test/operators/test_batchnorm_op.cpp b/test/operators/test_batchnorm_op.cpp
index 4ccad8c1512036c2400a09575b3775e75b26acce..5f064d27f3f3f9cca5428467557c9412f76735c7 100644
--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/batchnorm_op.h"
diff --git a/test/operators/test_box_coder_op.cpp b/test/operators/test_box_coder_op.cpp
index 92cba3995c866c67c00491ad5cc38fb094594ad3..aeef10be9650623767af4d2de8913ce53b1d2c59 100644
--- a/test/operators/test_box_coder_op.cpp
+++ b/test/operators/test_box_coder_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
 #include "../test_include.h"
 #include "operators/box_coder_op.h"
 
diff --git a/test/operators/test_elementwise_sub_op.cpp b/test/operators/test_elementwise_sub_op.cpp
index cfac83eff7a012d52d47f96e088bd8519603cadc..e27361b21c3146675ea856d02d70878e73e8912f 100644
--- a/test/operators/test_elementwise_sub_op.cpp
+++ b/test/operators/test_elementwise_sub_op.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/elementwise_sub_op.h"
diff --git a/test/operators/test_fill_constant_op.cpp b/test/operators/test_fill_constant_op.cpp
index b099217d1641eb221b3d0d86d780fb6ecfa929bd..99c65ed821c0a90691070b661a6967a11d4694f7 100644
--- a/test/operators/test_fill_constant_op.cpp
+++ b/test/operators/test_fill_constant_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
 #include "../test_include.h"
 #include "operators/fill_constant_op.h"
 
diff --git a/test/operators/test_fusion_fc_op.cpp b/test/operators/test_fusion_fc_op.cpp
index a23bde45cb74f0f75e655821b15e66b1cef4c081..aaa2d7b578dbda4c6919210eb4a2fb42ba243e53 100644
--- a/test/operators/test_fusion_fc_op.cpp
+++ b/test/operators/test_fusion_fc_op.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
 #include <framework/program/program-optimize/program_optimize.h>
 #include "../test_include.h"
 #include "operators/fusion_fc_op.h"
diff --git a/test/operators/test_im2sequence_op.cpp b/test/operators/test_im2sequence_op.cpp
index b45e437e12f95cd9f7050247fc03a152246d8122..6c69d1cc9d94ffd958251ee4ed783d6b5531c455 100644
--- a/test/operators/test_im2sequence_op.cpp
+++ b/test/operators/test_im2sequence_op.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/im2sequence_op.h"
diff --git a/test/operators/test_mul_op.cpp b/test/operators/test_mul_op.cpp
index 10dab2cda1b3c692f42cf8760eb2b48ae6451f39..262ee960e1c777d369d3b510eb31e5ed47b3493c 100644
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -93,6 +93,8 @@ int TestMulOP() {
 }  // namespace paddle_mobile
 
 int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(8);
   paddle_mobile::TestMulOP<int8_t, int32_t>();
   paddle_mobile::TestMulOP<float, float>();
   return 0;
diff --git a/test/operators/test_multiclass_nms_op.cpp b/test/operators/test_multiclass_nms_op.cpp
index d1b98d4965fd182ab1adc480279f38cea53974be..3447bbdd10b64d2c2f497bdb4d5af15958a9a95b 100644
--- a/test/operators/test_multiclass_nms_op.cpp
+++ b/test/operators/test_multiclass_nms_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
 #include "../test_include.h"
 #include "operators/multiclass_nms_op.h"
 
@@ -31,14 +30,12 @@ class TestMultiClassNMSOp {
 
     const std::vector<std::shared_ptr<BlockDesc>> blocks =
         to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
     for (auto block_desc : blocks) {
       std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
       for (auto op : ops) {
         if (op->Type() == "multiclass_nms" &&
             op->Input("BBoxes")[0] == "box_coder_0.tmp_0") {
-          DLOG << " mul attr size: " << op->GetAttrMap().size();
+          DLOG << " attr size: " << op->GetAttrMap().size();
           DLOG << " inputs size: " << op->GetInputs().size();
           DLOG << " outputs size: " << op->GetOutputs().size();
           DLOG << " BBoxes is : " << op->Input("BBoxes")[0];
@@ -55,14 +52,6 @@ class TestMultiClassNMSOp {
                << op->GetAttrMap().at("nms_top_k").Get<int>();
           DLOG << " score_threshold : "
                << op->GetAttrMap().at("score_threshold").Get<float>();
-          //                            DLOG << " variances : " <<
-          //                            op->GetAttrMap().at("variances").Get<std::vector<float>>();
-          //                            DLOG << " aspect_ratios : " <<
-          //                            op->GetAttrMap().at("aspect_ratios").Get<std::vector<float>>();
-          //                            DLOG << " min_sizes : " <<
-          //                            op->GetAttrMap().at("min_sizes").Get<std::vector<float>>();
-          //                            DLOG << " max_sizes : " <<
-          //                            op->GetAttrMap().at("max_sizes").Get<std::vector<float>>();
           std::shared_ptr<operators::MultiClassNMSOp<Dtype, float>> priorbox =
               std::make_shared<operators::MultiClassNMSOp<Dtype, float>>(
                   op->Type(), op->GetInputs(), op->GetOutputs(),
@@ -88,16 +77,12 @@ class TestMultiClassNMSOp {
     auto *output_tensor = output->GetMutable<LoDTensor>();
     output_tensor->mutable_data<float>({1917, 6});
 
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
     std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
     out_tensor.reset(output_tensor);
 
     predict(t1, t2, 0);
 
     return out_tensor;
-    // return outvars_tensor;
   }
 
  private:
diff --git a/test/operators/test_polygon_box_transform_op.cpp b/test/operators/test_polygon_box_transform_op.cpp
index a71177ddbd8e4d8b0f204fd6ec9c948882499cbd..5b30ce1ebfd59db972953e16e4506fa2595b8f04 100644
--- a/test/operators/test_polygon_box_transform_op.cpp
+++ b/test/operators/test_polygon_box_transform_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
 #include "../test_include.h"
 #include "operators/polygon_box_transform_op.h"
 
diff --git a/test/operators/test_prior_box_op.cpp b/test/operators/test_prior_box_op.cpp
index 8c697a9a7982f05b71caa5bb5f4d12e50dc9d418..2c75d01df297030b4633829ac4b29f7592aaf5c4 100644
--- a/test/operators/test_prior_box_op.cpp
+++ b/test/operators/test_prior_box_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
 #include "../test_include.h"
 #include "operators/prior_box_op.h"
 
diff --git a/test/operators/test_reshape2_op.cpp b/test/operators/test_reshape2_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..42c348a6274592eb23332620131faa0784a71d28
--- /dev/null
+++ b/test/operators/test_reshape2_op.cpp
@@ -0,0 +1,142 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/reshape2_op.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename Dtype>
+class TestReshape2Op {
+ public:
+  explicit TestReshape2Op(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    for (auto block_desc : blocks) {
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      for (auto op : ops) {
+        if (op->Type() == "reshape2") {
+          DLOG << " attr size: " << op->GetAttrMap().size();
+          std::unordered_map<std::string, Attribute> attrs = op->GetAttrMap();
+          for (std::unordered_map<std::string, Attribute>::iterator it =
+                   attrs.begin();
+               it != attrs.end(); ++it) {
+            DLOG << "  " << it->first << " " << it->second;
+          }
+
+          DLOG << " inputs size: " << op->GetInputs().size();
+          VariableNameMap inputs = op->GetInputs();
+          for (VariableNameMap::iterator it = inputs.begin();
+               it != inputs.end(); ++it) {
+            DLOG << "  " << it->first << " " << it->second;
+          }
+
+          DLOG << " outputs size: " << op->GetOutputs().size();
+          VariableNameMap outputs = op->GetOutputs();
+          for (VariableNameMap::iterator it = outputs.begin();
+               it != outputs.end(); ++it) {
+            DLOG << "  " << it->first << " " << it->second;
+          }
+
+          input_var_name = op->Input("X")[0];
+          output_var_name = op->Output("Out")[0];
+          std::shared_ptr<operators::Reshape2Op<Dtype, float>> op_ptr =
+              std::make_shared<operators::Reshape2Op<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(op_ptr);
+          return;
+        }
+      }
+    }
+  }
+
+  std::shared_ptr<Tensor> predict(const Tensor &t) {
+    auto scope = program_.scope;
+    Variable *input_feed_value = scope->Var(input_var_name);
+    auto tensor_input = input_feed_value->GetMutable<LoDTensor>();
+    tensor_input->ShareDataWith(t);
+
+    Variable *output = scope->Var(output_var_name);
+    auto *output_tensor = output->GetMutable<LoDTensor>();
+
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);
+
+    predict(t, 0);
+
+    return out_tensor;
+  }
+
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+  string input_var_name;
+  string output_var_name;
+
+  void predict(const Tensor &t, int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      op->Run();
+    }
+  }
+};
+
+template class TestReshape2Op<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
+
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run Reshape2 Test";
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_ocr) + "/model",
+                             std::string(g_ocr) + "/params");
+
+  paddle_mobile::framework::Tensor input;
+  SetupTensor<float>(&input, {1, 4, 4}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *input_ptr = input.data<float>();
+  for (int i = 0; i < 16; ++i) {
+    *(input_ptr + i) = i;
+  }
+  DLOG << "input : ";
+  for (int i = 0; i < input.numel(); ++i) {
+    DLOG << " index " << i << " : " << input_ptr[i];
+  }
+
+  paddle_mobile::framework::TestReshape2Op<paddle_mobile::CPU> testReshape2Op(
+      program);
+
+  auto output = testReshape2Op.predict(input);
+  auto *output_ptr = output->data<float>();
+
+  DLOG << "output : ";
+  for (int i = 0; i < output->numel(); ++i) {
+    DLOG << " index " << i << " : " << output_ptr[i];
+  }
+  return 0;
+}
diff --git a/test/operators/test_sum_op.cpp b/test/operators/test_sum_op.cpp
index e51d1cff5e99c5d9c444db046e78eee6a03f9243..467529d8d3877fcb9ac5527daf5f037aea6d18fc 100644
--- a/test/operators/test_sum_op.cpp
+++ b/test/operators/test_sum_op.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/sum_op.h"
diff --git a/test/operators/test_transpose2_op.cpp b/test/operators/test_transpose2_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b75a957cd5c1cd08dc09895e9e2448761e822274
--- /dev/null
+++ b/test/operators/test_transpose2_op.cpp
@@ -0,0 +1,143 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/transpose2_op.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename Dtype>
+class TestTranspose2Op {
+ public:
+  explicit TestTranspose2Op(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    for (auto block_desc : blocks) {
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      for (auto op : ops) {
+        if (op->Type() == "transpose2") {
+          DLOG << " attr size: " << op->GetAttrMap().size();
+          std::unordered_map<std::string, Attribute> attrs = op->GetAttrMap();
+          for (std::unordered_map<std::string, Attribute>::iterator it =
+                   attrs.begin();
+               it != attrs.end(); ++it) {
+            DLOG << "  " << it->first << " " << it->second;
+          }
+
+          DLOG << " inputs size: " << op->GetInputs().size();
+          VariableNameMap inputs = op->GetInputs();
+          for (VariableNameMap::iterator it = inputs.begin();
+               it != inputs.end(); ++it) {
+            DLOG << "  " << it->first << " " << it->second;
+          }
+
+          DLOG << " outputs size: " << op->GetOutputs().size();
+          VariableNameMap outputs = op->GetOutputs();
+          for (VariableNameMap::iterator it = outputs.begin();
+               it != outputs.end(); ++it) {
+            DLOG << "  " << it->first << " " << it->second;
+          }
+
+          input_var_name = op->Input("X")[0];
+          output_var_name = op->Output("Out")[0];
+          std::shared_ptr<operators::Transpose2Op<Dtype, float>> op_ptr =
+              std::make_shared<operators::Transpose2Op<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(op_ptr);
+          return;
+        }
+      }
+    }
+  }
+
+  std::shared_ptr<Tensor> predict(const Tensor &t) {
+    auto scope = program_.scope;
+    Variable *input_feed_value = scope->Var(input_var_name);
+    auto tensor_input = input_feed_value->GetMutable<LoDTensor>();
+    tensor_input->ShareDataWith(t);
+
+    Variable *output = scope->Var(output_var_name);
+    auto *output_tensor = output->GetMutable<LoDTensor>();
+    output_tensor->mutable_data<float>({1, 2, 8});
+
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);
+
+    predict(t, 0);
+
+    return out_tensor;
+  }
+
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+  string input_var_name;
+  string output_var_name;
+
+  void predict(const Tensor &t, int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      op->Run();
+    }
+  }
+};
+
+template class TestTranspose2Op<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
+
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run Transpose2 Test";
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_ocr) + "/model",
+                             std::string(g_ocr) + "/params");
+
+  paddle_mobile::framework::Tensor input;
+  SetupTensor<float>(&input, {1, 8, 2}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *input_ptr = input.data<float>();
+  for (int i = 0; i < 16; ++i) {
+    *(input_ptr + i) = i;
+  }
+  DLOG << "input : ";
+  for (int i = 0; i < input.numel(); ++i) {
+    DLOG << " index " << i << " : " << input_ptr[i];
+  }
+
+  paddle_mobile::framework::TestTranspose2Op<paddle_mobile::CPU>
+      testTranspose2Op(program);
+
+  auto output = testTranspose2Op.predict(input);
+  auto *output_ptr = output->data<float>();
+
+  DLOG << "output : ";
+  for (int i = 0; i < output->numel(); ++i) {
+    DLOG << " index " << i << " : " << output_ptr[i];
+  }
+  return 0;
+}
diff --git a/tools/build.sh b/tools/build.sh
index 1408822e46850752bcd448350fc483c25f70ae9a..c6554105718304c195bb4a3326c80947719033a0 100755
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -160,7 +160,7 @@ build_for_ios() {
     fi
     cd "${BUILD_DIR}"
     make -j 8
-    cp ../../../src/ios_io/PaddleMobile.h ./build/PaddleMobile.h
+    cp ../../../src/ios_io/PaddleMobileCPU.h ./build/PaddleMobileCPU.h
     cd ./build
     # 生成符号表
     ranlib *.a
diff --git a/tools/op.cmake b/tools/op.cmake
index f7a6ed4b134f78ddb23487cd3a861f244e6a86db..2e1e311a2c96bac02257cfdce2d2fbebcd962dfb 100644
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -201,9 +201,11 @@ if(NOT FOUND_MATCH)
   set(PRIORBOX_OP ON)
   set(RELU_OP ON)
   set(RESHAPE_OP ON)
+  set(RESHAPE2_OP ON)
   set(SIGMOID_OP ON)
   set(SOFTMAX_OP ON)
   set(TRANSPOSE_OP ON)
+  set(TRANSPOSE2_OP ON)
   set(FUSION_CONVADDBNRELU_OP ON)
   set(FUSION_CONVADDADDPRELU_OP ON)
   set(FUSION_DWCONVBNRELU_OP ON)
@@ -246,9 +248,11 @@ endif()
   # option(PRIORBOX_OP "" ON)
   # option(RELU_OP "" ON)
   # option(RESHAPE_OP "" ON)
+  # option(RESHAPE2_OP "" ON)
   # option(SIGMOID_OP "" ON)
   # option(SOFTMAX_OP "" ON)
   # option(TRANSPOSE_OP "" ON)
+  # option(TRANSPOSE2_OP "" ON)
 # endif ()
 
 if (BATCHNORM_OP)
@@ -314,6 +318,9 @@ endif()
 if (RESHAPE_OP)
   add_definitions(-DRESHAPE_OP)
 endif()
+if (RESHAPE2_OP)
+  add_definitions(-DRESHAPE2_OP)
+endif()
 if (SIGMOID_OP)
   add_definitions(-DSIGMOID_OP)
 endif()
@@ -323,6 +330,9 @@ endif()
 if (TRANSPOSE_OP)
   add_definitions(-DTRANSPOSE_OP)
 endif()
+if (TRANSPOSE2_OP)
+  add_definitions(-DTRANSPOSE2_OP)
+endif()
 if (FUSION_CONVADDBNRELU_OP)
   add_definitions(-DFUSION_CONVADDBNRELU_OP)
 endif()