Merge remote-tracking branch 'upstream/develop' into develop

a973b2fb · xiebaiyuan · 72dcbcb7 · 08505ef8 · a973b2fb · a973b2fb
31 changed file
--- a/README.md
+++ b/README.md
@@ -8,46 +8,19 @@
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)-->
-欢迎来到 Paddle-Mobile GitHub 项目。
+欢迎来到 Paddle-Mobile GitHub 项目。Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平台的深度学习的框架。
-Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平台的深度学习的框架。Paddle-Mobile设计思想和PaddlePaddle的最新版fluid版本保持了高度一致，同时针对嵌入式做了大量优化。设计之初就对嵌入式的性能、体积、能耗、硬件平台覆盖等方面做了考虑。
-## 简单搜索线上效果
-如下gif是简单搜索app的线上主体检测应用效果
-![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif)
-## Demo目录
-[点我](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo)
 ## Features
- **ARM CPU**
+- 高性能支持ARM CPU 
+- 支持Mali GPU
- **Mali GPU**
+- 支持Andreno GPU
+- 支持苹果设备的GPU Metal实现
- **苹果设备的GPU Metal实现**
+- 支持ZU5、ZU9等FPGA开发板
+- 支持树莓派等arm-linux开发板
- **FPGA**
-    目前已经支持 ZCU102 开发板。
- **灵活性**
-    * paddle-mobile cpu版不依赖任何第三库, 可进行快速集成。
-    * 使用泛型特化进行平台切换, 可灵活切换 cpu、gpu 和其他协处理器。
-    * 可根据特定的常见网络, 进行编译特定的 op, 降低编译时间, 减小包大小。
-    * 使用 docker 编译, 提供统一的编译环境。
-    * 高可拓展性, 方便拓展其他协处理器, 提供高性能 arm 算子实现, 方便其他协处理器开发者集成开发。
-    * 直接兼容 paddle-fluid 模型, 不需要额外的转换操作。
- **体积**
-    paddle-mobile从设计之初就深入考虑到移动端的包体积的问题，cpu实现中没有外部依赖。在编译过程中，如果该网络不需要的op是完全不会被打入的。同时编译选项优化也为体积压缩提供了帮助。
-    除了二进制体积，我们对代码体积极力避免过大。整个仓库的代码体积也非常小。
+## Demo目录
+[https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo)
 ## 文档
@@ -74,18 +47,22 @@ Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平
 ### 1. 直接使用Paddle Fluid训练
 该方式最为可靠，推荐方式
 ### 2. caffe转为Paddle Fluid模型
-[链接](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
+[https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
 ### 3. ONNX
 ONNX全称为“Open Neural Network Exchange”，即“开放的神经网络切换”。该项目的目的是让不同的神经网络开发框架做到互通互用。
 除直接使用PaddlePaddle训练fluid版本的模型外，还可以通过onnx转换得到个别Paddle fluid模型。
-目前，百度也在做onnx支持工作。相关转换项目在这里：[paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)。
+目前，百度也在做onnx支持工作。相关转换项目在这里：
+[https://github.com/PaddlePaddle/paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)
-![](http://7xop3k.com1.z0.glb.clouddn.com/15311951836000.jpg)
 ### 4. 部分测试模型和测试图片下载
-[下载链接](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
+[http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
+<!--## 简单搜索线上效果
+如下gif是简单搜索app的线上主体检测应用效果
+![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif)-->
 ## 问题解决
@@ -97,5 +74,3 @@ Paddle-Mobile 提供相对宽松的Apache-2.0开源协议 [Apache-2.0 license](L
 ## 旧版 Mobile-Deep-Learning
 原MDL(Mobile-Deep-Learning)工程被迁移到了这里 [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) 
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -44,6 +44,7 @@ const char *G_OP_TYPE_RESHAPE2 = "reshape2";
 const char *G_OP_TYPE_SIGMOID = "sigmoid";
 const char *G_OP_TYPE_SOFTMAX = "softmax";
 const char *G_OP_TYPE_TRANSPOSE = "transpose";
+const char *G_OP_TYPE_TRANSPOSE2 = "transpose2";
 const char *G_OP_TYPE_SPLIT = "split";
 const char *G_OP_TYPE_FEED = "feed";
 const char *G_OP_TYPE_FETCH = "fetch";
@@ -91,6 +92,7 @@ std::unordered_map<
        {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
        {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}},
        {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_TRANSPOSE2, {{"X"}, {"Out", "XShape"}}},
        {G_OP_TYPE_BOX_CODER,
         {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},

--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -115,6 +115,9 @@ LOAD_OP2(reshape2, CPU, MALI_GPU);
 #ifdef TRANSPOSE_OP
 LOAD_OP1(transpose, CPU);
 #endif
+#ifdef TRANSPOSE2_OP
+LOAD_OP1(transpose2, CPU);
+#endif
 #ifdef PRIORBOX_OP
 LOAD_OP1(prior_box, CPU);
 #endif

--- a/src/operators/kernel/arm/im2sequence_kernel.cpp
+++ b/src/operators/kernel/arm/im2sequence_kernel.cpp
@@ -35,7 +35,7 @@ template <>
 void Im2SequenceKernel<CPU, float>::Compute(
    const Im2SequenceParam<CPU> &param) const {
  const Tensor *in_x = param.Input();
-  Tensor *out = param.Output();
+  framework::LoDTensor *out = param.Output();
  out->mutable_data<float>();
  std::vector<int> kernels = param.Kernels();
@@ -52,22 +52,31 @@ void Im2SequenceKernel<CPU, float>::Compute(
                                       paddings[2], strides[0]);
  int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
                                      paddings[3], strides[1]);
-  const std::vector<int> dilations({1, 1});
+  out->mutable_data<float>({batch_size * output_height * output_width,
+                            img_channels * kernels[0] * kernels[1]});
+  const std::vector<int> dilations({1, 1});
  // TODO: verify
  auto out_dims = out->dims();
  out->Resize({batch_size, out->numel() / batch_size});
  for (int i = 0; i < batch_size; i++) {
    const Tensor src =
        in_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
    Tensor dst = out->Slice(i, i + 1).Resize(
        {output_height, output_width, img_channels, kernels[0], kernels[1]});
    math::Im2ColFunctor<math::ColFormat::kOCF, CPU, float> f;
    f(src, dilations, strides, paddings, &dst);
  }
  out->Resize(out_dims);
+  framework::LoD lod(1);
+  lod[0].reserve(batch_size + 1);
+  int offset = 0;
+  lod[0].push_back(offset);
+  for (int i = 0; i < batch_size; ++i) {
+    offset += output_height * output_width;
+    lod[0].push_back(offset);
+  }
+  out->set_lod(lod);
 }
 template class Im2SequenceKernel<CPU, float>;

--- a/src/operators/kernel/arm/transpose2_kernel.cpp
+++ b/src/operators/kernel/arm/transpose2_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef TRANSPOSE2_OP
+#include "operators/kernel/transpose2_kernel.h"
+#include "operators/kernel/central-arm-func/transpose2_arm_func.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool Transpose2Kernel<CPU, float>::Init(Transpose2Param<CPU> *param) {
+  return true;
+}
+template <>
+void Transpose2Kernel<CPU, float>::Compute(
+    const Transpose2Param<CPU> &param) const {
+  Transpose2Compute<float>(param);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
+++ b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
@@ -29,10 +29,9 @@ void FusionFcCompute(const FusionFcParam<CPU> &param) {
  auto *input_z_data = input_z->data<float>();
  int axis = param.Axis();
  Tensor *out = param.Out();
-  auto *out_data = out->mutable_data<float>();
  //  int m = out->dims()[0];
  //  int n = out->dims()[1];
+  auto *out_data = out->mutable_data<float>();
  const Tensor x_matrix =
      input_x->dims().size() > 2
          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())

--- a/src/operators/kernel/central-arm-func/pool_arm_func.h
+++ b/src/operators/kernel/central-arm-func/pool_arm_func.h
@@ -83,6 +83,7 @@ void PoolCompute(const PoolParam<CPU> &param) {
 #if __aarch64__
    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
 #else
+    /// todo: fix bug in Pool2x2
    if (pooling_type == "max") {
      math::Pool2x2Maxs2p0(strides, paddings, in_x, out);
    } else if (pooling_type == "avg") {

--- a/src/operators/kernel/central-arm-func/softmax_arm_func.h
+++ b/src/operators/kernel/central-arm-func/softmax_arm_func.h
@@ -24,6 +24,7 @@ void SoftmaxCompute(const SoftmaxParam<CPU> &param) {
  Tensor *out = param.Out();
  auto x_dims = in_x->dims();
  out->Resize(x_dims);
+  out->mutable_data<float>();
  math::SoftmaxFuntor<CPU, float>()(in_x, out);
 }
 }  // namespace operators

--- a/src/operators/kernel/central-arm-func/transpose2_arm_func.h
+++ b/src/operators/kernel/central-arm-func/transpose2_arm_func.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef TRANSPOSE2_OP
+#pragma once
+#include <vector>
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename P>
+void Transpose2Compute(const Transpose2Param<CPU>& param) {
+  const auto* input_x = param.InputX();
+  const auto input_x_dims = input_x->dims();
+  auto* out = param.Out();
+  const auto axis = param.Axis();
+  const auto* input_x_data = input_x->data<float>();
+  auto* out_data = out->mutable_data<float>();
+  size_t ndim = axis.size();
+  std::vector<int> xdim(ndim);
+  std::vector<int> xstride(ndim);
+  std::vector<int> xout(ndim);
+  for (int i = 0; i < ndim; i++) {
+    int j = ndim - 1 - i;
+    xdim[j] = input_x_dims[axis[i]];
+    xstride[j] = 1;
+    for (int k = axis[i] + 1; k < ndim; k++) {
+      xstride[j] *= input_x_dims[k];
+    }
+    xout[j] = xstride[j] * xdim[j];
+  }
+  auto numel = input_x->numel();
+  size_t pind = 0;
+  std::vector<int> ind(ndim);
+  for (int i = 0; i < numel; i++) {
+    out_data[i] = input_x_data[pind];
+    ind[0]++;
+    pind += xstride[0];
+    for (int j = 0; j < ndim - 1; j++) {
+      if (ind[j] == xdim[j]) {
+        ind[j + 1]++;
+        ind[j] = 0;
+        pind += xstride[j + 1];
+        pind -= xout[j];
+      } else {
+        break;
+      }
+    }
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/transpose2_kernel.h
+++ b/src/operators/kernel/transpose2_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef TRANSPOSE2_OP
+#pragma once
+#include <vector>
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class Transpose2Kernel
+    : public framework::OpKernelBase<DeviceType, Transpose2Param<DeviceType>> {
+ public:
+  void Compute(const Transpose2Param<DeviceType>& param) const;
+  bool Init(Transpose2Param<DeviceType>* param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -257,8 +257,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
  const int h = static_cast<int>(input->dims()[2]);
  const int w = static_cast<int>(input->dims()[3]);
-  const int l = h;
+  //  const int l = h;
  const int batch_size = static_cast<int>(input->dims()[0]);
  const int c = static_cast<int>(input->dims()[1]);
  const int hxw = h * w;
@@ -271,7 +270,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
        vbias = vdupq_n_f32(bias_data[j]);
      }
-      int l_mid = l - 2;  // l=1->l_mid=-1,l=2->l_mid=0
+      int w_mid = w - 2;  // l=1->l_mid=-1,l=2->l_mid=0
      float w00 = filter_data_tmp[0];
      float w01 = filter_data_tmp[1];
      float w02 = filter_data_tmp[2];
@@ -283,39 +282,38 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
      float w22 = filter_data_tmp[8];
      output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
-                       w21 * input_data[l] + w22 * input_data[l + 1];
+                       w21 * input_data[w] + w22 * input_data[w + 1];
-      output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
+      output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w - 1] +
-                           w20 * input_data[2 * l - 2] +
+                           w20 * input_data[2 * w - 2] +
-                           w21 * input_data[2 * l - 1];
+                           w21 * input_data[2 * w - 1];
-      output_data[(l - 1) * l] =
+      output_data[(h - 1) * w] =
-          w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
+          w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w + 1] +
-          w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
+          w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1];
-      output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
+      output_data[h * w - 1] =
-                               w01 * input_data[(l - 2) * (l + 1) + 1] +
+          w00 * input_data[h * w - w - 2] + w01 * input_data[h * w - w - 1] +
-                               w10 * input_data[l * l - 2] +
+          w10 * input_data[h * w - 2] + w11 * input_data[h * w - 1];
-                               w11 * input_data[l * l - 1];
      if (if_bias) {
        output_data[0] += bias_data[j];
-        output_data[l - 1] += bias_data[j];
+        output_data[w - 1] += bias_data[j];
-        output_data[(l - 1) * l] += bias_data[j];
+        output_data[(h - 1) * w] += bias_data[j];
-        output_data[l * l - 1] += bias_data[j];
+        output_data[h * w - 1] += bias_data[j];
      }
-      for (int i = 1; i < l - 1; ++i) {
+      for (int i = 1; i < h - 1; ++i) {
-        output_data[i * l] =
+        output_data[i * w] =
-            w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
+            w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] +
-            w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
+            w11 * input_data[i * w] + w12 * input_data[i * w + 1] +
-            w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1];
+            w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1];
-        output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
+        output_data[i * w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] +
-                                     w01 * input_data[i * l + l - 1 - l] +
+                                     w01 * input_data[i * w + w - 1 - w] +
-                                     w10 * input_data[i * l + l - 1 - 1] +
+                                     w10 * input_data[i * w + w - 1 - 1] +
-                                     w11 * input_data[i * l + l - 1] +
+                                     w11 * input_data[i * w + w - 1] +
-                                     w20 * input_data[i * l + l - 1 + l - 1] +
+                                     w20 * input_data[i * w + w - 1 + w - 1] +
-                                     w21 * input_data[i * l + l - 1 + l];
+                                     w21 * input_data[i * w + w - 1 + w];
        if (if_bias) {
-          output_data[i * l] += bias_data[j];
+          output_data[i * w] += bias_data[j];
-          output_data[i * l + l - 1] += bias_data[j];
+          output_data[i * w + w - 1] += bias_data[j];
        }
      }
@@ -325,15 +323,15 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
      float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1, tmp2,
          tmp3, tmp4, tmp5, out0;
      in0 = vld1q_f32(input_tmp);
-      in2 = vld1q_f32(input_tmp + l);
+      in2 = vld1q_f32(input_tmp + w);
-      const float *input_tmp_end = input_tmp + (l - 2) * l;
+      const float *input_tmp_end = input_tmp + (h - 2) * w;
      in4 = vld1q_f32(input_tmp_end);
-      in6 = vld1q_f32(input_tmp_end + l);
+      in6 = vld1q_f32(input_tmp_end + w);
-      int c_mid = l_mid;
+      int c_mid = w_mid;
      auto output_ptr = output_data + 1;
      for (; c_mid > 3; c_mid -= 4) {
        in1 = vld1q_f32(input_tmp + 4);
-        in3 = vld1q_f32(input_tmp + l + 4);
+        in3 = vld1q_f32(input_tmp + w + 4);
        tmp0 = vextq_f32(in0, in1, 1);
        tmp1 = vextq_f32(in0, in1, 2);
@@ -352,7 +350,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
        vst1q_f32(output_ptr, out0);
        in5 = vld1q_f32(input_tmp_end + 4);
-        in7 = vld1q_f32(input_tmp_end + l + 4);
+        in7 = vld1q_f32(input_tmp_end + w + 4);
        tmp0 = vextq_f32(in4, in5, 1);
        tmp1 = vextq_f32(in4, in5, 2);
@@ -367,7 +365,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
        out0 = vmlaq_n_f32(out0, tmp3, w12);
        out0 = vaddq_f32(out0, vbias);
-        vst1q_f32(output_ptr + (l - 1) * l, out0);
+        vst1q_f32(output_ptr + (h - 1) * w, out0);
        // can optimize to each 8 stride.
        input_tmp += 4;
@@ -380,8 +378,8 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
      }
      // top right pad
-      float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]);
+      float32x4_t pad0 = vdupq_n_f32(input_data[w - 1]);
-      float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]);
+      float32x4_t pad1 = vdupq_n_f32(input_data[2 * w - 1]);
      tmp0 = vextq_f32(in0, pad0, 1);
      tmp1 = vextq_f32(in0, pad0, 2);
@@ -409,8 +407,8 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
      }
      // bottom right pad
-      float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]);
+      float32x4_t pad2 = vdupq_n_f32(input_data[h * w - 1 - w]);
-      float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]);
+      float32x4_t pad3 = vdupq_n_f32(input_data[h * w - 1]);
      tmp0 = vextq_f32(in4, pad2, 1);
      tmp1 = vextq_f32(in4, pad2, 2);
@@ -427,28 +425,28 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
      for (int i = 0; i < c_mid; ++i) {
        if (i == 0) {
-          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0);
+          vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 0);
        }
        if (i == 1) {
-          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1);
+          vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 1);
        }
        if (i == 2) {
-          vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2);
+          vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 2);
        }
      }
      // mid
-      for (int i = 0; i < l - 2; ++i) {
+      for (int i = 0; i < h - 2; ++i) {
-        auto output_ptr = output_data + (i + 1) * l + 1;
+        auto output_ptr = output_data + (i + 1) * w + 1;
-        input_tmp = input_data + i * l;
+        input_tmp = input_data + i * w;
        auto in0_tmp = vld1q_f32(input_tmp);
-        auto in2_tmp = vld1q_f32(input_tmp + l);
+        auto in2_tmp = vld1q_f32(input_tmp + w);
-        auto in4_tmp = vld1q_f32(input_tmp + l + l);
+        auto in4_tmp = vld1q_f32(input_tmp + w + w);
-        c_mid = l_mid;
+        c_mid = w_mid;
        for (; c_mid > 3; c_mid -= 4) {
          auto in1_tmp = vld1q_f32(input_tmp + 4);
-          auto in3_tmp = vld1q_f32(input_tmp + l + 4);
+          auto in3_tmp = vld1q_f32(input_tmp + w + 4);
-          auto in5_tmp = vld1q_f32(input_tmp + l + l + 4);
+          auto in5_tmp = vld1q_f32(input_tmp + w + w + 4);
          tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
          tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
@@ -477,9 +475,9 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
          in4_tmp = in5_tmp;
        }
-        float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]);
+        float32x4_t pad0 = vdupq_n_f32(input_data[i * w + w - 1]);
-        float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]);
+        float32x4_t pad1 = vdupq_n_f32(input_data[i * w + w - 1 + w]);
-        float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]);
+        float32x4_t pad2 = vdupq_n_f32(input_data[i * w + w - 1 + w + w]);
        tmp0 = vextq_f32(in0_tmp, pad0, 1);
        tmp1 = vextq_f32(in0_tmp, pad0, 2);
@@ -539,8 +537,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
  const int hxw = input_height * input_width;
-  const int l = input_height;
+  //  const int l = input_height;
+  const int h = input_height;
+  const int w = input_width;
  float32x4_t vzero = vdupq_n_f32(0);
  for (int b = 0; b < batch_size; b++) {
@@ -626,54 +625,53 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
      }
      output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
-                       w21 * input_data[l] + w22 * input_data[l + 1];
+                       w21 * input_data[w] + w22 * input_data[w + 1];
-      output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l - 1] +
+      output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w - 1] +
-                           w20 * input_data[2 * l - 2] +
+                           w20 * input_data[2 * w - 2] +
-                           w21 * input_data[2 * l - 1];
+                           w21 * input_data[2 * w - 1];
-      output_data[(l - 1) * l] =
+      output_data[(h - 1) * w] =
-          w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l + 1] +
+          w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w + 1] +
-          w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
+          w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1];
-      output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
+      output_data[h * w - 1] =
-                               w01 * input_data[(l - 2) * (l + 1) + 1] +
+          w00 * input_data[h * w - w - 2] + w01 * input_data[h * w - w - 1] +
-                               w10 * input_data[l * l - 2] +
+          w10 * input_data[h * w - 2] + w11 * input_data[h * w - 1];
-                               w11 * input_data[l * l - 1];
      output_data[0] = output_data[0] * newscale_data[c] + newbias_data[c];
-      output_data[l - 1] =
+      output_data[w - 1] =
-          output_data[l - 1] * newscale_data[c] + newbias_data[c];
+          output_data[w - 1] * newscale_data[c] + newbias_data[c];
-      output_data[(l - 1) * l] =
+      output_data[(h - 1) * w] =
-          output_data[(l - 1) * l] * newscale_data[c] + newbias_data[c];
+          output_data[(h - 1) * w] * newscale_data[c] + newbias_data[c];
-      output_data[l * l - 1] =
+      output_data[h * w - 1] =
-          output_data[l * l - 1] * newscale_data[c] + newbias_data[c];
+          output_data[h * w - 1] * newscale_data[c] + newbias_data[c];
      if (if_relu) {
        output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
-        output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l - 1];
+        output_data[w - 1] = output_data[w - 1] < 0 ? 0 : output_data[w - 1];
-        output_data[(l - 1) * l] =
+        output_data[(h - 1) * w] =
-            output_data[(l - 1) * l] < 0 ? 0 : output_data[(l - 1) * l];
+            output_data[(h - 1) * w] < 0 ? 0 : output_data[(h - 1) * w];
-        output_data[l * l - 1] =
+        output_data[h * w - 1] =
-            output_data[l * l - 1] < 0 ? 0 : output_data[l * l - 1];
+            output_data[h * w - 1] < 0 ? 0 : output_data[h * w - 1];
      }
-      for (int i = 1; i < l - 1; ++i) {
+      for (int i = 1; i < h - 1; ++i) {
-        output_data[i * l] =
+        output_data[i * w] =
-            w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1] +
+            w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1] +
-            w11 * input_data[i * l] + w12 * input_data[i * l + 1] +
+            w11 * input_data[i * w] + w12 * input_data[i * w + 1] +
-            w21 * input_data[i * l + l] + w22 * input_data[i * l + l + 1];
+            w21 * input_data[i * w + w] + w22 * input_data[i * w + w + 1];
-        output_data[i * l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] +
+        output_data[i * w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] +
-                                     w01 * input_data[i * l + l - 1 - l] +
+                                     w01 * input_data[i * w + w - 1 - w] +
-                                     w10 * input_data[i * l + l - 1 - 1] +
+                                     w10 * input_data[i * w + w - 1 - 1] +
-                                     w11 * input_data[i * l + l - 1] +
+                                     w11 * input_data[i * w + w - 1] +
-                                     w20 * input_data[i * l + l - 1 + l - 1] +
+                                     w20 * input_data[i * w + w - 1 + w - 1] +
-                                     w21 * input_data[i * l + l - 1 + l];
+                                     w21 * input_data[i * w + w - 1 + w];
-        output_data[i * l] =
+        output_data[i * w] =
-            output_data[i * l] * newscale_data[c] + newbias_data[c];
+            output_data[i * w] * newscale_data[c] + newbias_data[c];
-        output_data[i * l + l - 1] =
+        output_data[i * w + w - 1] =
-            output_data[i * l + l - 1] * newscale_data[c] + newbias_data[c];
+            output_data[i * w + w - 1] * newscale_data[c] + newbias_data[c];
        if (if_relu) {
-          output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i * l];
+          output_data[i * w] = output_data[i * w] < 0 ? 0 : output_data[i * w];
-          output_data[i * l + l - 1] =
+          output_data[i * w + w - 1] =
-              output_data[i * l + l - 1] < 0 ? 0 : output_data[i * l + l - 1];
+              output_data[i * w + w - 1] < 0 ? 0 : output_data[i * w + w - 1];
        }
      }
@@ -776,7 +774,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
        const int h = static_cast<int>(input->dims()[2]);
        const int w = static_cast<int>(input->dims()[3]);
-        const int l = h;
+//        const int l = h;
        const int batch_size = static_cast<int>(input->dims()[0]);
        const int c = static_cast<int>(input->dims()[1]);
@@ -792,7 +790,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
            vnewbias = vdupq_n_f32(newbias_data[j]);
            vnewscale = vdupq_n_f32(newscale_data[j]);
-            int l_mid = l - 2;  // l=1->l_mid=-1,l=2->l_mid=0
+            int w_mid = w - 2;  // l=1->l_mid=-1,l=2->l_mid=0
            float w00 = filter_data_tmp[0];
            float w01 = filter_data_tmp[1];
            float w02 = filter_data_tmp[2];
@@ -804,49 +802,49 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
            float w22 = filter_data_tmp[8];
            output_data[0] = w11 * input_data[0] + w12 * input_data[1] +
-                             w21 * input_data[l] + w22 * input_data[l + 1];
+                             w21 * input_data[w] + w22 * input_data[w + 1];
-            output_data[l - 1] = w10 * input_data[l - 2] + w11 * input_data[l -
+            output_data[w - 1] = w10 * input_data[w - 2] + w11 * input_data[w -
-       1] + w20 * input_data[2 * l - 2] + w21 * input_data[2 * l - 1];
+       1] + w20 * input_data[2 * w - 2] + w21 * input_data[2 * w - 1];
-            output_data[(l - 1) * l] =
+            output_data[(h - 1) * w] =
-                w01 * input_data[(l - 2) * l] + w02 * input_data[(l - 2) * l +
+                w01 * input_data[(h - 2) * w] + w02 * input_data[(h - 2) * w +
-       1] + w11 * input_data[(l - 1) * l] + w12 * input_data[(l - 1) * l + 1];
+       1] + w11 * input_data[(h - 1) * w] + w12 * input_data[(h - 1) * w + 1];
-            output_data[l * l - 1] = w00 * input_data[(l - 2) * (l + 1)] +
+            output_data[h * w - 1] = w00 * input_data[h*w-w-2] +
-                                     w01 * input_data[(l - 2) * (l + 1) + 1] +
+                                     w01 * input_data[h*w-w-1] +
-                                     w10 * input_data[l * l - 2] +
+                                     w10 * input_data[h * w - 2] +
-                                     w11 * input_data[l * l - 1];
+                                     w11 * input_data[h * w - 1];
            output_data[0] = output_data[0] * newscale_data[j] +
-       newbias_data[j]; output_data[l - 1] = output_data[l - 1] *
+       newbias_data[j]; output_data[w - 1] = output_data[w - 1] *
-       newscale_data[j] + newbias_data[j]; output_data[(l - 1) * l] =
+       newscale_data[j] + newbias_data[j]; output_data[(h - 1) * w] =
-                output_data[(l - 1) * l] * newscale_data[j] + newbias_data[j];
+                output_data[(h - 1) * w] * newscale_data[j] + newbias_data[j];
-            output_data[l * l - 1] =
+            output_data[h * w - 1] =
-                output_data[l * l - 1] * newscale_data[j] + newbias_data[j];
+                output_data[h * w - 1] * newscale_data[j] + newbias_data[j];
            if (if_relu) {
              output_data[0] = output_data[0] < 0 ? 0 : output_data[0];
-              output_data[l - 1] = output_data[l - 1] < 0 ? 0 : output_data[l -
+              output_data[w - 1] = output_data[w - 1] < 0 ? 0 : output_data[w -
-       1]; output_data[(l - 1) * l] = output_data[(l - 1) * l] < 0 ? 0 :
+       1]; output_data[(h - 1) * w] = output_data[(h - 1) * w] < 0 ? 0 :
-       output_data[(l - 1) * l]; output_data[l * l - 1] = output_data[l * l - 1]
+       output_data[(h - 1) * w]; output_data[h * w - 1] = output_data[h * w - 1]
-       < 0 ? 0 : output_data[l * l - 1];
+       < 0 ? 0 : output_data[h * w - 1];
            }
-            for (int i = 1; i < l - 1; ++i) {
+            for (int i = 1; i < h - 1; ++i) {
-              output_data[i * l] =
+              output_data[i * w] =
-                  w01 * input_data[i * l - l] + w02 * input_data[i * l - l + 1]
+                  w01 * input_data[i * w - w] + w02 * input_data[i * w - w + 1]
-       + w11 * input_data[i * l] + w12 * input_data[i * l + 1] + w21 *
+       + w11 * input_data[i * w] + w12 * input_data[i * w + 1] + w21 *
-       input_data[i * l + l] + w22 * input_data[i * l + l + 1]; output_data[i *
+       input_data[i * w + w] + w22 * input_data[i * w + w + 1]; output_data[i *
-       l + l - 1] = w00 * input_data[i * l + l - 1 - l - 1] + w01 * input_data[i
+       w + w - 1] = w00 * input_data[i * w + w - 1 - w - 1] + w01 * input_data[i
-       * l + l - 1 - l] + w10 * input_data[i * l + l - 1 - 1] + w11 *
+       * w + w - 1 - w] + w10 * input_data[i * w + w - 1 - 1] + w11 *
-       input_data[i * l + l - 1] + w20 * input_data[i * l + l - 1 + l - 1] + w21
+       input_data[i * w + w - 1] + w20 * input_data[i * w + w - 1 + w - 1] + w21
-       * input_data[i * l + l - 1 + l]; output_data[i * l] = output_data[i * l]
+       * input_data[i * w + w - 1 + w]; output_data[i * w] = output_data[i * w]
-       * newscale_data[j] + newbias_data[j]; output_data[i * l + l - 1] =
+       * newscale_data[j] + newbias_data[j]; output_data[i * w + w - 1] =
-                  output_data[i * l + l - 1] * newscale_data[j] +
+                  output_data[i * w + w - 1] * newscale_data[j] +
       newbias_data[j];
              if (if_relu) {
-                output_data[i * l] = output_data[i * l] < 0 ? 0 : output_data[i
+                output_data[i * w] = output_data[i * w] < 0 ? 0 : output_data[i
-       * l]; output_data[i * l + l - 1] = output_data[i * l + l - 1] < 0 ? 0 :
+       * w]; output_data[i * w + w - 1] = output_data[i * w + w - 1] < 0 ? 0 :
-       output_data[i * l + l - 1];
+       output_data[i * w + w - 1];
              }
            }
@@ -855,11 +853,11 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
            float32x4_t in0, in1, in2, in3, in4, in5, in6, in7, tmp0, tmp1,
       tmp2, tmp3, tmp4, tmp5, out0; in0 = vld1q_f32(input_tmp); in2 =
-       vld1q_f32(input_tmp + l); const float *input_tmp_end = input_tmp + (l -
+       vld1q_f32(input_tmp + w); const float *input_tmp_end = input_tmp + (h -
-       2) * l; in4 = vld1q_f32(input_tmp_end); in6 = vld1q_f32(input_tmp_end +
+       2) * w; in4 = vld1q_f32(input_tmp_end); in6 = vld1q_f32(input_tmp_end +
-       l); int c_mid = l_mid; auto output_ptr = output_data + 1; for (; c_mid >
+       w); int c_mid = w_mid; auto output_ptr = output_data + 1; for (; c_mid >
       3; c_mid -= 4) { in1 = vld1q_f32(input_tmp + 4); in3 =
-       vld1q_f32(input_tmp + l + 4);
+       vld1q_f32(input_tmp + w + 4);
              tmp0 = vextq_f32(in0, in1, 1);
              tmp1 = vextq_f32(in0, in1, 2);
@@ -880,7 +878,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
              vst1q_f32(output_ptr, out0);
              in5 = vld1q_f32(input_tmp_end + 4);
-              in7 = vld1q_f32(input_tmp_end + l + 4);
+              in7 = vld1q_f32(input_tmp_end + w + 4);
              tmp0 = vextq_f32(in4, in5, 1);
              tmp1 = vextq_f32(in4, in5, 2);
@@ -897,7 +895,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
              if (if_relu) {
                out0 = vmaxq_f32(out0, vzero);
              }
-              vst1q_f32(output_ptr + (l - 1) * l, out0);
+              vst1q_f32(output_ptr + (h - 1) * w, out0);
              // can optimize to each 8 stride.
              input_tmp += 4;
@@ -910,8 +908,8 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
            }
            // top right pad
-            float32x4_t pad0 = vdupq_n_f32(input_data[l - 1]);
+            float32x4_t pad0 = vdupq_n_f32(input_data[w - 1]);
-            float32x4_t pad1 = vdupq_n_f32(input_data[2 * l - 1]);
+            float32x4_t pad1 = vdupq_n_f32(input_data[2 * w - 1]);
            tmp0 = vextq_f32(in0, pad0, 1);
            tmp1 = vextq_f32(in0, pad0, 2);
@@ -941,8 +939,8 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
            }
            // bottom right pad
-            float32x4_t pad2 = vdupq_n_f32(input_data[l * l - 1 - l]);
+            float32x4_t pad2 = vdupq_n_f32(input_data[h * w - 1 - w]);
-            float32x4_t pad3 = vdupq_n_f32(input_data[l * l - 1]);
+            float32x4_t pad3 = vdupq_n_f32(input_data[h * w - 1]);
            tmp0 = vextq_f32(in4, pad2, 1);
            tmp1 = vextq_f32(in4, pad2, 2);
@@ -961,29 +959,29 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
            }
            for (int i = 0; i < c_mid; ++i) {
              if (i == 0) {
-                vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 0);
+                vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 0);
              }
              if (i == 1) {
-                vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 1);
+                vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 1);
              }
              if (i == 2) {
-                vst1q_lane_f32(output_ptr + (l - 1) * l + i, out0, 2);
+                vst1q_lane_f32(output_ptr + (h - 1) * w + i, out0, 2);
              }
            }
            // mid
-            for (int i = 0; i < l - 2; ++i) {
+            for (int i = 0; i < h - 2; ++i) {
-              auto output_ptr = output_data + (i + 1) * l + 1;
+              auto output_ptr = output_data + (i + 1) * w + 1;
-              input_tmp = input_data + i * l;
+              input_tmp = input_data + i * w;
              auto in0_tmp = vld1q_f32(input_tmp);
-              auto in2_tmp = vld1q_f32(input_tmp + l);
+              auto in2_tmp = vld1q_f32(input_tmp + w);
-              auto in4_tmp = vld1q_f32(input_tmp + l + l);
+              auto in4_tmp = vld1q_f32(input_tmp + w + w);
-              c_mid = l_mid;
+              c_mid = w_mid;
              for (; c_mid > 3; c_mid -= 4) {
                auto in1_tmp = vld1q_f32(input_tmp + 4);
-                auto in3_tmp = vld1q_f32(input_tmp + l + 4);
+                auto in3_tmp = vld1q_f32(input_tmp + w + 4);
-                auto in5_tmp = vld1q_f32(input_tmp + l + l + 4);
+                auto in5_tmp = vld1q_f32(input_tmp + w + w + 4);
                tmp0 = vextq_f32(in0_tmp, in1_tmp, 1);
                tmp1 = vextq_f32(in0_tmp, in1_tmp, 2);
@@ -1014,9 +1012,9 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
                in4_tmp = in5_tmp;
              }
-              float32x4_t pad0 = vdupq_n_f32(input_data[i * l + l - 1]);
+              float32x4_t pad0 = vdupq_n_f32(input_data[i * w + w - 1]);
-              float32x4_t pad1 = vdupq_n_f32(input_data[i * l + l - 1 + l]);
+              float32x4_t pad1 = vdupq_n_f32(input_data[i * w + w - 1 + w]);
-              float32x4_t pad2 = vdupq_n_f32(input_data[i * l + l - 1 + l + l]);
+              float32x4_t pad2 = vdupq_n_f32(input_data[i * w + w - 1 + w + w]);
              tmp0 = vextq_f32(in0_tmp, pad0, 1);
              tmp1 = vextq_f32(in0_tmp, pad0, 2);
@@ -1060,6 +1058,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
 #endif
 }
+/// w!=h not fix
 void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
                                   Tensor *output, const Tensor *new_scale,
                                   const Tensor *new_bias, bool if_relu) {
@@ -1275,7 +1274,8 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
  const int in_l = in_h;
  const int inhxw = in_h * in_w;
  const int outhxw = out_h * out_w;
-  const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0;
+  /// todo : fix if_pad when w != h
+  const int if_pad = in_w - 1 == (out_w - 1) * 2 ? 1 : 0;
  const int batch_size = static_cast<int>(input->dims()[0]);
  const int c = static_cast<int>(input->dims()[1]);
  const float *input_row_ptr;
@@ -1381,9 +1381,9 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
        if ((w4 != w_times)) {
          vst1q_f32(output_row_ptr, res3);
        } else {
-          if (out_l - 2 - w_times * 3 == 1) {
+          if (out_w - 2 - w_times * 3 == 1) {
            vst1q_lane_f32(output_row_ptr, res3, 0);
-          } else if (out_l - 2 - w_times * 3 == 2) {
+          } else if (out_w - 2 - w_times * 3 == 2) {
            vst1q_lane_f32(output_row_ptr, res3, 0);
            vst1q_lane_f32(output_row_ptr + 1, res3, 1);
          }
@@ -1393,28 +1393,28 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
      }
      output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 +
-                           input_const[in_l] * w21 +
+                           input_const[in_w] * w21 +
-                           input_const[in_l + 1] * w22;
+                           input_const[in_w + 1] * w22;
-      out2in_mid = (out_l - 1) * 2;
+      out2in_mid = (out_w - 1) * 2;
-      output_data_tmp[out_l - 1] =
+      output_data_tmp[out_w - 1] =
          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
          w20 * input_const[out2in_mid + in_w - 1] +
          w21 * input_const[out2in_mid + in_w] +
          (1 - if_pad) * (w12 * input_const[out2in_mid + 1] +
                          w22 * input_const[out2in_mid + in_w + 1]);
-      out2in_mid = (out_l - 1) * 2 * in_w;
+      out2in_mid = (out_h - 1) * 2 * in_w;
-      output_data_tmp[out_l * (out_l - 1)] =
+      output_data_tmp[out_w * (out_h - 1)] =
          w01 * input_const[out2in_mid - in_w] +
          w02 * input_const[out2in_mid - in_w + 1] +
          w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
          (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] +
                          w22 * input_const[out2in_mid + in_w + 1]);
-      out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2;
+      out2in_mid = (out_h - 1) * 2 * in_w + (out_w - 1) * 2;
-      output_data_tmp[out_l * out_l - 1] =
+      output_data_tmp[out_h * out_w - 1] =
          w00 * input_const[out2in_mid - in_w - 1] +
          w01 * input_const[out2in_mid - in_w] +
          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
@@ -1425,21 +1425,21 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
                          w22 * input_const[out2in_mid + in_w + 1]);
      if (if_bias) {
        output_data_tmp[0] += bias_data[j];
-        output_data_tmp[out_l - 1] += bias_data[j];
+        output_data_tmp[out_w - 1] += bias_data[j];
-        output_data_tmp[out_l * (out_l - 1)] += bias_data[j];
+        output_data_tmp[out_w * (out_h - 1)] += bias_data[j];
-        output_data_tmp[out_l * out_l - 1] += bias_data[j];
+        output_data_tmp[out_h * out_w - 1] += bias_data[j];
      }
      for (int i = 1; i < out_h - 1; i++) {
        out2in_mid = i * 2 * in_w;
-        output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] +
+        output_data_tmp[i * out_w] = w01 * input_const[out2in_mid - in_w] +
                                     w02 * input_const[out2in_mid - in_w + 1] +
                                     w11 * input_const[out2in_mid] +
                                     w12 * input_const[out2in_mid + 1] +
                                     w21 * input_const[out2in_mid + in_w] +
                                     w22 * input_const[out2in_mid + in_w + 1];
-        out2in_mid = i * 2 * in_w + (out_l - 1) * 2;
+        out2in_mid = i * 2 * in_w + (out_w - 1) * 2;
-        output_data_tmp[i * out_l + out_l - 1] =
+        output_data_tmp[i * out_w + out_w - 1] =
            w00 * input_const[out2in_mid - in_w - 1] +
            w01 * input_const[out2in_mid - in_w] +
            w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
@@ -1449,8 +1449,8 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
                            w12 * input_const[out2in_mid + 1] +
                            w22 * input_const[out2in_mid + in_w + 1]);
        if (if_bias) {
-          output_data_tmp[i * out_l] += bias_data[j];
+          output_data_tmp[i * out_w] += bias_data[j];
-          output_data_tmp[i * out_l + out_l - 1] += bias_data[j];
+          output_data_tmp[i * out_w + out_w - 1] += bias_data[j];
        }
      }
      filter_data_tmp += 9;
@@ -1657,11 +1657,12 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
  const int in_w = static_cast<int>(input->dims()[3]);
  const int out_h = static_cast<int>(output->dims()[2]);
  const int out_w = static_cast<int>(output->dims()[3]);
-  const int out_l = out_h;
+  //  const int out_l = out_h;
-  const int in_l = in_h;
+  //  const int in_l = in_h;
  const int inhxw = in_h * in_w;
  const int outhxw = out_h * out_w;
-  const int if_pad = in_l - 1 == (out_l - 1) * 2 ? 1 : 0;
+  /// todo : fix if_pad when w != h
+  const int if_pad = in_w - 1 == (out_w - 1) * 2 ? 1 : 0;
  const int batch_size = static_cast<int>(input->dims()[0]);
  const int c = static_cast<int>(input->dims()[1]);
  const int w_times = (out_w - 2) / 3;
@@ -1775,9 +1776,9 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
          vst1q_lane_f32(output_row_ptr + 1, res3, 1);
          vst1q_lane_f32(output_row_ptr + 2, res3, 2);
        } else {
-          if (out_l - 2 - w_times * 3 == 1) {
+          if (out_w - 2 - w_times * 3 == 1) {
            vst1q_lane_f32(output_row_ptr, res3, 0);
-          } else if (out_l - 2 - w_times * 3 == 2) {
+          } else if (out_w - 2 - w_times * 3 == 2) {
            vst1q_lane_f32(output_row_ptr, res3, 0);
            vst1q_lane_f32(output_row_ptr + 1, res3, 1);
          }
@@ -1787,28 +1788,28 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
      }
      output_data_tmp[0] = input_const[0] * w11 + input_const[1] * w12 +
-                           input_const[in_l] * w21 +
+                           input_const[in_w] * w21 +
-                           input_const[in_l + 1] * w22;
+                           input_const[in_w + 1] * w22;
-      out2in_mid = (out_l - 1) * 2;
+      out2in_mid = (out_w - 1) * 2;
-      output_data_tmp[out_l - 1] =
+      output_data_tmp[out_w - 1] =
          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
          w20 * input_const[out2in_mid + in_w - 1] +
          w21 * input_const[out2in_mid + in_w] +
          (1 - if_pad) * (w12 * input_const[out2in_mid + 1] +
                          w22 * input_const[out2in_mid + in_w + 1]);
-      out2in_mid = (out_l - 1) * 2 * in_w;
+      out2in_mid = (out_h - 1) * 2 * in_w;
-      output_data_tmp[out_l * (out_l - 1)] =
+      output_data_tmp[out_w * (out_h - 1)] =
          w01 * input_const[out2in_mid - in_w] +
          w02 * input_const[out2in_mid - in_w + 1] +
          w11 * input_const[out2in_mid] + w12 * input_const[out2in_mid + 1] +
          (1 - if_pad) * (w21 * input_const[out2in_mid + in_w] +
                          w22 * input_const[out2in_mid + in_w + 1]);
-      out2in_mid = (out_l - 1) * 2 * in_w + (out_l - 1) * 2;
+      out2in_mid = (out_h - 1) * 2 * in_w + (out_w - 1) * 2;
-      output_data_tmp[out_l * out_l - 1] =
+      output_data_tmp[out_h * out_w - 1] =
          w00 * input_const[out2in_mid - in_w - 1] +
          w01 * input_const[out2in_mid - in_w] +
          w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
@@ -1819,38 +1820,38 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
                          w22 * input_const[out2in_mid + in_w + 1]);
      output_data_tmp[0] =
          output_data_tmp[0] * newscale_data[j] + newbias_data[j];
-      output_data_tmp[out_l - 1] =
+      output_data_tmp[out_w - 1] =
-          output_data_tmp[out_l - 1] * newscale_data[j] + newbias_data[j];
+          output_data_tmp[out_w - 1] * newscale_data[j] + newbias_data[j];
-      output_data_tmp[out_l * (out_l - 1)] =
+      output_data_tmp[out_w * (out_h - 1)] =
-          output_data_tmp[out_l * (out_l - 1)] * newscale_data[j] +
+          output_data_tmp[out_w * (out_h - 1)] * newscale_data[j] +
          newbias_data[j];
-      output_data_tmp[out_l * out_l - 1] =
+      output_data_tmp[out_h * out_w - 1] =
-          output_data_tmp[out_l * out_l - 1] * newscale_data[j] +
+          output_data_tmp[out_h * out_w - 1] * newscale_data[j] +
          newbias_data[j];
      if (if_relu) {
        output_data_tmp[0] = output_data_tmp[0] < 0 ? 0 : output_data_tmp[0];
-        output_data_tmp[out_l - 1] =
+        output_data_tmp[out_w - 1] =
-            output_data_tmp[out_l - 1] < 0 ? 0 : output_data_tmp[out_l - 1];
+            output_data_tmp[out_w - 1] < 0 ? 0 : output_data_tmp[out_w - 1];
-        output_data_tmp[out_l * (out_l - 1)] =
+        output_data_tmp[out_w * (out_h - 1)] =
-            output_data_tmp[out_l * (out_l - 1)] < 0
+            output_data_tmp[out_w * (out_h - 1)] < 0
                ? 0
-                : output_data_tmp[out_l * (out_l - 1)];
+                : output_data_tmp[out_w * (out_h - 1)];
-        output_data_tmp[out_l * out_l - 1] =
+        output_data_tmp[out_h * out_w - 1] =
-            output_data_tmp[out_l * out_l - 1] < 0
+            output_data_tmp[out_h * out_w - 1] < 0
                ? 0
-                : output_data_tmp[out_l * out_l - 1];
+                : output_data_tmp[out_h * out_w - 1];
      }
      for (int i = 1; i < out_h - 1; i++) {
        out2in_mid = i * 2 * in_w;
-        output_data_tmp[i * out_l] = w01 * input_const[out2in_mid - in_w] +
+        output_data_tmp[i * out_w] = w01 * input_const[out2in_mid - in_w] +
                                     w02 * input_const[out2in_mid - in_w + 1] +
                                     w11 * input_const[out2in_mid] +
                                     w12 * input_const[out2in_mid + 1] +
                                     w21 * input_const[out2in_mid + in_w] +
                                     w22 * input_const[out2in_mid + in_w + 1];
-        out2in_mid = i * 2 * in_w + (out_l - 1) * 2;
+        out2in_mid = i * 2 * in_w + (out_w - 1) * 2;
-        output_data_tmp[i * out_l + out_l - 1] =
+        output_data_tmp[i * out_w + out_w - 1] =
            w00 * input_const[out2in_mid - in_w - 1] +
            w01 * input_const[out2in_mid - in_w] +
            w10 * input_const[out2in_mid - 1] + w11 * input_const[out2in_mid] +
@@ -1859,18 +1860,18 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
            (1 - if_pad) * (w02 * input_const[out2in_mid - in_w + 1] +
                            w12 * input_const[out2in_mid + 1] +
                            w22 * input_const[out2in_mid + in_w + 1]);
-        output_data_tmp[i * out_l] =
+        output_data_tmp[i * out_w] =
-            output_data_tmp[i * out_l] * newscale_data[j] + newbias_data[j];
+            output_data_tmp[i * out_w] * newscale_data[j] + newbias_data[j];
-        output_data_tmp[i * out_l + out_l - 1] =
+        output_data_tmp[i * out_w + out_w - 1] =
-            output_data_tmp[i * out_l + out_l - 1] * newscale_data[j] +
+            output_data_tmp[i * out_w + out_w - 1] * newscale_data[j] +
            newbias_data[j];
        if (if_relu) {
-          output_data_tmp[i * out_l] =
+          output_data_tmp[i * out_w] =
-              output_data_tmp[i * out_l] < 0 ? 0 : output_data_tmp[i * out_l];
+              output_data_tmp[i * out_w] < 0 ? 0 : output_data_tmp[i * out_w];
-          output_data_tmp[i * out_l + out_l - 1] =
+          output_data_tmp[i * out_w + out_w - 1] =
-              output_data_tmp[i * out_l + out_l - 1] < 0
+              output_data_tmp[i * out_w + out_w - 1] < 0
                  ? 0
-                  : output_data_tmp[i * out_l + out_l - 1];
+                  : output_data_tmp[i * out_w + out_w - 1];
        }
      }
    }

--- a/src/operators/math/im2col.cpp
+++ b/src/operators/math/im2col.cpp
@@ -53,7 +53,7 @@ void Im2ColFunctor<ColFormat::kCFO, CPU, float>::operator()(
       (((isize - 2 * padding[0] + filter_height) % stride[0] == 0) ? 1 : 0));
  int fill = isize % 2;
  if (stride[0] == 1 && filter_height == 3 && pad1 && pad2 &&
-      dilation[0] == 1 && im_height > 2) {
+      dilation[0] == 1 && im_height > 2 && im_height == im_width) {
    for (int c = 0; c < im_channels; ++c) {
      int oosize = osize * osize;
      int nk4 = osize / 4;
@@ -225,7 +225,7 @@ void Im2ColFunctor<ColFormat::kCFO, CPU, float>::operator()(
      im_data += isize * isize;
    }
  } else if (stride[0] == 2 && filter_height == 3 && pad1 && dilation[0] == 1 &&
-             im_height > 2) {
+             im_height > 2 && im_height == im_width) {
    for (int c = 0; c < im_channels; ++c) {
      int oosize = osize * osize;
      int nk4 = osize / 4;
@@ -605,7 +605,6 @@ class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
    const T *im_data = im.data<T>();
    T *col_data = col->data<T>();
    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
        for (int channel = 0; channel < im_channels; ++channel) {
@@ -617,7 +616,6 @@ class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
                 ++filter_col_idx) {
              int im_col_offset =
                  col_col_idx * stride[1] + filter_col_idx - padding[1];
              int col_offset =
                  ((((col_row_idx)*col_width + col_col_idx) * im_channels +
                    channel) *
@@ -625,7 +623,6 @@ class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
                   filter_row_idx) *
                      filter_width +
                  filter_col_idx;
              int im_offset = (channel * im_height + im_row_offset) * im_width +
                              im_col_offset;
              col_data[col_offset] =

--- a/src/operators/math/pool_2x2.cpp
+++ b/src/operators/math/pool_2x2.cpp
@@ -58,7 +58,7 @@ void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
        const float *in_ptr1 = input_data + i * input_batch_stride +
                               c * input_channel_stride + ph * input_width;
        const float *in_ptr2 = in_ptr1 + input_width;
-        if (ph + 1 >= input_height) {
+        if (ph != input_height && ph + 1 >= input_height) {
          in_ptr2 = static_cast<float *>(
              paddle_mobile::memory::Alloc(sizeof(float) * input_width));
          memset(static_cast<void *>(const_cast<float *>(in_ptr2)), -FLT_MAX,
@@ -122,19 +122,30 @@ void Pool2x2Maxs2p0(vector<int> strides, vector<int> paddings,
 #endif
        if (_w2 != 0) {
-          in_ptr1 += 16 * w1 + 4 * w2;
+          in_ptr1 = input_data + i * input_batch_stride +
-          in_ptr2 += 16 * w1 + 4 * w2;
+                    c * input_channel_stride + ph * input_width + 16 * w1 +
-          out_ptr += 8 * w1 + 2 * w2;
+                    4 * w2;
+          in_ptr2 = in_ptr1 + input_width;
+          out_ptr = output_data + i * output_batch_stride +
+                    c * output_channel_stride + ph / 2 * output_width + 8 * w1 +
+                    2 * w2;
          if (_w2 == 1) {
            *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
          } else if (_w2 == 2) {
-            float temp = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
+            float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
+            in_ptr1++;
+            in_ptr2++;
            float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
            *out_ptr = (temp > temp1) ? temp : temp1;
          } else if (_w2 == 3) {
-            float temp = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
+            float temp = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
-            float temp1 = (*in_ptr1++ > *in_ptr2++) ? *in_ptr1++ : *in_ptr2++;
+            in_ptr1++;
-            *out_ptr++ = (temp > temp1) ? temp : temp1;
+            in_ptr2++;
+            float temp1 = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
+            in_ptr1++;
+            in_ptr2++;
+            *out_ptr = (temp > temp1) ? temp : temp1;
+            out_ptr++;
            *out_ptr = (*in_ptr1 > *in_ptr2) ? *in_ptr1 : *in_ptr2;
          }
        }
@@ -173,7 +184,7 @@ void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
  int w2 = _w1 / 4;
  int _w2 = _w1 % 4;
-  float quarter = 1 / 4;
+  float quarter = 0.25;
  for (int i = 0; i < batch_size; ++i) {
    for (int c = 0; c < output_channels; ++c) {
      for (int ph = 0; ph < input_height; ph += 2) {
@@ -250,25 +261,32 @@ void Pool2x2Avgs2p0(vector<int> strides, vector<int> paddings,
 #endif
        if (_w2 != 0) {
-          in_ptr1 += 16 * w1 + 4 * w2;
+          in_ptr1 = input_data + i * input_batch_stride +
-          in_ptr2 += 16 * w1 + 4 * w2;
+                    c * input_channel_stride + ph * input_width + 16 * w1 +
-          out_ptr += 8 * w1 + 2 * w2;
+                    4 * w2;
+          in_ptr2 = in_ptr1 + input_width;
+          out_ptr = output_data + i * output_batch_stride +
+                    c * output_channel_stride + ph / 2 * output_width + 8 * w1 +
+                    2 * w2;
          if (_w2 == 1) {
            *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
          } else if (_w2 == 2) {
            float temp = 0;
-            temp += *in_ptr1++;
-            temp += *in_ptr2++;
            temp += *in_ptr1;
            temp += *in_ptr2;
-            *out_ptr = 0.5 * temp;
+            in_ptr1++;
+            in_ptr2++;
+            temp += *in_ptr1;
+            temp += *in_ptr2;
+            *out_ptr = 0.25 * temp;
          } else if (_w2 == 3) {
            float temp = 0;
            temp += *in_ptr1++;
            temp += *in_ptr2++;
            temp += *in_ptr1++;
            temp += *in_ptr2++;
-            *out_ptr++ = 0.5 * temp;
+            *out_ptr = 0.25 * temp;
+            out_ptr++;
            *out_ptr = 0.5 * (*in_ptr1 + *in_ptr2);
          }
        }

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1132,6 +1132,37 @@ class TransposeParam : public OpParam {
 };
 #endif
+#ifdef TRANSPOSE2_OP
+template <typename Dtype>
+class Transpose2Param : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+ public:
+  Transpose2Param(const VariableNameMap &inputs, const VariableNameMap &outputs,
+                  const AttributeMap &attrs, const Scope &scope) {
+    input_x_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    output_xshape_ = OutputXShapeFrom<GType>(outputs, scope);
+    axis_ = GetAttr<vector<int>>("axis", attrs);
+  }
+  const RType *InputX() const { return input_x_; }
+  RType *Out() const { return out_; }
+  RType *OutputXShape() const { return output_xshape_; }
+  const vector<int> &Axis() const { return axis_; }
+ private:
+  RType *input_x_;
+  RType *out_;
+  RType *output_xshape_;
+  vector<int> axis_;
+};
+#endif
 #ifdef LOOKUP_OP
 template <typename Dtype>
 class LookupParam : public OpParam {
@@ -2116,9 +2147,9 @@ class Im2SequenceParam : public OpParam {
    paddings_ = GetAttr<vector<int>>("paddings", attrs);
  }
-  const RType *Input() const { return input_x_; }
+  const GType *Input() const { return input_x_; }
-  RType *Output() const { return out_; }
+  GType *Output() const { return out_; }
  const vector<int> &Kernels() const { return kernels_; }
@@ -2127,8 +2158,8 @@ class Im2SequenceParam : public OpParam {
  const vector<int> &Paddings() const { return paddings_; }
 private:
-  RType *input_x_;
+  GType *input_x_;
-  RType *out_;
+  GType *out_;
  vector<int> kernels_;
  vector<int> strides_;
  vector<int> paddings_;

--- a/src/operators/transpose2_op.cpp
+++ b/src/operators/transpose2_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef TRANSPOSE2_OP
+#include <vector>
+#include "common/enforce.h"
+#include "operators/transpose2_op.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename Dtype, typename T>
+void Transpose2Op<Dtype, T>::InferShape() const {
+  auto input_x_dims = this->param_.InputX()->dims();
+  auto axis = this->param_.Axis();
+  size_t x_dims_size = input_x_dims.size();
+  size_t axis_size = axis.size();
+  PADDLE_MOBILE_ENFORCE((x_dims_size == axis_size),
+                        "input_dims must "
+                        "be equal to the axis_size. ")
+  std::vector<int> count(axis_size, 0);
+  for (size_t i = 0; i < axis_size; i++) {
+    PADDLE_MOBILE_ENFORCE(
+        axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
+        "Each element of Attribute axis should be a unique value "
+        "range from 0 to (dims - 1), "
+        "where the dims is the axis's size");
+  }
+  framework::DDim out_dims(input_x_dims);
+  for (size_t i = 0; i < axis_size; i++) {
+    out_dims[i] = input_x_dims[axis[i]];
+  }
+  this->param_.Out()->Resize(out_dims);
+  std::vector<int64_t> xshape_dims(input_x_dims.size() + 1, 0);
+  for (int i = 0; i < input_x_dims.size(); ++i) {
+    xshape_dims[i + 1] = input_x_dims[i];
+  }
+  this->param_.OutputXShape()->Resize(framework::make_ddim(xshape_dims));
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(transpose2, ops::Transpose2Op);
+#endif
+#endif  // TRANSPOSE_OP
--- a/src/operators/transpose2_op.h
+++ b/src/operators/transpose2_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef TRANSPOSE2_OP
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/transpose2_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using paddle_mobile::framework::Tensor;
+template <typename DeviceType, typename T>
+class Transpose2Op : public framework::OperatorWithKernel<
+                         DeviceType, Transpose2Param<DeviceType>,
+                         operators::Transpose2Kernel<DeviceType, T>> {
+ public:
+  Transpose2Op(const std::string &type, const VariableNameMap &inputs,
+               const VariableNameMap &outputs,
+               const framework::AttributeMap &attrs,
+               std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<
+            DeviceType, Transpose2Param<DeviceType>,
+            operators::Transpose2Kernel<DeviceType, T>>(type, inputs, outputs,
+                                                        attrs, scope) {}
+  using framework::OperatorWithKernel<
+      DeviceType, Transpose2Param<DeviceType>,
+      operators::Transpose2Kernel<DeviceType, T>>::OperatorWithKernel;
+  void InferShape() const override;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -184,6 +184,10 @@ if (NOT FOUND_MATCH)
    ADD_EXECUTABLE(test-transpose-op operators/test_transpose_op.cpp test_helper.h test_include.h)
    target_link_libraries(test-transpose-op paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-transpose2-op operators/test_transpose2_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-transpose2-op paddle-mobile)
    # gen test
    ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h)
    target_link_libraries(test-multiclassnms-op paddle-mobile)
@@ -343,6 +347,10 @@ if (NOT FOUND_MATCH)
    ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h)
    target_link_libraries(test-multi-process paddle-mobile)
+    # gen test
+    ADD_EXECUTABLE(test-eng net/test_eng.cpp test_helper.h test_include.h)
+    target_link_libraries(test-eng paddle-mobile)
    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 endif ()
--- a/test/net/test_eng.cpp
+++ b/test/net/test_eng.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+int main() {
+#ifdef PADDLE_MOBILE_CPU
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+#endif
+  //    paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  if (paddle_mobile.Load(std::string(g_eng) + "/model",
+                         std::string(g_eng) + "/params", true, false, 1,
+                         true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    std::vector<int64_t> dims{1, 1, 48, 400};
+    LoDTensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 1, 48, 400}, static_cast<float>(0),
+                       static_cast<float>(1));
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+    //   预热十次
+    for (int i = 0; i < 1; ++i) {
+      paddle_mobile.PredictLod(input_tensor);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 1; ++i) {
+      paddle_mobile.PredictLod(input_tensor);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+              << std::endl;
+  }
+  return 0;
+}
--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/batchnorm_op.h"

--- a/test/operators/test_box_coder_op.cpp
+++ b/test/operators/test_box_coder_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 #include "../test_include.h"
 #include "operators/box_coder_op.h"

--- a/test/operators/test_elementwise_sub_op.cpp
+++ b/test/operators/test_elementwise_sub_op.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/elementwise_sub_op.h"

--- a/test/operators/test_fill_constant_op.cpp
+++ b/test/operators/test_fill_constant_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 #include "../test_include.h"
 #include "operators/fill_constant_op.h"

--- a/test/operators/test_fusion_fc_op.cpp
+++ b/test/operators/test_fusion_fc_op.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 #include <framework/program/program-optimize/program_optimize.h>
 #include "../test_include.h"
 #include "operators/fusion_fc_op.h"

--- a/test/operators/test_im2sequence_op.cpp
+++ b/test/operators/test_im2sequence_op.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/im2sequence_op.h"

--- a/test/operators/test_multiclass_nms_op.cpp
+++ b/test/operators/test_multiclass_nms_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 #include "../test_include.h"
 #include "operators/multiclass_nms_op.h"
@@ -31,14 +30,12 @@ class TestMultiClassNMSOp {
    const std::vector<std::shared_ptr<BlockDesc>> blocks =
        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
    for (auto block_desc : blocks) {
      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
      for (auto op : ops) {
        if (op->Type() == "multiclass_nms" &&
            op->Input("BBoxes")[0] == "box_coder_0.tmp_0") {
-          DLOG << " mul attr size: " << op->GetAttrMap().size();
+          DLOG << " attr size: " << op->GetAttrMap().size();
          DLOG << " inputs size: " << op->GetInputs().size();
          DLOG << " outputs size: " << op->GetOutputs().size();
          DLOG << " BBoxes is : " << op->Input("BBoxes")[0];
@@ -55,14 +52,6 @@ class TestMultiClassNMSOp {
               << op->GetAttrMap().at("nms_top_k").Get<int>();
          DLOG << " score_threshold : "
               << op->GetAttrMap().at("score_threshold").Get<float>();
-          //                            DLOG << " variances : " <<
-          //                            op->GetAttrMap().at("variances").Get<std::vector<float>>();
-          //                            DLOG << " aspect_ratios : " <<
-          //                            op->GetAttrMap().at("aspect_ratios").Get<std::vector<float>>();
-          //                            DLOG << " min_sizes : " <<
-          //                            op->GetAttrMap().at("min_sizes").Get<std::vector<float>>();
-          //                            DLOG << " max_sizes : " <<
-          //                            op->GetAttrMap().at("max_sizes").Get<std::vector<float>>();
          std::shared_ptr<operators::MultiClassNMSOp<Dtype, float>> priorbox =
              std::make_shared<operators::MultiClassNMSOp<Dtype, float>>(
                  op->Type(), op->GetInputs(), op->GetOutputs(),
@@ -88,16 +77,12 @@ class TestMultiClassNMSOp {
    auto *output_tensor = output->GetMutable<LoDTensor>();
    output_tensor->mutable_data<float>({1917, 6});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
    out_tensor.reset(output_tensor);
    predict(t1, t2, 0);
    return out_tensor;
-    // return outvars_tensor;
  }
 private:

--- a/test/operators/test_polygon_box_transform_op.cpp
+++ b/test/operators/test_polygon_box_transform_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 #include "../test_include.h"
 #include "operators/polygon_box_transform_op.h"

--- a/test/operators/test_prior_box_op.cpp
+++ b/test/operators/test_prior_box_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 #include "../test_include.h"
 #include "operators/prior_box_op.h"

--- a/test/operators/test_reshape2_op.cpp
+++ b/test/operators/test_reshape2_op.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 #include "../test_include.h"
 #include "operators/reshape2_op.h"

--- a/test/operators/test_sum_op.cpp
+++ b/test/operators/test_sum_op.cpp
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/sum_op.h"

--- a/test/operators/test_transpose2_op.cpp
+++ b/test/operators/test_transpose2_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "../test_include.h"
+#include "operators/transpose2_op.h"
+namespace paddle_mobile {
+namespace framework {
+template <typename Dtype>
+class TestTranspose2Op {
+ public:
+  explicit TestTranspose2Op(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    for (auto block_desc : blocks) {
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      for (auto op : ops) {
+        if (op->Type() == "transpose2") {
+          DLOG << " attr size: " << op->GetAttrMap().size();
+          std::unordered_map<std::string, Attribute> attrs = op->GetAttrMap();
+          for (std::unordered_map<std::string, Attribute>::iterator it =
+                   attrs.begin();
+               it != attrs.end(); ++it) {
+            DLOG << "  " << it->first << " " << it->second;
+          }
+          DLOG << " inputs size: " << op->GetInputs().size();
+          VariableNameMap inputs = op->GetInputs();
+          for (VariableNameMap::iterator it = inputs.begin();
+               it != inputs.end(); ++it) {
+            DLOG << "  " << it->first << " " << it->second;
+          }
+          DLOG << " outputs size: " << op->GetOutputs().size();
+          VariableNameMap outputs = op->GetOutputs();
+          for (VariableNameMap::iterator it = outputs.begin();
+               it != outputs.end(); ++it) {
+            DLOG << "  " << it->first << " " << it->second;
+          }
+          input_var_name = op->Input("X")[0];
+          output_var_name = op->Output("Out")[0];
+          std::shared_ptr<operators::Transpose2Op<Dtype, float>> op_ptr =
+              std::make_shared<operators::Transpose2Op<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(op_ptr);
+          return;
+        }
+      }
+    }
+  }
+  std::shared_ptr<Tensor> predict(const Tensor &t) {
+    auto scope = program_.scope;
+    Variable *input_feed_value = scope->Var(input_var_name);
+    auto tensor_input = input_feed_value->GetMutable<LoDTensor>();
+    tensor_input->ShareDataWith(t);
+    Variable *output = scope->Var(output_var_name);
+    auto *output_tensor = output->GetMutable<LoDTensor>();
+    output_tensor->mutable_data<float>({1, 2, 8});
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);
+    predict(t, 0);
+    return out_tensor;
+  }
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+  string input_var_name;
+  string output_var_name;
+  void predict(const Tensor &t, int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      op->Run();
+    }
+  }
+};
+template class TestTranspose2Op<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run Transpose2 Test";
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_ocr) + "/model",
+                             std::string(g_ocr) + "/params");
+  paddle_mobile::framework::Tensor input;
+  SetupTensor<float>(&input, {1, 8, 2}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *input_ptr = input.data<float>();
+  for (int i = 0; i < 16; ++i) {
+    *(input_ptr + i) = i;
+  }
+  DLOG << "input : ";
+  for (int i = 0; i < input.numel(); ++i) {
+    DLOG << " index " << i << " : " << input_ptr[i];
+  }
+  paddle_mobile::framework::TestTranspose2Op<paddle_mobile::CPU>
+      testTranspose2Op(program);
+  auto output = testTranspose2Op.predict(input);
+  auto *output_ptr = output->data<float>();
+  DLOG << "output : ";
+  for (int i = 0; i < output->numel(); ++i) {
+    DLOG << " index " << i << " : " << output_ptr[i];
+  }
+  return 0;
+}
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -205,6 +205,7 @@ if(NOT FOUND_MATCH)
  set(SIGMOID_OP ON)
  set(SOFTMAX_OP ON)
  set(TRANSPOSE_OP ON)
+  set(TRANSPOSE2_OP ON)
  set(FUSION_CONVADDBNRELU_OP ON)
  set(FUSION_CONVADDADDPRELU_OP ON)
  set(FUSION_DWCONVBNRELU_OP ON)
@@ -251,6 +252,7 @@ endif()
  # option(SIGMOID_OP "" ON)
  # option(SOFTMAX_OP "" ON)
  # option(TRANSPOSE_OP "" ON)
+  # option(TRANSPOSE2_OP "" ON)
 # endif ()
 if (BATCHNORM_OP)
@@ -328,6 +330,9 @@ endif()
 if (TRANSPOSE_OP)
  add_definitions(-DTRANSPOSE_OP)
 endif()
+if (TRANSPOSE2_OP)
+  add_definitions(-DTRANSPOSE2_OP)
+endif()
 if (FUSION_CONVADDBNRELU_OP)
  add_definitions(-DFUSION_CONVADDBNRELU_OP)
 endif()