Transform kernel in op initialization

4d05a8c6 · hjchen2 · 3076c54f · 4d05a8c6 · 4d05a8c6 · 4d05a8c6
12 changed file
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <map>
 #include <string>
+#include <utility>
 #include <vector>

 #include "common/enforce.h"
@@ -119,10 +120,6 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
  virtual void InferShape() const = 0;

  void Init() {
-    //    for (auto i : this->inputs_) {
-    //      DLOG << i.first;
-    //      DLOG << i.second;
-    //    }
    PADDLE_MOBILE_ENFORCE(kernel_.Init(&param_), "  %s kernel init failed",
                          this->type_.c_str());
  }

--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -135,22 +135,6 @@ class Tensor {
    return reinterpret_cast<T *>(mutable_data(typeid(T)));
  }

-#ifdef PADDLE_MOBILE_DEBUG
-  template <typename T>
-  inline void dump(std::string filename) const {
-    const T *dataptr = data<T>();
-    std::ofstream out(filename.c_str());
-    for (int i = 0; i < numel(); ++i) {
-      out << dataptr[i] << " ";
-    }
-    out << "形状：";
-    for (int j = 0; j < dims_.size(); ++j) {
-      out << dims_[j] << " ";
-    }
-    out.close();
-  }
-#endif
-
  inline void *mutable_data(std::type_index type) {
    if (holder_ != nullptr) {
      holder_->set_type(type);

--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
@@ -17,17 +17,69 @@ limitations under the License. */
 #include "operators/kernel/conv_kernel.h"
 #include "operators/kernel/central-arm-func/conv_arm_func.h"

+#include <iostream>
+
 namespace paddle_mobile {
 namespace operators {

 template <>
 bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
+  if (param->Input()->type() == typeid(int8_t)) {
+    param->ExecMode() = ConvParam<CPU>::EXEC_GEMM_INT8;
+  } else {
+    if (param->Groups() == param->Input()->dims()[1] &&
+        param->Input()->dims()[1] == param->Output()->dims()[1] &&
+        param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+        param->Filter()->dims()[2] == 3 && param->Strides()[0] == 1) {
+      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE3x3S1P1_FLOAT;
+    } else if (param->Groups() == param->Input()->dims()[1] &&
+               param->Input()->dims()[1] == param->Output()->dims()[1] &&
+               param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+               param->Filter()->dims()[2] == 3) {
+      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE3x3_FLOAT;
+    } else if (param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+               param->Strides()[0] == param->Strides()[1] &&
+               param->Dilations()[0] == param->Dilations()[1] &&
+               param->Filter()->dims()[2] == 3 && param->Strides()[0] == 1 &&
+               param->Dilations()[0] == 1 && param->Output()->dims()[1] >= 16 &&
+               param->Input()->dims()[2] >= 16) {
+      param->ExecMode() = ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT;
+      // transform weight
+      framework::Tensor *transformed_weight = new framework::Tensor;
+      operators::math::winograd_transform_weight<8, 3>(*param->Filter(),
+                                                       transformed_weight);
+      param->Filter() = transformed_weight;
+    } else {
+      param->ExecMode() = ConvParam<CPU>::EXEC_GEMM_FLOAT;
+    }
+  }
  return true;
 }

 template <>
 void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) const {
-  ConvCompute<float>(param);
+  switch (param.ExecMode()) {
+    case ConvParam<CPU>::EXEC_GEMM_INT8:
+      GemmConv<int8_t, int32_t>(param);
+      break;
+    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1P1_FLOAT:
+      math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
+                                 nullptr, false);
+      break;
+    case ConvParam<CPU>::EXEC_DEPTHWISE3x3_FLOAT:
+      math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
+                             param.Filter(), nullptr, param.Output(), false);
+      break;
+    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
+      WinogradConv3x3<8, 3>(param);
+      break;
+    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
+      GemmConv<float, float>(param);
+      break;
+    default:
+      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
+                                    param.ExecMode());
+  }
 }

 template class ConvKernel<CPU, float>;

--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -22,14 +22,14 @@ limitations under the License. */
 #include "operators/math/math_function.h"
 #include "operators/math/pad.h"
 #include "operators/math/vol2col.h"
-#include "operators/math/winograd/winograd.h"
+#include "operators/math/winograd/winograd_transform.h"
 #include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {

 template <typename Itype, typename Otype>
-inline void ConvBasic(const ConvParam<CPU> &param) {
+inline void GemmConv(const ConvParam<CPU> &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
  Tensor *output = param.Output();
@@ -117,9 +117,10 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
  }
 }

-inline void BatchConv3x3Winograd(const ConvParam<CPU> &param) {
+template <int tile, int kernel>
+inline void WinogradConv3x3(const ConvParam<CPU> &param) {
  const Tensor *input = param.Input();
-  Tensor *filter = param.Filter();
+  const Tensor *filter = param.Filter();
  Tensor *output = param.Output();
  output->mutable_data<float>();
  int batch_size = input->dims()[0];
@@ -127,51 +128,40 @@ inline void BatchConv3x3Winograd(const ConvParam<CPU> &param) {
  const std::vector<int> &paddings = param.Paddings();
  math::PadFunctor<CPU, float> pad;

+  auto winograd_pad = [&](int width, int pad) {
+    int output_tile = tile - kernel + 1;
+    //    int tiles = (width + pad - kernel) / output_tile + 1;
+    //    return (tiles - 1) * output_tile + tile - width;
+    int pad_width = (width + 2 * pad - kernel) / output_tile * output_tile;
+    return pad_width + tile - width;
+  };
+
  Tensor input_pad;
+  framework::Tensor transformed_input;
  for (int i = 0; i < batch_size; ++i) {
    Tensor in_batch = input->Slice(i, i + 1);
    Tensor out_batch = output->Slice(i, i + 1);
-    if (paddings[0] == 0 && paddings[1] == 0) {
-      input_pad = in_batch;
-    } else {
+    int pad_bottom = winograd_pad(in_batch.dims()[2], paddings[0]);
+    int pad_right = winograd_pad(in_batch.dims()[3], paddings[1]);
+    if (paddings[0] || paddings[1] || pad_bottom || pad_right) {
      framework::DDim pad_shape = in_batch.dims();
-      pad_shape[2] += 2 * paddings[0];
-      pad_shape[3] += 2 * paddings[1];
+      pad_shape[2] += paddings[0] + pad_bottom;
+      pad_shape[3] += paddings[1] + pad_right;
      input_pad.mutable_data<float>(pad_shape);
-      pad(in_batch, paddings[0], paddings[0], paddings[1], paddings[1],
+      pad(in_batch, paddings[0], pad_bottom, paddings[1], pad_right,
          &input_pad);
-    }
-    math::winograd_f6k3(input_pad, *filter, &out_batch);
-  }
-}
-
-template <typename P>
-void ConvCompute(const ConvParam<CPU> &param) {
-  if (param.Input()->type() == typeid(int8_t)) {
-    ConvBasic<int8_t, int32_t>(param);
-  } else {
-    if (param.Groups() == param.Input()->dims()[1] &&
-        param.Input()->dims()[1] == param.Output()->dims()[1] &&
-        param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-        param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
-      math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
-                                 nullptr, false);
-    } else if (param.Groups() == param.Input()->dims()[1] &&
-               param.Input()->dims()[1] == param.Output()->dims()[1] &&
-               param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-               param.Filter()->dims()[2] == 3) {
-      math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
-                             param.Filter(), nullptr, param.Output(), false);
-    } else if (param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-               param.Strides()[0] == param.Strides()[1] &&
-               param.Dilations()[0] == param.Dilations()[1] &&
-               param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 &&
-               param.Dilations()[0] == 1 && param.Output()->dims()[1] >= 16 &&
-               param.Output()->dims()[2] >= 16) {
-      BatchConv3x3Winograd(param);
    } else {
-      ConvBasic<float, float>(param);
+      input_pad = in_batch;
    }
+#if __aarch64__
+      // TODO(hjchen2)
+#else
+    // tile input and transform
+    math::winograd_transform_input<tile, kernel>(input_pad, &transformed_input);
+    // caculate output
+    math::winograd_transform_output<tile, kernel>(transformed_input, *filter,
+                                                  output);
+#endif
  }
 }


--- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h
@@ -44,7 +44,7 @@ void DepthwiseConvCompute(const ConvParam<CPU> &param) {
                                 Bias, false);

  } else {
-    ConvBasic<float, float>(param);
+    GemmConv<float, float>(param);
  }
 }


--- a/src/operators/math/winograd/winograd.cpp
+++ b/src/operators/math/winograd/winograd.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#include "operators/math/winograd/winograd.h"
-#include "operators/math/winograd/winograd_transform.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-// F(2X2, 3X3)
-void winograd_f2k3(const framework::Tensor &input,
-                   const framework::Tensor &weight, framework::Tensor *output) {
-}
-// F(6X6, 3X3)
-void winograd_f6k3(const framework::Tensor &input,
-                   const framework::Tensor &weight, framework::Tensor *output) {
-  framework::Tensor transformed_input;
-  framework::Tensor transformed_weight;
-#if __aarch64__
-  // TODO(hjchen2)
-#else
-  // transform weight
-  winograd_transform_weight<8, 3>(weight, &transformed_weight);
-  // tile input and transform
-  winograd_transform_input<8, 3>(input, &transformed_input);
-  // caculate output
-  winograd_transform_output<8, 3>(transformed_input, transformed_weight,
-                                  output);
-#endif
-}
-
-// F(4X4, 5X5)
-void winograd_f4k5(const framework::Tensor &input,
-                   const framework::Tensor &weight, framework::Tensor *output) {
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
--- a/src/operators/math/winograd/winograd.h
+++ b/src/operators/math/winograd/winograd.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#pragma once
-
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-// F(2X2, 3X3)
-void winograd_f2k3(const framework::Tensor &input,
-                   const framework::Tensor &weight, framework::Tensor *output);
-// F(6X6, 3X3)
-void winograd_f6k3(const framework::Tensor &input,
-                   const framework::Tensor &weight, framework::Tensor *output);
-
-// F(4X4, 5X5)
-void winograd_f4k5(const framework::Tensor &input,
-                   const framework::Tensor &weight, framework::Tensor *output);
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
--- a/src/operators/math/winograd/winograd_transform_f6k3.cpp
+++ b/src/operators/math/winograd/winograd_transform_f6k3.cpp
@@ -40,6 +40,7 @@ void winograd_transform_weight<8, 3>(const framework::Tensor &weight,
   * w7 = g2
   */
  // weight shape is [out_channel, in_channel, kernel_h, kernel_w]
+  // package weight into [roundup(out_channel/4), 64, in_channel, 4] tiles
  int out_channel = weight.dims()[0];
  int in_channel = weight.dims()[1];
  // reshape and alloc transformed weight
@@ -322,12 +323,12 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
   * x6 = (4 * d2 - 5 * d4 + d6) - (2 * d1 - 2.5 * d3 + 0.5 * d5)
   * x7 = (d7 - d1) + (d3 - d5) * 5.25
   */
-  // pack input to [8 * roundup(h/6), 8 * roundup(w/6), channel] tiles
+  // package input into [roundup(tiles/8), 64, channel, 8] tiles
  int channel = input.dims()[1];
  int height = input.dims()[2];
  int width = input.dims()[3];
-  int h_tiles = (height + 3) / 6;  // (height + 5 - 2) / 6
-  int w_tiles = (width + 3) / 6;   // (width + 5 - 2) / 6
+  int h_tiles = (height + 3) / 6;  // (height - 8 + 5 + 6) / 6
+  int w_tiles = (width + 3) / 6;   // (width - 8 + 5 + 6) / 6
  int tiles = (h_tiles * w_tiles + 7) / 8;
  framework::DDim transformed_shape =
      framework::make_ddim(std::vector<int>{tiles, 64, channel, 8});
@@ -335,29 +336,11 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
  memset(outptr, 0, output->numel() * sizeof(float));

  const float *inptr = input.data<float>();
-  int inter_h = (height - 2) / 6;
-  int inter_w = (width - 2) / 6;
-  int remain_h = height - (inter_h * 6);
-  int remain_w = width - (inter_w * 6);
-  framework::Tensor input_pad;
-  if (remain_h > 2 || remain_w > 2) {
-    inter_h += (remain_h > 2);
-    inter_w += (remain_w > 2);
-    height = (inter_h - 1) * 6 + 8;
-    width = (inter_w - 1) * 6 + 8;
-    framework::DDim input_shape =
-        framework::make_ddim(std::vector<int>{1, channel, height, width});
-    PadFunctor<CPU, float> pad;
-    inptr = input_pad.mutable_data<float>(input_shape);
-    pad(input, 0, height - input.dims()[2], 0, width - input.dims()[3],
-        &input_pad);
-  }
-
  size_t image_size = height * width;
  const float transform_matrix[8] = {5.25f, -5.f,   -4.25f, -2.5f,
                                     2.f,   -1.25f, 0.5f,   0.25f};
  int remain_c_start = channel & 0xFFFC;
-#if 0
+#if 1
  remain_c_start = 0;
 #else
  #pragma omp parallel for
@@ -381,14 +364,14 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
            "vld1.32    {d8-d11}, [%[in1]], %[steps]    \n"
            "vld1.32    {d12-d15}, [%[in2]], %[steps]   \n"
            "vld1.32    {d16-d19}, [%[in3]], %[steps]   \n"
-            "vtrn.32    q2, q4                          \n"  // d0: q2, q2
-            "vtrn.32    q3, q5                          \n"  // d1: q4, q3
-            "vtrn.32    q6, q8                          \n"  // d2: q6, q4
-            "vtrn.32    q7, q9                          \n"  // d3: q8, q5
-            "vswp.32    d5, d12                         \n"  // d4: q3, q6
-            "vswp.32    d9, d16                         \n"  // d5: q5, q7
-            "vswp.32    d7, d14                         \n"  // d6: q7, q8
-            "vswp.32    d11, d18                        \n"  // d7: q9, q9
+            "vtrn.32    q2, q4                          \n"  // d0: q2
+            "vtrn.32    q3, q5                          \n"  // d1: q4
+            "vtrn.32    q6, q8                          \n"  // d2: q6
+            "vtrn.32    q7, q9                          \n"  // d3: q8
+            "vswp.32    d5, d12                         \n"  // d4: q3
+            "vswp.32    d9, d16                         \n"  // d5: q5
+            "vswp.32    d7, d14                         \n"  // d6: q7
+            "vswp.32    d11, d18                        \n"  // d7: q9

            "vsub.f32   q10, q2, q7                     \n"
            "vsub.f32   q11, q3, q6                     \n"
@@ -680,14 +663,14 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
            "vld1.32    {d8-d11}, [%[in1]], %[steps]    \n"
            "vld1.32    {d12-d15}, [%[in2]], %[steps]   \n"
            "vld1.32    {d16-d19}, [%[in3]], %[steps]   \n"
-            "vtrn.32    q2, q4                          \n"  // d0: q2, q2
-            "vtrn.32    q3, q5                          \n"  // d1: q4, q3
-            "vtrn.32    q6, q8                          \n"  // d2: q6, q4
-            "vtrn.32    q7, q9                          \n"  // d3: q8, q5
-            "vswp.32    d5, d12                         \n"  // d4: q3, q6
-            "vswp.32    d9, d16                         \n"  // d5: q5, q7
-            "vswp.32    d7, d14                         \n"  // d6: q7, q8
-            "vswp.32    d11, d18                        \n"  // d7: q9, q9
+            "vtrn.32    q2, q4                          \n"  // d0: q2
+            "vtrn.32    q3, q5                          \n"  // d1: q4
+            "vtrn.32    q6, q8                          \n"  // d2: q6
+            "vtrn.32    q7, q9                          \n"  // d3: q8
+            "vswp.32    d5, d12                         \n"  // d4: q3
+            "vswp.32    d9, d16                         \n"  // d5: q5
+            "vswp.32    d7, d14                         \n"  // d6: q7
+            "vswp.32    d11, d18                        \n"  // d7: q9

            "vsub.f32   q10, q2, q7                     \n"
            "vsub.f32   q11, q3, q6                     \n"
@@ -749,11 +732,12 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,

        float *ptr0 = d_bt;
        float *ptr1 = ptr0 + 32;
-        int tile_id = h * w_tiles + w;
-        int block_id = tile_id >> 3;
-        int pack_id = tile_id & 0x7;
+        int tile_indics = h * w_tiles + w;
+        int tile_block = tile_indics >> 3;
+        int block_indics = tile_indics & 0x7;
        // (tiles / 8, 64, channel, 8)
-        float *out0 = outptr + (block_id * 64 * channel + c) * 8 + pack_id;
+        float *out0 =
+            outptr + (tile_block * 64 * channel + c) * 8 + block_indics;
        float *out1 = out0 + channel * 8;
        float *out2 = out1 + channel * 8;
        float *out3 = out2 + channel * 8;
@@ -771,7 +755,6 @@ void winograd_transform_input<8, 3>(const framework::Tensor &input,
            "vld1.32    {d8-d11}, [%[ptr0]]!            \n"  // q4: d2, q5: d3
            "vld1.32    {d12-d15}, [%[ptr1]]!           \n"  // q6: d4, q7: d5
            "vld1.32    {d16-d19}, [%[ptr1]]!           \n"  // q8: d6, q9: d7
-
            "vtrn.32    q2, q3                          \n"
            "vtrn.32    q4, q5                          \n"
            "vtrn.32    q6, q7                          \n"
@@ -918,7 +901,7 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
            "cmp        %[inter_channel], #0           \n"
            "ble        cmp_remain_%=                  \n"

-            "loop_4c_%=:                               \n"
+            "loop_2c_%=:                               \n"
            "vld1.32    {d0-d3}, [%[w_ptr]]!           \n"
            "vld1.32    {d4-d7}, [%[in_ptr]]!          \n"
            "vmla.f32   q8, q2, d0[0]                  \n"
@@ -941,7 +924,7 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
            "vmla.f32   q15, q5, d3[1]                 \n"

            "subs       %[inter_channel], #1           \n"
-            "bne        loop_4c_%=                     \n"
+            "bne        loop_2c_%=                     \n"

            // cmp remain channel > 0
            "cmp_remain_%=:                            \n"

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -379,9 +379,9 @@ class ConvParam : public OpParam {

  const RType *Input() const { return input_; }

-  RType *Filter() const { return filter_; }
+  RType *&Filter() const { return filter_; }

-  RType *Output() const { return output_; }
+  RType *&Output() const { return output_; }

  const vector<int> &Strides() const { return strides_; }

@@ -389,15 +389,28 @@ class ConvParam : public OpParam {

  const vector<int> &Dilations() const { return dilations_; }

+  enum ExecMode {
+    EXEC_INVALID = 0,
+    EXEC_GEMM_FLOAT,
+    EXEC_DEPTHWISE3x3S1P1_FLOAT,
+    EXEC_DEPTHWISE3x3_FLOAT,
+    EXEC_WINOGRAD3X3_FLOAT,
+    EXEC_WINOGRAD5X5_FLOAT,
+    EXEC_GEMM_INT8,
+  };
+
+  ExecMode &ExecMode() const { return exec_mode_; }
+
  const int &Groups() const { return groups; }

 private:
  RType *input_;
-  RType *output_;
-  RType *filter_;
+  mutable RType *output_;
+  mutable RType *filter_;
  vector<int> strides_;
  vector<int> paddings_;
  vector<int> dilations_;
+  mutable enum ExecMode exec_mode_;
  int groups;
 };
 template <typename Dtype>

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -141,7 +141,7 @@ if (NOT FOUND_MATCH)
    target_link_libraries(test-googlenet-quali paddle-mobile)

    # gen test
-    ADD_EXECUTABLE(test-conv-op operators/test_cov_op.cpp test_helper.h test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-conv-op operators/test_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-conv-op paddle-mobile)

    # gen test
@@ -220,10 +220,6 @@ if (NOT FOUND_MATCH)
    ADD_EXECUTABLE(test-dequantize-op operators/test_dequantize_op.cpp test_helper.h test_include.h)
    target_link_libraries(test-dequantize-op paddle-mobile)

-    # test int8 conv op
-    ADD_EXECUTABLE(test-int8-conv-op operators/test_int8_conv_op.cpp test_helper.h test_include.h)
-    target_link_libraries(test-int8-conv-op paddle-mobile)
-
    # gen test log
    ADD_EXECUTABLE(test-log common/test_log.cpp)
    target_link_libraries(test-log paddle-mobile)

--- a/test/operators/test_int8_conv_op.cpp
+++ b/test/operators/test_int8_conv_op.cpp
@@ -18,7 +18,7 @@ limitations under the License. */

 namespace paddle_mobile {

-// Reference convolution for checking results:
+// Reference convolution from Caffe for checking results.
 // accumulate through explicit loops over input, output, and filters.
 template <typename Itype, typename Otype>
 void conv2d(const framework::Tensor *input, const framework::Tensor *filter,
@@ -129,7 +129,7 @@ void conv2d(const framework::Tensor *input, const framework::Tensor *filter,
 }

 template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
-int TestConvOp() {
+int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
  int kernel_h = Kernel;
  int kernel_w = Kernel;
  int pad_h = Pad;
@@ -140,10 +140,10 @@ int TestConvOp() {
  int dilation_w = 1;

  int batch_size = 1;
-  int input_c = 3;
-  int input_h = 100;
-  int input_w = 100;
-  int output_c = 10;
+  int input_c = in_channels;
+  int input_h = in_height;
+  int input_w = in_width;
+  int output_c = out_channels;
  framework::DDim input_shape =
      framework::make_ddim({batch_size, input_c, input_h, input_w});
  framework::DDim filter_shape =
@@ -158,7 +158,7 @@ int TestConvOp() {

  auto input_var = scope.get()->Var("input");
  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(input, input_shape, -20, 20);
+  SetupTensor<Itype>(input, input_shape, -20.0, 20.0);

  auto filter_var = scope.get()->Var("filter");
  auto filter = filter_var->template GetMutable<framework::LoDTensor>();
@@ -174,8 +174,9 @@ int TestConvOp() {

  auto *op = new operators::ConvOp<CPU, float>("conv2d", inputs, outputs, attrs,
                                               scope);
-  //  struct timespec ts_begin, ts_end;
  op->InferShape();
+  op->Init();
+  //  struct timespec ts_begin, ts_end;
  // warmup
  //  op->Run();
  //  clock_gettime(CLOCK_MONOTONIC, &ts_begin);
@@ -202,7 +203,8 @@ int TestConvOp() {
  const Otype *output_data = output->data<Otype>();
  Otype *output_cmp_data = output_cmp.data<Otype>();
  for (int i = 0; i < output->numel(); ++i) {
-    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
+    float gap = output_data[i] - output_cmp_data[i];
+    PADDLE_MOBILE_ENFORCE(std::abs(gap / output_data[i]) < 1e-3,
                          "output[%d] = %d, output_cmp[%d] = %d", i,
                          output_data[i], i, output_cmp_data[i]);
  }
@@ -212,68 +214,104 @@ int TestConvOp() {

 }  // namespace paddle_mobile

-int main() {
+int main(int argc, char *argv[]) {
+  if (argc < 5) {
+    LOG(paddle_mobile::kLOG_INFO)
+        << "Usage:\n"
+        << "  ./test-int8-conv-op in_channels in_height in_width out_channels\n"
+        << "  params:\n"
+        << "   -in_channels: int, input image's channels\n"
+        << "   -in_height: int, input image's height\n"
+        << "   -in_width: int, input image's width\n"
+        << "   -out_channels: int, conv output channels\n";
+    return 1;
+  }
+  int in_channels = atoi(argv[1]);
+  int in_height = atoi(argv[2]);
+  int in_width = atoi(argv[3]);
+  int out_channels = atoi(argv[4]);
+  // kernel = 3, pad = 1, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1";
+  paddle_mobile::TestConvOp<float, float, 3, 1, 1>(in_channels, in_height,
+                                                   in_width, out_channels);
+
  // kernel = 7, pad = 0, stride = 2
  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 2>();
+  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 2>(in_channels, in_height,
+                                                      in_width, out_channels);

  // kernel = 7, pad = 1, stride = 2
  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 2>();
+  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 2>(in_channels, in_height,
+                                                      in_width, out_channels);

  // kernel = 7, pad = 3, stride = 2
  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 2>();
+  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 2>(in_channels, in_height,
+                                                      in_width, out_channels);

  // kernel = 7, pad = 0, stride = 1
  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 1>();
+  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 0, 1>(in_channels, in_height,
+                                                      in_width, out_channels);

  // kernel = 7, pad = 1, stride = 1
  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 1>();
+  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 1, 1>(in_channels, in_height,
+                                                      in_width, out_channels);

  // kernel = 7, pad = 3, stride = 1
  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 1>();
+  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 1>(in_channels, in_height,
+                                                      in_width, out_channels);

  // kernel = 7, pad = 5, stride = 3
  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 5, 3>();
+  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 5, 3>(in_channels, in_height,
+                                                      in_width, out_channels);

  // kernel = 7, pad = 3, stride = 4
  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 4>();
-  LOG(paddle_mobile::kLOG_INFO) << "\n";
+  paddle_mobile::TestConvOp<int8_t, int32_t, 7, 3, 4>(in_channels, in_height,
+                                                      in_width, out_channels);

  // kernel = 3, pad = 0, stride = 1
  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 1>();
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 1>(in_channels, in_height,
+                                                      in_width, out_channels);
+
  // kernel = 3, pad = 0, stride = 1
  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=0, stride=1";
-  paddle_mobile::TestConvOp<float, float, 3, 0, 1>();
-  LOG(paddle_mobile::kLOG_INFO) << "\n";
+  paddle_mobile::TestConvOp<float, float, 3, 0, 1>(in_channels, in_height,
+                                                   in_width, out_channels);

  // kernel = 3, pad = 1, stride = 1
  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=1, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 1>();
+  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 1>(in_channels, in_height,
+                                                      in_width, out_channels);
+
  // kernel = 3, pad = 1, stride = 1
  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=3, pad=1, stride=1";
-  paddle_mobile::TestConvOp<float, float, 3, 1, 1>();
-  LOG(paddle_mobile::kLOG_INFO) << "\n";
+  paddle_mobile::TestConvOp<float, float, 3, 1, 1>(in_channels, in_height,
+                                                   in_width, out_channels);

  // kernel = 5, pad = 0, stride = 1
  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=0, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 0, 1>();
+  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 0, 1>(in_channels, in_height,
+                                                      in_width, out_channels);
+
  // kernel = 5, pad = 0, stride = 1
  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=0, stride=1";
-  paddle_mobile::TestConvOp<float, float, 5, 0, 1>();
-  LOG(paddle_mobile::kLOG_INFO) << "\n";
+  paddle_mobile::TestConvOp<float, float, 5, 0, 1>(in_channels, in_height,
+                                                   in_width, out_channels);

  // kernel = 5, pad = 2, stride = 1
  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=2, stride=1";
-  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 2, 1>();
+  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 2, 1>(in_channels, in_height,
+                                                      in_width, out_channels);
+
  // kernel = 5, pad = 2, stride = 1
  LOG(paddle_mobile::kLOG_INFO) << "float, kernel=5, pad=2, stride=1";
-  paddle_mobile::TestConvOp<float, float, 5, 2, 1>();
+  paddle_mobile::TestConvOp<float, float, 5, 2, 1>(in_channels, in_height,
+                                                   in_width, out_channels);
 }
--- a/test/operators/test_cov_op.cpp
+++ b/test/operators/test_cov_op.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/conv_op.h"
-
-int main() {
-  paddle_mobile::Loader<paddle_mobile::GPU_MALI> loader;
-  //  ../models/image_classification_resnet.inference.model
-  auto program = loader.Load(g_googlenet);
-
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
-
-  Executor4Test<paddle_mobile::GPU_MALI, paddle_mobile::operators::ConvOp<
-                                             paddle_mobile::GPU_MALI, float>>
-      executor(program, "conv2d");
-
-  paddle_mobile::framework::Tensor input;
-  GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
-  //  // use SetupTensor if not has local input image .
-  //  SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
-  //                     static_cast<float>(1));
-
-  auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 112, 112});
-  auto output = executor.Predict(input, "data", "conv2d_0.tmp_0", out_ddim);
-
-  auto output_ptr = output->data<float>();
-  for (int j = 0; j < 20; ++j) {
-    DLOG << " value of output: " << output_ptr[j];
-  }
-  return 0;
-}