[ARM] optimize depthwise int8 f3s1 arm neon kernel,test=develop (#4125)

d14e57f7 · Leonardo-Ding · GitHub · 42cefe1b · d14e57f7 · d14e57f7
5 changed file
--- a/lite/backends/arm/math/conv3x3s1_depthwise_int8.cc
+++ b/lite/backends/arm/math/conv3x3s1_depthwise_int8.cc
--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
@@ -106,6 +106,42 @@ void conv_depthwise_3x3s1_int8(Dtype* dout,
                               int padh,
                               ARMContext* ctx);
+void conv_depthwise_3x3s1_int8_int8_impl(int8_t* dout,
+                                         const int8_t* din,
+                                         const int8_t* weights,
+                                         const float* scale,
+                                         const float* bias,
+                                         bool flag_bias,
+                                         int flag_act,
+                                         float* alpha,
+                                         int num,
+                                         int chin,
+                                         int hin,
+                                         int win,
+                                         int hout,
+                                         int wout,
+                                         int padw,
+                                         int padh,
+                                         ARMContext* ctx);
+void conv_depthwise_3x3s1_int8_float_impl(float* dout,
+                                          const int8_t* din,
+                                          const int8_t* weights,
+                                          const float* scale,
+                                          const float* bias,
+                                          bool flag_bias,
+                                          int flag_act,
+                                          float* alpha,
+                                          int num,
+                                          int chin,
+                                          int hin,
+                                          int win,
+                                          int hout,
+                                          int wout,
+                                          int padw,
+                                          int padh,
+                                          ARMContext* ctx);
 template <typename Dtype>
 void conv_depthwise_3x3s2_int8(Dtype* dout,
                               const int8_t* din,

--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -814,24 +814,52 @@ void conv_depthwise_3x3_int8_fp32(const void* din,
      alpha[3] = local_alpha;
    }
  }
+  bool support_act_type = flag_act <= 1;
+  bool support_pad_type =
+      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]) &&
+      (paddings[0] == paddings[2]) && (paddings[0] == 0 || paddings[0] == 1);
+  bool support_stride_type = (param.strides[0] == 1 && param.strides[1] == 1);
+  bool support_width_type = w_in > 9 ? true : false;
  if (stride == 1) {
-    conv_depthwise_3x3s1_int8(reinterpret_cast<float*>(dout),
+    if (!support_act_type || !support_pad_type || !support_stride_type ||
-                              reinterpret_cast<const int8_t*>(din),
+        !support_width_type) {
-                              reinterpret_cast<const int8_t*>(weights),
+      conv_depthwise_3x3s1_int8(reinterpret_cast<float*>(dout),
-                              scale,
+                                reinterpret_cast<const int8_t*>(din),
-                              bias,
+                                reinterpret_cast<const int8_t*>(weights),
-                              flag_bias,
+                                scale,
-                              flag_act,
+                                bias,
-                              alpha,
+                                flag_bias,
-                              num,
+                                flag_act,
-                              ch_in,
+                                alpha,
-                              h_in,
+                                num,
-                              w_in,
+                                ch_in,
-                              h_out,
+                                h_in,
-                              w_out,
+                                w_in,
-                              pad_w,
+                                h_out,
-                              pad_h,
+                                w_out,
-                              ctx);
+                                pad_w,
+                                pad_h,
+                                ctx);
+    } else {
+      conv_depthwise_3x3s1_int8_float_impl(
+          reinterpret_cast<float*>(dout),
+          reinterpret_cast<const int8_t*>(din),
+          reinterpret_cast<const int8_t*>(weights),
+          scale,
+          bias,
+          flag_bias,
+          flag_act,
+          alpha,
+          num,
+          ch_in,
+          h_in,
+          w_in,
+          h_out,
+          w_out,
+          pad_w,
+          pad_h,
+          ctx);
+    }
  } else if (stride == 2) {
    conv_depthwise_3x3s2_int8(reinterpret_cast<float*>(dout),
                              reinterpret_cast<const int8_t*>(din),
@@ -897,24 +925,52 @@ void conv_depthwise_3x3_int8_int8(const void* din,
      alpha[3] = local_alpha;
    }
  }
+  bool support_act_type = flag_act <= 1;
+  bool support_pad_type =
+      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]) &&
+      (paddings[0] == paddings[2]) && (paddings[0] == 0 || paddings[0] == 1);
+  bool support_stride_type = (param.strides[0] == 1 && param.strides[1] == 1);
+  bool support_width_type = w_in > 9 ? true : false;
  if (stride == 1) {
-    conv_depthwise_3x3s1_int8(reinterpret_cast<int8_t*>(dout),
+    if (!support_act_type || !support_pad_type || !support_stride_type ||
-                              reinterpret_cast<const int8_t*>(din),
+        !support_width_type) {
-                              reinterpret_cast<const int8_t*>(weights),
+      conv_depthwise_3x3s1_int8(reinterpret_cast<int8_t*>(dout),
-                              scale,
+                                reinterpret_cast<const int8_t*>(din),
-                              bias,
+                                reinterpret_cast<const int8_t*>(weights),
-                              flag_bias,
+                                scale,
-                              flag_act,
+                                bias,
-                              alpha,
+                                flag_bias,
-                              num,
+                                flag_act,
-                              ch_in,
+                                alpha,
-                              h_in,
+                                num,
-                              w_in,
+                                ch_in,
-                              h_out,
+                                h_in,
-                              w_out,
+                                w_in,
-                              pad_w,
+                                h_out,
-                              pad_h,
+                                w_out,
-                              ctx);
+                                pad_w,
+                                pad_h,
+                                ctx);
+    } else {
+      conv_depthwise_3x3s1_int8_int8_impl(
+          reinterpret_cast<int8_t*>(dout),
+          reinterpret_cast<const int8_t*>(din),
+          reinterpret_cast<const int8_t*>(weights),
+          scale,
+          bias,
+          flag_bias,
+          flag_act,
+          alpha,
+          num,
+          ch_in,
+          h_in,
+          w_in,
+          h_out,
+          w_out,
+          pad_w,
+          pad_h,
+          ctx);
+    }
  } else if (stride == 2) {
    conv_depthwise_3x3s2_int8(reinterpret_cast<int8_t*>(dout),
                              reinterpret_cast<const int8_t*>(din),

--- a/lite/kernels/arm/conv_depthwise.cc
+++ b/lite/kernels/arm/conv_depthwise.cc
@@ -31,7 +31,6 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
  auto paddings = *param.paddings;
  // select dw conv kernel
  if (kw == 3) {
-    // VLOG(5) << "invoke 3x3 dw conv fp32";
    bool pads_less = ((paddings[1] < 2) && (paddings[3] < 2));
    if (pads_less && paddings[0] == paddings[2] &&
        (paddings[0] == 0 || paddings[0] == 1)) {
@@ -54,7 +53,6 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
    kernel_func_name_ = "conv_depthwise_3x3_fp32";
 #endif
  } else if (kw == 5) {
-    // VLOG(5) << "invoke 5x5 dw conv fp32";
    auto strides = param.strides;
    if ((strides[0] == 1 && strides[1] == 1) ||
        (strides[0] == 2 && strides[1] == 2)) {
@@ -104,23 +102,44 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
      w_scale_[i] = scale[i] * in_scale;
    }
  }
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto x_dims = param.x->dims();
+  int iw = x_dims[3];
+  int ih = x_dims[2];
+  auto act_param = param.activation_param;
+  bool has_act = act_param.has_active;
+  lite_api::ActivationType act_type = act_param.active_type;
+  // no activation and relu activation is supported now
+  bool support_act_type =
+      (has_act == false) ||
+      (has_act == true && act_type == lite_api::ActivationType::kRelu);
+  bool support_pad_type =
+      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]) &&
+      (paddings[0] == paddings[2]) && (paddings[0] == 0 || paddings[0] == 1);
+  bool support_stride_type = (strides[0] == 1 && strides[1] == 1);
+  bool support_width_type = iw > 9 ? true : false;
  /// select dw conv kernel
  if (kw == 3) {
    // trans weights
-    // VLOG(5) << "invoke 3x3 dw conv int8 kernel fp32 out";
    impl_ = lite::arm::math::conv_depthwise_3x3_int8_fp32;
 #ifdef LITE_WITH_PROFILE
    kernel_func_name_ = "conv_depthwise_3x3_int8_fp32";
 #endif
-    int cround = ROUNDUP(w_dims[0], 8);
+    if (!support_act_type || !support_pad_type || !support_stride_type ||
-    weights_.Resize({cround / 8, 1, kh * kw, 8});
+        !support_width_type) {
-    auto wptr = param.filter->data<int8_t>();
+      int cround = ROUNDUP(w_dims[0], 8);
-    auto wptr_new = weights_.mutable_data<int8_t>();
+      weights_.Resize({cround / 8, 1, kh * kw, 8});
-    lite::arm::math::conv_trans_weights_numc(wptr, wptr_new, oc, 1, 8, 9);
+      auto wptr = param.filter->data<int8_t>();
-    flag_trans_weights_ = true;
+      auto wptr_new = weights_.mutable_data<int8_t>();
+      lite::arm::math::conv_trans_weights_numc(wptr, wptr_new, oc, 1, 8, 9);
+      flag_trans_weights_ = true;
+    } else {
+      flag_trans_weights_ = false;
+    }
  } else if (kw == 5) {
    // trans weights
-    // VLOG(5) << "invoke 5x5 dw conv int8 kernel fp32 out";
    impl_ = lite::arm::math::conv_depthwise_5x5_int8_fp32;
 #ifdef LITE_WITH_PROFILE
    kernel_func_name_ = "conv_depthwise_5x5_int8_fp32";
@@ -175,23 +194,45 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
    param.activation_param.Relu_clipped_coef =
        param.activation_param.Relu_clipped_coef / param.output_scale;
  }
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto x_dims = param.x->dims();
+  int iw = x_dims[3];
+  int ih = x_dims[2];
+  auto act_param = param.activation_param;
+  bool has_act = act_param.has_active;
+  lite_api::ActivationType act_type = act_param.active_type;
+  // no activation and relu activation is supported now
+  bool support_act_type =
+      (has_act == false) ||
+      (has_act == true && act_type == lite_api::ActivationType::kRelu);
+  bool support_pad_type =
+      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]) &&
+      (paddings[0] == paddings[2]) && (paddings[0] == 0 || paddings[0] == 1);
+  bool support_stride_type = (strides[0] == 1 && strides[1] == 1);
+  bool support_width_type = iw > 9 ? true : false;
  /// select dw conv kernel
  if (kw == 3) {
    // trans weights
-    // VLOG(5) << "invoke 3x3 dw conv int8 kernel int8 out";
    impl_ = lite::arm::math::conv_depthwise_3x3_int8_int8;
 #ifdef LITE_WITH_PROFILE
    kernel_func_name_ = "conv_depthwise_3x3_int8_int8";
 #endif
-    int cround = ROUNDUP(w_dims[0], 8);
+    if (!support_act_type || !support_pad_type || !support_stride_type ||
-    weights_.Resize({cround / 8, 1, kh * kw, 8});
+        !support_width_type) {
-    auto wptr = param.filter->data<int8_t>();
+      int cround = ROUNDUP(w_dims[0], 8);
-    auto wptr_new = weights_.mutable_data<int8_t>();
+      weights_.Resize({cround / 8, 1, kh * kw, 8});
-    lite::arm::math::conv_trans_weights_numc(wptr, wptr_new, oc, 1, 8, 9);
+      auto wptr = param.filter->data<int8_t>();
-    flag_trans_weights_ = true;
+      auto wptr_new = weights_.mutable_data<int8_t>();
+      lite::arm::math::conv_trans_weights_numc(wptr, wptr_new, oc, 1, 8, 9);
+      flag_trans_weights_ = true;
+    } else {
+      flag_trans_weights_ = false;
+    }
  } else if (kw == 5) {
    // trans weights
-    // VLOG(5) << "invoke 5x5 dw conv int8 kernel int8 out";
    impl_ = lite::arm::math::conv_depthwise_5x5_int8_int8;
 #ifdef LITE_WITH_PROFILE
    kernel_func_name_ = "conv_depthwise_5x5_int8_int8";
@@ -283,7 +324,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
  auto w_dims = param.filter->dims();
  auto o_dims = param.output->dims();
-  int iw = x_dims[3];  // nchw
+  int iw = x_dims[3];
  int ih = x_dims[2];
  int ic = x_dims[1];
  int bs = x_dims[0];
@@ -333,7 +374,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
  auto w_dims = param.filter->dims();
  auto o_dims = param.output->dims();
-  int iw = x_dims[3];  // nchw
+  int iw = x_dims[3];
  int ih = x_dims[2];
  int ic = x_dims[1];
  int bs = x_dims[0];

--- a/lite/tests/math/conv_int8_compute_test.cc
+++ b/lite/tests/math/conv_int8_compute_test.cc
@@ -125,7 +125,7 @@ void release_param(ConvParam* param) {
 #ifdef LITE_WITH_ARM
 #include "lite/backends/arm/math/funcs.h"
-void test_conv_int8(const std::vector<DDim>& input_dims,
+void test_conv_int8(const DDim& dim_in,
                    const DDim& weight_dim,
                    int group,
                    const std::vector<int>& strides,
@@ -237,241 +237,234 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
      conv_int8_fp32.SetContext(std::move(ctx2));
      /// set param and context
-      for (auto& dim_in : input_dims) {
+      param_int8_out.x->Resize(dim_in);
-        param_int8_out.x->Resize(dim_in);
+      DDim out_tmp_dims = compute_out_dim(dim_in, param_int8_out);
-        DDim out_tmp_dims = compute_out_dim(dim_in, param_int8_out);
+      if (out_tmp_dims[2] < 1 || out_tmp_dims[3] < 1) {
-        if (out_tmp_dims[2] < 1 || out_tmp_dims[3] < 1) {
+        return;
-          continue;
-        }
-        param_fp32_out.x->Resize(dim_in);
-        param_int8_out.output->Resize(out_tmp_dims);
-        param_fp32_out.output->Resize(out_tmp_dims);
-        break;
      }
+      param_fp32_out.x->Resize(dim_in);
+      param_int8_out.output->Resize(out_tmp_dims);
+      param_fp32_out.output->Resize(out_tmp_dims);
      conv_int8_int8.SetParam(param_int8_out);
      conv_int8_fp32.SetParam(param_fp32_out);
      /// prepare for run
      conv_int8_int8.PrepareForRun();
      conv_int8_fp32.PrepareForRun();
-      for (auto& dim_in : input_dims) {
+      CHECK_EQ(weight_dim[1] * group, dim_in[1])
-        CHECK_EQ(weight_dim[1] * group, dim_in[1])
+          << "input channel must equal to weights channel";
-            << "input channel must equal to weights channel";
+      DDim dim_out = compute_out_dim(dim_in, param_int8_out);
-        DDim dim_out = compute_out_dim(dim_in, param_int8_out);
+      if (dim_out[2] < 1 || dim_out[3] < 1) {
-        if (dim_out[2] < 1 || dim_out[3] < 1) {
+        continue;
-          continue;
+      }
-        }
+      delete param_fp32_out.output;
-        delete param_fp32_out.output;
+      param_fp32_out.output = new Tensor;
-        param_fp32_out.output = new Tensor;
+      param_fp32_out.output->set_precision(PRECISION(kFloat));
-        param_fp32_out.output->set_precision(PRECISION(kFloat));
+      delete param_int8_out.output;
-        delete param_int8_out.output;
+      param_int8_out.output = new Tensor;
-        param_int8_out.output = new Tensor;
+      param_int8_out.output->set_precision(PRECISION(kInt8));
-        param_int8_out.output->set_precision(PRECISION(kInt8));
+      param_int8_out.x->Resize(dim_in);
-        param_int8_out.x->Resize(dim_in);
+      param_int8_out.output->Resize(dim_out);
-        param_int8_out.output->Resize(dim_out);
+      param_fp32_out.x->Resize(dim_in);
-        param_fp32_out.x->Resize(dim_in);
+      param_fp32_out.output->Resize(dim_out);
-        param_fp32_out.output->Resize(dim_out);
+      Tensor tin_fp32;
-        Tensor tin_fp32;
+      tin_fp32.Resize(dim_in);
-        tin_fp32.Resize(dim_in);
+      tin_fp32.set_precision(PRECISION(kFloat));
-        tin_fp32.set_precision(PRECISION(kFloat));
+      Tensor tout_basic_fp32;
-        Tensor tout_basic_fp32;
+      Tensor tout_basic_int8;
-        Tensor tout_basic_int8;
+      paddle::lite::fill_tensor_rand(*param_int8_out.x, -127, 127);
-        paddle::lite::fill_tensor_rand(*param_int8_out.x, -127, 127);
+      param_fp32_out.x->CopyDataFrom(*param_int8_out.x);
-        param_fp32_out.x->CopyDataFrom(*param_int8_out.x);
+      auto din_fp32 = tin_fp32.mutable_data<float>();
-        auto din_fp32 = tin_fp32.mutable_data<float>();
+      paddle::lite::arm::math::int8_to_fp32(param_int8_out.x->data<int8_t>(),
-        paddle::lite::arm::math::int8_to_fp32(param_int8_out.x->data<int8_t>(),
+                                            din_fp32,
-                                              din_fp32,
+                                            scale_in.data(),
-                                              scale_in.data(),
+                                            1,
+                                            1,
+                                            dim_in.production());
+      if (FLAGS_check_result) {
+        tout_basic_fp32.set_precision(PRECISION(kFloat));
+        tout_basic_fp32.Resize(dim_out);
+        tout_basic_int8.set_precision(PRECISION(kInt8));
+        tout_basic_int8.Resize(dim_out);
+        fill_tensor_const(tout_basic_fp32, 0.f);
+        auto dout_basic_fp32 = tout_basic_fp32.mutable_data<float>();
+        auto dout_basic_int8 = tout_basic_int8.mutable_data<int8_t>();
+        conv_basic<float, float>(din_fp32,
+                                 dout_basic_fp32,
+                                 dim_in[0],
+                                 dim_out[1],
+                                 dim_out[2],
+                                 dim_out[3],
+                                 dim_in[1],
+                                 dim_in[2],
+                                 dim_in[3],
+                                 wptr_fp32,
+                                 bptr_fp32,
+                                 group,
+                                 weight_dim[3],
+                                 weight_dim[2],
+                                 strides[1],
+                                 strides[0],
+                                 dilas[1],
+                                 dilas[0],
+                                 pads[2],
+                                 pads[0],
+                                 flag_bias,
+                                 flag_act,
+                                 six,
+                                 alpha);
+        paddle::lite::arm::math::fp32_to_int8(dout_basic_fp32,
+                                              dout_basic_int8,
+                                              scale_out.data(),
                                              1,
                                              1,
-                                              dim_in.production());
+                                              dim_out.production());
+      }
-        if (FLAGS_check_result) {
+      double gops = 2.0 * dim_out.production() * dim_in[1] * weight_dim[2] *
-          tout_basic_fp32.set_precision(PRECISION(kFloat));
+                    weight_dim[3] / group;
-          tout_basic_fp32.Resize(dim_out);
+      /// warm up
-          tout_basic_int8.set_precision(PRECISION(kInt8));
+      for (int i = 0; i < FLAGS_warmup; ++i) {
-          tout_basic_int8.Resize(dim_out);
+        conv_int8_fp32.Launch();
-          fill_tensor_const(tout_basic_fp32, 0.f);
+      }
-          auto dout_basic_fp32 = tout_basic_fp32.mutable_data<float>();
+      /// compute fp32 output
-          auto dout_basic_int8 = tout_basic_int8.mutable_data<int8_t>();
+      Timer t0;
-          conv_basic<float, float>(din_fp32,
+      for (int i = 0; i < FLAGS_repeats; ++i) {
-                                   dout_basic_fp32,
+        t0.Start();
-                                   dim_in[0],
+        conv_int8_fp32.Launch();
-                                   dim_out[1],
+        t0.Stop();
-                                   dim_out[2],
+      }
-                                   dim_out[3],
+      LOG(INFO) << "int8 conv, fp32 output: output shape" << dim_out
-                                   dim_in[1],
+                << ",running time, avg: " << t0.LapTimes().Avg() << " ms"
-                                   dim_in[2],
+                << ", min time: " << t0.LapTimes().Min() << " ms"
-                                   dim_in[3],
+                << ", total GOPS: " << 1e-9 * gops
-                                   wptr_fp32,
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
-                                   bptr_fp32,
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
-                                   group,
-                                   weight_dim[3],
+      // compute int8 output
-                                   weight_dim[2],
+      t0.Reset();
-                                   strides[1],
+      for (int i = 0; i < FLAGS_repeats; ++i) {
-                                   strides[0],
+        t0.Start();
-                                   dilas[1],
+        conv_int8_int8.Launch();
-                                   dilas[0],
+        t0.Stop();
-                                   pads[2],
+      }
-                                   pads[0],
+      LOG(INFO) << "int8 conv, int8 output: output shape" << dim_out
-                                   flag_bias,
+                << ",running time, avg: " << t0.LapTimes().Avg()
-                                   flag_act,
+                << ", min time: " << t0.LapTimes().Min()
-                                   six,
+                << ", total GOPS: " << 1e-9 * gops
-                                   alpha);
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
-          paddle::lite::arm::math::fp32_to_int8(dout_basic_fp32,
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
-                                                dout_basic_int8,
-                                                scale_out.data(),
+      /// compare result fp32 output
-                                                1,
+      if (FLAGS_check_result) {
-                                                1,
+        double max_ratio = 0;
-                                                dim_out.production());
+        double max_diff = 0;
-        }
+        tensor_cmp_host(
-        double gops = 2.0 * dim_out.production() * dim_in[1] * weight_dim[2] *
+            tout_basic_fp32, *param_fp32_out.output, max_ratio, max_diff);
-                      weight_dim[3] / group;
+        LOG(INFO) << "FP32 compare result, max diff: " << max_diff
-        /// warm up
+                  << ", max ratio: " << max_ratio;
-        for (int i = 0; i < FLAGS_warmup; ++i) {
+        if (std::abs(max_ratio) > 1e-5f) {
-          conv_int8_int8.Launch();
+          if (max_diff > 5e-5f) {
-        }
+            LOG(WARNING) << "basic result";
-        /// compute fp32 output
+            print_tensor(tout_basic_fp32);
-        Timer t0;
+            LOG(WARNING) << "lite result";
-        for (int i = 0; i < FLAGS_repeats; ++i) {
+            print_tensor(*param_fp32_out.output);
-          t0.Start();
+            Tensor tdiff;
-          conv_int8_fp32.Launch();
+            tdiff.Resize(tout_basic_fp32.dims());
-          t0.Stop();
+            tdiff.set_precision(PRECISION(kFloat));
-        }
+            tensor_diff(tout_basic_fp32, *param_fp32_out.output, tdiff);
-        LOG(INFO) << "int8 conv, fp32 output: output shape" << dim_out
+            print_tensor(tdiff);
-                  << ",running time, avg: " << t0.LapTimes().Avg()
+            release_param(&param_int8_out);
-                  << ", min time: " << t0.LapTimes().Min()
+            release_param(&param_fp32_out);
-                  << ", total GOPS: " << 1e-9 * gops
+            LOG(FATAL) << "test int8 conv, fp32 out: input: " << dim_in
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                       << ", output: " << dim_out
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
+                       << ", weight dim: " << weight_dim << ", pad: " << pads[0]
+                       << ", " << pads[1] << ", " << pads[2] << ", " << pads[3]
-        /// compute int8 output
+                       << ", stride: " << strides[0] << ", " << strides[1]
-        t0.Reset();
+                       << ", dila_: " << dilas[0] << ", " << dilas[1]
-        for (int i = 0; i < FLAGS_repeats; ++i) {
+                       << ", group: " << group
-          t0.Start();
+                       << ", bias: " << (flag_bias ? "true" : "false")
-          conv_int8_int8.Launch();
+                       << ", act: " << flag_act << ", threads: " << th
-          t0.Stop();
+                       << ", power_mode: " << cls << " failed!!\n";
-        }
-        LOG(INFO) << "int8 conv, int8 output: output shape" << dim_out
-                  << ",running time, avg: " << t0.LapTimes().Avg()
-                  << ", min time: " << t0.LapTimes().Min()
-                  << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
-        /// compare result fp32 output
-        if (FLAGS_check_result) {
-          double max_ratio = 0;
-          double max_diff = 0;
-          tensor_cmp_host(
-              tout_basic_fp32, *param_fp32_out.output, max_ratio, max_diff);
-          LOG(INFO) << "FP32 compare result, max diff: " << max_diff
-                    << ", max ratio: " << max_ratio;
-          if (std::abs(max_ratio) > 1e-5f) {
-            if (max_diff > 5e-5f) {
-              LOG(WARNING) << "basic result";
-              print_tensor(tout_basic_fp32);
-              LOG(WARNING) << "lite result";
-              print_tensor(*param_fp32_out.output);
-              Tensor tdiff;
-              tdiff.Resize(tout_basic_fp32.dims());
-              tdiff.set_precision(PRECISION(kFloat));
-              tensor_diff(tout_basic_fp32, *param_fp32_out.output, tdiff);
-              print_tensor(tdiff);
-              release_param(&param_int8_out);
-              release_param(&param_fp32_out);
-              LOG(FATAL) << "test int8 conv, fp32 out: input: " << dim_in
-                         << ", output: " << dim_out
-                         << ", weight dim: " << weight_dim
-                         << ", pad: " << pads[0] << ", " << pads[1] << ", "
-                         << pads[2] << ", " << pads[3]
-                         << ", stride: " << strides[0] << ", " << strides[1]
-                         << ", dila_: " << dilas[0] << ", " << dilas[1]
-                         << ", group: " << group
-                         << ", bias: " << (flag_bias ? "true" : "false")
-                         << ", act: " << flag_act << ", threads: " << th
-                         << ", power_mode: " << cls << " failed!!\n";
-            }
          }
        }
-        /// compare result int8 output
+      }
-        if (FLAGS_check_result) {
+      // compare result int8 output
-          double max_ratio = 0;
+      if (FLAGS_check_result) {
-          double max_diff = 0;
+        double max_ratio = 0;
-          // ! int8
+        double max_diff = 0;
-          tensor_cmp_host(
+        // ! int8
-              tout_basic_int8, *param_int8_out.output, max_ratio, max_diff);
+        tensor_cmp_host(
-          LOG(INFO) << "int8 compare result, max diff: " << max_diff
+            tout_basic_int8, *param_int8_out.output, max_ratio, max_diff);
-                    << ", max ratio: " << max_ratio;
+        LOG(INFO) << "int8 compare result, max diff: " << max_diff
-          if (fabs(max_diff) > 0) {
+                  << ", max ratio: " << max_ratio;
-            Tensor tdiff;
+        if (fabs(max_diff) > 0) {
-            tdiff.Resize(tout_basic_int8.dims());
+          Tensor tdiff;
-            tdiff.set_precision(PRECISION(kInt8));
+          tdiff.Resize(tout_basic_int8.dims());
-            tensor_diff(tout_basic_int8, *param_int8_out.output, tdiff);
+          tdiff.set_precision(PRECISION(kInt8));
-            auto ptr = tdiff.data<int8_t>();
+          tensor_diff(tout_basic_int8, *param_int8_out.output, tdiff);
-            auto ptr_basic_fp32 = tout_basic_fp32.data<float>();
+          auto ptr = tdiff.data<int8_t>();
-            float count = 0;
+          auto ptr_basic_fp32 = tout_basic_fp32.data<float>();
-            bool check = true;
+          float count = 0;
-            for (int i = 0; i < tdiff.numel(); ++i) {
+          bool check = true;
-              if (abs(ptr[i]) > 1) {
+          for (int i = 0; i < tdiff.numel(); ++i) {
-                check = false;
+            if (abs(ptr[i]) > 1) {
-                LOG(ERROR) << "basic float data: " << ptr_basic_fp32[i]
+              check = false;
-                           << ", after scale: "
+              LOG(ERROR) << "basic float data: " << ptr_basic_fp32[i]
-                           << ptr_basic_fp32[i] / scale_out[0];
+                         << ", after scale: "
-                break;
+                         << ptr_basic_fp32[i] / scale_out[0];
-              }
+              break;
-              if (ptr[i] != 0) {
-                LOG(ERROR) << "basic float data: " << ptr_basic_fp32[i]
-                           << ", after scale: "
-                           << ptr_basic_fp32[i] / scale_out[0];
-                count += 1;
-              }
            }
-            check =
+            if (ptr[i] != 0) {
-                check &&
+              LOG(ERROR) << "basic float data: " << ptr_basic_fp32[i]
-                count < std::max(10, static_cast<int>(0.01 * tdiff.numel()));
+                         << ", after scale: "
-            if (!check) {
+                         << ptr_basic_fp32[i] / scale_out[0];
-              LOG(WARNING) << "int8 basic result";
+              count += 1;
-              print_tensor(tout_basic_int8);
-              LOG(WARNING) << "int8 lite result";
-              print_tensor(*param_int8_out.output);
-              LOG(WARNING) << "int8 diff tensor";
-              print_tensor(tdiff);
-              release_param(&param_int8_out);
-              release_param(&param_fp32_out);
-              LOG(FATAL) << "test int8 conv, int8 out: input: " << dim_in
-                         << ", output: " << dim_out
-                         << ", weight dim: " << weight_dim
-                         << ", pad: " << pads[0] << ", " << pads[1] << ", "
-                         << pads[2] << ", " << pads[3]
-                         << ", stride: " << strides[0] << ", " << strides[1]
-                         << ", dila_: " << dilas[0] << ", " << dilas[1]
-                         << ", bias: " << (flag_bias ? "true" : "false")
-                         << ", act: " << flag_act << ", threads: " << th
-                         << ", power_mode: " << cls << " failed!!\n";
            }
          }
+          check = check &&
+                  count < std::max(10, static_cast<int>(0.01 * tdiff.numel()));
+          if (!check) {
+            LOG(WARNING) << "int8 basic result";
+            print_tensor(tout_basic_int8);
+            LOG(WARNING) << "int8 lite result";
+            print_tensor(*param_int8_out.output);
+            LOG(WARNING) << "int8 diff tensor";
+            print_tensor(tdiff);
+            release_param(&param_int8_out);
+            release_param(&param_fp32_out);
+            LOG(FATAL) << "test int8 conv, int8 out: input: " << dim_in
+                       << ", output: " << dim_out
+                       << ", weight dim: " << weight_dim << ", pad: " << pads[0]
+                       << ", " << pads[1] << ", " << pads[2] << ", " << pads[3]
+                       << ", stride: " << strides[0] << ", " << strides[1]
+                       << ", dila_: " << dilas[0] << ", " << dilas[1]
+                       << ", bias: " << (flag_bias ? "true" : "false")
+                       << ", act: " << flag_act << ", threads: " << th
+                       << ", power_mode: " << cls << " failed!!\n";
+          }
        }
-        LOG(INFO) << "test int8 conv: input: " << dim_in
-                  << ", output: " << dim_out << ", weight dim: " << weight_dim
-                  << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2]
-                  << ", " << pads[3] << ", stride: " << strides[0] << ", "
-                  << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1]
-                  << ", bias: " << (flag_bias ? "true" : "false")
-                  << ", act: " << flag_act << ", threads: " << th
-                  << ", power_mode: " << cls << " successed!!\n";
      }
+      LOG(INFO) << "test int8 conv: input: " << dim_in
+                << ", output: " << dim_out << ", weight dim: " << weight_dim
+                << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2]
+                << ", " << pads[3] << ", stride: " << strides[0] << ", "
+                << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1]
+                << ", bias: " << (flag_bias ? "true" : "false")
+                << ", act: " << flag_act << ", threads: " << th
+                << ", power_mode: " << cls << " successed!!\n";
    }
  }
  release_param(&param_int8_out);
  release_param(&param_fp32_out);
 }
 #else
-void test_conv_int8(const std::vector<DDim>& input_dims,
+void test_conv_int8(const DDim& dims_in,
                    const DDim& weight_dim,
                    int group,
                    const std::vector<int>& strides,
@@ -493,25 +486,24 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
        for (auto& flag_bias : {false, true}) {
          for (auto& flag_act : {0, 1, 2, 4}) {
            for (auto& c : {1, 3, 5, 8, 16, 32}) {
-              std::vector<DDim> dims;
              DDim weights_dim({c, 1, 3, 3});
              for (auto& batch : {1, 2}) {
                for (auto& h : {1, 3, 15, 33}) {
-                  dims.push_back(DDim({batch, c, h, h}));
+                  DDim dims({batch, c, h, h});
+                  test_conv_int8(dims,
+                                 weights_dim,
+                                 c,
+                                 {stride, stride},
+                                 {pad, pad, pad, pad},
+                                 {1, 1},
+                                 flag_bias,
+                                 flag_act,
+                                 {FLAGS_threads},
+                                 {FLAGS_power_mode},
+                                 FLAGS_clipped_coef,
+                                 FLAGS_leakey_relu_alpha);
                }
              }
-              test_conv_int8(dims,
-                             weights_dim,
-                             c,
-                             {stride, stride},
-                             {pad, pad, pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_act,
-                             {4},
-                             {FLAGS_power_mode},
-                             FLAGS_clipped_coef,
-                             FLAGS_leakey_relu_alpha);
            }
          }
        }
@@ -529,25 +521,24 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
        for (auto& flag_bias : {false, true}) {
          for (auto& flag_act : {0, 1, 2, 4}) {
            for (auto& c : {1, 5, 15, 33}) {
-              std::vector<DDim> dims;
              DDim weights_dim({c, 1, 5, 5});
              for (auto& batch : {1, 2}) {
                for (auto& h : {1, 3, 15, 33, 112, 224}) {
-                  dims.push_back(DDim({batch, c, h, h}));
+                  DDim dims({batch, c, h, h});
+                  test_conv_int8(dims,
+                                 weights_dim,
+                                 c,
+                                 {stride, stride},
+                                 {pad, pad, pad, pad},
+                                 {1, 1},
+                                 flag_bias,
+                                 flag_act,
+                                 {1, 4},
+                                 {FLAGS_power_mode},
+                                 FLAGS_clipped_coef,
+                                 FLAGS_leakey_relu_alpha);
                }
              }
-              test_conv_int8(dims,
-                             weights_dim,
-                             c,
-                             {stride, stride},
-                             {pad, pad, pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_act,
-                             {1, 4},
-                             {FLAGS_power_mode},
-                             FLAGS_clipped_coef,
-                             FLAGS_leakey_relu_alpha);
            }
          }
        }
@@ -565,28 +556,27 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
        for (auto& g : {1, 2}) {
          for (auto& flag_bias : {false, true}) {
            for (auto& flag_act : {0, 1, 2, 4}) {
-              std::vector<DDim> dims;
              if (cin % g != 0 || cout % g != 0) {
                continue;
              }
              DDim weights_dim({cout, cin / g, 1, 1});
              for (auto& batch : {1, 2}) {
                for (auto& h : {1, 9, 16, 33}) {
-                  dims.push_back(DDim({batch, cin, h, h}));
+                  DDim dims({batch, cin, h, h});
+                  test_conv_int8(dims,
+                                 weights_dim,
+                                 g,
+                                 {1, 1},
+                                 {0, 0, 0, 0},
+                                 {1, 1},
+                                 flag_bias,
+                                 flag_act,
+                                 {4},
+                                 {FLAGS_power_mode},
+                                 FLAGS_clipped_coef,
+                                 FLAGS_leakey_relu_alpha);
                }
              }
-              test_conv_int8(dims,
-                             weights_dim,
-                             g,
-                             {1, 1},
-                             {0, 0, 0, 0},
-                             {1, 1},
-                             flag_bias,
-                             flag_act,
-                             {4},
-                             {FLAGS_power_mode},
-                             FLAGS_clipped_coef,
-                             FLAGS_leakey_relu_alpha);
            }
          }
        }
@@ -606,29 +596,29 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
            for (auto& pad_left : {1, 2}) {
              for (auto& pad_right : {1, 2}) {
                for (auto& flag_bias : {false, true}) {
-                  for (auto& flag_act : {0, 1, 2, 4}) {
+                  for (auto& flag_act : {0, 1}) {
-                    std::vector<DDim> dims;
                    DDim weights_dim({cout, cin, 3, 3});
                    for (auto& batch : {1, 2}) {
                      for (auto& h : {1, 7, 17, 33}) {
-                        dims.push_back(DDim({batch, cin, h, h}));
+                        DDim dims({batch, cin, h, h});
+                        if (cin == 1 && cout == 1) {
+                          continue;
+                        }
+                        test_conv_int8(
+                            dims,
+                            weights_dim,
+                            1,
+                            {1, 1},
+                            {pad_top, pad_bottom, pad_left, pad_right},
+                            {1, 1},
+                            flag_bias,
+                            flag_act,
+                            {4},
+                            {FLAGS_power_mode},
+                            FLAGS_clipped_coef,
+                            FLAGS_leakey_relu_alpha);
                      }
                    }
-                    if (cin == 1 && cout == 1) {
-                      continue;
-                    }
-                    test_conv_int8(dims,
-                                   weights_dim,
-                                   1,
-                                   {1, 1},
-                                   {pad_top, pad_bottom, pad_left, pad_right},
-                                   {1, 1},
-                                   flag_bias,
-                                   flag_act,
-                                   {4},
-                                   {FLAGS_power_mode},
-                                   FLAGS_clipped_coef,
-                                   FLAGS_leakey_relu_alpha);
                  }
                }
              }
@@ -652,25 +642,25 @@ TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
              for (auto& pad_right : {1, 2}) {
                for (auto& flag_bias : {false, true}) {
                  for (auto& flag_act : {0, 1, 2, 4}) {
-                    std::vector<DDim> dims;
                    DDim weights_dim({cout, cin, 3, 3});
                    for (auto& batch : {1, 2}) {
                      for (auto& h : {1, 7, 19, 33}) {
-                        dims.push_back(DDim({batch, cin, h, h}));
+                        DDim dims({batch, cin, h, h});
+                        test_conv_int8(
+                            dims,
+                            weights_dim,
+                            1,
+                            {2, 2},
+                            {pad_top, pad_bottom, pad_left, pad_right},
+                            {1, 1},
+                            flag_bias,
+                            flag_act,
+                            {4},
+                            {FLAGS_power_mode},
+                            FLAGS_clipped_coef,
+                            FLAGS_leakey_relu_alpha);
                      }
                    }
-                    test_conv_int8(dims,
-                                   weights_dim,
-                                   1,
-                                   {2, 2},
-                                   {pad_top, pad_bottom, pad_left, pad_right},
-                                   {1, 1},
-                                   flag_bias,
-                                   flag_act,
-                                   {4},
-                                   {FLAGS_power_mode},
-                                   FLAGS_clipped_coef,
-                                   FLAGS_leakey_relu_alpha);
                  }
                }
              }
@@ -702,26 +692,27 @@ TEST(TestConvRandInt8, test_conv_rand) {
                              if (cin % g != 0 || cout % g != 0) {
                                break;
                              }
-                              std::vector<DDim> dims;
                              DDim weights_dim({cout, cin / g, kh, kw});
                              for (auto& batch : {1, 2}) {
                                for (auto& h : {1, 3, 5, 19}) {
-                                  dims.push_back(DDim({batch, cin, h, h}));
+                                  DDim dims({batch, cin, h, h});
+                                  test_conv_int8(dims,
+                                                 weights_dim,
+                                                 g,
+                                                 {stride, stride},
+                                                 {pad_top,
+                                                  pad_bottom,
+                                                  pad_left,
+                                                  pad_right},
+                                                 {dila, dila},
+                                                 flag_bias,
+                                                 flag_act,
+                                                 {4},
+                                                 {FLAGS_power_mode},
+                                                 FLAGS_clipped_coef,
+                                                 FLAGS_leakey_relu_alpha);
                                }
                              }
-                              test_conv_int8(
-                                  dims,
-                                  weights_dim,
-                                  g,
-                                  {stride, stride},
-                                  {pad_top, pad_bottom, pad_left, pad_right},
-                                  {dila, dila},
-                                  flag_bias,
-                                  flag_act,
-                                  {4},
-                                  {FLAGS_power_mode},
-                                  FLAGS_clipped_coef,
-                                  FLAGS_leakey_relu_alpha);
                            }
                          }
                        }