[ARM] int8 direct_conv, dw_conv add relu6 and leaky relu fusion, test=develop (#3737)

int8 direct_conv, dw_conv add relu6 and leaky relu fusion

[ARM] int8 direct_conv, dw_conv add relu6 and leaky relu fusion, test=develop (#3737)
int8 direct_conv, dw_conv add relu6 and leaky relu fusion
9e361a4d · yiicy · GitHub · cba42f0d · 9e361a4d · 9e361a4d
13 changed file
--- a/lite/backends/arm/math/conv3x3s1_depthwise_int8.cc
+++ b/lite/backends/arm/math/conv3x3s1_depthwise_int8.cc
@@ -36,7 +36,8 @@ void conv_depthwise_3x3s1_int8(Dtype* dout,
                               const float* scale,
                               const float* bias,
                               bool flag_bias,
-                               bool flag_relu,
+                               int flag_act,
+                               float* alpha,
                               int num,
                               int chin,
                               int hin,
@@ -434,7 +435,8 @@ void conv_depthwise_3x3s1_int8(Dtype* dout,
                                          chout,
                                          hout,
                                          wout,
-                                          flag_relu,
+                                          flag_act,
+                                          alpha,
                                          bias_local,
                                          flag_bias,
                                          ptr_write,
@@ -450,7 +452,8 @@ template void conv_depthwise_3x3s1_int8<int8_t>(int8_t* dout,
                                                const float* scale,
                                                const float* bias,
                                                bool flag_bias,
-                                                bool flag_relu,
+                                                int flag_act,
+                                                float* alpha,
                                                int num,
                                                int chin,
                                                int hin,
@@ -467,7 +470,8 @@ template void conv_depthwise_3x3s1_int8<float>(float* dout,
                                               const float* scale,
                                               const float* bias,
                                               bool flag_bias,
-                                               bool flag_relu,
+                                               int flag_act,
+                                               float* alpha,
                                               int num,
                                               int chin,
                                               int hin,

--- a/lite/backends/arm/math/conv3x3s1_direct_int8.cc
+++ b/lite/backends/arm/math/conv3x3s1_direct_int8.cc
@@ -42,8 +42,30 @@ void conv_3x3s1_direct_int8(const int8_t* din,
                            Context<TARGET(kARM)>* ctx,
                            const float* scale) {
  auto paddings = *param.paddings;
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias;
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  int flag_act = 0;  // relu: 1, relu6: 2, leakey: 3
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 1;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 2;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 3;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
  int pad_h = paddings[0];
  int pad_w = paddings[2];
@@ -442,7 +464,8 @@ void conv_3x3s1_direct_int8(const int8_t* din,
                                   chout,
                                   hout,
                                   wout,
-                                   flag_relu,
+                                   flag_act,
+                                   alpha,
                                   bias_local,
                                   flag_bias,
                                   ptr_write,

--- a/lite/backends/arm/math/conv3x3s2_depthwise_int8.cc
+++ b/lite/backends/arm/math/conv3x3s2_depthwise_int8.cc
@@ -36,7 +36,8 @@ void conv_depthwise_3x3s2_int8(Dtype* dout,
                               const float* scale,
                               const float* bias,
                               bool flag_bias,
-                               bool flag_relu,
+                               int flag_act,
+                               float* alpha,
                               int num,
                               int chin,
                               int hin,
@@ -447,7 +448,8 @@ void conv_depthwise_3x3s2_int8(Dtype* dout,
                                          chout,
                                          hout,
                                          wout,
-                                          flag_relu,
+                                          flag_act,
+                                          alpha,
                                          bias_local,
                                          flag_bias,
                                          ptr_write,
@@ -463,7 +465,8 @@ template void conv_depthwise_3x3s2_int8<int8_t>(int8_t* dout,
                                                const float* scale,
                                                const float* bias,
                                                bool flag_bias,
-                                                bool flag_relu,
+                                                int flag_act,
+                                                float* alpha,
                                                int num,
                                                int chin,
                                                int hin,
@@ -480,7 +483,8 @@ template void conv_depthwise_3x3s2_int8<float>(float* dout,
                                               const float* scale,
                                               const float* bias,
                                               bool flag_bias,
-                                               bool flag_relu,
+                                               int flag_act,
+                                               float* alpha,
                                               int num,
                                               int chin,
                                               int hin,

--- a/lite/backends/arm/math/conv3x3s2_direct_int8.cc
+++ b/lite/backends/arm/math/conv3x3s2_direct_int8.cc
@@ -47,8 +47,30 @@ void conv_3x3s2_direct_int8(const int8_t* din,
  //! prepack input to tmp buffer
  //! write output to tmp buffer
  auto paddings = *param.paddings;
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias;
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  int flag_act = 0;  // relu: 1, relu6: 2, leakey: 3
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 1;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 2;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 3;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
  int pad_h = paddings[0];
  int pad_w = paddings[2];
@@ -442,7 +464,8 @@ void conv_3x3s2_direct_int8(const int8_t* din,
                                   chout,
                                   hout,
                                   wout,
-                                   flag_relu,
+                                   flag_act,
+                                   alpha,
                                   bias_local,
                                   flag_bias,
                                   ptr_write,
@@ -474,8 +497,30 @@ void conv_3x3s2_direct_int8(const int8_t* din,
  //! prepack input to tmp buffer
  //! write output to tmp buffer
  auto paddings = *param.paddings;
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias;
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  int flag_act = 0;  // relu: 1, relu6: 2, leakey: 3
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 1;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 2;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 3;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
  int pad_h = paddings[0];
  int pad_w = paddings[2];
  const int threads = ctx->threads();
@@ -698,7 +743,8 @@ void conv_3x3s2_direct_int8(const int8_t* din,
                                   chout,
                                   hout,
                                   wout,
-                                   flag_relu,
+                                   flag_act,
+                                   alpha,
                                   bias_local,
                                   flag_bias,
                                   ptr_write,

--- a/lite/backends/arm/math/conv5x5s1_depthwise_int8.cc
+++ b/lite/backends/arm/math/conv5x5s1_depthwise_int8.cc
@@ -36,7 +36,8 @@ void conv_depthwise_5x5s1_int8(Dtype* dout,
                               const float* scale,
                               const float* bias,
                               bool flag_bias,
-                               bool flag_relu,
+                               int flag_act,
+                               float* alpha,
                               int num,
                               int chin,
                               int hin,
@@ -726,7 +727,8 @@ void conv_depthwise_5x5s1_int8(Dtype* dout,
                                          chout,
                                          hout,
                                          wout,
-                                          flag_relu,
+                                          flag_act,
+                                          alpha,
                                          bias_local,
                                          flag_bias,
                                          ptr_write,
@@ -742,7 +744,8 @@ template void conv_depthwise_5x5s1_int8<int8_t>(int8_t* dout,
                                                const float* scale,
                                                const float* bias,
                                                bool flag_bias,
-                                                bool flag_relu,
+                                                int flag_act,
+                                                float* alpha,
                                                int num,
                                                int chin,
                                                int hin,
@@ -759,7 +762,8 @@ template void conv_depthwise_5x5s1_int8<float>(float* dout,
                                               const float* scale,
                                               const float* bias,
                                               bool flag_bias,
-                                               bool flag_relu,
+                                               int flag_act,
+                                               float* alpha,
                                               int num,
                                               int chin,
                                               int hin,

--- a/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc
+++ b/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc
@@ -36,7 +36,8 @@ void conv_depthwise_5x5s2_int8(Dtype* dout,
                               const float* scale,
                               const float* bias,
                               bool flag_bias,
-                               bool flag_relu,
+                               int flag_act,
+                               float* alpha,
                               int num,
                               int chin,
                               int hin,
@@ -746,7 +747,8 @@ void conv_depthwise_5x5s2_int8(Dtype* dout,
                                          chout,
                                          hout,
                                          wout,
-                                          flag_relu,
+                                          flag_act,
+                                          alpha,
                                          bias_local,
                                          flag_bias,
                                          ptr_write,
@@ -762,7 +764,8 @@ template void conv_depthwise_5x5s2_int8<int8_t>(int8_t* dout,
                                                const float* scale,
                                                const float* bias,
                                                bool flag_bias,
-                                                bool flag_relu,
+                                                int flag_act,
+                                                float* alpha,
                                                int num,
                                                int chin,
                                                int hin,
@@ -779,7 +782,8 @@ template void conv_depthwise_5x5s2_int8<float>(float* dout,
                                               const float* scale,
                                               const float* bias,
                                               bool flag_bias,
-                                               bool flag_relu,
+                                               int flag_act,
+                                               float* alpha,
                                               int num,
                                               int chin,
                                               int hin,

--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
@@ -94,7 +94,8 @@ void conv_depthwise_3x3s1_int8(Dtype* dout,
                               const float* scale,
                               const float* bias,
                               bool flag_bias,
-                               bool flag_relu,
+                               int flag_act,
+                               float* alpha,
                               int num,
                               int chin,
                               int hin,
@@ -112,7 +113,8 @@ void conv_depthwise_3x3s2_int8(Dtype* dout,
                               const float* scale,
                               const float* bias,
                               bool flag_bias,
-                               bool flag_relu,
+                               int flag_act,
+                               float* alpha,
                               int num,
                               int chin,
                               int hin,
@@ -178,7 +180,8 @@ void conv_depthwise_5x5s1_int8(Dtype* dout,
                               const float* scale,
                               const float* bias,
                               bool flag_bias,
-                               bool flag_relu,
+                               int flag_act,
+                               float* alpha,
                               int num,
                               int chin,
                               int hin,
@@ -196,7 +199,8 @@ void conv_depthwise_5x5s2_int8(Dtype* dout,
                               const float* scale,
                               const float* bias,
                               bool flag_bias,
-                               bool flag_relu,
+                               int flag_act,
+                               float* alpha,
                               int num,
                               int chin,
                               int hin,

--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -790,8 +790,30 @@ void conv_depthwise_3x3_int8_fp32(const void* din,
  int pad_h = paddings[0];
  int pad_w = paddings[2];
  int stride = param.strides[1];
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  int flag_act = 0;  // relu: 1, relu6: 2, leakey: 3
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 1;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 2;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 3;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
  if (stride == 1) {
    conv_depthwise_3x3s1_int8(reinterpret_cast<float*>(dout),
                              reinterpret_cast<const int8_t*>(din),
@@ -799,7 +821,8 @@ void conv_depthwise_3x3_int8_fp32(const void* din,
                              scale,
                              bias,
                              flag_bias,
-                              flag_relu,
+                              flag_act,
+                              alpha,
                              num,
                              ch_in,
                              h_in,
@@ -816,7 +839,8 @@ void conv_depthwise_3x3_int8_fp32(const void* din,
                              scale,
                              bias,
                              flag_bias,
-                              flag_relu,
+                              flag_act,
+                              alpha,
                              num,
                              ch_in,
                              h_in,
@@ -849,8 +873,30 @@ void conv_depthwise_3x3_int8_int8(const void* din,
  int pad_h = paddings[0];
  int pad_w = paddings[2];
  int stride = param.strides[1];
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  int flag_act = 0;  // relu: 1, relu6: 2, leakey: 3
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 1;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 2;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 3;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
  if (stride == 1) {
    conv_depthwise_3x3s1_int8(reinterpret_cast<int8_t*>(dout),
                              reinterpret_cast<const int8_t*>(din),
@@ -858,7 +904,8 @@ void conv_depthwise_3x3_int8_int8(const void* din,
                              scale,
                              bias,
                              flag_bias,
-                              flag_relu,
+                              flag_act,
+                              alpha,
                              num,
                              ch_in,
                              h_in,
@@ -875,7 +922,8 @@ void conv_depthwise_3x3_int8_int8(const void* din,
                              scale,
                              bias,
                              flag_bias,
-                              flag_relu,
+                              flag_act,
+                              alpha,
                              num,
                              ch_in,
                              h_in,
@@ -908,8 +956,30 @@ void conv_depthwise_5x5_int8_fp32(const void* din,
  int pad_h = paddings[0];
  int pad_w = paddings[2];
  int stride = param.strides[1];
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  int flag_act = 0;  // relu: 1, relu6: 2, leakey: 3
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 1;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 2;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 3;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
  if (stride == 1) {
    conv_depthwise_5x5s1_int8(reinterpret_cast<float*>(dout),
                              reinterpret_cast<const int8_t*>(din),
@@ -917,7 +987,8 @@ void conv_depthwise_5x5_int8_fp32(const void* din,
                              scale,
                              bias,
                              flag_bias,
-                              flag_relu,
+                              flag_act,
+                              alpha,
                              num,
                              ch_in,
                              h_in,
@@ -934,7 +1005,8 @@ void conv_depthwise_5x5_int8_fp32(const void* din,
                              scale,
                              bias,
                              flag_bias,
-                              flag_relu,
+                              flag_act,
+                              alpha,
                              num,
                              ch_in,
                              h_in,
@@ -967,8 +1039,30 @@ void conv_depthwise_5x5_int8_int8(const void* din,
  int pad_h = paddings[0];
  int pad_w = paddings[2];
  int stride = param.strides[1];
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
+  auto act_param = param.activation_param;
+  auto act_type = act_param.active_type;
+  int flag_act = 0;  // relu: 1, relu6: 2, leakey: 3
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 1;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 2;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 3;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
  if (stride == 1) {
    conv_depthwise_5x5s1_int8(reinterpret_cast<int8_t*>(dout),
                              reinterpret_cast<const int8_t*>(din),
@@ -976,7 +1070,8 @@ void conv_depthwise_5x5_int8_int8(const void* din,
                              scale,
                              bias,
                              flag_bias,
-                              flag_relu,
+                              flag_act,
+                              alpha,
                              num,
                              ch_in,
                              h_in,
@@ -993,7 +1088,8 @@ void conv_depthwise_5x5_int8_int8(const void* din,
                              scale,
                              bias,
                              flag_bias,
-                              flag_relu,
+                              flag_act,
+                              alpha,
                              num,
                              ch_in,
                              h_in,

--- a/lite/backends/arm/math/gemm_prepacked_int8.cc
+++ b/lite/backends/arm/math/gemm_prepacked_int8.cc
@@ -534,18 +534,18 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
  "fmin   v17.4s, v17.4s, v1.4s\n" /* relu6 */     \
  "fmin   v18.4s, v18.4s, v1.4s\n" /* relu6 */     \
  "fmin   v19.4s, v19.4s, v1.4s\n" /* relu6 */     \
-  "fmin   v20.4s, v20.4s, v0.4s\n" /* relu6 */     \
+  "fmin   v20.4s, v20.4s, v1.4s\n" /* relu6 */     \
-  "fmin   v21.4s, v21.4s, v0.4s\n" /* relu6 */     \
+  "fmin   v21.4s, v21.4s, v1.4s\n" /* relu6 */     \
-  "fmin   v22.4s, v22.4s, v0.4s\n" /* relu6 */     \
+  "fmin   v22.4s, v22.4s, v1.4s\n" /* relu6 */     \
-  "fmin   v23.4s, v23.4s, v0.4s\n" /* relu6 */     \
+  "fmin   v23.4s, v23.4s, v1.4s\n" /* relu6 */     \
-  "fmin   v24.4s, v24.4s, v0.4s\n" /* relu6 */     \
+  "fmin   v24.4s, v24.4s, v1.4s\n" /* relu6 */     \
-  "fmin   v25.4s, v25.4s, v0.4s\n" /* relu6 */     \
+  "fmin   v25.4s, v25.4s, v1.4s\n" /* relu6 */     \
-  "fmin   v26.4s, v26.4s, v0.4s\n" /* relu6 */     \
+  "fmin   v26.4s, v26.4s, v1.4s\n" /* relu6 */     \
-  "fmin   v27.4s, v27.4s, v0.4s\n" /* relu6 */     \
+  "fmin   v27.4s, v27.4s, v1.4s\n" /* relu6 */     \
-  "fmin   v28.4s, v28.4s, v0.4s\n" /* relu6 */     \
+  "fmin   v28.4s, v28.4s, v1.4s\n" /* relu6 */     \
-  "fmin   v29.4s, v29.4s, v0.4s\n" /* relu6 */     \
+  "fmin   v29.4s, v29.4s, v1.4s\n" /* relu6 */     \
-  "fmin   v30.4s, v30.4s, v0.4s\n" /* relu6 */     \
+  "fmin   v30.4s, v30.4s, v1.4s\n" /* relu6 */     \
-  "fmin   v31.4s, v31.4s, v0.4s\n" /* relu6 */     \
+  "fmin   v31.4s, v31.4s, v1.4s\n" /* relu6 */     \
  "b      9f                    \n"   /* relu end */
 #define GEMM_INT8_LEAKY_RELU                       \

--- a/lite/kernels/arm/conv_depthwise.cc
+++ b/lite/kernels/arm/conv_depthwise.cc
@@ -169,6 +169,12 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
    }
    flag_trans_bias_ = true;
  }
+  //! update relu6 parameter
+  if (param.activation_param.has_active &&
+      param.activation_param.active_type == lite_api::ActivationType::kRelu6) {
+    param.activation_param.Relu_clipped_coef =
+        param.activation_param.Relu_clipped_coef / param.output_scale;
+  }
  /// select dw conv kernel
  if (kw == 3) {
    // trans weights

--- a/lite/kernels/arm/conv_direct.h
+++ b/lite/kernels/arm/conv_direct.h
@@ -39,7 +39,8 @@ inline bool direct_conv_trans_weights(
    const std::vector<float>& w_scale,
    float in_scale,
    float out_scale,
-    std::vector<float>& merge_scale) {  // NOLINT
+    std::vector<float>& merge_scale,  // NOLINT
+    float* relu_clipped_coef) {
  constexpr int cblock = 4;
  int oc = win->dims()[0];
  int ic = win->dims()[1];
@@ -64,7 +65,8 @@ inline bool direct_conv_trans_weights<PRECISION(kInt8), PRECISION(kFloat)>(
    const std::vector<float>& w_scale,
    float in_scale,
    float out_scale,
-    std::vector<float>& merge_scale) {  // NOLINT
+    std::vector<float>& merge_scale,  // NOLINT
+    float* relu_clipped_coef) {
  int cblock = 4;
  if (stride == 2) {
    cblock = lite::arm::math::conv_3x3s2_direct_int8_c_num();
@@ -103,7 +105,8 @@ inline bool direct_conv_trans_weights<PRECISION(kInt8), PRECISION(kInt8)>(
    const std::vector<float>& w_scale,
    float in_scale,
    float out_scale,
-    std::vector<float>& merge_scale) {  // NOLINT
+    std::vector<float>& merge_scale,  // NOLINT
+    float* relu_clipped_coef) {
  int cblock = 4;
  if (stride == 2) {
    cblock = lite::arm::math::conv_3x3s2_direct_int8_c_num();
@@ -130,6 +133,8 @@ inline bool direct_conv_trans_weights<PRECISION(kInt8), PRECISION(kInt8)>(
      merge_scale[i] = w_scale[i] * scale;
    }
  }
+  /// update relu_clipped_coef
+  *relu_clipped_coef /= out_scale;
  /// update bias
  if (bin) {
    bout->Resize(bin->dims());
@@ -167,16 +172,17 @@ class DirectConv : public KernelLite<TARGET(kARM), Ptype> {
        << "direct conv only support conv3x3s1 and conv3x3s2";
    CHECK(kw == 3 && kh == 3)
        << "direct conv only support conv3x3s1 and conv3x3s2";
-    flag_trans_bias_ =
+    flag_trans_bias_ = direct_conv_trans_weights<Ptype, OutType>(
-        direct_conv_trans_weights<Ptype, OutType>(param.filter,
+        param.filter,
-                                                  &weights_,
+        &weights_,
-                                                  param.bias,
+        param.bias,
-                                                  &bias_,
+        &bias_,
-                                                  sw,
+        sw,
-                                                  param.weight_scale,
+        param.weight_scale,
-                                                  param.input_scale,
+        param.input_scale,
-                                                  param.output_scale,
+        param.output_scale,
-                                                  w_scale_);
+        w_scale_,
+        &param.activation_param.Relu_clipped_coef);
  }
  virtual void Run();

--- a/lite/tests/math/conv_int8_compute_test.cc
+++ b/lite/tests/math/conv_int8_compute_test.cc
@@ -56,7 +56,7 @@ DEFINE_int32(dila_w, 1, "dilation width");
 DEFINE_bool(flag_act, true, "do act");
 DEFINE_bool(flag_bias, true, "with bias");
 DEFINE_double(clipped_coef, 1.0, "clipped relu coef");
-DEFINE_double(leakey_relu_alpha, 8.88, "leakey relu alpha");
+DEFINE_double(leakey_relu_alpha, 2.22, "leakey relu alpha");
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
@@ -188,7 +188,14 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
  }
  std::vector<float> scale_in{1.f / 127};
-  std::vector<float> scale_out{weight_dim.count(1, 4) / 127.f};
+  std::vector<float> scale_out(1, weight_dim.count(1, 4) / 127.f);
+  if (flag_act == 2) {
+    scale_out[0] = six / 127.f;
+  } else if (flag_act == 4) {
+    if (std::abs(alpha) > 1) {
+      scale_out[0] *= std::abs(alpha);
+    }
+  }
  std::vector<float> scale_w(weight_dim[0], 1.f / 127);
  param_int8_out.input_scale = scale_in[0];
@@ -484,7 +491,7 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
    for (auto& stride : {1, 2}) {
      for (auto& pad : {0, 1}) {
        for (auto& flag_bias : {false, true}) {
-          for (auto& flag_act : {0, 1}) {
+          for (auto& flag_act : {0, 1, 2, 4}) {
            for (auto& c : {1, 3, 5, 8, 16, 32}) {
              std::vector<DDim> dims;
              DDim weights_dim({c, 1, 3, 3});
@@ -520,7 +527,7 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
    for (auto& stride : {1, 2}) {
      for (auto& pad : {0, 1, 2, 3, 4}) {
        for (auto& flag_bias : {false, true}) {
-          for (auto& flag_act : {0, 1}) {
+          for (auto& flag_act : {0, 1, 2, 4}) {
            for (auto& c : {1, 5, 15, 33}) {
              std::vector<DDim> dims;
              DDim weights_dim({c, 1, 5, 5});
@@ -553,7 +560,7 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
 #if 1  /// conv1x1s1
 TEST(TestConv1x1s1Int8, test_conv1x1s1) {
  if (FLAGS_basic_test) {
-    for (auto& cin : {1, 3, 8, 32}) {
+    for (auto& cin : {1, 3, 8, 33}) {
      for (auto& cout : {1, 5, 17}) {
        for (auto& g : {1, 2}) {
          for (auto& flag_bias : {false, true}) {
@@ -599,7 +606,7 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
            for (auto& pad_left : {1, 2}) {
              for (auto& pad_right : {1, 2}) {
                for (auto& flag_bias : {false, true}) {
-                  for (auto& flag_act : {0, 1}) {
+                  for (auto& flag_act : {0, 1, 2, 4}) {
                    std::vector<DDim> dims;
                    DDim weights_dim({cout, cin, 3, 3});
                    for (auto& batch : {1, 2}) {
@@ -641,7 +648,7 @@ TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
            for (auto& pad_left : {1, 2}) {
              for (auto& pad_right : {1, 2}) {
                for (auto& flag_bias : {false, true}) {
-                  for (auto& flag_act : {0, 1}) {
+                  for (auto& flag_act : {0, 1, 2, 4}) {
                    std::vector<DDim> dims;
                    DDim weights_dim({cout, cin, 3, 3});
                    for (auto& batch : {1, 2}) {
@@ -673,7 +680,7 @@ TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
 }
 #endif  /// conv3x3s2
-#if 0   /// random param conv
+#if 1  /// random param conv
 TEST(TestConvRandInt8, test_conv_rand) {
  if (FLAGS_basic_test) {
    for (auto& cin : {1, 17}) {